diff options
| author | Yigit Sever | 2019-09-16 20:04:39 +0300 |
|---|---|---|
| committer | Yigit Sever | 2019-09-16 20:04:39 +0300 |
| commit | c65b25db3bdd7fb9e32c9fa252195e2d24bbffff (patch) | |
| tree | 93fd21eb51d1665955083cbbb603ae8267d8e304 | |
| parent | 970c1010a6edaab8d9cf72f38e0a497e0d7d2415 (diff) | |
| download | Evaluating-Dictionary-Alignment-c65b25db3bdd7fb9e32c9fa252195e2d24bbffff.tar.gz Evaluating-Dictionary-Alignment-c65b25db3bdd7fb9e32c9fa252195e2d24bbffff.tar.bz2 Evaluating-Dictionary-Alignment-c65b25db3bdd7fb9e32c9fa252195e2d24bbffff.zip | |
Initial get_data and get_embedding scripts
| -rwxr-xr-x | get_data.sh | 65 | ||||
| -rwxr-xr-x | get_embeddings.sh | 43 |
2 files changed, 108 insertions, 0 deletions
diff --git a/get_data.sh b/get_data.sh new file mode 100755 index 0000000..f6298b1 --- /dev/null +++ b/get_data.sh | |||
| @@ -0,0 +1,65 @@ | |||
| 1 | #!/bin/bash | ||
| 2 | # | ||
| 3 | # Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr> | ||
| 4 | # | ||
| 5 | # Permission is hereby granted, free of charge, to any person obtaining | ||
| 6 | # a copy of this software and associated documentation files (the "Software"), | ||
| 7 | # to deal in the Software without restriction, including without limitation | ||
| 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
| 9 | # and/or sell copies of the Software, and to permit persons to whom the | ||
| 10 | # Software is furnished to do so, subject to the following conditions: | ||
| 11 | # | ||
| 12 | # The above copyright notice and this permission notice shall be included | ||
| 13 | # in all copies or substantial portions of the Software. | ||
| 14 | # | ||
| 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 16 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
| 17 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
| 18 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
| 19 | # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
| 20 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
| 21 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
| 22 | # | ||
| 23 | # | ||
| 24 | |||
| 25 | ROOT="$(pwd)" | ||
| 26 | WNET="${ROOT}/data" | ||
| 27 | mkdir -p "${WNET}" | ||
| 28 | |||
| 29 | echo "Downloading wordnet data" | ||
| 30 | |||
| 31 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/als.zip -P "${WNET}" | ||
| 32 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}" | ||
| 33 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/ell.zip -P "${WNET}" | ||
| 34 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/ita.zip -P "${WNET}" | ||
| 35 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/ron.zip -P "${WNET}" | ||
| 36 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/slv.zip -P "${WNET}" | ||
| 37 | |||
| 38 | echo "Unzipping wordnet data" | ||
| 39 | |||
| 40 | for LANG in als bul ell ita ron slv; do | ||
| 41 | unzip -ofq "${WNET}/${LANG}" -d "${WNET}" | ||
| 42 | rm -f "${WNET}/${LANG}.zip" | ||
| 43 | done | ||
| 44 | |||
| 45 | rm -rf "${WNET}/ita" # comes alongside iwn, not useful for us | ||
| 46 | |||
| 47 | echo "Downloading dictionaries" | ||
| 48 | |||
| 49 | DICT="${ROOT}/dictionaries" | ||
| 50 | |||
| 51 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/en-sq.dic.gz -P "${DICT}" # English - Albanian | ||
| 52 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English | ||
| 53 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-en.dic.gz -P "${DICT}" # Greek - English | ||
| 54 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/en-it.dic.gz -P "${DICT}" # English - Italian | ||
| 55 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/en-ro.dic.gz -P "${DICT}" # English - Romanian | ||
| 56 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/en-sl.dic.gz -P "${DICT}" # English - Slovenian | ||
| 57 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-el.dic.gz -P "${DICT}" # Bulgarian - Greek | ||
| 58 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-it.dic.gz -P "${DICT}" # Bulgarian - Italian | ||
| 59 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-ro.dic.gz -P "${DICT}" # Bulgarian - Romanian | ||
| 60 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-it.dic.gz -P "${DICT}" # Greek - Italian | ||
| 61 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-ro.dic.gz -P "${DICT}" # Greek - Romanian | ||
| 62 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-sq.dic.gz -P "${DICT}" # Greek - Albanian | ||
| 63 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian | ||
| 64 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian | ||
| 65 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian | ||
diff --git a/get_embeddings.sh b/get_embeddings.sh new file mode 100755 index 0000000..0c5d918 --- /dev/null +++ b/get_embeddings.sh | |||
| @@ -0,0 +1,43 @@ | |||
| 1 | #!/bin/bash | ||
| 2 | # | ||
| 3 | # Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr> | ||
| 4 | # | ||
| 5 | # Permission is hereby granted, free of charge, to any person obtaining | ||
| 6 | # a copy of this software and associated documentation files (the "Software"), | ||
| 7 | # to deal in the Software without restriction, including without limitation | ||
| 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
| 9 | # and/or sell copies of the Software, and to permit persons to whom the | ||
| 10 | # Software is furnished to do so, subject to the following conditions: | ||
| 11 | # | ||
| 12 | # The above copyright notice and this permission notice shall be included | ||
| 13 | # in all copies or substantial portions of the Software. | ||
| 14 | # | ||
| 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 16 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
| 17 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
| 18 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
| 19 | # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
| 20 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
| 21 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
| 22 | # | ||
| 23 | |||
| 24 | ROOT="$(pwd)" | ||
| 25 | EMBS="${ROOT}/embeddings" | ||
| 26 | mkdir -p "${EMBS}" | ||
| 27 | |||
| 28 | echo "Downloading embeddings" | ||
| 29 | |||
| 30 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.vec.gz -P "${EMBS}" # Albanian | ||
| 31 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian | ||
| 32 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English | ||
| 33 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.vec.gz -P "${EMBS}" # Greek | ||
| 34 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz -P "${EMBS}" # Italian | ||
| 35 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz -P "${EMBS}" # Romanian | ||
| 36 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.vec.gz -P "${EMBS}" # Slovenian | ||
| 37 | |||
| 38 | echo "Extracting embeddings" | ||
| 39 | |||
| 40 | for LANG in sq bg el it ro sl; do | ||
| 41 | gunzip -fc "${EMBS}/cc.${LANG}.300.vec.gz" > "${EMBS}/${LANG}.1M.vec" | ||
| 42 | rm -f "${EMBS}/cc.${LANG}.300.vec.gz" | ||
| 43 | done | ||
