diff options
| -rw-r--r-- | demo.sh | 85 | ||||
| -rwxr-xr-x | get_data.sh | 1 | ||||
| -rwxr-xr-x | get_embeddings.sh | 7 |
3 files changed, 90 insertions, 3 deletions
| @@ -0,0 +1,85 @@ | |||
| 1 | #!/bin/bash | ||
| 2 | # | ||
| 3 | # Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr> | ||
| 4 | # | ||
| 5 | # Permission is hereby granted, free of charge, to any person obtaining | ||
| 6 | # a copy of this software and associated documentation files (the "Software"), | ||
| 7 | # to deal in the Software without restriction, including without limitation | ||
| 8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
| 9 | # and/or sell copies of the Software, and to permit persons to whom the | ||
| 10 | # Software is furnished to do so, subject to the following conditions: | ||
| 11 | # | ||
| 12 | # The above copyright notice and this permission notice shall be included | ||
| 13 | # in all copies or substantial portions of the Software. | ||
| 14 | # | ||
| 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 16 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
| 17 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
| 18 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
| 19 | # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
| 20 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
| 21 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
| 22 | # | ||
| 23 | # | ||
| 24 | |||
| 25 | ROOT="$(pwd)" | ||
| 26 | SCRIPTS="${ROOT}/scripts" | ||
| 27 | WNET="${ROOT}/wordnets" | ||
| 28 | EMBS="${ROOT}/embeddings" | ||
| 29 | DICT="${ROOT}/dictionaries" | ||
| 30 | |||
| 31 | TRAIN_DIR="${DICT}/train" | ||
| 32 | TEST_DIR="${DICT}/test" | ||
| 33 | |||
| 34 | TAB_DIR="${WNET}/tab_files" | ||
| 35 | READY="${WNET}/ready" | ||
| 36 | mkdir -p "${WNET}" | ||
| 37 | |||
| 38 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}" | ||
| 39 | unzip -o -q "${WNET}/bul.zip" -d "${WNET}" | ||
| 40 | |||
| 41 | mkdir -p "${TAB_DIR}" | ||
| 42 | "${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}" | ||
| 43 | |||
| 44 | python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg" | ||
| 45 | |||
| 46 | mkdir -p "${READY}" | ||
| 47 | mv "${ROOT}"/*.def "${READY}" | ||
| 48 | |||
| 49 | mkdir -p "${DICT}" | ||
| 50 | |||
| 51 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English | ||
| 52 | gunzip -q "${DICT}/bg-en.dic.gz" | ||
| 53 | |||
| 54 | export LC_CTYPE=en_US.UTF-8 | ||
| 55 | export LC_ALL=en_US.UTF-8 | ||
| 56 | |||
| 57 | perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}" | ||
| 58 | |||
| 59 | mkdir -p "${TRAIN_DIR}" | ||
| 60 | mkdir -p "${TEST_DIR}" | ||
| 61 | |||
| 62 | mv "${DICT}"/*.train "${TRAIN_DIR}" | ||
| 63 | mv "${DICT}"/*.test "${TEST_DIR}" | ||
| 64 | rm -f "${DICT}"/*.dic | ||
| 65 | |||
| 66 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian | ||
| 67 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English | ||
| 68 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" | ||
| 69 | gunzip "${EMBS}/cc.bg.300.vec.gz" | ||
| 70 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" | ||
| 71 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" | ||
| 72 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" | ||
| 73 | |||
| 74 | |||
| 75 | for lang_code in bg en; do | ||
| 76 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" | ||
| 77 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | ||
| 78 | done | ||
| 79 | |||
| 80 | python "${ROOT}/vecmap/map_embeddings.py" --supervised \ | ||
| 81 | "${TRAIN_DIC_DIR}/en_bg.train" \ | ||
| 82 | "${EMBS}/en.vec" \ | ||
| 83 | "${EMBS}/bg.vec" \ | ||
| 84 | "${MAP_TO}/en_to_bg.vec" \ | ||
| 85 | "${MAP_TO}/bg_to_en.vec" > /dev/null 2>&1 | ||
diff --git a/get_data.sh b/get_data.sh index 2d04678..4f97ad5 100755 --- a/get_data.sh +++ b/get_data.sh | |||
| @@ -67,6 +67,7 @@ mv "${ROOT}"/*.def "${READY}" | |||
| 67 | echo "Downloading dictionaries" | 67 | echo "Downloading dictionaries" |
| 68 | 68 | ||
| 69 | DICT="${ROOT}/dictionaries" | 69 | DICT="${ROOT}/dictionaries" |
| 70 | mkdir -p "${DICT}" | ||
| 70 | 71 | ||
| 71 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/en-sq.dic.gz -P "${DICT}" # English - Albanian | 72 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/en-sq.dic.gz -P "${DICT}" # English - Albanian |
| 72 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English | 73 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English |
diff --git a/get_embeddings.sh b/get_embeddings.sh index daf839b..47a7a8e 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh | |||
| @@ -25,6 +25,7 @@ set -o errexit -o pipefail -o noclobber -o nounset | |||
| 25 | 25 | ||
| 26 | ROOT="$(pwd)" | 26 | ROOT="$(pwd)" |
| 27 | EMBS="${ROOT}/embeddings" | 27 | EMBS="${ROOT}/embeddings" |
| 28 | DICT="${ROOT}/dictionaries" | ||
| 28 | mkdir -p "${EMBS}" | 29 | mkdir -p "${EMBS}" |
| 29 | 30 | ||
| 30 | echo "Downloading embeddings" | 31 | echo "Downloading embeddings" |
| @@ -58,15 +59,15 @@ if [ ! "$(ls -A "${ROOT}/vecmap/")" ]; then | |||
| 58 | echo "VecMap directory seems empty, did you run git submodule init && git submodule update?"; exit | 59 | echo "VecMap directory seems empty, did you run git submodule init && git submodule update?"; exit |
| 59 | fi | 60 | fi |
| 60 | 61 | ||
| 61 | if [ ! -d "${ROOT}/dictionaries" ]; then | 62 | if [ ! -d "${DICT}" ]; then |
| 62 | echo "Dictionaries directory does not exist, did you run ./get_data.sh?"; exit | 63 | echo "Dictionaries directory does not exist, did you run ./get_data.sh?"; exit |
| 63 | fi | 64 | fi |
| 64 | 65 | ||
| 65 | if [ ! "$(ls -A "${ROOT}/dictionaries/")" ]; then | 66 | if [ ! "$(ls -A "${DICT}")" ]; then |
| 66 | echo "Dictionaries directory seems empty, did you run ./get_data.sh?"; exit | 67 | echo "Dictionaries directory seems empty, did you run ./get_data.sh?"; exit |
| 67 | fi | 68 | fi |
| 68 | 69 | ||
| 69 | TRAIN_DIC_DIR="${ROOT}/dictionaries/train" | 70 | TRAIN_DIC_DIR="${DICT}/train" |
| 70 | MAP_TO="${ROOT}/bilingual_embeddings" | 71 | MAP_TO="${ROOT}/bilingual_embeddings" |
| 71 | 72 | ||
| 72 | mkdir -p "${MAP_TO}" | 73 | mkdir -p "${MAP_TO}" |
