diff options
| -rw-r--r-- | demo.sh | 21 | 
1 files changed, 11 insertions, 10 deletions
| @@ -30,53 +30,54 @@ DICT="${ROOT}/dictionaries" | |||
| 30 | 30 | ||
| 31 | TRAIN_DIR="${DICT}/train" | 31 | TRAIN_DIR="${DICT}/train" | 
| 32 | TEST_DIR="${DICT}/test" | 32 | TEST_DIR="${DICT}/test" | 
| 33 | |||
| 34 | TAB_DIR="${WNET}/tab_files" | 33 | TAB_DIR="${WNET}/tab_files" | 
| 35 | READY="${WNET}/ready" | 34 | READY="${WNET}/ready" | 
| 36 | mkdir -p "${WNET}" | ||
| 37 | 35 | ||
| 36 | # create wordnets directory and download a single wordnet | ||
| 37 | mkdir -p "${WNET}" | ||
| 38 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}" | 38 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}" | 
| 39 | unzip -o -q "${WNET}/bul.zip" -d "${WNET}" | 39 | unzip -o -q "${WNET}/bul.zip" -d "${WNET}" | 
| 40 | 40 | ||
| 41 | # create tab directory and export a single .tab file | ||
| 41 | mkdir -p "${TAB_DIR}" | 42 | mkdir -p "${TAB_DIR}" | 
| 42 | "${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}" | 43 | "${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}" | 
| 43 | 44 | ||
| 44 | python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg" | 45 | # create ready directory and create two .def files | 
| 45 | |||
| 46 | mkdir -p "${READY}" | 46 | mkdir -p "${READY}" | 
| 47 | python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg" | ||
| 47 | mv "${ROOT}"/*.def "${READY}" | 48 | mv "${ROOT}"/*.def "${READY}" | 
| 48 | 49 | ||
| 50 | # create dictionaries directory and download a single dictionary | ||
| 49 | mkdir -p "${DICT}" | 51 | mkdir -p "${DICT}" | 
| 50 | |||
| 51 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English | 52 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English | 
| 52 | gunzip -q "${DICT}/bg-en.dic.gz" | 53 | gunzip -q "${DICT}/bg-en.dic.gz" | 
| 53 | 54 | ||
| 54 | export LC_CTYPE=en_US.UTF-8 | 55 | export LC_CTYPE=en_US.UTF-8 | 
| 55 | export LC_ALL=en_US.UTF-8 | 56 | export LC_ALL=en_US.UTF-8 | 
| 56 | 57 | ||
| 58 | # create a train and a test seed lexicon | ||
| 57 | perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}" | 59 | perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}" | 
| 58 | |||
| 59 | mkdir -p "${TRAIN_DIR}" | 60 | mkdir -p "${TRAIN_DIR}" | 
| 60 | mkdir -p "${TEST_DIR}" | 61 | mkdir -p "${TEST_DIR}" | 
| 61 | |||
| 62 | mv "${DICT}"/*.train "${TRAIN_DIR}" | 62 | mv "${DICT}"/*.train "${TRAIN_DIR}" | 
| 63 | mv "${DICT}"/*.test "${TEST_DIR}" | 63 | mv "${DICT}"/*.test "${TEST_DIR}" | 
| 64 | rm -f "${DICT}"/*.dic | 64 | rm -f "${DICT}"/*.dic | 
| 65 | 65 | ||
| 66 | # download two monolingual embeddings | ||
| 66 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian | 67 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian | 
| 67 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English | 68 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English | 
| 68 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" | ||
| 69 | gunzip "${EMBS}/cc.bg.300.vec.gz" | 69 | gunzip "${EMBS}/cc.bg.300.vec.gz" | 
| 70 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" | 70 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" | 
| 71 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" | 71 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" | 
| 72 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" | 72 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" | 
| 73 | 73 | ||
| 74 | 74 | # truncate two embeddings | |
| 75 | for lang_code in bg en; do | 75 | for lang_code in bg en; do | 
| 76 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" | 76 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" # one line on top for the <number of tokens> <dimensions> | 
| 77 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | 77 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | 
| 78 | done | 78 | done | 
| 79 | 79 | ||
| 80 | # map two embeddings | ||
| 80 | python "${ROOT}/vecmap/map_embeddings.py" --supervised \ | 81 | python "${ROOT}/vecmap/map_embeddings.py" --supervised \ | 
| 81 | "${TRAIN_DIC_DIR}/en_bg.train" \ | 82 | "${TRAIN_DIC_DIR}/en_bg.train" \ | 
| 82 | "${EMBS}/en.vec" \ | 83 | "${EMBS}/en.vec" \ | 
