diff options
| author | Yigit Sever | 2019-09-19 21:03:20 +0300 |
|---|---|---|
| committer | Yigit Sever | 2019-09-19 21:03:20 +0300 |
| commit | 8fd301da3aea62abab058a95e3c348b4102e8510 (patch) | |
| tree | 23b302b7375e5462f1de8b76e68ad4cb0282ede1 | |
| parent | a02fdcc60675fb63dd67b2fc05b3d286b19abe78 (diff) | |
| download | Evaluating-Dictionary-Alignment-8fd301da3aea62abab058a95e3c348b4102e8510.tar.gz Evaluating-Dictionary-Alignment-8fd301da3aea62abab058a95e3c348b4102e8510.tar.bz2 Evaluating-Dictionary-Alignment-8fd301da3aea62abab058a95e3c348b4102e8510.zip | |
Clean-up demo.sh
| -rw-r--r-- | demo.sh | 21 |
1 files changed, 11 insertions, 10 deletions
| @@ -30,53 +30,54 @@ DICT="${ROOT}/dictionaries" | |||
| 30 | 30 | ||
| 31 | TRAIN_DIR="${DICT}/train" | 31 | TRAIN_DIR="${DICT}/train" |
| 32 | TEST_DIR="${DICT}/test" | 32 | TEST_DIR="${DICT}/test" |
| 33 | |||
| 34 | TAB_DIR="${WNET}/tab_files" | 33 | TAB_DIR="${WNET}/tab_files" |
| 35 | READY="${WNET}/ready" | 34 | READY="${WNET}/ready" |
| 36 | mkdir -p "${WNET}" | ||
| 37 | 35 | ||
| 36 | # create wordnets directory and download a single wordnet | ||
| 37 | mkdir -p "${WNET}" | ||
| 38 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}" | 38 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}" |
| 39 | unzip -o -q "${WNET}/bul.zip" -d "${WNET}" | 39 | unzip -o -q "${WNET}/bul.zip" -d "${WNET}" |
| 40 | 40 | ||
| 41 | # create tab directory and export a single .tab file | ||
| 41 | mkdir -p "${TAB_DIR}" | 42 | mkdir -p "${TAB_DIR}" |
| 42 | "${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}" | 43 | "${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}" |
| 43 | 44 | ||
| 44 | python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg" | 45 | # create ready directory and create two .def files |
| 45 | |||
| 46 | mkdir -p "${READY}" | 46 | mkdir -p "${READY}" |
| 47 | python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg" | ||
| 47 | mv "${ROOT}"/*.def "${READY}" | 48 | mv "${ROOT}"/*.def "${READY}" |
| 48 | 49 | ||
| 50 | # create dictionaries directory and download a single dictionary | ||
| 49 | mkdir -p "${DICT}" | 51 | mkdir -p "${DICT}" |
| 50 | |||
| 51 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English | 52 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English |
| 52 | gunzip -q "${DICT}/bg-en.dic.gz" | 53 | gunzip -q "${DICT}/bg-en.dic.gz" |
| 53 | 54 | ||
| 54 | export LC_CTYPE=en_US.UTF-8 | 55 | export LC_CTYPE=en_US.UTF-8 |
| 55 | export LC_ALL=en_US.UTF-8 | 56 | export LC_ALL=en_US.UTF-8 |
| 56 | 57 | ||
| 58 | # create a train and a test seed lexicon | ||
| 57 | perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}" | 59 | perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}" |
| 58 | |||
| 59 | mkdir -p "${TRAIN_DIR}" | 60 | mkdir -p "${TRAIN_DIR}" |
| 60 | mkdir -p "${TEST_DIR}" | 61 | mkdir -p "${TEST_DIR}" |
| 61 | |||
| 62 | mv "${DICT}"/*.train "${TRAIN_DIR}" | 62 | mv "${DICT}"/*.train "${TRAIN_DIR}" |
| 63 | mv "${DICT}"/*.test "${TEST_DIR}" | 63 | mv "${DICT}"/*.test "${TEST_DIR}" |
| 64 | rm -f "${DICT}"/*.dic | 64 | rm -f "${DICT}"/*.dic |
| 65 | 65 | ||
| 66 | # download two monolingual embeddings | ||
| 66 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian | 67 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian |
| 67 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English | 68 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English |
| 68 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" | ||
| 69 | gunzip "${EMBS}/cc.bg.300.vec.gz" | 69 | gunzip "${EMBS}/cc.bg.300.vec.gz" |
| 70 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" | 70 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" |
| 71 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" | 71 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" |
| 72 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" | 72 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" |
| 73 | 73 | ||
| 74 | 74 | # truncate two embeddings | |
| 75 | for lang_code in bg en; do | 75 | for lang_code in bg en; do |
| 76 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" | 76 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" # one line on top for the <number of tokens> <dimensions> |
| 77 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | 77 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" |
| 78 | done | 78 | done |
| 79 | 79 | ||
| 80 | # map two embeddings | ||
| 80 | python "${ROOT}/vecmap/map_embeddings.py" --supervised \ | 81 | python "${ROOT}/vecmap/map_embeddings.py" --supervised \ |
| 81 | "${TRAIN_DIC_DIR}/en_bg.train" \ | 82 | "${TRAIN_DIC_DIR}/en_bg.train" \ |
| 82 | "${EMBS}/en.vec" \ | 83 | "${EMBS}/en.vec" \ |
