diff options
Diffstat (limited to 'demo.sh')
-rw-r--r-- | demo.sh | 21 |
1 files changed, 11 insertions, 10 deletions
@@ -30,53 +30,54 @@ DICT="${ROOT}/dictionaries" | |||
30 | 30 | ||
31 | TRAIN_DIR="${DICT}/train" | 31 | TRAIN_DIR="${DICT}/train" |
32 | TEST_DIR="${DICT}/test" | 32 | TEST_DIR="${DICT}/test" |
33 | |||
34 | TAB_DIR="${WNET}/tab_files" | 33 | TAB_DIR="${WNET}/tab_files" |
35 | READY="${WNET}/ready" | 34 | READY="${WNET}/ready" |
36 | mkdir -p "${WNET}" | ||
37 | 35 | ||
36 | # create wordnets directory and download a single wordnet | ||
37 | mkdir -p "${WNET}" | ||
38 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}" | 38 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}" |
39 | unzip -o -q "${WNET}/bul.zip" -d "${WNET}" | 39 | unzip -o -q "${WNET}/bul.zip" -d "${WNET}" |
40 | 40 | ||
41 | # create tab directory and export a single .tab file | ||
41 | mkdir -p "${TAB_DIR}" | 42 | mkdir -p "${TAB_DIR}" |
42 | "${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}" | 43 | "${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}" |
43 | 44 | ||
44 | python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg" | 45 | # create ready directory and create two .def files |
45 | |||
46 | mkdir -p "${READY}" | 46 | mkdir -p "${READY}" |
47 | python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg" | ||
47 | mv "${ROOT}"/*.def "${READY}" | 48 | mv "${ROOT}"/*.def "${READY}" |
48 | 49 | ||
50 | # create dictionaries directory and download a single dictionary | ||
49 | mkdir -p "${DICT}" | 51 | mkdir -p "${DICT}" |
50 | |||
51 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English | 52 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English |
52 | gunzip -q "${DICT}/bg-en.dic.gz" | 53 | gunzip -q "${DICT}/bg-en.dic.gz" |
53 | 54 | ||
54 | export LC_CTYPE=en_US.UTF-8 | 55 | export LC_CTYPE=en_US.UTF-8 |
55 | export LC_ALL=en_US.UTF-8 | 56 | export LC_ALL=en_US.UTF-8 |
56 | 57 | ||
58 | # create a train and a test seed lexicon | ||
57 | perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}" | 59 | perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}" |
58 | |||
59 | mkdir -p "${TRAIN_DIR}" | 60 | mkdir -p "${TRAIN_DIR}" |
60 | mkdir -p "${TEST_DIR}" | 61 | mkdir -p "${TEST_DIR}" |
61 | |||
62 | mv "${DICT}"/*.train "${TRAIN_DIR}" | 62 | mv "${DICT}"/*.train "${TRAIN_DIR}" |
63 | mv "${DICT}"/*.test "${TEST_DIR}" | 63 | mv "${DICT}"/*.test "${TEST_DIR}" |
64 | rm -f "${DICT}"/*.dic | 64 | rm -f "${DICT}"/*.dic |
65 | 65 | ||
66 | # download two monolingual embeddings | ||
66 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian | 67 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian |
67 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English | 68 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English |
68 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" | ||
69 | gunzip "${EMBS}/cc.bg.300.vec.gz" | 69 | gunzip "${EMBS}/cc.bg.300.vec.gz" |
70 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" | 70 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" |
71 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" | 71 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" |
72 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" | 72 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" |
73 | 73 | ||
74 | 74 | # truncate two embeddings | |
75 | for lang_code in bg en; do | 75 | for lang_code in bg en; do |
76 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" | 76 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" # one line on top for the <number of tokens> <dimensions> |
77 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | 77 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" |
78 | done | 78 | done |
79 | 79 | ||
80 | # map two embeddings | ||
80 | python "${ROOT}/vecmap/map_embeddings.py" --supervised \ | 81 | python "${ROOT}/vecmap/map_embeddings.py" --supervised \ |
81 | "${TRAIN_DIC_DIR}/en_bg.train" \ | 82 | "${TRAIN_DIC_DIR}/en_bg.train" \ |
82 | "${EMBS}/en.vec" \ | 83 | "${EMBS}/en.vec" \ |