aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--demo.sh21
1 files changed, 11 insertions, 10 deletions
diff --git a/demo.sh b/demo.sh
index 51346f3..acb7c2f 100644
--- a/demo.sh
+++ b/demo.sh
@@ -30,53 +30,54 @@ DICT="${ROOT}/dictionaries"
30 30
31TRAIN_DIR="${DICT}/train" 31TRAIN_DIR="${DICT}/train"
32TEST_DIR="${DICT}/test" 32TEST_DIR="${DICT}/test"
33
34TAB_DIR="${WNET}/tab_files" 33TAB_DIR="${WNET}/tab_files"
35READY="${WNET}/ready" 34READY="${WNET}/ready"
36mkdir -p "${WNET}"
37 35
36# create wordnets directory and download a single wordnet
37mkdir -p "${WNET}"
38wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}" 38wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}"
39unzip -o -q "${WNET}/bul.zip" -d "${WNET}" 39unzip -o -q "${WNET}/bul.zip" -d "${WNET}"
40 40
41# create tab directory and export a single .tab file
41mkdir -p "${TAB_DIR}" 42mkdir -p "${TAB_DIR}"
42"${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}" 43"${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}"
43 44
44python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg" 45# create ready directory and create two .def files
45
46mkdir -p "${READY}" 46mkdir -p "${READY}"
47python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg"
47mv "${ROOT}"/*.def "${READY}" 48mv "${ROOT}"/*.def "${READY}"
48 49
50# create dictionaries directory and download a single dictionary
49mkdir -p "${DICT}" 51mkdir -p "${DICT}"
50
51wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English 52wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English
52gunzip -q "${DICT}/bg-en.dic.gz" 53gunzip -q "${DICT}/bg-en.dic.gz"
53 54
54export LC_CTYPE=en_US.UTF-8 55export LC_CTYPE=en_US.UTF-8
55export LC_ALL=en_US.UTF-8 56export LC_ALL=en_US.UTF-8
56 57
58# create a train and a test seed lexicon
57perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}" 59perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}"
58
59mkdir -p "${TRAIN_DIR}" 60mkdir -p "${TRAIN_DIR}"
60mkdir -p "${TEST_DIR}" 61mkdir -p "${TEST_DIR}"
61
62mv "${DICT}"/*.train "${TRAIN_DIR}" 62mv "${DICT}"/*.train "${TRAIN_DIR}"
63mv "${DICT}"/*.test "${TEST_DIR}" 63mv "${DICT}"/*.test "${TEST_DIR}"
64rm -f "${DICT}"/*.dic 64rm -f "${DICT}"/*.dic
65 65
66# download two monolingual embeddings
66wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian 67wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian
67wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English 68wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English
68mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec"
69gunzip "${EMBS}/cc.bg.300.vec.gz" 69gunzip "${EMBS}/cc.bg.300.vec.gz"
70mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" 70mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec"
71unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" 71unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}"
72mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" 72mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec"
73 73
74 74# truncate two embeddings
75for lang_code in bg en; do 75for lang_code in bg en; do
76 sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" 76 sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" # one line on top for the <number of tokens> <dimensions>
77 sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" 77 sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"
78done 78done
79 79
80# map two embeddings
80python "${ROOT}/vecmap/map_embeddings.py" --supervised \ 81python "${ROOT}/vecmap/map_embeddings.py" --supervised \
81 "${TRAIN_DIC_DIR}/en_bg.train" \ 82 "${TRAIN_DIC_DIR}/en_bg.train" \
82 "${EMBS}/en.vec" \ 83 "${EMBS}/en.vec" \