diff options
author | Yigit Sever | 2019-09-19 00:22:46 +0300 |
---|---|---|
committer | Yigit Sever | 2019-09-19 00:22:46 +0300 |
commit | f930fb325ec68e9522c689132d20ea335b30dbbe (patch) | |
tree | 37917602b2de3f251a23b195e4be689fa9ed18f4 /demo.sh | |
parent | 1890976ed1eee59eda92ceabdcb1c966d6707269 (diff) | |
parent | 4ffdd84d704d9040cd493b0ca8e53fb15278be26 (diff) | |
download | Evaluating-Dictionary-Alignment-f930fb325ec68e9522c689132d20ea335b30dbbe.tar.gz Evaluating-Dictionary-Alignment-f930fb325ec68e9522c689132d20ea335b30dbbe.tar.bz2 Evaluating-Dictionary-Alignment-f930fb325ec68e9522c689132d20ea335b30dbbe.zip |
Merge branch 'master' of github.com:yigitsever/Evaluating-Dictionary-Alignment
Diffstat (limited to 'demo.sh')
-rw-r--r-- | demo.sh | 85 |
1 files changed, 85 insertions, 0 deletions
@@ -0,0 +1,85 @@ | |||
1 | #!/bin/bash | ||
2 | # | ||
3 | # Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr> | ||
4 | # | ||
5 | # Permission is hereby granted, free of charge, to any person obtaining | ||
6 | # a copy of this software and associated documentation files (the "Software"), | ||
7 | # to deal in the Software without restriction, including without limitation | ||
8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
9 | # and/or sell copies of the Software, and to permit persons to whom the | ||
10 | # Software is furnished to do so, subject to the following conditions: | ||
11 | # | ||
12 | # The above copyright notice and this permission notice shall be included | ||
13 | # in all copies or substantial portions of the Software. | ||
14 | # | ||
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
16 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
17 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
18 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
19 | # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
20 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
21 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
22 | # | ||
23 | # | ||
24 | |||
25 | ROOT="$(pwd)" | ||
26 | SCRIPTS="${ROOT}/scripts" | ||
27 | WNET="${ROOT}/wordnets" | ||
28 | EMBS="${ROOT}/embeddings" | ||
29 | DICT="${ROOT}/dictionaries" | ||
30 | |||
31 | TRAIN_DIR="${DICT}/train" | ||
32 | TEST_DIR="${DICT}/test" | ||
33 | |||
34 | TAB_DIR="${WNET}/tab_files" | ||
35 | READY="${WNET}/ready" | ||
36 | mkdir -p "${WNET}" | ||
37 | |||
38 | wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}" | ||
39 | unzip -o -q "${WNET}/bul.zip" -d "${WNET}" | ||
40 | |||
41 | mkdir -p "${TAB_DIR}" | ||
42 | "${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}" | ||
43 | |||
44 | python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg" | ||
45 | |||
46 | mkdir -p "${READY}" | ||
47 | mv "${ROOT}"/*.def "${READY}" | ||
48 | |||
49 | mkdir -p "${DICT}" | ||
50 | |||
51 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English | ||
52 | gunzip -q "${DICT}/bg-en.dic.gz" | ||
53 | |||
54 | export LC_CTYPE=en_US.UTF-8 | ||
55 | export LC_ALL=en_US.UTF-8 | ||
56 | |||
57 | perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}" | ||
58 | |||
59 | mkdir -p "${TRAIN_DIR}" | ||
60 | mkdir -p "${TEST_DIR}" | ||
61 | |||
62 | mv "${DICT}"/*.train "${TRAIN_DIR}" | ||
63 | mv "${DICT}"/*.test "${TEST_DIR}" | ||
64 | rm -f "${DICT}"/*.dic | ||
65 | |||
66 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian | ||
67 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English | ||
68 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" | ||
69 | gunzip "${EMBS}/cc.bg.300.vec.gz" | ||
70 | mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" | ||
71 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" | ||
72 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" | ||
73 | |||
74 | |||
75 | for lang_code in bg en; do | ||
76 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" | ||
77 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | ||
78 | done | ||
79 | |||
80 | python "${ROOT}/vecmap/map_embeddings.py" --supervised \ | ||
81 | "${TRAIN_DIC_DIR}/en_bg.train" \ | ||
82 | "${EMBS}/en.vec" \ | ||
83 | "${EMBS}/bg.vec" \ | ||
84 | "${MAP_TO}/en_to_bg.vec" \ | ||
85 | "${MAP_TO}/bg_to_en.vec" > /dev/null 2>&1 | ||