diff options
author | Yigit Sever | 2019-09-16 22:21:38 +0300 |
---|---|---|
committer | Yigit Sever | 2019-09-16 22:21:38 +0300 |
commit | 6430ff4318ef2350cce3ae6d5fbe9e0a551d9c4f (patch) | |
tree | 3b2ad3de7b9e2e0e0ce142a6052a183c282409e6 /get_data.sh | |
parent | ff425f488d12d4ecf41a81c65784d32224404aec (diff) | |
download | Evaluating-Dictionary-Alignment-6430ff4318ef2350cce3ae6d5fbe9e0a551d9c4f.tar.gz Evaluating-Dictionary-Alignment-6430ff4318ef2350cce3ae6d5fbe9e0a551d9c4f.tar.bz2 Evaluating-Dictionary-Alignment-6430ff4318ef2350cce3ae6d5fbe9e0a551d9c4f.zip |
Dictionary creation done
Diffstat (limited to 'get_data.sh')
-rwxr-xr-x | get_data.sh | 26 |
1 files changed, 25 insertions, 1 deletions
diff --git a/get_data.sh b/get_data.sh index f6298b1..71b2c65 100755 --- a/get_data.sh +++ b/get_data.sh | |||
@@ -23,7 +23,7 @@ | |||
23 | # | 23 | # |
24 | 24 | ||
25 | ROOT="$(pwd)" | 25 | ROOT="$(pwd)" |
26 | WNET="${ROOT}/data" | 26 | WNET="${ROOT}/wordnets" |
27 | mkdir -p "${WNET}" | 27 | mkdir -p "${WNET}" |
28 | 28 | ||
29 | echo "Downloading wordnet data" | 29 | echo "Downloading wordnet data" |
@@ -63,3 +63,27 @@ wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-sq.dic.g | |||
63 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian | 63 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian |
64 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian | 64 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian |
65 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian | 65 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian |
66 | |||
67 | for FILE in ${DICT}/*; do | ||
68 | gunzip -q "${FILE}" | ||
69 | done | ||
70 | |||
71 | export LC_CTYPE=en_US.UTF-8 | ||
72 | export LC_ALL=en_US.UTF-8 | ||
73 | |||
74 | echo "Creating dictionaries" | ||
75 | |||
76 | for PAIR in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do | ||
77 | IFS=',' read -r source_lang target_lang <<< "${PAIR}" | ||
78 | perl "${ROOT}/train_dic_creator.pl" "${source_lang}" "${target_lang}" | ||
79 | done | ||
80 | |||
81 | TRAIN_DIR="${DICT}/train" | ||
82 | TEST_DIR="${DICT}/test" | ||
83 | |||
84 | mkdir -p "${TRAIN_DIR}" | ||
85 | mkdir -p "${TEST_DIR}" | ||
86 | |||
87 | mv ${DICT}/*.train ${TRAIN_DIR} | ||
88 | mv ${DICT}/*.test ${TEST_DIR} | ||
89 | rm -f ${DICT}/*.dic | ||