diff options
| author | Yigit Sever | 2019-09-16 22:21:38 +0300 |
|---|---|---|
| committer | Yigit Sever | 2019-09-16 22:21:38 +0300 |
| commit | 6430ff4318ef2350cce3ae6d5fbe9e0a551d9c4f (patch) | |
| tree | 3b2ad3de7b9e2e0e0ce142a6052a183c282409e6 /get_data.sh | |
| parent | ff425f488d12d4ecf41a81c65784d32224404aec (diff) | |
| download | Evaluating-Dictionary-Alignment-6430ff4318ef2350cce3ae6d5fbe9e0a551d9c4f.tar.gz Evaluating-Dictionary-Alignment-6430ff4318ef2350cce3ae6d5fbe9e0a551d9c4f.tar.bz2 Evaluating-Dictionary-Alignment-6430ff4318ef2350cce3ae6d5fbe9e0a551d9c4f.zip | |
Dictionary creation done
Diffstat (limited to 'get_data.sh')
| -rwxr-xr-x | get_data.sh | 26 |
1 files changed, 25 insertions, 1 deletions
diff --git a/get_data.sh b/get_data.sh index f6298b1..71b2c65 100755 --- a/get_data.sh +++ b/get_data.sh | |||
| @@ -23,7 +23,7 @@ | |||
| 23 | # | 23 | # |
| 24 | 24 | ||
| 25 | ROOT="$(pwd)" | 25 | ROOT="$(pwd)" |
| 26 | WNET="${ROOT}/data" | 26 | WNET="${ROOT}/wordnets" |
| 27 | mkdir -p "${WNET}" | 27 | mkdir -p "${WNET}" |
| 28 | 28 | ||
| 29 | echo "Downloading wordnet data" | 29 | echo "Downloading wordnet data" |
| @@ -63,3 +63,27 @@ wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-sq.dic.g | |||
| 63 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian | 63 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian |
| 64 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian | 64 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian |
| 65 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian | 65 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian |
| 66 | |||
| 67 | for FILE in ${DICT}/*; do | ||
| 68 | gunzip -q "${FILE}" | ||
| 69 | done | ||
| 70 | |||
| 71 | export LC_CTYPE=en_US.UTF-8 | ||
| 72 | export LC_ALL=en_US.UTF-8 | ||
| 73 | |||
| 74 | echo "Creating dictionaries" | ||
| 75 | |||
| 76 | for PAIR in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do | ||
| 77 | IFS=',' read -r source_lang target_lang <<< "${PAIR}" | ||
| 78 | perl "${ROOT}/train_dic_creator.pl" "${source_lang}" "${target_lang}" | ||
| 79 | done | ||
| 80 | |||
| 81 | TRAIN_DIR="${DICT}/train" | ||
| 82 | TEST_DIR="${DICT}/test" | ||
| 83 | |||
| 84 | mkdir -p "${TRAIN_DIR}" | ||
| 85 | mkdir -p "${TEST_DIR}" | ||
| 86 | |||
| 87 | mv ${DICT}/*.train ${TRAIN_DIR} | ||
| 88 | mv ${DICT}/*.test ${TEST_DIR} | ||
| 89 | rm -f ${DICT}/*.dic | ||
