aboutsummaryrefslogtreecommitdiffstats
path: root/get_data.sh
diff options
context:
space:
mode:
Diffstat (limited to 'get_data.sh')
-rwxr-xr-xget_data.sh26
1 files changed, 25 insertions, 1 deletions
diff --git a/get_data.sh b/get_data.sh
index f6298b1..71b2c65 100755
--- a/get_data.sh
+++ b/get_data.sh
@@ -23,7 +23,7 @@
23# 23#
24 24
25ROOT="$(pwd)" 25ROOT="$(pwd)"
26WNET="${ROOT}/data" 26WNET="${ROOT}/wordnets"
27mkdir -p "${WNET}" 27mkdir -p "${WNET}"
28 28
29echo "Downloading wordnet data" 29echo "Downloading wordnet data"
@@ -63,3 +63,27 @@ wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-sq.dic.g
63wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian 63wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian
64wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian 64wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian
65wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian 65wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian
66
67for FILE in ${DICT}/*; do
68 gunzip -q "${FILE}"
69done
70
71export LC_CTYPE=en_US.UTF-8
72export LC_ALL=en_US.UTF-8
73
74echo "Creating dictionaries"
75
76for PAIR in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do
77 IFS=',' read -r source_lang target_lang <<< "${PAIR}"
78 perl "${ROOT}/train_dic_creator.pl" "${source_lang}" "${target_lang}"
79done
80
81TRAIN_DIR="${DICT}/train"
82TEST_DIR="${DICT}/test"
83
84mkdir -p "${TRAIN_DIR}"
85mkdir -p "${TEST_DIR}"
86
87mv ${DICT}/*.train ${TRAIN_DIR}
88mv ${DICT}/*.test ${TEST_DIR}
89rm -f ${DICT}/*.dic