diff options
-rwxr-xr-x | get_data.sh | 26 | ||||
-rwxr-xr-x | train_dic_creator.pl | 4 |
2 files changed, 27 insertions, 3 deletions
diff --git a/get_data.sh b/get_data.sh index f6298b1..71b2c65 100755 --- a/get_data.sh +++ b/get_data.sh | |||
@@ -23,7 +23,7 @@ | |||
23 | # | 23 | # |
24 | 24 | ||
25 | ROOT="$(pwd)" | 25 | ROOT="$(pwd)" |
26 | WNET="${ROOT}/data" | 26 | WNET="${ROOT}/wordnets" |
27 | mkdir -p "${WNET}" | 27 | mkdir -p "${WNET}" |
28 | 28 | ||
29 | echo "Downloading wordnet data" | 29 | echo "Downloading wordnet data" |
@@ -63,3 +63,27 @@ wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-sq.dic.g | |||
63 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian | 63 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian |
64 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian | 64 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian |
65 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian | 65 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian |
66 | |||
67 | for FILE in ${DICT}/*; do | ||
68 | gunzip -q "${FILE}" | ||
69 | done | ||
70 | |||
71 | export LC_CTYPE=en_US.UTF-8 | ||
72 | export LC_ALL=en_US.UTF-8 | ||
73 | |||
74 | echo "Creating dictionaries" | ||
75 | |||
76 | for PAIR in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do | ||
77 | IFS=',' read -r source_lang target_lang <<< "${PAIR}" | ||
78 | perl "${ROOT}/train_dic_creator.pl" "${source_lang}" "${target_lang}" | ||
79 | done | ||
80 | |||
81 | TRAIN_DIR="${DICT}/train" | ||
82 | TEST_DIR="${DICT}/test" | ||
83 | |||
84 | mkdir -p "${TRAIN_DIR}" | ||
85 | mkdir -p "${TEST_DIR}" | ||
86 | |||
87 | mv ${DICT}/*.train ${TRAIN_DIR} | ||
88 | mv ${DICT}/*.test ${TEST_DIR} | ||
89 | rm -f ${DICT}/*.dic | ||
diff --git a/train_dic_creator.pl b/train_dic_creator.pl index a8de6ea..1921a85 100755 --- a/train_dic_creator.pl +++ b/train_dic_creator.pl | |||
@@ -109,8 +109,8 @@ my @tail = @result[-$size..-1]; | |||
109 | my $train_file_name = $source_lang . '_' . $target_lang . '.train'; | 109 | my $train_file_name = $source_lang . '_' . $target_lang . '.train'; |
110 | my $test_file_name = $source_lang . '_' . $target_lang . '.test'; | 110 | my $test_file_name = $source_lang . '_' . $target_lang . '.test'; |
111 | 111 | ||
112 | open my $train_fh, '>', $train_file_name; | 112 | open my $train_fh, '>', $dict_dir . $train_file_name; |
113 | open my $test_fh, '>', $test_file_name; | 113 | open my $test_fh, '>', $dict_dir . $test_file_name; |
114 | 114 | ||
115 | print $train_fh join("\n", @head); | 115 | print $train_fh join("\n", @head); |
116 | print $test_fh join("\n", @tail); | 116 | print $test_fh join("\n", @tail); |