diff options
| -rwxr-xr-x | get_data.sh | 26 | ||||
| -rwxr-xr-x | train_dic_creator.pl | 4 |
2 files changed, 27 insertions, 3 deletions
diff --git a/get_data.sh b/get_data.sh index f6298b1..71b2c65 100755 --- a/get_data.sh +++ b/get_data.sh | |||
| @@ -23,7 +23,7 @@ | |||
| 23 | # | 23 | # |
| 24 | 24 | ||
| 25 | ROOT="$(pwd)" | 25 | ROOT="$(pwd)" |
| 26 | WNET="${ROOT}/data" | 26 | WNET="${ROOT}/wordnets" |
| 27 | mkdir -p "${WNET}" | 27 | mkdir -p "${WNET}" |
| 28 | 28 | ||
| 29 | echo "Downloading wordnet data" | 29 | echo "Downloading wordnet data" |
| @@ -63,3 +63,27 @@ wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-sq.dic.g | |||
| 63 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian | 63 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian |
| 64 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian | 64 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian |
| 65 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian | 65 | wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian |
| 66 | |||
| 67 | for FILE in ${DICT}/*; do | ||
| 68 | gunzip -q "${FILE}" | ||
| 69 | done | ||
| 70 | |||
| 71 | export LC_CTYPE=en_US.UTF-8 | ||
| 72 | export LC_ALL=en_US.UTF-8 | ||
| 73 | |||
| 74 | echo "Creating dictionaries" | ||
| 75 | |||
| 76 | for PAIR in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do | ||
| 77 | IFS=',' read -r source_lang target_lang <<< "${PAIR}" | ||
| 78 | perl "${ROOT}/train_dic_creator.pl" "${source_lang}" "${target_lang}" | ||
| 79 | done | ||
| 80 | |||
| 81 | TRAIN_DIR="${DICT}/train" | ||
| 82 | TEST_DIR="${DICT}/test" | ||
| 83 | |||
| 84 | mkdir -p "${TRAIN_DIR}" | ||
| 85 | mkdir -p "${TEST_DIR}" | ||
| 86 | |||
| 87 | mv ${DICT}/*.train ${TRAIN_DIR} | ||
| 88 | mv ${DICT}/*.test ${TEST_DIR} | ||
| 89 | rm -f ${DICT}/*.dic | ||
diff --git a/train_dic_creator.pl b/train_dic_creator.pl index a8de6ea..1921a85 100755 --- a/train_dic_creator.pl +++ b/train_dic_creator.pl | |||
| @@ -109,8 +109,8 @@ my @tail = @result[-$size..-1]; | |||
| 109 | my $train_file_name = $source_lang . '_' . $target_lang . '.train'; | 109 | my $train_file_name = $source_lang . '_' . $target_lang . '.train'; |
| 110 | my $test_file_name = $source_lang . '_' . $target_lang . '.test'; | 110 | my $test_file_name = $source_lang . '_' . $target_lang . '.test'; |
| 111 | 111 | ||
| 112 | open my $train_fh, '>', $train_file_name; | 112 | open my $train_fh, '>', $dict_dir . $train_file_name; |
| 113 | open my $test_fh, '>', $test_file_name; | 113 | open my $test_fh, '>', $dict_dir . $test_file_name; |
| 114 | 114 | ||
| 115 | print $train_fh join("\n", @head); | 115 | print $train_fh join("\n", @head); |
| 116 | print $test_fh join("\n", @tail); | 116 | print $test_fh join("\n", @tail); |
