aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xget_data.sh26
-rwxr-xr-xtrain_dic_creator.pl4
2 files changed, 27 insertions, 3 deletions
diff --git a/get_data.sh b/get_data.sh
index f6298b1..71b2c65 100755
--- a/get_data.sh
+++ b/get_data.sh
@@ -23,7 +23,7 @@
23# 23#
24 24
25ROOT="$(pwd)" 25ROOT="$(pwd)"
26WNET="${ROOT}/data" 26WNET="${ROOT}/wordnets"
27mkdir -p "${WNET}" 27mkdir -p "${WNET}"
28 28
29echo "Downloading wordnet data" 29echo "Downloading wordnet data"
@@ -63,3 +63,27 @@ wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-sq.dic.g
63wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian 63wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian
64wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian 64wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian
65wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian 65wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian
66
67for FILE in ${DICT}/*; do
68 gunzip -q "${FILE}"
69done
70
71export LC_CTYPE=en_US.UTF-8
72export LC_ALL=en_US.UTF-8
73
74echo "Creating dictionaries"
75
76for PAIR in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do
77 IFS=',' read -r source_lang target_lang <<< "${PAIR}"
78 perl "${ROOT}/train_dic_creator.pl" "${source_lang}" "${target_lang}"
79done
80
81TRAIN_DIR="${DICT}/train"
82TEST_DIR="${DICT}/test"
83
84mkdir -p "${TRAIN_DIR}"
85mkdir -p "${TEST_DIR}"
86
87mv ${DICT}/*.train ${TRAIN_DIR}
88mv ${DICT}/*.test ${TEST_DIR}
89rm -f ${DICT}/*.dic
diff --git a/train_dic_creator.pl b/train_dic_creator.pl
index a8de6ea..1921a85 100755
--- a/train_dic_creator.pl
+++ b/train_dic_creator.pl
@@ -109,8 +109,8 @@ my @tail = @result[-$size..-1];
109my $train_file_name = $source_lang . '_' . $target_lang . '.train'; 109my $train_file_name = $source_lang . '_' . $target_lang . '.train';
110my $test_file_name = $source_lang . '_' . $target_lang . '.test'; 110my $test_file_name = $source_lang . '_' . $target_lang . '.test';
111 111
112open my $train_fh, '>', $train_file_name; 112open my $train_fh, '>', $dict_dir . $train_file_name;
113open my $test_fh, '>', $test_file_name; 113open my $test_fh, '>', $dict_dir . $test_file_name;
114 114
115print $train_fh join("\n", @head); 115print $train_fh join("\n", @head);
116print $test_fh join("\n", @tail); 116print $test_fh join("\n", @tail);