From 6430ff4318ef2350cce3ae6d5fbe9e0a551d9c4f Mon Sep 17 00:00:00 2001
From: Yigit Sever
Date: Mon, 16 Sep 2019 22:21:38 +0300
Subject: Dictionary creation done

---
 get_data.sh          | 26 +++++++++++++++++++++++++-
 train_dic_creator.pl |  4 ++--
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/get_data.sh b/get_data.sh
index f6298b1..71b2c65 100755
--- a/get_data.sh
+++ b/get_data.sh
@@ -23,7 +23,7 @@
 #
 
 ROOT="$(pwd)"
-WNET="${ROOT}/data"
+WNET="${ROOT}/wordnets"
 mkdir -p "${WNET}"
 
 echo "Downloading wordnet data"
@@ -63,3 +63,27 @@ wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-sq.dic.g
 wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian
 wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian
 wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian
+
+for FILE in ${DICT}/*; do
+    gunzip -q "${FILE}"
+done
+
+export LC_CTYPE=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+
+echo "Creating dictionaries"
+
+for PAIR in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do
+    IFS=',' read -r source_lang target_lang <<< "${PAIR}"
+    perl "${ROOT}/train_dic_creator.pl" "${source_lang}" "${target_lang}"
+done
+
+TRAIN_DIR="${DICT}/train"
+TEST_DIR="${DICT}/test"
+
+mkdir -p "${TRAIN_DIR}"
+mkdir -p "${TEST_DIR}"
+
+mv ${DICT}/*.train ${TRAIN_DIR}
+mv ${DICT}/*.test ${TEST_DIR}
+rm -f ${DICT}/*.dic
diff --git a/train_dic_creator.pl b/train_dic_creator.pl
index a8de6ea..1921a85 100755
--- a/train_dic_creator.pl
+++ b/train_dic_creator.pl
@@ -109,8 +109,8 @@ my @tail = @result[-$size..-1];
 my $train_file_name = $source_lang . '_' . $target_lang . '.train';
 my $test_file_name = $source_lang . '_' . $target_lang . '.test';
 
-open my $train_fh, '>', $train_file_name;
-open my $test_fh, '>', $test_file_name;
+open my $train_fh, '>', $dict_dir . $train_file_name;
+open my $test_fh, '>', $dict_dir . $test_file_name;
 
 print $train_fh join("\n", @head);
 print $test_fh join("\n", @tail);
-- 
cgit v1.2.3-70-g09d2