From 6430ff4318ef2350cce3ae6d5fbe9e0a551d9c4f Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Mon, 16 Sep 2019 22:21:38 +0300 Subject: Dictionary creation done --- get_data.sh | 26 +++++++++++++++++++++++++- train_dic_creator.pl | 4 ++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/get_data.sh b/get_data.sh index f6298b1..71b2c65 100755 --- a/get_data.sh +++ b/get_data.sh @@ -23,7 +23,7 @@ # ROOT="$(pwd)" -WNET="${ROOT}/data" +WNET="${ROOT}/wordnets" mkdir -p "${WNET}" echo "Downloading wordnet data" @@ -63,3 +63,27 @@ wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-sq.dic.g wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian + +for FILE in ${DICT}/*; do + gunzip -q "${FILE}" +done + +export LC_CTYPE=en_US.UTF-8 +export LC_ALL=en_US.UTF-8 + +echo "Creating dictionaries" + +for PAIR in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do + IFS=',' read -r source_lang target_lang <<< "${PAIR}" + perl "${ROOT}/train_dic_creator.pl" "${source_lang}" "${target_lang}" +done + +TRAIN_DIR="${DICT}/train" +TEST_DIR="${DICT}/test" + +mkdir -p "${TRAIN_DIR}" +mkdir -p "${TEST_DIR}" + +mv ${DICT}/*.train ${TRAIN_DIR} +mv ${DICT}/*.test ${TEST_DIR} +rm -f ${DICT}/*.dic diff --git a/train_dic_creator.pl b/train_dic_creator.pl index a8de6ea..1921a85 100755 --- a/train_dic_creator.pl +++ b/train_dic_creator.pl @@ -109,8 +109,8 @@ my @tail = @result[-$size..-1]; my $train_file_name = $source_lang . '_' . $target_lang . '.train'; my $test_file_name = $source_lang . '_' . $target_lang . '.test'; -open my $train_fh, '>', $train_file_name; -open my $test_fh, '>', $test_file_name; +open my $train_fh, '>', $dict_dir . $train_file_name; +open my $test_fh, '>', $dict_dir . $test_file_name; print $train_fh join("\n", @head); print $test_fh join("\n", @tail); -- cgit v1.2.3-70-g09d2