From f9bc1db3284f2bf05f7b1cf55ab912363c6e8440 Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Thu, 19 Sep 2019 00:17:16 +0300 Subject: Use variables as much as possible --- get_data.sh | 1 + get_embeddings.sh | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/get_data.sh b/get_data.sh index 2d04678..4f97ad5 100755 --- a/get_data.sh +++ b/get_data.sh @@ -67,6 +67,7 @@ mv "${ROOT}"/*.def "${READY}" echo "Downloading dictionaries" DICT="${ROOT}/dictionaries" +mkdir -p "${DICT}" wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/en-sq.dic.gz -P "${DICT}" # English - Albanian wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English diff --git a/get_embeddings.sh b/get_embeddings.sh index daf839b..47a7a8e 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh @@ -25,6 +25,7 @@ set -o errexit -o pipefail -o noclobber -o nounset ROOT="$(pwd)" EMBS="${ROOT}/embeddings" +DICT="${ROOT}/dictionaries" mkdir -p "${EMBS}" echo "Downloading embeddings" @@ -58,15 +59,15 @@ if [ ! "$(ls -A "${ROOT}/vecmap/")" ]; then echo "VecMap directory seems empty, did you run git submodule init && git submodule update?"; exit fi -if [ ! -d "${ROOT}/dictionaries" ]; then +if [ ! -d "${DICT}" ]; then echo "Dictionaries directory does not exist, did you run ./get_data.sh?"; exit fi -if [ ! "$(ls -A "${ROOT}/dictionaries/")" ]; then +if [ ! "$(ls -A "${DICT}")" ]; then echo "Dictionaries directory seems empty, did you run ./get_data.sh?"; exit fi -TRAIN_DIC_DIR="${ROOT}/dictionaries/train" +TRAIN_DIC_DIR="${DICT}/train" MAP_TO="${ROOT}/bilingual_embeddings" mkdir -p "${MAP_TO}" -- cgit v1.2.3-70-g09d2 From 4ffdd84d704d9040cd493b0ca8e53fb15278be26 Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Thu, 19 Sep 2019 00:17:45 +0300 Subject: Include demo script, untested --- demo.sh | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 demo.sh diff --git a/demo.sh b/demo.sh new file mode 100644 index 0000000..51346f3 --- /dev/null +++ b/demo.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# +# Copyright © 2019 Yiğit Sever +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# + +ROOT="$(pwd)" +SCRIPTS="${ROOT}/scripts" +WNET="${ROOT}/wordnets" +EMBS="${ROOT}/embeddings" +DICT="${ROOT}/dictionaries" + +TRAIN_DIR="${DICT}/train" +TEST_DIR="${DICT}/test" + +TAB_DIR="${WNET}/tab_files" +READY="${WNET}/ready" +mkdir -p "${WNET}" + +wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}" +unzip -o -q "${WNET}/bul.zip" -d "${WNET}" + +mkdir -p "${TAB_DIR}" +"${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}" + +python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg" + +mkdir -p "${READY}" +mv "${ROOT}"/*.def "${READY}" + +mkdir -p "${DICT}" + +wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English +gunzip -q "${DICT}/bg-en.dic.gz" + +export LC_CTYPE=en_US.UTF-8 +export LC_ALL=en_US.UTF-8 + +perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}" + +mkdir -p "${TRAIN_DIR}" +mkdir -p "${TEST_DIR}" + +mv "${DICT}"/*.train "${TRAIN_DIR}" +mv "${DICT}"/*.test "${TEST_DIR}" +rm -f "${DICT}"/*.dic + +wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian +wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English +mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" +gunzip "${EMBS}/cc.bg.300.vec.gz" +mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec" +unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" +mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" + + +for lang_code in bg en; do + sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" + sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" +done + +python "${ROOT}/vecmap/map_embeddings.py" --supervised \ + "${TRAIN_DIC_DIR}/en_bg.train" \ + "${EMBS}/en.vec" \ + "${EMBS}/bg.vec" \ + "${MAP_TO}/en_to_bg.vec" \ + "${MAP_TO}/bg_to_en.vec" > /dev/null 2>&1 -- cgit v1.2.3-70-g09d2