From c42d42e9eb450343f8077c953bc85fadf2a657fa Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Wed, 18 Sep 2019 18:30:42 +0300 Subject: Add initial instructions for mapping vectors --- README.md | 12 +++++++++++- get_embeddings.sh | 29 +++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cf75522..6e246a9 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,17 @@ Linewise aligned definition files are in `wordnets/ready`. We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings. You can skip this step if you are providing your own polylingual embeddings. -Otherwise simply run; +Otherwise; + +* initialize and update the VecMap submodule; + +```bash +git submodule init && git submodule update +``` + +* make sure `./get_data` is already run and `dictionaries` folder is present. + +* run; ```bash ./get_embeddings.sh diff --git a/get_embeddings.sh b/get_embeddings.sh index 66af5af..225122f 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh @@ -21,6 +21,8 @@ # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # +set -o errexit -o pipefail -o noclobber -o nounset + ROOT="$(pwd)" EMBS="${ROOT}/embeddings" mkdir -p "${EMBS}" @@ -51,3 +53,30 @@ for lang_code in bg en el it ro sl sq; do sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" done + +if [ ! "$(ls -A "${ROOT}/vecmap/")" ]; then + echo "VecMap directory seems empty, did you run git submodule init && git submodule update?"; exit +fi + +if [ ! -d "${ROOT}/dictionaries" ]; then + echo "Dictionaries directory does not exist, did you run ./get_data.sh?"; exit +fi + +if [ ! "$(ls -A "${ROOT}/dictionaries/")" ]; then + echo "Dictionaries directory seems empty, did you run ./get_data.sh?"; exit +fi + +TRAIN_DIC_DIR="${ROOT}/dictionaries/train" +MAP_TO="${ROOT}/bilingual_embeddings" + +mkdir -p "${MAP_TO}" + +for i in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do + IFS=',' read -r source_lang target_lang <<< "${i}" + python "${ROOT}/vecmap/map_embeddings.py" --supervised \ + "${TRAIN_DIC_DIR}/${source_lang}_${target_lang}.dic" \ + "${EMBS}/${source_lang}.vec" \ + "${EMBS}/${target_lang}.vec" \ + "${MAP_TO}/${source_lang}_to_${target_lang}.vec" \ + "${MAP_TO}/${target_lang}_to_${source_lang}.vec" > /dev/null +done -- cgit v1.2.3-70-g09d2