From c42d42e9eb450343f8077c953bc85fadf2a657fa Mon Sep 17 00:00:00 2001
From: Yigit Sever
Date: Wed, 18 Sep 2019 18:30:42 +0300
Subject: Add initial instructions for mapping vectors

---
 README.md         | 12 +++++++++++-
 get_embeddings.sh | 29 +++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cf75522..6e246a9 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,17 @@ Linewise aligned definition files are in `wordnets/ready`.
 
 We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings.
 You can skip this step if you are providing your own polylingual embeddings.
-Otherwise simply run;
+Otherwise;
+
+* initialize and update the VecMap submodule;
+
+```bash
+git submodule init && git submodule update
+```
+
+* make sure `./get_data` is already run and `dictionaries` folder is present.
+
+* run;
 
 ```bash
 ./get_embeddings.sh
diff --git a/get_embeddings.sh b/get_embeddings.sh
index 66af5af..225122f 100755
--- a/get_embeddings.sh
+++ b/get_embeddings.sh
@@ -21,6 +21,8 @@
 # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #
 
+set -o errexit -o pipefail -o noclobber -o nounset
+
 ROOT="$(pwd)"
 EMBS="${ROOT}/embeddings"
 mkdir -p "${EMBS}"
@@ -51,3 +53,30 @@ for lang_code in bg en el it ro sl sq; do
     sed -i '1,500001!d' "${EMBS}/${lang_code}.vec"
     sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"
 done
+
+if [ ! "$(ls -A "${ROOT}/vecmap/")" ]; then
+    echo "VecMap directory seems empty, did you run git submodule init && git submodule update?"; exit
+fi
+
+if [ ! -d "${ROOT}/dictionaries" ]; then
+    echo "Dictionaries directory does not exist, did you run ./get_data.sh?"; exit
+fi
+
+if [ ! "$(ls -A "${ROOT}/dictionaries/")" ]; then
+    echo "Dictionaries directory seems empty, did you run ./get_data.sh?"; exit
+fi
+
+TRAIN_DIC_DIR="${ROOT}/dictionaries/train"
+MAP_TO="${ROOT}/bilingual_embeddings"
+
+mkdir -p "${MAP_TO}"
+
+for i in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do
+    IFS=',' read -r source_lang target_lang <<< "${i}"
+    python "${ROOT}/vecmap/map_embeddings.py" --supervised \
+        "${TRAIN_DIC_DIR}/${source_lang}_${target_lang}.dic" \
+        "${EMBS}/${source_lang}.vec" \
+        "${EMBS}/${target_lang}.vec" \
+        "${MAP_TO}/${source_lang}_to_${target_lang}.vec" \
+        "${MAP_TO}/${target_lang}_to_${source_lang}.vec" > /dev/null
+done
-- 
cgit v1.2.3-70-g09d2