From 3843cc5f372f4264bb3cffff9d0c0b7deb703b32 Mon Sep 17 00:00:00 2001
From: Yigit Sever
Date: Wed, 18 Sep 2019 01:21:20 +0300
Subject: Truncate embeddings

---
 README.md         | 12 ++++++++++++
 get_embeddings.sh | 11 +++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c87fe29..cf75522 100644
--- a/README.md
+++ b/README.md
@@ -20,3 +20,15 @@ git clone https://github.com/yigitsever/Evaluating-Dictionary-Alignment.git && c
 This will create two directories; `dictionaries` and `wordnets`.
 Linewise aligned definition files are in `wordnets/ready`.
 
+## Acquiring The Embeddings
+
+We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings.
+You can skip this step if you are providing your own polylingual embeddings.
+Otherwise simply run;
+
+```bash
+./get_embeddings.sh
+```
+
+Bear in mind that this will require around 30 GB free space.
+
diff --git a/get_embeddings.sh b/get_embeddings.sh
index aba2078..be354a6 100755
--- a/get_embeddings.sh
+++ b/get_embeddings.sh
@@ -39,8 +39,15 @@ echo "Extracting embeddings"
 
 for lang_code in sq bg el it ro sl; do
         gunzip "${EMBS}/cc.${lang_code}.300.vec.gz"
-        mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec"
+        mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec"
 done
 
 unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}"
-mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec"
+mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec"
+rm -f "${EMBS}/crawl-300d-2M.vec.zip"
+
+# truncate to top 500k tokens for efficiency
+for lang_code in bg en el it ro sl sq; do
+    sed -in '1,500001!d' "${EMBS}/${lang_code}.vec"
+    sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"
+done
-- 
cgit v1.2.3-70-g09d2