aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--README.md12
-rwxr-xr-xget_embeddings.sh11
2 files changed, 21 insertions, 2 deletions
diff --git a/README.md b/README.md
index c87fe29..cf75522 100644
--- a/README.md
+++ b/README.md
@@ -20,3 +20,15 @@ git clone https://github.com/yigitsever/Evaluating-Dictionary-Alignment.git && c
20This will create two directories; `dictionaries` and `wordnets`. 20This will create two directories; `dictionaries` and `wordnets`.
21Linewise aligned definition files are in `wordnets/ready`. 21Linewise aligned definition files are in `wordnets/ready`.
22 22
23## Acquiring The Embeddings
24
25We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings.
26You can skip this step if you are providing your own polylingual embeddings.
27Otherwise simply run;
28
29```bash
30./get_embeddings.sh
31```
32
33Bear in mind that this will require around 30 GB free space.
34
diff --git a/get_embeddings.sh b/get_embeddings.sh
index aba2078..be354a6 100755
--- a/get_embeddings.sh
+++ b/get_embeddings.sh
@@ -39,8 +39,15 @@ echo "Extracting embeddings"
39 39
40for lang_code in sq bg el it ro sl; do 40for lang_code in sq bg el it ro sl; do
41 gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" 41 gunzip "${EMBS}/cc.${lang_code}.300.vec.gz"
42 mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec" 42 mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec"
43done 43done
44 44
45unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" 45unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}"
46mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec" 46mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec"
47rm -f "${EMBS}/crawl-300d-2M.vec.zip"
48
49# truncate to top 500k tokens for efficiency
50for lang_code in bg en el it ro sl sq; do
51 sed -in '1,500001!d' "${EMBS}/${lang_code}.vec"
52 sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"
53done