From 3843cc5f372f4264bb3cffff9d0c0b7deb703b32 Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Wed, 18 Sep 2019 01:21:20 +0300 Subject: Truncate embeddings --- README.md | 12 ++++++++++++ get_embeddings.sh | 11 +++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c87fe29..cf75522 100644 --- a/README.md +++ b/README.md @@ -20,3 +20,15 @@ git clone https://github.com/yigitsever/Evaluating-Dictionary-Alignment.git && c This will create two directories; `dictionaries` and `wordnets`. Linewise aligned definition files are in `wordnets/ready`. +## Acquiring The Embeddings + +We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings. +You can skip this step if you are providing your own polylingual embeddings. +Otherwise simply run; + +```bash +./get_embeddings.sh +``` + +Bear in mind that this will require around 30 GB free space. + diff --git a/get_embeddings.sh b/get_embeddings.sh index aba2078..be354a6 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh @@ -39,8 +39,15 @@ echo "Extracting embeddings" for lang_code in sq bg el it ro sl; do gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" - mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec" + mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec" done unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" -mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec" +mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" +rm -f "${EMBS}/crawl-300d-2M.vec.zip" + +# truncate to top 500k tokens for efficiency +for lang_code in bg en el it ro sl sq; do + sed -in '1,500001!d' "${EMBS}/${lang_code}.vec" + sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" +done -- cgit v1.2.3-70-g09d2