diff options
| -rw-r--r-- | README.md | 12 | ||||
| -rwxr-xr-x | get_embeddings.sh | 11 |
2 files changed, 21 insertions, 2 deletions
| @@ -20,3 +20,15 @@ git clone https://github.com/yigitsever/Evaluating-Dictionary-Alignment.git && c | |||
| 20 | This will create two directories; `dictionaries` and `wordnets`. | 20 | This will create two directories; `dictionaries` and `wordnets`. |
| 21 | Linewise aligned definition files are in `wordnets/ready`. | 21 | Linewise aligned definition files are in `wordnets/ready`. |
| 22 | 22 | ||
| 23 | ## Acquiring The Embeddings | ||
| 24 | |||
| 25 | We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings. | ||
| 26 | You can skip this step if you are providing your own polylingual embeddings. | ||
| 27 | Otherwise simply run; | ||
| 28 | |||
| 29 | ```bash | ||
| 30 | ./get_embeddings.sh | ||
| 31 | ``` | ||
| 32 | |||
| 33 | Bear in mind that this will require around 30 GB free space. | ||
| 34 | |||
diff --git a/get_embeddings.sh b/get_embeddings.sh index aba2078..be354a6 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh | |||
| @@ -39,8 +39,15 @@ echo "Extracting embeddings" | |||
| 39 | 39 | ||
| 40 | for lang_code in sq bg el it ro sl; do | 40 | for lang_code in sq bg el it ro sl; do |
| 41 | gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" | 41 | gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" |
| 42 | mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec" | 42 | mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec" |
| 43 | done | 43 | done |
| 44 | 44 | ||
| 45 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" | 45 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" |
| 46 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec" | 46 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" |
| 47 | rm -f "${EMBS}/crawl-300d-2M.vec.zip" | ||
| 48 | |||
| 49 | # truncate to top 500k tokens for efficiency | ||
| 50 | for lang_code in bg en el it ro sl sq; do | ||
| 51 | sed -in '1,500001!d' "${EMBS}/${lang_code}.vec" | ||
| 52 | sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | ||
| 53 | done | ||
