diff options
author | Yigit Sever | 2019-09-18 01:21:20 +0300 |
---|---|---|
committer | Yigit Sever | 2019-09-18 01:21:20 +0300 |
commit | 3843cc5f372f4264bb3cffff9d0c0b7deb703b32 (patch) | |
tree | 3d3b542730196fb35c068f7048896e241144158e | |
parent | ce45fffd0941d97897283a1d5f9f4f0790a54616 (diff) | |
download | Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.tar.gz Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.tar.bz2 Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.zip |
Truncate embeddings
-rw-r--r-- | README.md | 12 | ||||
-rwxr-xr-x | get_embeddings.sh | 11 |
2 files changed, 21 insertions, 2 deletions
@@ -20,3 +20,15 @@ git clone https://github.com/yigitsever/Evaluating-Dictionary-Alignment.git && c | |||
20 | This will create two directories; `dictionaries` and `wordnets`. | 20 | This will create two directories; `dictionaries` and `wordnets`. |
21 | Linewise aligned definition files are in `wordnets/ready`. | 21 | Linewise aligned definition files are in `wordnets/ready`. |
22 | 22 | ||
23 | ## Acquiring The Embeddings | ||
24 | |||
25 | We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings. | ||
26 | You can skip this step if you are providing your own polylingual embeddings. | ||
27 | Otherwise simply run; | ||
28 | |||
29 | ```bash | ||
30 | ./get_embeddings.sh | ||
31 | ``` | ||
32 | |||
33 | Bear in mind that this will require around 30 GB free space. | ||
34 | |||
diff --git a/get_embeddings.sh b/get_embeddings.sh index aba2078..be354a6 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh | |||
@@ -39,8 +39,15 @@ echo "Extracting embeddings" | |||
39 | 39 | ||
40 | for lang_code in sq bg el it ro sl; do | 40 | for lang_code in sq bg el it ro sl; do |
41 | gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" | 41 | gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" |
42 | mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec" | 42 | mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec" |
43 | done | 43 | done |
44 | 44 | ||
45 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" | 45 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" |
46 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec" | 46 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" |
47 | rm -f "${EMBS}/crawl-300d-2M.vec.zip" | ||
48 | |||
49 | # truncate to top 500k tokens for efficiency | ||
50 | for lang_code in bg en el it ro sl sq; do | ||
51 | sed -in '1,500001!d' "${EMBS}/${lang_code}.vec" | ||
52 | sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | ||
53 | done | ||