diff options
author | Yigit Sever | 2019-09-18 01:21:20 +0300 |
---|---|---|
committer | Yigit Sever | 2019-09-18 01:21:20 +0300 |
commit | 3843cc5f372f4264bb3cffff9d0c0b7deb703b32 (patch) | |
tree | 3d3b542730196fb35c068f7048896e241144158e /get_embeddings.sh | |
parent | ce45fffd0941d97897283a1d5f9f4f0790a54616 (diff) | |
download | Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.tar.gz Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.tar.bz2 Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.zip |
Truncate embeddings
Diffstat (limited to 'get_embeddings.sh')
-rwxr-xr-x | get_embeddings.sh | 11 |
1 files changed, 9 insertions, 2 deletions
diff --git a/get_embeddings.sh b/get_embeddings.sh index aba2078..be354a6 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh | |||
@@ -39,8 +39,15 @@ echo "Extracting embeddings" | |||
39 | 39 | ||
40 | for lang_code in sq bg el it ro sl; do | 40 | for lang_code in sq bg el it ro sl; do |
41 | gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" | 41 | gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" |
42 | mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec" | 42 | mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec" |
43 | done | 43 | done |
44 | 44 | ||
45 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" | 45 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" |
46 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec" | 46 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" |
47 | rm -f "${EMBS}/crawl-300d-2M.vec.zip" | ||
48 | |||
49 | # truncate to top 500k tokens for efficiency | ||
50 | for lang_code in bg en el it ro sl sq; do | ||
51 | sed -in '1,500001!d' "${EMBS}/${lang_code}.vec" | ||
52 | sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | ||
53 | done | ||