aboutsummaryrefslogtreecommitdiffstats
path: root/get_embeddings.sh
diff options
context:
space:
mode:
authorYigit Sever2019-09-18 01:21:20 +0300
committerYigit Sever2019-09-18 01:21:20 +0300
commit3843cc5f372f4264bb3cffff9d0c0b7deb703b32 (patch)
tree3d3b542730196fb35c068f7048896e241144158e /get_embeddings.sh
parentce45fffd0941d97897283a1d5f9f4f0790a54616 (diff)
downloadEvaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.tar.gz
Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.tar.bz2
Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.zip
Truncate embeddings
Diffstat (limited to 'get_embeddings.sh')
-rwxr-xr-xget_embeddings.sh11
1 files changed, 9 insertions, 2 deletions
diff --git a/get_embeddings.sh b/get_embeddings.sh
index aba2078..be354a6 100755
--- a/get_embeddings.sh
+++ b/get_embeddings.sh
@@ -39,8 +39,15 @@ echo "Extracting embeddings"
39 39
40for lang_code in sq bg el it ro sl; do 40for lang_code in sq bg el it ro sl; do
41 gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" 41 gunzip "${EMBS}/cc.${lang_code}.300.vec.gz"
42 mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec" 42 mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec"
43done 43done
44 44
45unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" 45unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}"
46mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec" 46mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec"
47rm -f "${EMBS}/crawl-300d-2M.vec.zip"
48
49# truncate to top 500k tokens for efficiency
50for lang_code in bg en el it ro sl sq; do
51 sed -in '1,500001!d' "${EMBS}/${lang_code}.vec"
52 sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"
53done