aboutsummaryrefslogtreecommitdiffstats
path: root/get_embeddings.sh
diff options
context:
space:
mode:
Diffstat (limited to 'get_embeddings.sh')
-rwxr-xr-xget_embeddings.sh11
1 files changed, 9 insertions, 2 deletions
diff --git a/get_embeddings.sh b/get_embeddings.sh
index aba2078..be354a6 100755
--- a/get_embeddings.sh
+++ b/get_embeddings.sh
@@ -39,8 +39,15 @@ echo "Extracting embeddings"
39 39
40for lang_code in sq bg el it ro sl; do 40for lang_code in sq bg el it ro sl; do
41 gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" 41 gunzip "${EMBS}/cc.${lang_code}.300.vec.gz"
42 mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec" 42 mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec"
43done 43done
44 44
45unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" 45unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}"
46mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec" 46mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec"
47rm -f "${EMBS}/crawl-300d-2M.vec.zip"
48
49# truncate to top 500k tokens for efficiency
50for lang_code in bg en el it ro sl sq; do
51 sed -in '1,500001!d' "${EMBS}/${lang_code}.vec"
52 sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"
53done