diff options
Diffstat (limited to 'get_embeddings.sh')
-rwxr-xr-x | get_embeddings.sh | 11 |
1 files changed, 9 insertions, 2 deletions
diff --git a/get_embeddings.sh b/get_embeddings.sh index aba2078..be354a6 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh | |||
@@ -39,8 +39,15 @@ echo "Extracting embeddings" | |||
39 | 39 | ||
40 | for lang_code in sq bg el it ro sl; do | 40 | for lang_code in sq bg el it ro sl; do |
41 | gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" | 41 | gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" |
42 | mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec" | 42 | mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec" |
43 | done | 43 | done |
44 | 44 | ||
45 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" | 45 | unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" |
46 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec" | 46 | mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" |
47 | rm -f "${EMBS}/crawl-300d-2M.vec.zip" | ||
48 | |||
49 | # truncate to top 500k tokens for efficiency | ||
50 | for lang_code in bg en el it ro sl sq; do | ||
51 | sed -in '1,500001!d' "${EMBS}/${lang_code}.vec" | ||
52 | sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | ||
53 | done | ||