From 3843cc5f372f4264bb3cffff9d0c0b7deb703b32 Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Wed, 18 Sep 2019 01:21:20 +0300 Subject: Truncate embeddings --- get_embeddings.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'get_embeddings.sh') diff --git a/get_embeddings.sh b/get_embeddings.sh index aba2078..be354a6 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh @@ -39,8 +39,15 @@ echo "Extracting embeddings" for lang_code in sq bg el it ro sl; do gunzip "${EMBS}/cc.${lang_code}.300.vec.gz" - mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec" + mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec" done unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}" -mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec" +mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec" +rm -f "${EMBS}/crawl-300d-2M.vec.zip" + +# truncate to top 500k tokens for efficiency +for lang_code in bg en el it ro sl sq; do + sed -in '1,500001!d' "${EMBS}/${lang_code}.vec" + sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" +done -- cgit v1.2.3-70-g09d2