aboutsummaryrefslogtreecommitdiffstats
path: root/get_embeddings.sh
diff options
context:
space:
mode:
Diffstat (limited to 'get_embeddings.sh')
-rwxr-xr-xget_embeddings.sh43
1 files changed, 43 insertions, 0 deletions
diff --git a/get_embeddings.sh b/get_embeddings.sh
new file mode 100755
index 0000000..0c5d918
--- /dev/null
+++ b/get_embeddings.sh
@@ -0,0 +1,43 @@
1#!/bin/bash
2#
3# Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr>
4#
5# Permission is hereby granted, free of charge, to any person obtaining
6# a copy of this software and associated documentation files (the "Software"),
7# to deal in the Software without restriction, including without limitation
8# the rights to use, copy, modify, merge, publish, distribute, sublicense,
9# and/or sell copies of the Software, and to permit persons to whom the
10# Software is furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice shall be included
13# in all copies or substantial portions of the Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
19# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
21# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22#
23
24ROOT="$(pwd)"
25EMBS="${ROOT}/embeddings"
26mkdir -p "${EMBS}"
27
28echo "Downloading embeddings"
29
30wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.vec.gz -P "${EMBS}" # Albanian
31wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian
32wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English
33wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.vec.gz -P "${EMBS}" # Greek
34wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz -P "${EMBS}" # Italian
35wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz -P "${EMBS}" # Romanian
36wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.vec.gz -P "${EMBS}" # Slovenian
37
38echo "Extracting embeddings"
39
40for LANG in sq bg el it ro sl; do
41 gunzip -fc "${EMBS}/cc.${LANG}.300.vec.gz" > "${EMBS}/${LANG}.1M.vec"
42 rm -f "${EMBS}/cc.${LANG}.300.vec.gz"
43done