diff options
Diffstat (limited to 'get_embeddings.sh')
-rwxr-xr-x | get_embeddings.sh | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/get_embeddings.sh b/get_embeddings.sh new file mode 100755 index 0000000..0c5d918 --- /dev/null +++ b/get_embeddings.sh | |||
@@ -0,0 +1,43 @@ | |||
1 | #!/bin/bash | ||
2 | # | ||
3 | # Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr> | ||
4 | # | ||
5 | # Permission is hereby granted, free of charge, to any person obtaining | ||
6 | # a copy of this software and associated documentation files (the "Software"), | ||
7 | # to deal in the Software without restriction, including without limitation | ||
8 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
9 | # and/or sell copies of the Software, and to permit persons to whom the | ||
10 | # Software is furnished to do so, subject to the following conditions: | ||
11 | # | ||
12 | # The above copyright notice and this permission notice shall be included | ||
13 | # in all copies or substantial portions of the Software. | ||
14 | # | ||
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
16 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
17 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
18 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
19 | # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
20 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
21 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
22 | # | ||
23 | |||
24 | ROOT="$(pwd)" | ||
25 | EMBS="${ROOT}/embeddings" | ||
26 | mkdir -p "${EMBS}" | ||
27 | |||
28 | echo "Downloading embeddings" | ||
29 | |||
30 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.vec.gz -P "${EMBS}" # Albanian | ||
31 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian | ||
32 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English | ||
33 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.vec.gz -P "${EMBS}" # Greek | ||
34 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz -P "${EMBS}" # Italian | ||
35 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz -P "${EMBS}" # Romanian | ||
36 | wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.vec.gz -P "${EMBS}" # Slovenian | ||
37 | |||
38 | echo "Extracting embeddings" | ||
39 | |||
40 | for LANG in sq bg el it ro sl; do | ||
41 | gunzip -fc "${EMBS}/cc.${LANG}.300.vec.gz" > "${EMBS}/${LANG}.1M.vec" | ||
42 | rm -f "${EMBS}/cc.${LANG}.300.vec.gz" | ||
43 | done | ||