aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xget_data.sh65
-rwxr-xr-xget_embeddings.sh43
2 files changed, 108 insertions, 0 deletions
diff --git a/get_data.sh b/get_data.sh
new file mode 100755
index 0000000..f6298b1
--- /dev/null
+++ b/get_data.sh
@@ -0,0 +1,65 @@
1#!/bin/bash
2#
3# Copyright © 2019 Yiğit Sever <[email protected]>
4#
5# Permission is hereby granted, free of charge, to any person obtaining
6# a copy of this software and associated documentation files (the "Software"),
7# to deal in the Software without restriction, including without limitation
8# the rights to use, copy, modify, merge, publish, distribute, sublicense,
9# and/or sell copies of the Software, and to permit persons to whom the
10# Software is furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice shall be included
13# in all copies or substantial portions of the Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
19# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
21# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22#
23#
24
25ROOT="$(pwd)"
26WNET="${ROOT}/data"
27mkdir -p "${WNET}"
28
29echo "Downloading wordnet data"
30
31wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/als.zip -P "${WNET}"
32wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}"
33wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/ell.zip -P "${WNET}"
34wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/ita.zip -P "${WNET}"
35wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/ron.zip -P "${WNET}"
36wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/slv.zip -P "${WNET}"
37
38echo "Unzipping wordnet data"
39
40for LANG in als bul ell ita ron slv; do
41 unzip -ofq "${WNET}/${LANG}" -d "${WNET}"
42 rm -f "${WNET}/${LANG}.zip"
43done
44
45rm -rf "${WNET}/ita" # comes alongside iwn, not useful for us
46
47echo "Downloading dictionaries"
48
49DICT="${ROOT}/dictionaries"
50
51wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/en-sq.dic.gz -P "${DICT}" # English - Albanian
52wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English
53wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-en.dic.gz -P "${DICT}" # Greek - English
54wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/en-it.dic.gz -P "${DICT}" # English - Italian
55wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/en-ro.dic.gz -P "${DICT}" # English - Romanian
56wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/en-sl.dic.gz -P "${DICT}" # English - Slovenian
57wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-el.dic.gz -P "${DICT}" # Bulgarian - Greek
58wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-it.dic.gz -P "${DICT}" # Bulgarian - Italian
59wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-ro.dic.gz -P "${DICT}" # Bulgarian - Romanian
60wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-it.dic.gz -P "${DICT}" # Greek - Italian
61wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-ro.dic.gz -P "${DICT}" # Greek - Romanian
62wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/el-sq.dic.gz -P "${DICT}" # Greek - Albanian
63wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/it-ro.dic.gz -P "${DICT}" # Italian - Romanian
64wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sl.dic.gz -P "${DICT}" # Romanian - Albanian
65wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/ro-sq.dic.gz -P "${DICT}" # Romanian - Albanian
diff --git a/get_embeddings.sh b/get_embeddings.sh
new file mode 100755
index 0000000..0c5d918
--- /dev/null
+++ b/get_embeddings.sh
@@ -0,0 +1,43 @@
1#!/bin/bash
2#
3# Copyright © 2019 Yiğit Sever <[email protected]>
4#
5# Permission is hereby granted, free of charge, to any person obtaining
6# a copy of this software and associated documentation files (the "Software"),
7# to deal in the Software without restriction, including without limitation
8# the rights to use, copy, modify, merge, publish, distribute, sublicense,
9# and/or sell copies of the Software, and to permit persons to whom the
10# Software is furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice shall be included
13# in all copies or substantial portions of the Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
19# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
21# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22#
23
24ROOT="$(pwd)"
25EMBS="${ROOT}/embeddings"
26mkdir -p "${EMBS}"
27
28echo "Downloading embeddings"
29
30wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.vec.gz -P "${EMBS}" # Albanian
31wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian
32wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English
33wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.vec.gz -P "${EMBS}" # Greek
34wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz -P "${EMBS}" # Italian
35wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz -P "${EMBS}" # Romanian
36wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.vec.gz -P "${EMBS}" # Slovenian
37
38echo "Extracting embeddings"
39
40for LANG in sq bg el it ro sl; do
41 gunzip -fc "${EMBS}/cc.${LANG}.300.vec.gz" > "${EMBS}/${LANG}.1M.vec"
42 rm -f "${EMBS}/cc.${LANG}.300.vec.gz"
43done