aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYigit Sever2019-09-18 18:30:42 +0300
committerYigit Sever2019-09-18 18:30:42 +0300
commitc42d42e9eb450343f8077c953bc85fadf2a657fa (patch)
tree3963206811f40b7c6b3354e26df3666e78e93db4
parente70d946dba8fe8c145f18273930f28d84f144e0c (diff)
downloadEvaluating-Dictionary-Alignment-c42d42e9eb450343f8077c953bc85fadf2a657fa.tar.gz
Evaluating-Dictionary-Alignment-c42d42e9eb450343f8077c953bc85fadf2a657fa.tar.bz2
Evaluating-Dictionary-Alignment-c42d42e9eb450343f8077c953bc85fadf2a657fa.zip
Add initial instructions for mapping vectors
-rw-r--r--README.md12
-rwxr-xr-xget_embeddings.sh29
2 files changed, 40 insertions, 1 deletions
diff --git a/README.md b/README.md
index cf75522..6e246a9 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,17 @@ Linewise aligned definition files are in `wordnets/ready`.
24 24
25We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings. 25We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings.
26You can skip this step if you are providing your own polylingual embeddings. 26You can skip this step if you are providing your own polylingual embeddings.
27Otherwise simply run; 27Otherwise;
28
29* initialize and update the VecMap submodule;
30
31```bash
32git submodule init && git submodule update
33```
34
35* make sure `./get_data` is already run and `dictionaries` folder is present.
36
37* run;
28 38
29```bash 39```bash
30./get_embeddings.sh 40./get_embeddings.sh
diff --git a/get_embeddings.sh b/get_embeddings.sh
index 66af5af..225122f 100755
--- a/get_embeddings.sh
+++ b/get_embeddings.sh
@@ -21,6 +21,8 @@
21# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22# 22#
23 23
24set -o errexit -o pipefail -o noclobber -o nounset
25
24ROOT="$(pwd)" 26ROOT="$(pwd)"
25EMBS="${ROOT}/embeddings" 27EMBS="${ROOT}/embeddings"
26mkdir -p "${EMBS}" 28mkdir -p "${EMBS}"
@@ -51,3 +53,30 @@ for lang_code in bg en el it ro sl sq; do
51 sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" 53 sed -i '1,500001!d' "${EMBS}/${lang_code}.vec"
52 sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" 54 sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"
53done 55done
56
57if [ ! "$(ls -A "${ROOT}/vecmap/")" ]; then
58 echo "VecMap directory seems empty, did you run git submodule init && git submodule update?"; exit
59fi
60
61if [ ! -d "${ROOT}/dictionaries" ]; then
62 echo "Dictionaries directory does not exist, did you run ./get_data.sh?"; exit
63fi
64
65if [ ! "$(ls -A "${ROOT}/dictionaries/")" ]; then
66 echo "Dictionaries directory seems empty, did you run ./get_data.sh?"; exit
67fi
68
69TRAIN_DIC_DIR="${ROOT}/dictionaries/train"
70MAP_TO="${ROOT}/bilingual_embeddings"
71
72mkdir -p "${MAP_TO}"
73
74for i in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do
75 IFS=',' read -r source_lang target_lang <<< "${i}"
76 python "${ROOT}/vecmap/map_embeddings.py" --supervised \
77 "${TRAIN_DIC_DIR}/${source_lang}_${target_lang}.dic" \
78 "${EMBS}/${source_lang}.vec" \
79 "${EMBS}/${target_lang}.vec" \
80 "${MAP_TO}/${source_lang}_to_${target_lang}.vec" \
81 "${MAP_TO}/${target_lang}_to_${source_lang}.vec" > /dev/null
82done