diff options
author | Yigit Sever | 2019-09-18 18:30:42 +0300 |
---|---|---|
committer | Yigit Sever | 2019-09-18 18:30:42 +0300 |
commit | c42d42e9eb450343f8077c953bc85fadf2a657fa (patch) | |
tree | 3963206811f40b7c6b3354e26df3666e78e93db4 | |
parent | e70d946dba8fe8c145f18273930f28d84f144e0c (diff) | |
download | Evaluating-Dictionary-Alignment-c42d42e9eb450343f8077c953bc85fadf2a657fa.tar.gz Evaluating-Dictionary-Alignment-c42d42e9eb450343f8077c953bc85fadf2a657fa.tar.bz2 Evaluating-Dictionary-Alignment-c42d42e9eb450343f8077c953bc85fadf2a657fa.zip |
Add initial instructions for mapping vectors
-rw-r--r-- | README.md | 12 | ||||
-rwxr-xr-x | get_embeddings.sh | 29 |
2 files changed, 40 insertions, 1 deletions
@@ -24,7 +24,17 @@ Linewise aligned definition files are in `wordnets/ready`. | |||
24 | 24 | ||
25 | We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings. | 25 | We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings. |
26 | You can skip this step if you are providing your own polylingual embeddings. | 26 | You can skip this step if you are providing your own polylingual embeddings. |
27 | Otherwise simply run; | 27 | Otherwise; |
28 | |||
29 | * initialize and update the VecMap submodule; | ||
30 | |||
31 | ```bash | ||
32 | git submodule init && git submodule update | ||
33 | ``` | ||
34 | |||
35 | * make sure `./get_data` is already run and `dictionaries` folder is present. | ||
36 | |||
37 | * run; | ||
28 | 38 | ||
29 | ```bash | 39 | ```bash |
30 | ./get_embeddings.sh | 40 | ./get_embeddings.sh |
diff --git a/get_embeddings.sh b/get_embeddings.sh index 66af5af..225122f 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh | |||
@@ -21,6 +21,8 @@ | |||
21 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | 21 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
22 | # | 22 | # |
23 | 23 | ||
24 | set -o errexit -o pipefail -o noclobber -o nounset | ||
25 | |||
24 | ROOT="$(pwd)" | 26 | ROOT="$(pwd)" |
25 | EMBS="${ROOT}/embeddings" | 27 | EMBS="${ROOT}/embeddings" |
26 | mkdir -p "${EMBS}" | 28 | mkdir -p "${EMBS}" |
@@ -51,3 +53,30 @@ for lang_code in bg en el it ro sl sq; do | |||
51 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" | 53 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" |
52 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | 54 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" |
53 | done | 55 | done |
56 | |||
57 | if [ ! "$(ls -A "${ROOT}/vecmap/")" ]; then | ||
58 | echo "VecMap directory seems empty, did you run git submodule init && git submodule update?"; exit | ||
59 | fi | ||
60 | |||
61 | if [ ! -d "${ROOT}/dictionaries" ]; then | ||
62 | echo "Dictionaries directory does not exist, did you run ./get_data.sh?"; exit | ||
63 | fi | ||
64 | |||
65 | if [ ! "$(ls -A "${ROOT}/dictionaries/")" ]; then | ||
66 | echo "Dictionaries directory seems empty, did you run ./get_data.sh?"; exit | ||
67 | fi | ||
68 | |||
69 | TRAIN_DIC_DIR="${ROOT}/dictionaries/train" | ||
70 | MAP_TO="${ROOT}/bilingual_embeddings" | ||
71 | |||
72 | mkdir -p "${MAP_TO}" | ||
73 | |||
74 | for i in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do | ||
75 | IFS=',' read -r source_lang target_lang <<< "${i}" | ||
76 | python "${ROOT}/vecmap/map_embeddings.py" --supervised \ | ||
77 | "${TRAIN_DIC_DIR}/${source_lang}_${target_lang}.dic" \ | ||
78 | "${EMBS}/${source_lang}.vec" \ | ||
79 | "${EMBS}/${target_lang}.vec" \ | ||
80 | "${MAP_TO}/${source_lang}_to_${target_lang}.vec" \ | ||
81 | "${MAP_TO}/${target_lang}_to_${source_lang}.vec" > /dev/null | ||
82 | done | ||