diff options
| -rw-r--r-- | README.md | 12 | ||||
| -rwxr-xr-x | get_embeddings.sh | 29 |
2 files changed, 40 insertions, 1 deletions
| @@ -24,7 +24,17 @@ Linewise aligned definition files are in `wordnets/ready`. | |||
| 24 | 24 | ||
| 25 | We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings. | 25 | We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings. |
| 26 | You can skip this step if you are providing your own polylingual embeddings. | 26 | You can skip this step if you are providing your own polylingual embeddings. |
| 27 | Otherwise simply run; | 27 | Otherwise; |
| 28 | |||
| 29 | * initialize and update the VecMap submodule; | ||
| 30 | |||
| 31 | ```bash | ||
| 32 | git submodule init && git submodule update | ||
| 33 | ``` | ||
| 34 | |||
| 35 | * make sure `./get_data` is already run and `dictionaries` folder is present. | ||
| 36 | |||
| 37 | * run; | ||
| 28 | 38 | ||
| 29 | ```bash | 39 | ```bash |
| 30 | ./get_embeddings.sh | 40 | ./get_embeddings.sh |
diff --git a/get_embeddings.sh b/get_embeddings.sh index 66af5af..225122f 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh | |||
| @@ -21,6 +21,8 @@ | |||
| 21 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | 21 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 22 | # | 22 | # |
| 23 | 23 | ||
| 24 | set -o errexit -o pipefail -o noclobber -o nounset | ||
| 25 | |||
| 24 | ROOT="$(pwd)" | 26 | ROOT="$(pwd)" |
| 25 | EMBS="${ROOT}/embeddings" | 27 | EMBS="${ROOT}/embeddings" |
| 26 | mkdir -p "${EMBS}" | 28 | mkdir -p "${EMBS}" |
| @@ -51,3 +53,30 @@ for lang_code in bg en el it ro sl sq; do | |||
| 51 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" | 53 | sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" |
| 52 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" | 54 | sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec" |
| 53 | done | 55 | done |
| 56 | |||
| 57 | if [ ! "$(ls -A "${ROOT}/vecmap/")" ]; then | ||
| 58 | echo "VecMap directory seems empty, did you run git submodule init && git submodule update?"; exit | ||
| 59 | fi | ||
| 60 | |||
| 61 | if [ ! -d "${ROOT}/dictionaries" ]; then | ||
| 62 | echo "Dictionaries directory does not exist, did you run ./get_data.sh?"; exit | ||
| 63 | fi | ||
| 64 | |||
| 65 | if [ ! "$(ls -A "${ROOT}/dictionaries/")" ]; then | ||
| 66 | echo "Dictionaries directory seems empty, did you run ./get_data.sh?"; exit | ||
| 67 | fi | ||
| 68 | |||
| 69 | TRAIN_DIC_DIR="${ROOT}/dictionaries/train" | ||
| 70 | MAP_TO="${ROOT}/bilingual_embeddings" | ||
| 71 | |||
| 72 | mkdir -p "${MAP_TO}" | ||
| 73 | |||
| 74 | for i in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do | ||
| 75 | IFS=',' read -r source_lang target_lang <<< "${i}" | ||
| 76 | python "${ROOT}/vecmap/map_embeddings.py" --supervised \ | ||
| 77 | "${TRAIN_DIC_DIR}/${source_lang}_${target_lang}.dic" \ | ||
| 78 | "${EMBS}/${source_lang}.vec" \ | ||
| 79 | "${EMBS}/${target_lang}.vec" \ | ||
| 80 | "${MAP_TO}/${source_lang}_to_${target_lang}.vec" \ | ||
| 81 | "${MAP_TO}/${target_lang}_to_${source_lang}.vec" > /dev/null | ||
| 82 | done | ||
