Truncate embeddings

author: Yigit Sever 2019-09-18 01:21:20 +0300
committer: Yigit Sever 2019-09-18 01:21:20 +0300
commit: 3843cc5f372f4264bb3cffff9d0c0b7deb703b32 (patch)
tree: 3d3b542730196fb35c068f7048896e241144158e
parent: ce45fffd0941d97897283a1d5f9f4f0790a54616 (diff)
download: Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.tar.gz
Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.tar.bz2
Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.zip
2 files changed, 21 insertions, 2 deletions
diff --git a/README.md b/README.md
index c87fe29..cf75522 100644
--- a/README.md
+++ b/README.md
@@ -20,3 +20,15 @@ git clone https://github.com/yigitsever/Evaluating-Dictionary-Alignment.git && c
 This will create two directories; `dictionaries` and `wordnets`.
 Linewise aligned definition files are in `wordnets/ready`.
+## Acquiring The Embeddings
+We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings.
+You can skip this step if you are providing your own polylingual embeddings.
+Otherwise simply run;
+```bash
+./get_embeddings.sh
+```
+Bear in mind that this will require around 30 GB free space.
diff --git a/get_embeddings.sh b/get_embeddings.sh
index aba2078..be354a6 100755
--- a/get_embeddings.sh
+++ b/get_embeddings.sh
@@ -39,8 +39,15 @@ echo "Extracting embeddings"
 for lang_code in sq bg el it ro sl; do
        gunzip "${EMBS}/cc.${lang_code}.300.vec.gz"
-        mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec"
+        mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec"
 done
 unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}"
-mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec"
+mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec"
+rm -f "${EMBS}/crawl-300d-2M.vec.zip"
+# truncate to top 500k tokens for efficiency
+for lang_code in bg en el it ro sl sq; do
+    sed -in '1,500001!d' "${EMBS}/${lang_code}.vec"
+    sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"
+done
author	Yigit Sever	2019-09-18 01:21:20 +0300
committer	Yigit Sever	2019-09-18 01:21:20 +0300
commit	3843cc5f372f4264bb3cffff9d0c0b7deb703b32 (patch)
tree	3d3b542730196fb35c068f7048896e241144158e
parent	ce45fffd0941d97897283a1d5f9f4f0790a54616 (diff)
download	Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.tar.gz Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.tar.bz2 Evaluating-Dictionary-Alignment-3843cc5f372f4264bb3cffff9d0c0b7deb703b32.zip

diff --git a/README.md b/README.md index c87fe29..cf75522 100644 --- a/README.md +++ b/README.md
@@ -20,3 +20,15 @@ git clone https://github.com/yigitsever/Evaluating-Dictionary-Alignment.git && c
20	This will create two directories; `dictionaries` and `wordnets`.	20	This will create two directories; `dictionaries` and `wordnets`.
21	Linewise aligned definition files are in `wordnets/ready`.	21	Linewise aligned definition files are in `wordnets/ready`.
22		22
		23	## Acquiring The Embeddings
		24
		25	We use [VecMap](https://github.com/artetxem/vecmap) on [fastText](https://fasttext.cc/) embeddings.
		26	You can skip this step if you are providing your own polylingual embeddings.
		27	Otherwise simply run;
		28
		29	```bash
		30	./get_embeddings.sh
		31	```
		32
		33	Bear in mind that this will require around 30 GB free space.
		34


diff --git a/get_embeddings.sh b/get_embeddings.sh index aba2078..be354a6 100755 --- a/get_embeddings.sh +++ b/get_embeddings.sh
@@ -39,8 +39,15 @@ echo "Extracting embeddings"
39		39
40	for lang_code in sq bg el it ro sl; do	40	for lang_code in sq bg el it ro sl; do
41	gunzip "${EMBS}/cc.${lang_code}.300.vec.gz"	41	gunzip "${EMBS}/cc.${lang_code}.300.vec.gz"
42	mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.1M.vec"	42	mv "${EMBS}/cc.${lang_code}.300.vec" "${EMBS}/${lang_code}.vec"
43	done	43	done
44		44
45	unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}"	45	unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}"
46	mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.1M.vec"	46	mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec"
		47	rm -f "${EMBS}/crawl-300d-2M.vec.zip"
		48
		49	# truncate to top 500k tokens for efficiency
		50	for lang_code in bg en el it ro sl sq; do
		51	sed -in '1,500001!d' "${EMBS}/${lang_code}.vec"
		52	sed -in '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"
		53	done