1 files changed, 11 insertions, 10 deletions
diff --git a/demo.sh b/demo.sh
index 51346f3..acb7c2f 100644
--- a/demo.sh
+++ b/demo.sh
@@ -30,53 +30,54 @@ DICT="${ROOT}/dictionaries"
 TRAIN_DIR="${DICT}/train"
 TEST_DIR="${DICT}/test"
 TAB_DIR="${WNET}/tab_files"
 READY="${WNET}/ready"
-mkdir -p "${WNET}"
+# create wordnets directory and download a single wordnet
+mkdir -p "${WNET}"
 wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}"
 unzip -o -q "${WNET}/bul.zip" -d "${WNET}"
+# create tab directory and export a single .tab file
 mkdir -p "${TAB_DIR}"
 "${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}"
-python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg"
+# create ready directory and create two .def files
 mkdir -p "${READY}"
+python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg"
 mv "${ROOT}"/*.def "${READY}"
+# create dictionaries directory and download a single dictionary
 mkdir -p "${DICT}"
 wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English
 gunzip -q "${DICT}/bg-en.dic.gz"
 export LC_CTYPE=en_US.UTF-8
 export LC_ALL=en_US.UTF-8
+# create a train and a test seed lexicon
 perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}"
 mkdir -p "${TRAIN_DIR}"
 mkdir -p "${TEST_DIR}"
 mv "${DICT}"/*.train "${TRAIN_DIR}"
 mv "${DICT}"/*.test "${TEST_DIR}"
 rm -f "${DICT}"/*.dic
+# download two monolingual embeddings
 wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian
 wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English
-mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec"
 gunzip "${EMBS}/cc.bg.300.vec.gz"
 mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec"
 unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}"
 mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec"
+# truncate two embeddings
 for lang_code in bg en; do
-    sed -i '1,500001!d' "${EMBS}/${lang_code}.vec"
+    sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" # one line on top for the <number of tokens> <dimensions>
    sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"
 done
+# map two embeddings
 python "${ROOT}/vecmap/map_embeddings.py" --supervised \
    "${TRAIN_DIC_DIR}/en_bg.train" \
    "${EMBS}/en.vec" \

diff --git a/demo.sh b/demo.sh index 51346f3..acb7c2f 100644 --- a/demo.sh +++ b/demo.sh
@@ -30,53 +30,54 @@ DICT="${ROOT}/dictionaries"
30		30
31	TRAIN_DIR="${DICT}/train"	31	TRAIN_DIR="${DICT}/train"
32	TEST_DIR="${DICT}/test"	32	TEST_DIR="${DICT}/test"
33
34	TAB_DIR="${WNET}/tab_files"	33	TAB_DIR="${WNET}/tab_files"
35	READY="${WNET}/ready"	34	READY="${WNET}/ready"
36	mkdir -p "${WNET}"
37		35
		36	# create wordnets directory and download a single wordnet
		37	mkdir -p "${WNET}"
38	wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}"	38	wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/bul.zip -P "${WNET}"
39	unzip -o -q "${WNET}/bul.zip" -d "${WNET}"	39	unzip -o -q "${WNET}/bul.zip" -d "${WNET}"
40		40
		41	# create tab directory and export a single .tab file
41	mkdir -p "${TAB_DIR}"	42	mkdir -p "${TAB_DIR}"
42	"${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}"	43	"${SCRIPTS}/tab_creator.pl" "${WNET}/bul/wn-data-bul.tab" "${TAB_DIR}"
43		44
44	python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg"	45	# create ready directory and create two .def files
45
46	mkdir -p "${READY}"	46	mkdir -p "${READY}"
		47	python "${SCRIPTS}/prep_lookup.py" -s "en" -t "bg"
47	mv "${ROOT}"/*.def "${READY}"	48	mv "${ROOT}"/*.def "${READY}"
48		49
		50	# create dictionaries directory and download a single dictionary
49	mkdir -p "${DICT}"	51	mkdir -p "${DICT}"
50
51	wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English	52	wget -nc -q https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/dic/bg-en.dic.gz -P "${DICT}" # Bulgarian - English
52	gunzip -q "${DICT}/bg-en.dic.gz"	53	gunzip -q "${DICT}/bg-en.dic.gz"
53		54
54	export LC_CTYPE=en_US.UTF-8	55	export LC_CTYPE=en_US.UTF-8
55	export LC_ALL=en_US.UTF-8	56	export LC_ALL=en_US.UTF-8
56		57
		58	# create a train and a test seed lexicon
57	perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}"	59	perl "${SCRIPTS}/train_dic_creator.pl" "en" "bg" "${DICT}"
58
59	mkdir -p "${TRAIN_DIR}"	60	mkdir -p "${TRAIN_DIR}"
60	mkdir -p "${TEST_DIR}"	61	mkdir -p "${TEST_DIR}"
61
62	mv "${DICT}"/*.train "${TRAIN_DIR}"	62	mv "${DICT}"/*.train "${TRAIN_DIR}"
63	mv "${DICT}"/*.test "${TEST_DIR}"	63	mv "${DICT}"/*.test "${TEST_DIR}"
64	rm -f "${DICT}"/*.dic	64	rm -f "${DICT}"/*.dic
65		65
		66	# download two monolingual embeddings
66	wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian	67	wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz -P "${EMBS}" # Bulgarian
67	wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English	68	wget -nc -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -P "${EMBS}" # English
68	mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec"
69	gunzip "${EMBS}/cc.bg.300.vec.gz"	69	gunzip "${EMBS}/cc.bg.300.vec.gz"
70	mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec"	70	mv "${EMBS}/cc.bg.300.vec" "${EMBS}/bg.vec"
71	unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}"	71	unzip -q "${EMBS}/crawl-300d-2M.vec.zip" -d "${EMBS}"
72	mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec"	72	mv "${EMBS}/crawl-300d-2M.vec" "${EMBS}/en.vec"
73		73
74		74	# truncate two embeddings
75	for lang_code in bg en; do	75	for lang_code in bg en; do
76	sed -i '1,500001!d' "${EMBS}/${lang_code}.vec"	76	sed -i '1,500001!d' "${EMBS}/${lang_code}.vec" # one line on top for the <number of tokens> <dimensions>
77	sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"	77	sed -i '1 s/^.*$/500000 300/' "${EMBS}/${lang_code}.vec"
78	done	78	done
79		79
		80	# map two embeddings
80	python "${ROOT}/vecmap/map_embeddings.py" --supervised \	81	python "${ROOT}/vecmap/map_embeddings.py" --supervised \
81	"${TRAIN_DIC_DIR}/en_bg.train" \	82	"${TRAIN_DIC_DIR}/en_bg.train" \
82	"${EMBS}/en.vec" \	83	"${EMBS}/en.vec" \