1 files changed, 179 insertions, 0 deletions
diff --git a/sentence_embedding.py b/sentence_embedding.py
new file mode 100644
index 0000000..0cd5361
--- /dev/null
+++ b/sentence_embedding.py
@@ -0,0 +1,179 @@
+import argparse
+import csv
+import random
+import numpy as np
+from lapjv import lapjv
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.preprocessing import normalize
+from Wasserstein_Distance import load_embeddings, process_corpus
+def main(args):
+    run_method = list()
+    if input_paradigm == "all":
+        run_paradigm.extend("matching", "retrieval")
+    else:
+        run_paradigm.append(input_paradigm)
+    source_lang = args.source_lang
+    target_lang = args.target_lang
+    batch = args.batch
+    source_vectors_filename = args.source_vector
+    target_vectors_filename = args.target_vector
+    vectors_source = load_embeddings(source_vectors_filename)
+    vectors_target = load_embeddings(target_vectors_filename)
+    source_defs_filename = args.source_defs
+    target_defs_filename = args.target_defs
+    defs_source = [
+        line.rstrip("\n") for line in open(source_defs_filename, encoding="utf8")
+    ]
+    defs_target = [
+        line.rstrip("\n") for line in open(target_defs_filename, encoding="utf8")
+    ]
+    clean_source_corpus, clean_source_vectors, source_keys = process_corpus(
+        set(vectors_source.keys()), defs_source, vectors_source, source_lang
+    )
+    clean_target_corpus, clean_target_vectors, target_keys = process_corpus(
+        set(vectors_target.keys()), defs_target, vectors_target, target_lang
+    )
+    take = args.instances
+    common_keys = set(source_keys).intersection(set(target_keys))
+    take = min(len(common_keys), take)  # you can't sample more than length
+    experiment_keys = random.sample(common_keys, take)
+    instances = len(experiment_keys)
+    clean_source_corpus = list(clean_source_corpus[experiment_keys])
+    clean_target_corpus = list(clean_target_corpus[experiment_keys])
+    if not batch:
+        print(
+            f"{source_lang} - {target_lang} "
+            + f" document sizes: {len(clean_source_corpus)}, {len(clean_target_corpus)}"
+        )
+    del vectors_source, vectors_target, defs_source, defs_target
+    vocab_counter = CountVectorizer().fit(clean_source_corpus + clean_target_corpus)
+    common = [
+        w
+        for w in vocab_counter.get_feature_names()
+        if w in clean_source_vectors or w in clean_target_vectors
+    ]
+    W_common = []
+    for w in common:
+        if w in clean_source_vectors:
+            W_common.append(np.array(clean_source_vectors[w]))
+        else:
+            W_common.append(np.array(clean_target_vectors[w]))
+    W_common = np.array(W_common)
+    W_common = normalize(W_common)  # default is l2
+    vect_tfidf = TfidfVectorizer(vocabulary=common, dtype=np.double, norm="l2")
+    vect_tfidf.fit(clean_source_corpus + clean_target_corpus)
+    X_idf_source = vect_tfidf.transform(clean_source_corpus)
+    X_idf_target = vect_tfidf.transform(clean_target_corpus)
+    X_idf_source_array = X_idf_source.toarray()
+    X_idf_target_array = X_idf_target.toarray()
+    S_emb_source = np.matmul(X_idf_source_array, W_common)
+    S_emb_target = np.matmul(X_idf_target_array, W_common)
+    S_emb_target_transpose = np.transpose(S_emb_target)
+    cost_matrix = np.matmul(S_emb_source, S_emb_target_transpose)
+    for paradigm in run_paradigm:
+        if paradigm == 'matching':
+            cost_matrix = cost_matrix * -1000
+            row_ind, col_ind, a = lapjv(cost_matrix, verbose=False)
+            result = zip(row_ind, col_ind)
+            hit_at_one = len([x for x, y in result if x == y])
+            percentage = hit_at_one / instances * 100
+            if not batch:
+                print(f"{hit_at_one} definitions have been matched correctly")
+            if batch:
+                fields = [
+                    f"{source_lang}",
+                    f"{target_lang}",
+                    f"{instances}",
+                    f"{hit_at_one}",
+                    f"{percentage}",
+                ]
+            with open("semb_matcing_results.csv", "a") as f:
+                writer = csv.writer(f)
+                writer.writerow(fields)
+        if paradigm == 'retrieval':
+            hit_at_one = len([x for x, y in enumerate(cost_matrix.argmax(axis=1)) if x == y])
+            percentage = hit_at_one / instances * 100
+            if not batch:
+                print(f"{hit_at_one} definitions have retrieved correctly")
+            if batch:
+                fields = [
+                    f"{source_lang}",
+                    f"{target_lang}",
+                    f"{instances}",
+                    f"{hit_at_one}",
+                    f"{percentage}",
+                ]
+                with open("semb_retrieval_results.csv", "a") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(fields)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="align dictionaries using sentence embedding representation"
+    )
+    parser.add_argument("source_lang", help="source language short name")
+    parser.add_argument("target_lang", help="target language short name")
+    parser.add_argument("source_vector", help="path of the source vector")
+    parser.add_argument("target_vector", help="path of the target vector")
+    parser.add_argument("source_defs", help="path of the source definitions")
+    parser.add_argument("target_defs", help="path of the target definitions")
+    parser.add_argument(
+        "-n",
+        "--instances",
+        help="number of instances in each language to retrieve",
+        default=1000,
+        type=int,
+    )
+    parser.add_argument(
+        "-b",
+        "--batch",
+        action="store_true",
+        help="running in batch (store results in csv) or"
+        + "running a single instance (output the results)",
+    )
+    parser.add_argument(
+        "paradigm",
+        choices=["all", "retrieval", "matching"],
+        default="all",
+        help="which paradigms to align with",
+    )
+    args = parser.parse_args()
+    main(args)

diff --git a/sentence_embedding.py b/sentence_embedding.py new file mode 100644 index 0000000..0cd5361 --- /dev/null +++ b/sentence_embedding.py
@@ -0,0 +1,179 @@
	1	import argparse
	2	import csv
	3	import random
	4
	5	import numpy as np
	6	from lapjv import lapjv
	7	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
	8	from sklearn.preprocessing import normalize
	9
	10	from Wasserstein_Distance import load_embeddings, process_corpus
	11
	12
	13	def main(args):
	14
	15	run_method = list()
	16
	17	if input_paradigm == "all":
	18	run_paradigm.extend("matching", "retrieval")
	19	else:
	20	run_paradigm.append(input_paradigm)
	21
	22	source_lang = args.source_lang
	23	target_lang = args.target_lang
	24	batch = args.batch
	25
	26	source_vectors_filename = args.source_vector
	27	target_vectors_filename = args.target_vector
	28
	29	vectors_source = load_embeddings(source_vectors_filename)
	30	vectors_target = load_embeddings(target_vectors_filename)
	31
	32	source_defs_filename = args.source_defs
	33	target_defs_filename = args.target_defs
	34	defs_source = [
	35	line.rstrip("\n") for line in open(source_defs_filename, encoding="utf8")
	36	]
	37	defs_target = [
	38	line.rstrip("\n") for line in open(target_defs_filename, encoding="utf8")
	39	]
	40
	41	clean_source_corpus, clean_source_vectors, source_keys = process_corpus(
	42	set(vectors_source.keys()), defs_source, vectors_source, source_lang
	43	)
	44
	45	clean_target_corpus, clean_target_vectors, target_keys = process_corpus(
	46	set(vectors_target.keys()), defs_target, vectors_target, target_lang
	47	)
	48
	49	take = args.instances
	50	common_keys = set(source_keys).intersection(set(target_keys))
	51	take = min(len(common_keys), take) # you can't sample more than length
	52	experiment_keys = random.sample(common_keys, take)
	53
	54	instances = len(experiment_keys)
	55
	56	clean_source_corpus = list(clean_source_corpus[experiment_keys])
	57	clean_target_corpus = list(clean_target_corpus[experiment_keys])
	58
	59	if not batch:
	60	print(
	61	f"{source_lang} - {target_lang} "
	62	+ f" document sizes: {len(clean_source_corpus)}, {len(clean_target_corpus)}"
	63	)
	64
	65	del vectors_source, vectors_target, defs_source, defs_target
	66
	67	vocab_counter = CountVectorizer().fit(clean_source_corpus + clean_target_corpus)
	68	common = [
	69	w
	70	for w in vocab_counter.get_feature_names()
	71	if w in clean_source_vectors or w in clean_target_vectors
	72	]
	73	W_common = []
	74
	75	for w in common:
	76	if w in clean_source_vectors:
	77	W_common.append(np.array(clean_source_vectors[w]))
	78	else:
	79	W_common.append(np.array(clean_target_vectors[w]))
	80
	81	W_common = np.array(W_common)
	82	W_common = normalize(W_common) # default is l2
	83
	84	vect_tfidf = TfidfVectorizer(vocabulary=common, dtype=np.double, norm="l2")
	85	vect_tfidf.fit(clean_source_corpus + clean_target_corpus)
	86	X_idf_source = vect_tfidf.transform(clean_source_corpus)
	87	X_idf_target = vect_tfidf.transform(clean_target_corpus)
	88
	89	X_idf_source_array = X_idf_source.toarray()
	90	X_idf_target_array = X_idf_target.toarray()
	91	S_emb_source = np.matmul(X_idf_source_array, W_common)
	92	S_emb_target = np.matmul(X_idf_target_array, W_common)
	93
	94	S_emb_target_transpose = np.transpose(S_emb_target)
	95
	96	cost_matrix = np.matmul(S_emb_source, S_emb_target_transpose)
	97
	98	for paradigm in run_paradigm:
	99	if paradigm == 'matching':
	100
	101	cost_matrix = cost_matrix * -1000
	102	row_ind, col_ind, a = lapjv(cost_matrix, verbose=False)
	103
	104	result = zip(row_ind, col_ind)
	105	hit_at_one = len([x for x, y in result if x == y])
	106	percentage = hit_at_one / instances * 100
	107
	108	if not batch:
	109	print(f"{hit_at_one} definitions have been matched correctly")
	110
	111	if batch:
	112	fields = [
	113	f"{source_lang}",
	114	f"{target_lang}",
	115	f"{instances}",
	116	f"{hit_at_one}",
	117	f"{percentage}",
	118	]
	119
	120	with open("semb_matcing_results.csv", "a") as f:
	121	writer = csv.writer(f)
	122	writer.writerow(fields)
	123
	124	if paradigm == 'retrieval':
	125
	126	hit_at_one = len([x for x, y in enumerate(cost_matrix.argmax(axis=1)) if x == y])
	127	percentage = hit_at_one / instances * 100
	128
	129	if not batch:
	130	print(f"{hit_at_one} definitions have retrieved correctly")
	131
	132	if batch:
	133	fields = [
	134	f"{source_lang}",
	135	f"{target_lang}",
	136	f"{instances}",
	137	f"{hit_at_one}",
	138	f"{percentage}",
	139	]
	140
	141	with open("semb_retrieval_results.csv", "a") as f:
	142	writer = csv.writer(f)
	143	writer.writerow(fields)
	144
	145
	146	if __name__ == "__main__":
	147
	148	parser = argparse.ArgumentParser(
	149	description="align dictionaries using sentence embedding representation"
	150	)
	151	parser.add_argument("source_lang", help="source language short name")
	152	parser.add_argument("target_lang", help="target language short name")
	153	parser.add_argument("source_vector", help="path of the source vector")
	154	parser.add_argument("target_vector", help="path of the target vector")
	155	parser.add_argument("source_defs", help="path of the source definitions")
	156	parser.add_argument("target_defs", help="path of the target definitions")
	157	parser.add_argument(
	158	"-n",
	159	"--instances",
	160	help="number of instances in each language to retrieve",
	161	default=1000,
	162	type=int,
	163	)
	164	parser.add_argument(
	165	"-b",
	166	"--batch",
	167	action="store_true",
	168	help="running in batch (store results in csv) or"
	169	+ "running a single instance (output the results)",
	170	)
	171	parser.add_argument(
	172	"paradigm",
	173	choices=["all", "retrieval", "matching"],
	174	default="all",
	175	help="which paradigms to align with",
	176	)
	177
	178	args = parser.parse_args()
	179	main(args)