1 files changed, 46 insertions, 51 deletions
diff --git a/WMD_matching.py b/WMD_matching.py
index 2755d15..69ea10e 100644
--- a/WMD_matching.py
+++ b/WMD_matching.py
@@ -13,7 +13,7 @@ from Wasserstein_Distance import (WassersteinMatcher,
 def main(args):
-    np.seterr(divide='ignore')  # POT has issues with divide by zero errors
+    np.seterr(divide="ignore")  # POT has issues with divide by zero errors
    source_lang = args.source_lang
    target_lang = args.target_lang
@@ -29,32 +29,24 @@ def main(args):
    mode = args.mode
    runfor = list()
-    if mode == 'all':
+    if mode == "all":
-        runfor.extend(['wmd', 'snk'])
+        runfor.extend(["wmd", "snk"])
    else:
        runfor.append(mode)
    defs_source = [
-        line.rstrip('\n')
+        line.rstrip("\n") for line in open(source_defs_filename, encoding="utf8")
-        for line in open(source_defs_filename, encoding='utf8')
    ]
    defs_target = [
-        line.rstrip('\n')
+        line.rstrip("\n") for line in open(target_defs_filename, encoding="utf8")
-        for line in open(target_defs_filename, encoding='utf8')
    ]
    clean_src_corpus, clean_src_vectors, src_keys = clean_corpus_using_embeddings_vocabulary(
-        set(vectors_source.keys()),
+        set(vectors_source.keys()), defs_source, vectors_source, source_lang
-        defs_source,
-        vectors_source,
-        source_lang,
    )
    clean_target_corpus, clean_target_vectors, target_keys = clean_corpus_using_embeddings_vocabulary(
-        set(vectors_target.keys()),
+        set(vectors_target.keys()), defs_target, vectors_target, target_lang
-        defs_target,
-        vectors_target,
-        target_lang,
    )
    take = args.instances
@@ -70,14 +62,15 @@ def main(args):
    if not batch:
        print(
-            f'{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}'
+            f"{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}"
        )
    del vectors_source, vectors_target, defs_source, defs_target
    vec = CountVectorizer().fit(clean_src_corpus + clean_target_corpus)
    common = [
-        word for word in vec.get_feature_names()
+        word
+        for word in vec.get_feature_names()
        if word in clean_src_vectors or word in clean_target_vectors
    ]
    W_common = []
@@ -88,9 +81,7 @@ def main(args):
            W_common.append(np.array(clean_target_vectors[w]))
    if not batch:
-        print(
+        print(f"{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}")
-            f'{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}'
-        )
    W_common = np.array(W_common)
    W_common = normalize(W_common)
@@ -101,24 +92,25 @@ def main(args):
    for metric in runfor:
        if not batch:
-            print(f'{metric}: {source_lang} - {target_lang}')
+            print(f"{metric}: {source_lang} - {target_lang}")
-        clf = WassersteinMatcher(W_embed=W_common,
+        clf = WassersteinMatcher(
-                                 n_neighbors=5,
+            W_embed=W_common, n_neighbors=5, n_jobs=14, sinkhorn=(metric == "snk")
-                                 n_jobs=14,
+        )
-                                 sinkhorn=(metric == 'snk'))
        clf.fit(X_train_idf[:instances], np.ones(instances))
-        p_at_one, percentage = clf.align(X_test_idf[:instances],
+        p_at_one, percentage = clf.align(X_test_idf[:instances], n_neighbors=instances)
-                                         n_neighbors=instances)
        if not batch:
-            print(f'P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%')
+            print(f"P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%")
        else:
            fields = [
-                f'{source_lang}', f'{target_lang}', f'{instances}',
+                f"{source_lang}",
-                f'{p_at_one}', f'{percentage}'
+                f"{target_lang}",
+                f"{instances}",
+                f"{p_at_one}",
+                f"{percentage}",
            ]
-            with open(f'{metric}_matching_results.csv', 'a') as f:
+            with open(f"{metric}_matching_results.csv", "a") as f:
                writer = csv.writer(f)
                writer.writerow(fields)
@@ -126,30 +118,33 @@ def main(args):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description='matching using wmd and wasserstein distance')
+        description="matching using wmd and wasserstein distance"
-    parser.add_argument('source_lang', help='source language short name')
+    )
-    parser.add_argument('target_lang', help='target language short name')
+    parser.add_argument("source_lang", help="source language short name")
-    parser.add_argument('source_vector', help='path of the source vector')
+    parser.add_argument("target_lang", help="target language short name")
-    parser.add_argument('target_vector', help='path of the target vector')
+    parser.add_argument("source_vector", help="path of the source vector")
-    parser.add_argument('source_defs', help='path of the source definitions')
+    parser.add_argument("target_vector", help="path of the target vector")
-    parser.add_argument('target_defs', help='path of the target definitions')
+    parser.add_argument("source_defs", help="path of the source definitions")
+    parser.add_argument("target_defs", help="path of the target definitions")
    parser.add_argument(
-        '-b',
+        "-b",
-        '--batch',
+        "--batch",
-        action='store_true',
+        action="store_true",
-        help=
+        help="running in batch (store results in csv) or running a single instance (output the results)",
-        'running in batch (store results in csv) or running a single instance (output the results)'
    )
-    parser.add_argument('mode',
-                        choices=['all', 'wmd', 'snk'],
-                        default='all',
-                        help='which methods to run')
    parser.add_argument(
-        '-n',
+        "mode",
-        '--instances',
+        choices=["all", "wmd", "snk"],
-        help='number of instances in each language to retrieve',
+        default="all",
+        help="which methods to run",
+    )
+    parser.add_argument(
+        "-n",
+        "--instances",
+        help="number of instances in each language to retrieve",
        default=1000,
-        type=int)
+        type=int,
+    )
    args = parser.parse_args()