From 49c6f58e51e12af691f7a1322137c64f46043b15 Mon Sep 17 00:00:00 2001
From: Yigit Sever
Date: Tue, 24 Sep 2019 21:26:34 +0300
Subject: Use black linter for WMD

---
 WMD_matching.py         |  97 ++++++++++++++++------------------
 WMD_retrieval.py        |  97 ++++++++++++++++------------------
 Wasserstein_Distance.py | 136 +++++++++++++++++++++++++-----------------------
 3 files changed, 163 insertions(+), 167 deletions(-)

diff --git a/WMD_matching.py b/WMD_matching.py
index 2755d15..69ea10e 100644
--- a/WMD_matching.py
+++ b/WMD_matching.py
@@ -13,7 +13,7 @@ from Wasserstein_Distance import (WassersteinMatcher,
 
 def main(args):
 
-    np.seterr(divide='ignore')  # POT has issues with divide by zero errors
+    np.seterr(divide="ignore")  # POT has issues with divide by zero errors
     source_lang = args.source_lang
     target_lang = args.target_lang
 
@@ -29,32 +29,24 @@ def main(args):
     mode = args.mode
     runfor = list()
 
-    if mode == 'all':
-        runfor.extend(['wmd', 'snk'])
+    if mode == "all":
+        runfor.extend(["wmd", "snk"])
     else:
         runfor.append(mode)
 
     defs_source = [
-        line.rstrip('\n')
-        for line in open(source_defs_filename, encoding='utf8')
+        line.rstrip("\n") for line in open(source_defs_filename, encoding="utf8")
     ]
     defs_target = [
-        line.rstrip('\n')
-        for line in open(target_defs_filename, encoding='utf8')
+        line.rstrip("\n") for line in open(target_defs_filename, encoding="utf8")
     ]
 
     clean_src_corpus, clean_src_vectors, src_keys = clean_corpus_using_embeddings_vocabulary(
-        set(vectors_source.keys()),
-        defs_source,
-        vectors_source,
-        source_lang,
+        set(vectors_source.keys()), defs_source, vectors_source, source_lang
     )
 
     clean_target_corpus, clean_target_vectors, target_keys = clean_corpus_using_embeddings_vocabulary(
-        set(vectors_target.keys()),
-        defs_target,
-        vectors_target,
-        target_lang,
+        set(vectors_target.keys()), defs_target, vectors_target, target_lang
     )
 
     take = args.instances
@@ -70,14 +62,15 @@ def main(args):
 
     if not batch:
         print(
-            f'{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}'
+            f"{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}"
         )
 
     del vectors_source, vectors_target, defs_source, defs_target
 
     vec = CountVectorizer().fit(clean_src_corpus + clean_target_corpus)
     common = [
-        word for word in vec.get_feature_names()
+        word
+        for word in vec.get_feature_names()
         if word in clean_src_vectors or word in clean_target_vectors
     ]
     W_common = []
@@ -88,9 +81,7 @@ def main(args):
             W_common.append(np.array(clean_target_vectors[w]))
 
     if not batch:
-        print(
-            f'{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}'
-        )
+        print(f"{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}")
 
     W_common = np.array(W_common)
     W_common = normalize(W_common)
@@ -101,24 +92,25 @@ def main(args):
 
     for metric in runfor:
         if not batch:
-            print(f'{metric}: {source_lang} - {target_lang}')
+            print(f"{metric}: {source_lang} - {target_lang}")
 
-        clf = WassersteinMatcher(W_embed=W_common,
-                                 n_neighbors=5,
-                                 n_jobs=14,
-                                 sinkhorn=(metric == 'snk'))
+        clf = WassersteinMatcher(
+            W_embed=W_common, n_neighbors=5, n_jobs=14, sinkhorn=(metric == "snk")
+        )
         clf.fit(X_train_idf[:instances], np.ones(instances))
-        p_at_one, percentage = clf.align(X_test_idf[:instances],
-                                         n_neighbors=instances)
+        p_at_one, percentage = clf.align(X_test_idf[:instances], n_neighbors=instances)
 
         if not batch:
-            print(f'P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%')
+            print(f"P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%")
         else:
             fields = [
-                f'{source_lang}', f'{target_lang}', f'{instances}',
-                f'{p_at_one}', f'{percentage}'
+                f"{source_lang}",
+                f"{target_lang}",
+                f"{instances}",
+                f"{p_at_one}",
+                f"{percentage}",
             ]
-            with open(f'{metric}_matching_results.csv', 'a') as f:
+            with open(f"{metric}_matching_results.csv", "a") as f:
                 writer = csv.writer(f)
                 writer.writerow(fields)
 
@@ -126,30 +118,33 @@ def main(args):
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(
-        description='matching using wmd and wasserstein distance')
-    parser.add_argument('source_lang', help='source language short name')
-    parser.add_argument('target_lang', help='target language short name')
-    parser.add_argument('source_vector', help='path of the source vector')
-    parser.add_argument('target_vector', help='path of the target vector')
-    parser.add_argument('source_defs', help='path of the source definitions')
-    parser.add_argument('target_defs', help='path of the target definitions')
+        description="matching using wmd and wasserstein distance"
+    )
+    parser.add_argument("source_lang", help="source language short name")
+    parser.add_argument("target_lang", help="target language short name")
+    parser.add_argument("source_vector", help="path of the source vector")
+    parser.add_argument("target_vector", help="path of the target vector")
+    parser.add_argument("source_defs", help="path of the source definitions")
+    parser.add_argument("target_defs", help="path of the target definitions")
     parser.add_argument(
-        '-b',
-        '--batch',
-        action='store_true',
-        help=
-        'running in batch (store results in csv) or running a single instance (output the results)'
+        "-b",
+        "--batch",
+        action="store_true",
+        help="running in batch (store results in csv) or running a single instance (output the results)",
     )
-    parser.add_argument('mode',
-                        choices=['all', 'wmd', 'snk'],
-                        default='all',
-                        help='which methods to run')
     parser.add_argument(
-        '-n',
-        '--instances',
-        help='number of instances in each language to retrieve',
+        "mode",
+        choices=["all", "wmd", "snk"],
+        default="all",
+        help="which methods to run",
+    )
+    parser.add_argument(
+        "-n",
+        "--instances",
+        help="number of instances in each language to retrieve",
         default=1000,
-        type=int)
+        type=int,
+    )
 
     args = parser.parse_args()
 
diff --git a/WMD_retrieval.py b/WMD_retrieval.py
index 02f35be..cb72079 100644
--- a/WMD_retrieval.py
+++ b/WMD_retrieval.py
@@ -13,7 +13,7 @@ from Wasserstein_Distance import (WassersteinRetriever,
 
 def main(args):
 
-    np.seterr(divide='ignore')  # POT has issues with divide by zero errors
+    np.seterr(divide="ignore")  # POT has issues with divide by zero errors
     source_lang = args.source_lang
     target_lang = args.target_lang
 
@@ -29,32 +29,24 @@ def main(args):
     mode = args.mode
     runfor = list()
 
-    if mode == 'all':
-        runfor.extend(['wmd', 'snk'])
+    if mode == "all":
+        runfor.extend(["wmd", "snk"])
     else:
         runfor.append(mode)
 
     defs_source = [
-        line.rstrip('\n')
-        for line in open(source_defs_filename, encoding='utf8')
+        line.rstrip("\n") for line in open(source_defs_filename, encoding="utf8")
     ]
     defs_target = [
-        line.rstrip('\n')
-        for line in open(target_defs_filename, encoding='utf8')
+        line.rstrip("\n") for line in open(target_defs_filename, encoding="utf8")
     ]
 
     clean_src_corpus, clean_src_vectors, src_keys = clean_corpus_using_embeddings_vocabulary(
-        set(vectors_source.keys()),
-        defs_source,
-        vectors_source,
-        source_lang,
+        set(vectors_source.keys()), defs_source, vectors_source, source_lang
     )
 
     clean_target_corpus, clean_target_vectors, target_keys = clean_corpus_using_embeddings_vocabulary(
-        set(vectors_target.keys()),
-        defs_target,
-        vectors_target,
-        target_lang,
+        set(vectors_target.keys()), defs_target, vectors_target, target_lang
     )
 
     take = args.instances
@@ -70,14 +62,15 @@ def main(args):
 
     if not batch:
         print(
-            f'{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}'
+            f"{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}"
         )
 
     del vectors_source, vectors_target, defs_source, defs_target
 
     vec = CountVectorizer().fit(clean_src_corpus + clean_target_corpus)
     common = [
-        word for word in vec.get_feature_names()
+        word
+        for word in vec.get_feature_names()
         if word in clean_src_vectors or word in clean_target_vectors
     ]
     W_common = []
@@ -88,9 +81,7 @@ def main(args):
             W_common.append(np.array(clean_target_vectors[w]))
 
     if not batch:
-        print(
-            f'{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}'
-        )
+        print(f"{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}")
 
     W_common = np.array(W_common)
     W_common = normalize(W_common)
@@ -101,55 +92,57 @@ def main(args):
 
     for metric in runfor:
         if not batch:
-            print(f'{metric}: {source_lang} - {target_lang}')
+            print(f"{metric}: {source_lang} - {target_lang}")
 
-        clf = WassersteinRetriever(W_embed=W_common,
-                                   n_neighbors=5,
-                                   n_jobs=14,
-                                   sinkhorn=(metric == 'snk'))
+        clf = WassersteinRetriever(
+            W_embed=W_common, n_neighbors=5, n_jobs=14, sinkhorn=(metric == "snk")
+        )
         clf.fit(X_train_idf[:instances], np.ones(instances))
-        p_at_one, percentage = clf.align(X_test_idf[:instances],
-                                         n_neighbors=instances)
+        p_at_one, percentage = clf.align(X_test_idf[:instances], n_neighbors=instances)
 
         if not batch:
-            print(f'P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%')
+            print(f"P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%")
         else:
             fields = [
-                f'{source_lang}', f'{target_lang}', f'{instances}',
-                f'{p_at_one}', f'{percentage}'
+                f"{source_lang}",
+                f"{target_lang}",
+                f"{instances}",
+                f"{p_at_one}",
+                f"{percentage}",
             ]
-            with open(f'{metric}_retrieval_result.csv', 'a') as f:
+            with open(f"{metric}_retrieval_result.csv", "a") as f:
                 writer = csv.writer(f)
                 writer.writerow(fields)
 
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser(
-        description='run retrieval using wmd or snk')
-    parser.add_argument('source_lang', help='source language short name')
-    parser.add_argument('target_lang', help='target language short name')
-    parser.add_argument('source_vector', help='path of the source vector')
-    parser.add_argument('target_vector', help='path of the target vector')
-    parser.add_argument('source_defs', help='path of the source definitions')
-    parser.add_argument('target_defs', help='path of the target definitions')
+    parser = argparse.ArgumentParser(description="run retrieval using wmd or snk")
+    parser.add_argument("source_lang", help="source language short name")
+    parser.add_argument("target_lang", help="target language short name")
+    parser.add_argument("source_vector", help="path of the source vector")
+    parser.add_argument("target_vector", help="path of the target vector")
+    parser.add_argument("source_defs", help="path of the source definitions")
+    parser.add_argument("target_defs", help="path of the target definitions")
     parser.add_argument(
-        '-b',
-        '--batch',
-        action='store_true',
-        help=
-        'running in batch (store results in csv) or running a single instance (output the results)'
+        "-b",
+        "--batch",
+        action="store_true",
+        help="running in batch (store results in csv) or running a single instance (output the results)",
     )
-    parser.add_argument('mode',
-                        choices=['all', 'wmd', 'snk'],
-                        default='all',
-                        help='which methods to run')
     parser.add_argument(
-        '-n',
-        '--instances',
-        help='number of instances in each language to retrieve',
+        "mode",
+        choices=["all", "wmd", "snk"],
+        default="all",
+        help="which methods to run",
+    )
+    parser.add_argument(
+        "-n",
+        "--instances",
+        help="number of instances in each language to retrieve",
         default=1000,
-        type=int)
+        type=int,
+    )
 
     args = parser.parse_args()
 
diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py
index 78bf9cf..60991b9 100644
--- a/Wasserstein_Distance.py
+++ b/Wasserstein_Distance.py
@@ -11,17 +11,20 @@ from sklearn.utils import check_array
 
 class WassersteinMatcher(KNeighborsClassifier):
     """
-    Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric.
-    Source and target distributions are l_1 normalized before computing the Wasserstein distance.
-    Wasserstein is parametrized by the distances between the individual points of the distributions.
+    Source and target distributions are l_1 normalized before computing the Wasserstein
+    distance. Wasserstein is parametrized by the distances between the individual
+    points of the distributions.
     """
-    def __init__(self,
-                 W_embed,
-                 n_neighbors=1,
-                 n_jobs=1,
-                 verbose=False,
-                 sinkhorn=False,
-                 sinkhorn_reg=0.1):
+
+    def __init__(
+        self,
+        W_embed,
+        n_neighbors=1,
+        n_jobs=1,
+        verbose=False,
+        sinkhorn=False,
+        sinkhorn_reg=0.1,
+    ):
         """
         Initialization of the class.
         Arguments
@@ -33,10 +36,12 @@ class WassersteinMatcher(KNeighborsClassifier):
         self.sinkhorn_reg = sinkhorn_reg
         self.W_embed = W_embed
         self.verbose = verbose
-        super(WassersteinMatcher, self).__init__(n_neighbors=n_neighbors,
-                                                 n_jobs=n_jobs,
-                                                 metric='precomputed',
-                                                 algorithm='brute')
+        super(WassersteinMatcher, self).__init__(
+            n_neighbors=n_neighbors,
+            n_jobs=n_jobs,
+            metric="precomputed",
+            algorithm="brute",
+        )
 
     def _wmd(self, i, row, X_train):
         union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -51,7 +56,7 @@ class WassersteinMatcher(KNeighborsClassifier):
                 W_dist,
                 self.sinkhorn_reg,
                 numItermax=50,
-                method='sinkhorn_stabilized',
+                method="sinkhorn_stabilized",
             )[0]
         else:
             return ot.emd2(bow_i, bow_j, W_dist)
@@ -66,27 +71,27 @@ class WassersteinMatcher(KNeighborsClassifier):
 
         if X_train is None:
             X_train = self._fit_X
-        pool = Pool(nodes=self.n_jobs
-                    )  # Parallelization of the calculation of the distances
+        pool = Pool(
+            nodes=self.n_jobs
+        )  # Parallelization of the calculation of the distances
         dist = pool.map(self._wmd_row, X_test)
         return np.array(dist)
 
     def fit(self, X, y):  # X_train_idf
-        X = check_array(X, accept_sparse='csr',
-                        copy=True)  # check if array is sparse
-        X = normalize(X, norm='l1', copy=False)
+        X = check_array(X, accept_sparse="csr", copy=True)  # check if array is sparse
+        X = normalize(X, norm="l1", copy=False)
         return super(WassersteinMatcher, self).fit(X, y)
 
     def predict(self, X):
-        X = check_array(X, accept_sparse='csr', copy=True)
-        X = normalize(X, norm='l1', copy=False)
+        X = check_array(X, accept_sparse="csr", copy=True)
+        X = normalize(X, norm="l1", copy=False)
         dist = self._pairwise_wmd(X)
         dist = dist * 1000  # for lapjv, small floating point numbers are evil
         return super(WassersteinMatcher, self).predict(dist)
 
     def kneighbors(self, X, n_neighbors=1):
-        X = check_array(X, accept_sparse='csr', copy=True)
-        X = normalize(X, norm='l1', copy=False)
+        X = check_array(X, accept_sparse="csr", copy=True)
+        X = normalize(X, norm="l1", copy=False)
         dist = self._pairwise_wmd(X)
         dist = dist * 1000  # for lapjv, small floating point numbers are evil
         return lapjv(dist)
@@ -102,19 +107,24 @@ class WassersteinMatcher(KNeighborsClassifier):
         percentage = p_at_one / n_neighbors * 100
         return p_at_one, percentage
 
+
 class WassersteinRetriever(KNeighborsClassifier):
     """
-    Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric.
-    Source and target distributions are l_1 normalized before computing the Wasserstein distance.
-    Wasserstein is parametrized by the distances between the individual points of the distributions.
+    Implements a nearest neighbors classifier for input distributions using
+    the Wasserstein distance as metric. Source and target distributions
+    are l_1 normalized before computing the Wasserstein distance. Wasserstein is
+    parametrized by the distances between the individual points of the distributions.
     """
-    def __init__(self,
-                 W_embed,
-                 n_neighbors=1,
-                 n_jobs=1,
-                 verbose=False,
-                 sinkhorn=False,
-                 sinkhorn_reg=0.1):
+
+    def __init__(
+        self,
+        W_embed,
+        n_neighbors=1,
+        n_jobs=1,
+        verbose=False,
+        sinkhorn=False,
+        sinkhorn_reg=0.1,
+    ):
         """
         Initialization of the class.
         Arguments
@@ -126,10 +136,12 @@ class WassersteinRetriever(KNeighborsClassifier):
         self.sinkhorn_reg = sinkhorn_reg
         self.W_embed = W_embed
         self.verbose = verbose
-        super(WassersteinRetriever, self).__init__(n_neighbors=n_neighbors,
-                                                    n_jobs=n_jobs,
-                                                    metric='precomputed',
-                                                    algorithm='brute')
+        super(WassersteinRetriever, self).__init__(
+            n_neighbors=n_neighbors,
+            n_jobs=n_jobs,
+            metric="precomputed",
+            algorithm="brute",
+        )
 
     def _wmd(self, i, row, X_train):
         union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -144,7 +156,7 @@ class WassersteinRetriever(KNeighborsClassifier):
                 W_dist,
                 self.sinkhorn_reg,
                 numItermax=50,
-                method='sinkhorn_stabilized',
+                method="sinkhorn_stabilized",
             )[0]
         else:
             return ot.emd2(bow_i, bow_j, W_dist)
@@ -164,19 +176,19 @@ class WassersteinRetriever(KNeighborsClassifier):
         return np.array(dist)
 
     def fit(self, X, y):
-        X = check_array(X, accept_sparse='csr', copy=True)
-        X = normalize(X, norm='l1', copy=False)
+        X = check_array(X, accept_sparse="csr", copy=True)
+        X = normalize(X, norm="l1", copy=False)
         return super(WassersteinRetriever, self).fit(X, y)
 
     def predict(self, X):
-        X = check_array(X, accept_sparse='csr', copy=True)
-        X = normalize(X, norm='l1', copy=False)
+        X = check_array(X, accept_sparse="csr", copy=True)
+        X = normalize(X, norm="l1", copy=False)
         dist = self._pairwise_wmd(X)
         return super(WassersteinRetriever, self).predict(dist)
 
     def kneighbors(self, X, n_neighbors=1):
-        X = check_array(X, accept_sparse='csr', copy=True)
-        X = normalize(X, norm='l1', copy=False)
+        X = check_array(X, accept_sparse="csr", copy=True)
+        X = normalize(X, norm="l1", copy=False)
         dist = self._pairwise_wmd(X)
         return super(WassersteinRetriever, self).kneighbors(dist, n_neighbors)
 
@@ -199,9 +211,9 @@ def load_embeddings(path, dimension=300):
     The first line may or may not include the word count and dimension
     """
     vectors = {}
-    with open(path, mode='r', encoding='utf8') as fp:
-        first_line = fp.readline().rstrip('\n')
-        if first_line.count(' ') == 1:
+    with open(path, mode="r", encoding="utf8") as fp:
+        first_line = fp.readline().rstrip("\n")
+        if first_line.count(" ") == 1:
             # includes the "word_count dimension" information
             (_, dimension) = map(int, first_line.split())
         else:
@@ -209,22 +221,19 @@ def load_embeddings(path, dimension=300):
             fp.seek(0)
         for line in fp:
             elems = line.split()
-            vectors[" ".join(elems[:-dimension])] = " ".join(
-                elems[-dimension:])
+            vectors[" ".join(elems[:-dimension])] = " ".join(elems[-dimension:])
     return vectors
 
 
 def clean_corpus_using_embeddings_vocabulary(
-        embeddings_dictionary,
-        corpus,
-        vectors,
-        language,
+    embeddings_dictionary, corpus, vectors, language
 ):
-    '''
+    """
     Cleans corpus using the dictionary of embeddings.
     Any word without an associated embedding in the dictionary is ignored.
-    Adds '__target-language' and '__source-language' at the end of the words according to their language.
-    '''
+    Adds '__target-language' and '__source-language' at the end
+    of the words according to their language.
+    """
     clean_corpus, clean_vectors, keys = [], {}, []
     words_we_want = set(embeddings_dictionary)
     tokenize = MosesTokenizer(language)
@@ -233,19 +242,18 @@ def clean_corpus_using_embeddings_vocabulary(
         words = tokenize(doc)
         for word in words:
             if word in words_we_want:
-                clean_doc.append(word + '__%s' % language)
-                clean_vectors[word + '__%s' % language] = np.array(
-                    vectors[word].split()).astype(np.float)
+                clean_doc.append(word + "__%s" % language)
+                clean_vectors[word + "__%s" % language] = np.array(
+                    vectors[word].split()
+                ).astype(np.float)
         if len(clean_doc) > 3 and len(clean_doc) < 25:
             keys.append(key)
-        clean_corpus.append(' '.join(clean_doc))
+        clean_corpus.append(" ".join(clean_doc))
     tokenize.close()
     return np.array(clean_corpus), clean_vectors, keys
 
 
-def mrr_precision_at_k(golden, preds, k_list=[
-        1,
-]):
+def mrr_precision_at_k(golden, preds, k_list=[1]):
     """
     Calculates Mean Reciprocal Error and Hits@1 == Precision@1
     """
-- 
cgit v1.2.3-70-g09d2