From 2936635892e17031c37facfd2115e8cfd6633222 Mon Sep 17 00:00:00 2001
From: Yigit Sever
Date: Sun, 22 Sep 2019 01:33:24 +0300
Subject: Introduce linter, stylize

---
 Wasserstein_Distance.py | 109 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 72 insertions(+), 37 deletions(-)

(limited to 'Wasserstein_Distance.py')

diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py
index 08439d2..161c13c 100644
--- a/Wasserstein_Distance.py
+++ b/Wasserstein_Distance.py
@@ -1,15 +1,14 @@
-import ot
-from sklearn.preprocessing import normalize
-from lapjv import lapjv
-from sklearn.neighbors import KNeighborsClassifier
+import numpy as np
 from sklearn.metrics import euclidean_distances
-from sklearn.externals.joblib import Parallel, delayed
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import normalize
 from sklearn.utils import check_array
-from sklearn.metrics.scorer import check_scoring
-from pathos.multiprocessing import ProcessingPool as Pool
-from sklearn.metrics import euclidean_distances
-import numpy as np
+
+import ot
+from lapjv import lapjv
 from mosestokenizer import MosesTokenizer
+from pathos.multiprocessing import ProcessingPool as Pool
+
 
 class Wasserstein_Matcher(KNeighborsClassifier):
     """
@@ -17,7 +16,13 @@ class Wasserstein_Matcher(KNeighborsClassifier):
     Source and target distributions are l_1 normalized before computing the Wasserstein distance.
     Wasserstein is parametrized by the distances between the individual points of the distributions.
     """
-    def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1):
+    def __init__(self,
+                 W_embed,
+                 n_neighbors=1,
+                 n_jobs=1,
+                 verbose=False,
+                 sinkhorn=False,
+                 sinkhorn_reg=0.1):
         """
         Initialization of the class.
         Arguments
@@ -29,7 +34,10 @@ class Wasserstein_Matcher(KNeighborsClassifier):
         self.sinkhorn_reg = sinkhorn_reg
         self.W_embed = W_embed
         self.verbose = verbose
-        super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute')
+        super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors,
+                                                  n_jobs=n_jobs,
+                                                  metric='precomputed',
+                                                  algorithm='brute')
 
     def _wmd(self, i, row, X_train):
         union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -38,9 +46,16 @@ class Wasserstein_Matcher(KNeighborsClassifier):
         bow_i = X_train[i, union_idx].A.ravel()
         bow_j = row[:, union_idx].A.ravel()
         if self.sinkhorn:
-            return  ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0]
+            return ot.sinkhorn2(
+                bow_i,
+                bow_j,
+                W_dist,
+                self.sinkhorn_reg,
+                numItermax=50,
+                method='sinkhorn_stabilized',
+            )[0]
         else:
-            return  ot.emd2(bow_i, bow_j, W_dist)
+            return ot.emd2(bow_i, bow_j, W_dist)
 
     def _wmd_row(self, row):
         X_train = self._fit_X
@@ -52,28 +67,31 @@ class Wasserstein_Matcher(KNeighborsClassifier):
 
         if X_train is None:
             X_train = self._fit_X
-        pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances
-        dist  = pool.map(self._wmd_row, X_test)
+        pool = Pool(nodes=self.n_jobs
+                    )  # Parallelization of the calculation of the distances
+        dist = pool.map(self._wmd_row, X_test)
         return np.array(dist)
 
-    def fit(self, X, y): # X_train_idf
-        X = check_array(X, accept_sparse='csr', copy=True) # check if array is sparse
+    def fit(self, X, y):  # X_train_idf
+        X = check_array(X, accept_sparse='csr',
+                        copy=True)  # check if array is sparse
         X = normalize(X, norm='l1', copy=False)
-        return super(Wasserstein_Matcher, self).fit(X, y) # X_train_idf, np_ones(document collection size)
+        return super(Wasserstein_Matcher, self).fit(
+            X, y)  # X_train_idf, np_ones(document collection size)
 
     def predict(self, X):
         X = check_array(X, accept_sparse='csr', copy=True)
         X = normalize(X, norm='l1', copy=False)
         dist = self._pairwise_wmd(X)
-        dist = dist * 1000 # for lapjv, small floating point numbers are evil
+        dist = dist * 1000  # for lapjv, small floating point numbers are evil
         return super(Wasserstein_Matcher, self).predict(dist)
 
-    def kneighbors(self, X, n_neighbors=1): # X : X_train_idf
+    def kneighbors(self, X, n_neighbors=1):  # X : X_train_idf
         X = check_array(X, accept_sparse='csr', copy=True)
         X = normalize(X, norm='l1', copy=False)
         dist = self._pairwise_wmd(X)
-        dist = dist * 1000 # for lapjv, small floating point numbers are evil
-        return lapjv(dist) # and here is the matching part
+        dist = dist * 1000  # for lapjv, small floating point numbers are evil
+        return lapjv(dist)  # and here is the matching part
 
 
 class Wasserstein_Retriever(KNeighborsClassifier):
@@ -82,7 +100,13 @@ class Wasserstein_Retriever(KNeighborsClassifier):
     Source and target distributions are l_1 normalized before computing the Wasserstein distance.
     Wasserstein is parametrized by the distances between the individual points of the distributions.
     """
-    def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1):
+    def __init__(self,
+                 W_embed,
+                 n_neighbors=1,
+                 n_jobs=1,
+                 verbose=False,
+                 sinkhorn=False,
+                 sinkhorn_reg=0.1):
         """
         Initialization of the class.
         Arguments
@@ -94,7 +118,10 @@ class Wasserstein_Retriever(KNeighborsClassifier):
         self.sinkhorn_reg = sinkhorn_reg
         self.W_embed = W_embed
         self.verbose = verbose
-        super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute')
+        super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors,
+                                                    n_jobs=n_jobs,
+                                                    metric='precomputed',
+                                                    algorithm='brute')
 
     def _wmd(self, i, row, X_train):
         union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -103,9 +130,16 @@ class Wasserstein_Retriever(KNeighborsClassifier):
         bow_i = X_train[i, union_idx].A.ravel()
         bow_j = row[:, union_idx].A.ravel()
         if self.sinkhorn:
-            return  ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0]
+            return ot.sinkhorn2(
+                bow_i,
+                bow_j,
+                W_dist,
+                self.sinkhorn_reg,
+                numItermax=50,
+                method='sinkhorn_stabilized',
+            )[0]
         else:
-            return  ot.emd2(bow_i, bow_j, W_dist)
+            return ot.emd2(bow_i, bow_j, W_dist)
 
     def _wmd_row(self, row):
         X_train = self._fit_X
@@ -117,8 +151,8 @@ class Wasserstein_Retriever(KNeighborsClassifier):
 
         if X_train is None:
             X_train = self._fit_X
-        pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances
-        dist  = pool.map(self._wmd_row, X_test)
+        pool = Pool(nodes=self.n_jobs)
+        dist = pool.map(self._wmd_row, X_test)
         return np.array(dist)
 
     def fit(self, X, y):
@@ -144,8 +178,8 @@ class Wasserstein_Retriever(KNeighborsClassifier):
         precision at one and percentage values
 
         """
-        dist, preds = self.kneighbors(X, n_neighbors)
-        mrr, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds)
+        _, preds = self.kneighbors(X, n_neighbors)
+        _, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds)
         percentage = p_at_one * 100
         return (p_at_one, percentage)
 
@@ -168,7 +202,8 @@ def load_embeddings(path, dimension=300):
             fp.seek(0)
         for line in fp:
             elems = line.split()
-            vectors[" ".join(elems[:-dimension])] = " ".join(elems[-dimension:])
+            vectors[" ".join(elems[:-dimension])] = " ".join(
+                elems[-dimension:])
     return vectors
 
 
@@ -177,7 +212,7 @@ def clean_corpus_using_embeddings_vocabulary(
         corpus,
         vectors,
         language,
-        ):
+):
     '''
     Cleans corpus using the dictionary of embeddings.
     Any word without an associated embedding in the dictionary is ignored.
@@ -192,7 +227,8 @@ def clean_corpus_using_embeddings_vocabulary(
         for word in words:
             if word in words_we_want:
                 clean_doc.append(word + '__%s' % language)
-                clean_vectors[word + '__%s' % language] = np.array(vectors[word].split()).astype(np.float)
+                clean_vectors[word + '__%s' % language] = np.array(
+                    vectors[word].split()).astype(np.float)
         if len(clean_doc) > 3 and len(clean_doc) < 25:
             keys.append(key)
         clean_corpus.append(' '.join(clean_doc))
@@ -208,10 +244,9 @@ def mrr_precision_at_k(golden, preds, k_list=[1,]):
     precision_at = np.zeros(len(k_list))
     for key, elem in enumerate(golden):
         if elem in preds[key]:
-            location = np.where(preds[key]==elem)[0][0]
-            my_score += 1/(1+ location)
+            location = np.where(preds[key] == elem)[0][0]
+            my_score += 1 / (1 + location)
         for k_index, k_value in enumerate(k_list):
             if location < k_value:
                 precision_at[k_index] += 1
-    return my_score/len(golden), (precision_at/len(golden))[0]
-
+    return my_score / len(golden), (precision_at / len(golden))[0]
-- 
cgit v1.2.3-70-g09d2