From 40d725329a1fb3699dd3b2c870fc8213db9d75eb Mon Sep 17 00:00:00 2001
From: Yigit Sever
Date: Sun, 22 Sep 2019 02:02:35 +0300
Subject: Unified WMD/SNK matching & retrieval

---
 WMD_matching.py         | 17 ++++++---------
 WMD_retrieval.py        | 15 ++++++-------
 Wasserstein_Distance.py | 57 ++++++++++++++++++++++++++++---------------------
 3 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/WMD_matching.py b/WMD_matching.py
index ea496b8..2755d15 100644
--- a/WMD_matching.py
+++ b/WMD_matching.py
@@ -6,7 +6,7 @@ import numpy as np
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from sklearn.preprocessing import normalize
 
-from Wasserstein_Distance import (Wasserstein_Matcher,
+from Wasserstein_Distance import (WassersteinMatcher,
                                   clean_corpus_using_embeddings_vocabulary,
                                   load_embeddings)
 
@@ -103,16 +103,13 @@ def main(args):
         if not batch:
             print(f'{metric}: {source_lang} - {target_lang}')
 
-        clf = Wasserstein_Matcher(W_embed=W_common,
-                                  n_neighbors=5,
-                                  n_jobs=14,
-                                  sinkhorn=(metric == 'snk'))
+        clf = WassersteinMatcher(W_embed=W_common,
+                                 n_neighbors=5,
+                                 n_jobs=14,
+                                 sinkhorn=(metric == 'snk'))
         clf.fit(X_train_idf[:instances], np.ones(instances))
-        row_ind, col_ind, _ = clf.kneighbors(X_test_idf[:instances],
-                                             n_neighbors=instances)
-        result = zip(row_ind, col_ind)
-        p_at_one = len([x for x, y in result if x == y])
-        percentage = p_at_one / instances * 100
+        p_at_one, percentage = clf.align(X_test_idf[:instances],
+                                         n_neighbors=instances)
 
         if not batch:
             print(f'P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%')
diff --git a/WMD_retrieval.py b/WMD_retrieval.py
index 3328023..02f35be 100644
--- a/WMD_retrieval.py
+++ b/WMD_retrieval.py
@@ -6,7 +6,7 @@ import numpy as np
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from sklearn.preprocessing import normalize
 
-from Wasserstein_Distance import (Wasserstein_Retriever,
+from Wasserstein_Distance import (WassersteinRetriever,
                                   clean_corpus_using_embeddings_vocabulary,
                                   load_embeddings)
 
@@ -101,16 +101,13 @@ def main(args):
 
     for metric in runfor:
         if not batch:
-            print(f'{metric} - tfidf: {source_lang} - {target_lang}')
+            print(f'{metric}: {source_lang} - {target_lang}')
 
-        clf = Wasserstein_Retriever(W_embed=W_common,
-                                    n_neighbors=5,
-                                    n_jobs=14,
-                                    sinkhorn=(metric == 'snk'))
+        clf = WassersteinRetriever(W_embed=W_common,
+                                   n_neighbors=5,
+                                   n_jobs=14,
+                                   sinkhorn=(metric == 'snk'))
         clf.fit(X_train_idf[:instances], np.ones(instances))
-        # dist, preds = clf.kneighbors(X_test_idf[:instances], n_neighbors=instances)
-        # mrr, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds)
-        # percentage = p_at_one * 100
         p_at_one, percentage = clf.align(X_test_idf[:instances],
                                          n_neighbors=instances)
 
diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py
index 161c13c..78bf9cf 100644
--- a/Wasserstein_Distance.py
+++ b/Wasserstein_Distance.py
@@ -1,16 +1,15 @@
 import numpy as np
-from sklearn.metrics import euclidean_distances
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.preprocessing import normalize
-from sklearn.utils import check_array
-
 import ot
 from lapjv import lapjv
 from mosestokenizer import MosesTokenizer
 from pathos.multiprocessing import ProcessingPool as Pool
+from sklearn.metrics import euclidean_distances
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import normalize
+from sklearn.utils import check_array
 
 
-class Wasserstein_Matcher(KNeighborsClassifier):
+class WassersteinMatcher(KNeighborsClassifier):
     """
     Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric.
     Source and target distributions are l_1 normalized before computing the Wasserstein distance.
@@ -34,10 +33,10 @@ class Wasserstein_Matcher(KNeighborsClassifier):
         self.sinkhorn_reg = sinkhorn_reg
         self.W_embed = W_embed
         self.verbose = verbose
-        super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors,
-                                                  n_jobs=n_jobs,
-                                                  metric='precomputed',
-                                                  algorithm='brute')
+        super(WassersteinMatcher, self).__init__(n_neighbors=n_neighbors,
+                                                 n_jobs=n_jobs,
+                                                 metric='precomputed',
+                                                 algorithm='brute')
 
     def _wmd(self, i, row, X_train):
         union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -76,25 +75,34 @@ class Wasserstein_Matcher(KNeighborsClassifier):
         X = check_array(X, accept_sparse='csr',
                         copy=True)  # check if array is sparse
         X = normalize(X, norm='l1', copy=False)
-        return super(Wasserstein_Matcher, self).fit(
-            X, y)  # X_train_idf, np_ones(document collection size)
+        return super(WassersteinMatcher, self).fit(X, y)
 
     def predict(self, X):
         X = check_array(X, accept_sparse='csr', copy=True)
         X = normalize(X, norm='l1', copy=False)
         dist = self._pairwise_wmd(X)
         dist = dist * 1000  # for lapjv, small floating point numbers are evil
-        return super(Wasserstein_Matcher, self).predict(dist)
+        return super(WassersteinMatcher, self).predict(dist)
 
-    def kneighbors(self, X, n_neighbors=1):  # X : X_train_idf
+    def kneighbors(self, X, n_neighbors=1):
         X = check_array(X, accept_sparse='csr', copy=True)
         X = normalize(X, norm='l1', copy=False)
         dist = self._pairwise_wmd(X)
         dist = dist * 1000  # for lapjv, small floating point numbers are evil
-        return lapjv(dist)  # and here is the matching part
+        return lapjv(dist)
 
+    def align(self, X, n_neighbors=1):
+        """ Wrapper function over kneighbors to return
+        precision at one and percentage values
 
-class Wasserstein_Retriever(KNeighborsClassifier):
+        """
+        row_ind, col_ind, _ = self.kneighbors(X, n_neighbors)
+        result = zip(row_ind, col_ind)
+        p_at_one = len([x for x, y in result if x == y])
+        percentage = p_at_one / n_neighbors * 100
+        return p_at_one, percentage
+
+class WassersteinRetriever(KNeighborsClassifier):
     """
     Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric.
     Source and target distributions are l_1 normalized before computing the Wasserstein distance.
@@ -118,7 +126,7 @@ class Wasserstein_Retriever(KNeighborsClassifier):
         self.sinkhorn_reg = sinkhorn_reg
         self.W_embed = W_embed
         self.verbose = verbose
-        super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors,
+        super(WassersteinRetriever, self).__init__(n_neighbors=n_neighbors,
                                                     n_jobs=n_jobs,
                                                     metric='precomputed',
                                                     algorithm='brute')
@@ -158,23 +166,22 @@ class Wasserstein_Retriever(KNeighborsClassifier):
     def fit(self, X, y):
         X = check_array(X, accept_sparse='csr', copy=True)
         X = normalize(X, norm='l1', copy=False)
-        return super(Wasserstein_Retriever, self).fit(X, y)
+        return super(WassersteinRetriever, self).fit(X, y)
 
     def predict(self, X):
         X = check_array(X, accept_sparse='csr', copy=True)
         X = normalize(X, norm='l1', copy=False)
         dist = self._pairwise_wmd(X)
-        return super(Wasserstein_Retriever, self).predict(dist)
+        return super(WassersteinRetriever, self).predict(dist)
 
     def kneighbors(self, X, n_neighbors=1):
         X = check_array(X, accept_sparse='csr', copy=True)
         X = normalize(X, norm='l1', copy=False)
         dist = self._pairwise_wmd(X)
-        return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors)
+        return super(WassersteinRetriever, self).kneighbors(dist, n_neighbors)
 
     def align(self, X, n_neighbors=1):
-        """
-        Wrapper function over kneighbors to return
+        """ Wrapper function over kneighbors to return
         precision at one and percentage values
 
         """
@@ -196,7 +203,7 @@ def load_embeddings(path, dimension=300):
         first_line = fp.readline().rstrip('\n')
         if first_line.count(' ') == 1:
             # includes the "word_count dimension" information
-            (word_count, dimension) = map(int, first_line.split())
+            (_, dimension) = map(int, first_line.split())
         else:
             # assume the file only contains vectors
             fp.seek(0)
@@ -236,7 +243,9 @@ def clean_corpus_using_embeddings_vocabulary(
     return np.array(clean_corpus), clean_vectors, keys
 
 
-def mrr_precision_at_k(golden, preds, k_list=[1,]):
+def mrr_precision_at_k(golden, preds, k_list=[
+        1,
+]):
     """
     Calculates Mean Reciprocal Error and Hits@1 == Precision@1
     """
-- 
cgit v1.2.3-70-g09d2