1 files changed, 72 insertions, 37 deletions
diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py
index 08439d2..161c13c 100644
--- a/Wasserstein_Distance.py
+++ b/Wasserstein_Distance.py
@@ -1,15 +1,14 @@
-import ot
+import numpy as np
-from sklearn.preprocessing import normalize
-from lapjv import lapjv
-from sklearn.neighbors import KNeighborsClassifier
 from sklearn.metrics import euclidean_distances
-from sklearn.externals.joblib import Parallel, delayed
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import normalize
 from sklearn.utils import check_array
-from sklearn.metrics.scorer import check_scoring
-from pathos.multiprocessing import ProcessingPool as Pool
+import ot
-from sklearn.metrics import euclidean_distances
+from lapjv import lapjv
-import numpy as np
 from mosestokenizer import MosesTokenizer
+from pathos.multiprocessing import ProcessingPool as Pool
 class Wasserstein_Matcher(KNeighborsClassifier):
    """
@@ -17,7 +16,13 @@ class Wasserstein_Matcher(KNeighborsClassifier):
    Source and target distributions are l_1 normalized before computing the Wasserstein distance.
    Wasserstein is parametrized by the distances between the individual points of the distributions.
    """
-    def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1):
+    def __init__(self,
+                 W_embed,
+                 n_neighbors=1,
+                 n_jobs=1,
+                 verbose=False,
+                 sinkhorn=False,
+                 sinkhorn_reg=0.1):
        """
        Initialization of the class.
        Arguments
@@ -29,7 +34,10 @@ class Wasserstein_Matcher(KNeighborsClassifier):
        self.sinkhorn_reg = sinkhorn_reg
        self.W_embed = W_embed
        self.verbose = verbose
-        super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute')
+        super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors,
+                                                  n_jobs=n_jobs,
+                                                  metric='precomputed',
+                                                  algorithm='brute')
    def _wmd(self, i, row, X_train):
        union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -38,9 +46,16 @@ class Wasserstein_Matcher(KNeighborsClassifier):
        bow_i = X_train[i, union_idx].A.ravel()
        bow_j = row[:, union_idx].A.ravel()
        if self.sinkhorn:
-            return  ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0]
+            return ot.sinkhorn2(
+                bow_i,
+                bow_j,
+                W_dist,
+                self.sinkhorn_reg,
+                numItermax=50,
+                method='sinkhorn_stabilized',
+            )[0]
        else:
-            return  ot.emd2(bow_i, bow_j, W_dist)
+            return ot.emd2(bow_i, bow_j, W_dist)
    def _wmd_row(self, row):
        X_train = self._fit_X
@@ -52,28 +67,31 @@ class Wasserstein_Matcher(KNeighborsClassifier):
        if X_train is None:
            X_train = self._fit_X
-        pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances
+        pool = Pool(nodes=self.n_jobs
-        dist  = pool.map(self._wmd_row, X_test)
+                    )  # Parallelization of the calculation of the distances
+        dist = pool.map(self._wmd_row, X_test)
        return np.array(dist)
-    def fit(self, X, y): # X_train_idf
+    def fit(self, X, y):  # X_train_idf
-        X = check_array(X, accept_sparse='csr', copy=True) # check if array is sparse
+        X = check_array(X, accept_sparse='csr',
+                        copy=True)  # check if array is sparse
        X = normalize(X, norm='l1', copy=False)
-        return super(Wasserstein_Matcher, self).fit(X, y) # X_train_idf, np_ones(document collection size)
+        return super(Wasserstein_Matcher, self).fit(
+            X, y)  # X_train_idf, np_ones(document collection size)
    def predict(self, X):
        X = check_array(X, accept_sparse='csr', copy=True)
        X = normalize(X, norm='l1', copy=False)
        dist = self._pairwise_wmd(X)
-        dist = dist * 1000 # for lapjv, small floating point numbers are evil
+        dist = dist * 1000  # for lapjv, small floating point numbers are evil
        return super(Wasserstein_Matcher, self).predict(dist)
-    def kneighbors(self, X, n_neighbors=1): # X : X_train_idf
+    def kneighbors(self, X, n_neighbors=1):  # X : X_train_idf
        X = check_array(X, accept_sparse='csr', copy=True)
        X = normalize(X, norm='l1', copy=False)
        dist = self._pairwise_wmd(X)
-        dist = dist * 1000 # for lapjv, small floating point numbers are evil
+        dist = dist * 1000  # for lapjv, small floating point numbers are evil
-        return lapjv(dist) # and here is the matching part
+        return lapjv(dist)  # and here is the matching part
 class Wasserstein_Retriever(KNeighborsClassifier):
@@ -82,7 +100,13 @@ class Wasserstein_Retriever(KNeighborsClassifier):
    Source and target distributions are l_1 normalized before computing the Wasserstein distance.
    Wasserstein is parametrized by the distances between the individual points of the distributions.
    """
-    def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1):
+    def __init__(self,
+                 W_embed,
+                 n_neighbors=1,
+                 n_jobs=1,
+                 verbose=False,
+                 sinkhorn=False,
+                 sinkhorn_reg=0.1):
        """
        Initialization of the class.
        Arguments
@@ -94,7 +118,10 @@ class Wasserstein_Retriever(KNeighborsClassifier):
        self.sinkhorn_reg = sinkhorn_reg
        self.W_embed = W_embed
        self.verbose = verbose
-        super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute')
+        super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors,
+                                                    n_jobs=n_jobs,
+                                                    metric='precomputed',
+                                                    algorithm='brute')
    def _wmd(self, i, row, X_train):
        union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -103,9 +130,16 @@ class Wasserstein_Retriever(KNeighborsClassifier):
        bow_i = X_train[i, union_idx].A.ravel()
        bow_j = row[:, union_idx].A.ravel()
        if self.sinkhorn:
-            return  ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0]
+            return ot.sinkhorn2(
+                bow_i,
+                bow_j,
+                W_dist,
+                self.sinkhorn_reg,
+                numItermax=50,
+                method='sinkhorn_stabilized',
+            )[0]
        else:
-            return  ot.emd2(bow_i, bow_j, W_dist)
+            return ot.emd2(bow_i, bow_j, W_dist)
    def _wmd_row(self, row):
        X_train = self._fit_X
@@ -117,8 +151,8 @@ class Wasserstein_Retriever(KNeighborsClassifier):
        if X_train is None:
            X_train = self._fit_X
-        pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances
+        pool = Pool(nodes=self.n_jobs)
-        dist  = pool.map(self._wmd_row, X_test)
+        dist = pool.map(self._wmd_row, X_test)
        return np.array(dist)
    def fit(self, X, y):
@@ -144,8 +178,8 @@ class Wasserstein_Retriever(KNeighborsClassifier):
        precision at one and percentage values
        """
-        dist, preds = self.kneighbors(X, n_neighbors)
+        _, preds = self.kneighbors(X, n_neighbors)
-        mrr, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds)
+        _, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds)
        percentage = p_at_one * 100
        return (p_at_one, percentage)
@@ -168,7 +202,8 @@ def load_embeddings(path, dimension=300):
            fp.seek(0)
        for line in fp:
            elems = line.split()
-            vectors[" ".join(elems[:-dimension])] = " ".join(elems[-dimension:])
+            vectors[" ".join(elems[:-dimension])] = " ".join(
+                elems[-dimension:])
    return vectors
@@ -177,7 +212,7 @@ def clean_corpus_using_embeddings_vocabulary(
        corpus,
        vectors,
        language,
-        ):
+):
    '''
    Cleans corpus using the dictionary of embeddings.
    Any word without an associated embedding in the dictionary is ignored.
@@ -192,7 +227,8 @@ def clean_corpus_using_embeddings_vocabulary(
        for word in words:
            if word in words_we_want:
                clean_doc.append(word + '__%s' % language)
-                clean_vectors[word + '__%s' % language] = np.array(vectors[word].split()).astype(np.float)
+                clean_vectors[word + '__%s' % language] = np.array(
+                    vectors[word].split()).astype(np.float)
        if len(clean_doc) > 3 and len(clean_doc) < 25:
            keys.append(key)
        clean_corpus.append(' '.join(clean_doc))
@@ -208,10 +244,9 @@ def mrr_precision_at_k(golden, preds, k_list=[1,]):
    precision_at = np.zeros(len(k_list))
    for key, elem in enumerate(golden):
        if elem in preds[key]:
-            location = np.where(preds[key]==elem)[0][0]
+            location = np.where(preds[key] == elem)[0][0]
-            my_score += 1/(1+ location)
+            my_score += 1 / (1 + location)
        for k_index, k_value in enumerate(k_list):
            if location < k_value:
                precision_at[k_index] += 1
-    return my_score/len(golden), (precision_at/len(golden))[0]
+    return my_score / len(golden), (precision_at / len(golden))[0]

diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py index 08439d2..161c13c 100644 --- a/Wasserstein_Distance.py +++ b/Wasserstein_Distance.py
@@ -1,15 +1,14 @@
1	import ot	1	import numpy as np
2	from sklearn.preprocessing import normalize
3	from lapjv import lapjv
4	from sklearn.neighbors import KNeighborsClassifier
5	from sklearn.metrics import euclidean_distances	2	from sklearn.metrics import euclidean_distances
6	from sklearn.externals.joblib import Parallel, delayed	3	from sklearn.neighbors import KNeighborsClassifier
		4	from sklearn.preprocessing import normalize
7	from sklearn.utils import check_array	5	from sklearn.utils import check_array
8	from sklearn.metrics.scorer import check_scoring	6
9	from pathos.multiprocessing import ProcessingPool as Pool	7	import ot
10	from sklearn.metrics import euclidean_distances	8	from lapjv import lapjv
11	import numpy as np
12	from mosestokenizer import MosesTokenizer	9	from mosestokenizer import MosesTokenizer
		10	from pathos.multiprocessing import ProcessingPool as Pool
		11
13		12
14	class Wasserstein_Matcher(KNeighborsClassifier):	13	class Wasserstein_Matcher(KNeighborsClassifier):
15	"""	14	"""
@@ -17,7 +16,13 @@ class Wasserstein_Matcher(KNeighborsClassifier):
17	Source and target distributions are l_1 normalized before computing the Wasserstein distance.	16	Source and target distributions are l_1 normalized before computing the Wasserstein distance.
18	Wasserstein is parametrized by the distances between the individual points of the distributions.	17	Wasserstein is parametrized by the distances between the individual points of the distributions.
19	"""	18	"""
20	def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1):	19	def __init__(self,
		20	W_embed,
		21	n_neighbors=1,
		22	n_jobs=1,
		23	verbose=False,
		24	sinkhorn=False,
		25	sinkhorn_reg=0.1):
21	"""	26	"""
22	Initialization of the class.	27	Initialization of the class.
23	Arguments	28	Arguments
@@ -29,7 +34,10 @@ class Wasserstein_Matcher(KNeighborsClassifier):
29	self.sinkhorn_reg = sinkhorn_reg	34	self.sinkhorn_reg = sinkhorn_reg
30	self.W_embed = W_embed	35	self.W_embed = W_embed
31	self.verbose = verbose	36	self.verbose = verbose
32	super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute')	37	super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors,
		38	n_jobs=n_jobs,
		39	metric='precomputed',
		40	algorithm='brute')
33		41
34	def _wmd(self, i, row, X_train):	42	def _wmd(self, i, row, X_train):
35	union_idx = np.union1d(X_train[i].indices, row.indices)	43	union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -38,9 +46,16 @@ class Wasserstein_Matcher(KNeighborsClassifier):
38	bow_i = X_train[i, union_idx].A.ravel()	46	bow_i = X_train[i, union_idx].A.ravel()
39	bow_j = row[:, union_idx].A.ravel()	47	bow_j = row[:, union_idx].A.ravel()
40	if self.sinkhorn:	48	if self.sinkhorn:
41	return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0]	49	return ot.sinkhorn2(
		50	bow_i,
		51	bow_j,
		52	W_dist,
		53	self.sinkhorn_reg,
		54	numItermax=50,
		55	method='sinkhorn_stabilized',
		56	)[0]
42	else:	57	else:
43	return ot.emd2(bow_i, bow_j, W_dist)	58	return ot.emd2(bow_i, bow_j, W_dist)
44		59
45	def _wmd_row(self, row):	60	def _wmd_row(self, row):
46	X_train = self._fit_X	61	X_train = self._fit_X
@@ -52,28 +67,31 @@ class Wasserstein_Matcher(KNeighborsClassifier):
52		67
53	if X_train is None:	68	if X_train is None:
54	X_train = self._fit_X	69	X_train = self._fit_X
55	pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances	70	pool = Pool(nodes=self.n_jobs
56	dist = pool.map(self._wmd_row, X_test)	71	) # Parallelization of the calculation of the distances
		72	dist = pool.map(self._wmd_row, X_test)
57	return np.array(dist)	73	return np.array(dist)
58		74
59	def fit(self, X, y): # X_train_idf	75	def fit(self, X, y): # X_train_idf
60	X = check_array(X, accept_sparse='csr', copy=True) # check if array is sparse	76	X = check_array(X, accept_sparse='csr',
		77	copy=True) # check if array is sparse
61	X = normalize(X, norm='l1', copy=False)	78	X = normalize(X, norm='l1', copy=False)
62	return super(Wasserstein_Matcher, self).fit(X, y) # X_train_idf, np_ones(document collection size)	79	return super(Wasserstein_Matcher, self).fit(
		80	X, y) # X_train_idf, np_ones(document collection size)
63		81
64	def predict(self, X):	82	def predict(self, X):
65	X = check_array(X, accept_sparse='csr', copy=True)	83	X = check_array(X, accept_sparse='csr', copy=True)
66	X = normalize(X, norm='l1', copy=False)	84	X = normalize(X, norm='l1', copy=False)
67	dist = self._pairwise_wmd(X)	85	dist = self._pairwise_wmd(X)
68	dist = dist * 1000 # for lapjv, small floating point numbers are evil	86	dist = dist * 1000 # for lapjv, small floating point numbers are evil
69	return super(Wasserstein_Matcher, self).predict(dist)	87	return super(Wasserstein_Matcher, self).predict(dist)
70		88
71	def kneighbors(self, X, n_neighbors=1): # X : X_train_idf	89	def kneighbors(self, X, n_neighbors=1): # X : X_train_idf
72	X = check_array(X, accept_sparse='csr', copy=True)	90	X = check_array(X, accept_sparse='csr', copy=True)
73	X = normalize(X, norm='l1', copy=False)	91	X = normalize(X, norm='l1', copy=False)
74	dist = self._pairwise_wmd(X)	92	dist = self._pairwise_wmd(X)
75	dist = dist * 1000 # for lapjv, small floating point numbers are evil	93	dist = dist * 1000 # for lapjv, small floating point numbers are evil
76	return lapjv(dist) # and here is the matching part	94	return lapjv(dist) # and here is the matching part
77		95
78		96
79	class Wasserstein_Retriever(KNeighborsClassifier):	97	class Wasserstein_Retriever(KNeighborsClassifier):
@@ -82,7 +100,13 @@ class Wasserstein_Retriever(KNeighborsClassifier):
82	Source and target distributions are l_1 normalized before computing the Wasserstein distance.	100	Source and target distributions are l_1 normalized before computing the Wasserstein distance.
83	Wasserstein is parametrized by the distances between the individual points of the distributions.	101	Wasserstein is parametrized by the distances between the individual points of the distributions.
84	"""	102	"""
85	def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1):	103	def __init__(self,
		104	W_embed,
		105	n_neighbors=1,
		106	n_jobs=1,
		107	verbose=False,
		108	sinkhorn=False,
		109	sinkhorn_reg=0.1):
86	"""	110	"""
87	Initialization of the class.	111	Initialization of the class.
88	Arguments	112	Arguments
@@ -94,7 +118,10 @@ class Wasserstein_Retriever(KNeighborsClassifier):
94	self.sinkhorn_reg = sinkhorn_reg	118	self.sinkhorn_reg = sinkhorn_reg
95	self.W_embed = W_embed	119	self.W_embed = W_embed
96	self.verbose = verbose	120	self.verbose = verbose
97	super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute')	121	super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors,
		122	n_jobs=n_jobs,
		123	metric='precomputed',
		124	algorithm='brute')
98		125
99	def _wmd(self, i, row, X_train):	126	def _wmd(self, i, row, X_train):
100	union_idx = np.union1d(X_train[i].indices, row.indices)	127	union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -103,9 +130,16 @@ class Wasserstein_Retriever(KNeighborsClassifier):
103	bow_i = X_train[i, union_idx].A.ravel()	130	bow_i = X_train[i, union_idx].A.ravel()
104	bow_j = row[:, union_idx].A.ravel()	131	bow_j = row[:, union_idx].A.ravel()
105	if self.sinkhorn:	132	if self.sinkhorn:
106	return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0]	133	return ot.sinkhorn2(
		134	bow_i,
		135	bow_j,
		136	W_dist,
		137	self.sinkhorn_reg,
		138	numItermax=50,
		139	method='sinkhorn_stabilized',
		140	)[0]
107	else:	141	else:
108	return ot.emd2(bow_i, bow_j, W_dist)	142	return ot.emd2(bow_i, bow_j, W_dist)
109		143
110	def _wmd_row(self, row):	144	def _wmd_row(self, row):
111	X_train = self._fit_X	145	X_train = self._fit_X
@@ -117,8 +151,8 @@ class Wasserstein_Retriever(KNeighborsClassifier):
117		151
118	if X_train is None:	152	if X_train is None:
119	X_train = self._fit_X	153	X_train = self._fit_X
120	pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances	154	pool = Pool(nodes=self.n_jobs)
121	dist = pool.map(self._wmd_row, X_test)	155	dist = pool.map(self._wmd_row, X_test)
122	return np.array(dist)	156	return np.array(dist)
123		157
124	def fit(self, X, y):	158	def fit(self, X, y):
@@ -144,8 +178,8 @@ class Wasserstein_Retriever(KNeighborsClassifier):
144	precision at one and percentage values	178	precision at one and percentage values
145		179
146	"""	180	"""
147	dist, preds = self.kneighbors(X, n_neighbors)	181	_, preds = self.kneighbors(X, n_neighbors)
148	mrr, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds)	182	_, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds)
149	percentage = p_at_one * 100	183	percentage = p_at_one * 100
150	return (p_at_one, percentage)	184	return (p_at_one, percentage)
151		185
@@ -168,7 +202,8 @@ def load_embeddings(path, dimension=300):
168	fp.seek(0)	202	fp.seek(0)
169	for line in fp:	203	for line in fp:
170	elems = line.split()	204	elems = line.split()
171	vectors[" ".join(elems[:-dimension])] = " ".join(elems[-dimension:])	205	vectors[" ".join(elems[:-dimension])] = " ".join(
		206	elems[-dimension:])
172	return vectors	207	return vectors
173		208
174		209
@@ -177,7 +212,7 @@ def clean_corpus_using_embeddings_vocabulary(
177	corpus,	212	corpus,
178	vectors,	213	vectors,
179	language,	214	language,
180	):	215	):
181	'''	216	'''
182	Cleans corpus using the dictionary of embeddings.	217	Cleans corpus using the dictionary of embeddings.
183	Any word without an associated embedding in the dictionary is ignored.	218	Any word without an associated embedding in the dictionary is ignored.
@@ -192,7 +227,8 @@ def clean_corpus_using_embeddings_vocabulary(
192	for word in words:	227	for word in words:
193	if word in words_we_want:	228	if word in words_we_want:
194	clean_doc.append(word + '__%s' % language)	229	clean_doc.append(word + '__%s' % language)
195	clean_vectors[word + '__%s' % language] = np.array(vectors[word].split()).astype(np.float)	230	clean_vectors[word + '__%s' % language] = np.array(
		231	vectors[word].split()).astype(np.float)
196	if len(clean_doc) > 3 and len(clean_doc) < 25:	232	if len(clean_doc) > 3 and len(clean_doc) < 25:
197	keys.append(key)	233	keys.append(key)
198	clean_corpus.append(' '.join(clean_doc))	234	clean_corpus.append(' '.join(clean_doc))
@@ -208,10 +244,9 @@ def mrr_precision_at_k(golden, preds, k_list=[1,]):
208	precision_at = np.zeros(len(k_list))	244	precision_at = np.zeros(len(k_list))
209	for key, elem in enumerate(golden):	245	for key, elem in enumerate(golden):
210	if elem in preds[key]:	246	if elem in preds[key]:
211	location = np.where(preds[key]==elem)[0][0]	247	location = np.where(preds[key] == elem)[0][0]
212	my_score += 1/(1+ location)	248	my_score += 1 / (1 + location)
213	for k_index, k_value in enumerate(k_list):	249	for k_index, k_value in enumerate(k_list):
214	if location < k_value:	250	if location < k_value:
215	precision_at[k_index] += 1	251	precision_at[k_index] += 1
216	return my_score/len(golden), (precision_at/len(golden))[0]	252	return my_score / len(golden), (precision_at / len(golden))[0]
217