aboutsummaryrefslogtreecommitdiffstats
path: root/Wasserstein_Distance.py
diff options
context:
space:
mode:
authorYigit Sever2019-09-22 01:33:24 +0300
committerYigit Sever2019-09-22 01:33:24 +0300
commit2936635892e17031c37facfd2115e8cfd6633222 (patch)
treee284cfb01c2c4a84a9f94cfd528bbc7a57b5d19f /Wasserstein_Distance.py
parent3a924c24d167a4411b19d1038c59639f06f2ba6b (diff)
downloadEvaluating-Dictionary-Alignment-2936635892e17031c37facfd2115e8cfd6633222.tar.gz
Evaluating-Dictionary-Alignment-2936635892e17031c37facfd2115e8cfd6633222.tar.bz2
Evaluating-Dictionary-Alignment-2936635892e17031c37facfd2115e8cfd6633222.zip
Introduce linter, stylize
Diffstat (limited to 'Wasserstein_Distance.py')
-rw-r--r--Wasserstein_Distance.py109
1 files changed, 72 insertions, 37 deletions
diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py
index 08439d2..161c13c 100644
--- a/Wasserstein_Distance.py
+++ b/Wasserstein_Distance.py
@@ -1,15 +1,14 @@
1import ot 1import numpy as np
2from sklearn.preprocessing import normalize
3from lapjv import lapjv
4from sklearn.neighbors import KNeighborsClassifier
5from sklearn.metrics import euclidean_distances 2from sklearn.metrics import euclidean_distances
6from sklearn.externals.joblib import Parallel, delayed 3from sklearn.neighbors import KNeighborsClassifier
4from sklearn.preprocessing import normalize
7from sklearn.utils import check_array 5from sklearn.utils import check_array
8from sklearn.metrics.scorer import check_scoring 6
9from pathos.multiprocessing import ProcessingPool as Pool 7import ot
10from sklearn.metrics import euclidean_distances 8from lapjv import lapjv
11import numpy as np
12from mosestokenizer import MosesTokenizer 9from mosestokenizer import MosesTokenizer
10from pathos.multiprocessing import ProcessingPool as Pool
11
13 12
14class Wasserstein_Matcher(KNeighborsClassifier): 13class Wasserstein_Matcher(KNeighborsClassifier):
15 """ 14 """
@@ -17,7 +16,13 @@ class Wasserstein_Matcher(KNeighborsClassifier):
17 Source and target distributions are l_1 normalized before computing the Wasserstein distance. 16 Source and target distributions are l_1 normalized before computing the Wasserstein distance.
18 Wasserstein is parametrized by the distances between the individual points of the distributions. 17 Wasserstein is parametrized by the distances between the individual points of the distributions.
19 """ 18 """
20 def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1): 19 def __init__(self,
20 W_embed,
21 n_neighbors=1,
22 n_jobs=1,
23 verbose=False,
24 sinkhorn=False,
25 sinkhorn_reg=0.1):
21 """ 26 """
22 Initialization of the class. 27 Initialization of the class.
23 Arguments 28 Arguments
@@ -29,7 +34,10 @@ class Wasserstein_Matcher(KNeighborsClassifier):
29 self.sinkhorn_reg = sinkhorn_reg 34 self.sinkhorn_reg = sinkhorn_reg
30 self.W_embed = W_embed 35 self.W_embed = W_embed
31 self.verbose = verbose 36 self.verbose = verbose
32 super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') 37 super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors,
38 n_jobs=n_jobs,
39 metric='precomputed',
40 algorithm='brute')
33 41
34 def _wmd(self, i, row, X_train): 42 def _wmd(self, i, row, X_train):
35 union_idx = np.union1d(X_train[i].indices, row.indices) 43 union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -38,9 +46,16 @@ class Wasserstein_Matcher(KNeighborsClassifier):
38 bow_i = X_train[i, union_idx].A.ravel() 46 bow_i = X_train[i, union_idx].A.ravel()
39 bow_j = row[:, union_idx].A.ravel() 47 bow_j = row[:, union_idx].A.ravel()
40 if self.sinkhorn: 48 if self.sinkhorn:
41 return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0] 49 return ot.sinkhorn2(
50 bow_i,
51 bow_j,
52 W_dist,
53 self.sinkhorn_reg,
54 numItermax=50,
55 method='sinkhorn_stabilized',
56 )[0]
42 else: 57 else:
43 return ot.emd2(bow_i, bow_j, W_dist) 58 return ot.emd2(bow_i, bow_j, W_dist)
44 59
45 def _wmd_row(self, row): 60 def _wmd_row(self, row):
46 X_train = self._fit_X 61 X_train = self._fit_X
@@ -52,28 +67,31 @@ class Wasserstein_Matcher(KNeighborsClassifier):
52 67
53 if X_train is None: 68 if X_train is None:
54 X_train = self._fit_X 69 X_train = self._fit_X
55 pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances 70 pool = Pool(nodes=self.n_jobs
56 dist = pool.map(self._wmd_row, X_test) 71 ) # Parallelization of the calculation of the distances
72 dist = pool.map(self._wmd_row, X_test)
57 return np.array(dist) 73 return np.array(dist)
58 74
59 def fit(self, X, y): # X_train_idf 75 def fit(self, X, y): # X_train_idf
60 X = check_array(X, accept_sparse='csr', copy=True) # check if array is sparse 76 X = check_array(X, accept_sparse='csr',
77 copy=True) # check if array is sparse
61 X = normalize(X, norm='l1', copy=False) 78 X = normalize(X, norm='l1', copy=False)
62 return super(Wasserstein_Matcher, self).fit(X, y) # X_train_idf, np_ones(document collection size) 79 return super(Wasserstein_Matcher, self).fit(
80 X, y) # X_train_idf, np_ones(document collection size)
63 81
64 def predict(self, X): 82 def predict(self, X):
65 X = check_array(X, accept_sparse='csr', copy=True) 83 X = check_array(X, accept_sparse='csr', copy=True)
66 X = normalize(X, norm='l1', copy=False) 84 X = normalize(X, norm='l1', copy=False)
67 dist = self._pairwise_wmd(X) 85 dist = self._pairwise_wmd(X)
68 dist = dist * 1000 # for lapjv, small floating point numbers are evil 86 dist = dist * 1000 # for lapjv, small floating point numbers are evil
69 return super(Wasserstein_Matcher, self).predict(dist) 87 return super(Wasserstein_Matcher, self).predict(dist)
70 88
71 def kneighbors(self, X, n_neighbors=1): # X : X_train_idf 89 def kneighbors(self, X, n_neighbors=1): # X : X_train_idf
72 X = check_array(X, accept_sparse='csr', copy=True) 90 X = check_array(X, accept_sparse='csr', copy=True)
73 X = normalize(X, norm='l1', copy=False) 91 X = normalize(X, norm='l1', copy=False)
74 dist = self._pairwise_wmd(X) 92 dist = self._pairwise_wmd(X)
75 dist = dist * 1000 # for lapjv, small floating point numbers are evil 93 dist = dist * 1000 # for lapjv, small floating point numbers are evil
76 return lapjv(dist) # and here is the matching part 94 return lapjv(dist) # and here is the matching part
77 95
78 96
79class Wasserstein_Retriever(KNeighborsClassifier): 97class Wasserstein_Retriever(KNeighborsClassifier):
@@ -82,7 +100,13 @@ class Wasserstein_Retriever(KNeighborsClassifier):
82 Source and target distributions are l_1 normalized before computing the Wasserstein distance. 100 Source and target distributions are l_1 normalized before computing the Wasserstein distance.
83 Wasserstein is parametrized by the distances between the individual points of the distributions. 101 Wasserstein is parametrized by the distances between the individual points of the distributions.
84 """ 102 """
85 def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1): 103 def __init__(self,
104 W_embed,
105 n_neighbors=1,
106 n_jobs=1,
107 verbose=False,
108 sinkhorn=False,
109 sinkhorn_reg=0.1):
86 """ 110 """
87 Initialization of the class. 111 Initialization of the class.
88 Arguments 112 Arguments
@@ -94,7 +118,10 @@ class Wasserstein_Retriever(KNeighborsClassifier):
94 self.sinkhorn_reg = sinkhorn_reg 118 self.sinkhorn_reg = sinkhorn_reg
95 self.W_embed = W_embed 119 self.W_embed = W_embed
96 self.verbose = verbose 120 self.verbose = verbose
97 super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') 121 super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors,
122 n_jobs=n_jobs,
123 metric='precomputed',
124 algorithm='brute')
98 125
99 def _wmd(self, i, row, X_train): 126 def _wmd(self, i, row, X_train):
100 union_idx = np.union1d(X_train[i].indices, row.indices) 127 union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -103,9 +130,16 @@ class Wasserstein_Retriever(KNeighborsClassifier):
103 bow_i = X_train[i, union_idx].A.ravel() 130 bow_i = X_train[i, union_idx].A.ravel()
104 bow_j = row[:, union_idx].A.ravel() 131 bow_j = row[:, union_idx].A.ravel()
105 if self.sinkhorn: 132 if self.sinkhorn:
106 return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0] 133 return ot.sinkhorn2(
134 bow_i,
135 bow_j,
136 W_dist,
137 self.sinkhorn_reg,
138 numItermax=50,
139 method='sinkhorn_stabilized',
140 )[0]
107 else: 141 else:
108 return ot.emd2(bow_i, bow_j, W_dist) 142 return ot.emd2(bow_i, bow_j, W_dist)
109 143
110 def _wmd_row(self, row): 144 def _wmd_row(self, row):
111 X_train = self._fit_X 145 X_train = self._fit_X
@@ -117,8 +151,8 @@ class Wasserstein_Retriever(KNeighborsClassifier):
117 151
118 if X_train is None: 152 if X_train is None:
119 X_train = self._fit_X 153 X_train = self._fit_X
120 pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances 154 pool = Pool(nodes=self.n_jobs)
121 dist = pool.map(self._wmd_row, X_test) 155 dist = pool.map(self._wmd_row, X_test)
122 return np.array(dist) 156 return np.array(dist)
123 157
124 def fit(self, X, y): 158 def fit(self, X, y):
@@ -144,8 +178,8 @@ class Wasserstein_Retriever(KNeighborsClassifier):
144 precision at one and percentage values 178 precision at one and percentage values
145 179
146 """ 180 """
147 dist, preds = self.kneighbors(X, n_neighbors) 181 _, preds = self.kneighbors(X, n_neighbors)
148 mrr, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds) 182 _, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds)
149 percentage = p_at_one * 100 183 percentage = p_at_one * 100
150 return (p_at_one, percentage) 184 return (p_at_one, percentage)
151 185
@@ -168,7 +202,8 @@ def load_embeddings(path, dimension=300):
168 fp.seek(0) 202 fp.seek(0)
169 for line in fp: 203 for line in fp:
170 elems = line.split() 204 elems = line.split()
171 vectors[" ".join(elems[:-dimension])] = " ".join(elems[-dimension:]) 205 vectors[" ".join(elems[:-dimension])] = " ".join(
206 elems[-dimension:])
172 return vectors 207 return vectors
173 208
174 209
@@ -177,7 +212,7 @@ def clean_corpus_using_embeddings_vocabulary(
177 corpus, 212 corpus,
178 vectors, 213 vectors,
179 language, 214 language,
180 ): 215):
181 ''' 216 '''
182 Cleans corpus using the dictionary of embeddings. 217 Cleans corpus using the dictionary of embeddings.
183 Any word without an associated embedding in the dictionary is ignored. 218 Any word without an associated embedding in the dictionary is ignored.
@@ -192,7 +227,8 @@ def clean_corpus_using_embeddings_vocabulary(
192 for word in words: 227 for word in words:
193 if word in words_we_want: 228 if word in words_we_want:
194 clean_doc.append(word + '__%s' % language) 229 clean_doc.append(word + '__%s' % language)
195 clean_vectors[word + '__%s' % language] = np.array(vectors[word].split()).astype(np.float) 230 clean_vectors[word + '__%s' % language] = np.array(
231 vectors[word].split()).astype(np.float)
196 if len(clean_doc) > 3 and len(clean_doc) < 25: 232 if len(clean_doc) > 3 and len(clean_doc) < 25:
197 keys.append(key) 233 keys.append(key)
198 clean_corpus.append(' '.join(clean_doc)) 234 clean_corpus.append(' '.join(clean_doc))
@@ -208,10 +244,9 @@ def mrr_precision_at_k(golden, preds, k_list=[1,]):
208 precision_at = np.zeros(len(k_list)) 244 precision_at = np.zeros(len(k_list))
209 for key, elem in enumerate(golden): 245 for key, elem in enumerate(golden):
210 if elem in preds[key]: 246 if elem in preds[key]:
211 location = np.where(preds[key]==elem)[0][0] 247 location = np.where(preds[key] == elem)[0][0]
212 my_score += 1/(1+ location) 248 my_score += 1 / (1 + location)
213 for k_index, k_value in enumerate(k_list): 249 for k_index, k_value in enumerate(k_list):
214 if location < k_value: 250 if location < k_value:
215 precision_at[k_index] += 1 251 precision_at[k_index] += 1
216 return my_score/len(golden), (precision_at/len(golden))[0] 252 return my_score / len(golden), (precision_at / len(golden))[0]
217