diff options
Diffstat (limited to 'Wasserstein_Distance.py')
-rw-r--r-- | Wasserstein_Distance.py | 109 |
1 files changed, 72 insertions, 37 deletions
diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py index 08439d2..161c13c 100644 --- a/Wasserstein_Distance.py +++ b/Wasserstein_Distance.py | |||
@@ -1,15 +1,14 @@ | |||
1 | import ot | 1 | import numpy as np |
2 | from sklearn.preprocessing import normalize | ||
3 | from lapjv import lapjv | ||
4 | from sklearn.neighbors import KNeighborsClassifier | ||
5 | from sklearn.metrics import euclidean_distances | 2 | from sklearn.metrics import euclidean_distances |
6 | from sklearn.externals.joblib import Parallel, delayed | 3 | from sklearn.neighbors import KNeighborsClassifier |
4 | from sklearn.preprocessing import normalize | ||
7 | from sklearn.utils import check_array | 5 | from sklearn.utils import check_array |
8 | from sklearn.metrics.scorer import check_scoring | 6 | |
9 | from pathos.multiprocessing import ProcessingPool as Pool | 7 | import ot |
10 | from sklearn.metrics import euclidean_distances | 8 | from lapjv import lapjv |
11 | import numpy as np | ||
12 | from mosestokenizer import MosesTokenizer | 9 | from mosestokenizer import MosesTokenizer |
10 | from pathos.multiprocessing import ProcessingPool as Pool | ||
11 | |||
13 | 12 | ||
14 | class Wasserstein_Matcher(KNeighborsClassifier): | 13 | class Wasserstein_Matcher(KNeighborsClassifier): |
15 | """ | 14 | """ |
@@ -17,7 +16,13 @@ class Wasserstein_Matcher(KNeighborsClassifier): | |||
17 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. | 16 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. |
18 | Wasserstein is parametrized by the distances between the individual points of the distributions. | 17 | Wasserstein is parametrized by the distances between the individual points of the distributions. |
19 | """ | 18 | """ |
20 | def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1): | 19 | def __init__(self, |
20 | W_embed, | ||
21 | n_neighbors=1, | ||
22 | n_jobs=1, | ||
23 | verbose=False, | ||
24 | sinkhorn=False, | ||
25 | sinkhorn_reg=0.1): | ||
21 | """ | 26 | """ |
22 | Initialization of the class. | 27 | Initialization of the class. |
23 | Arguments | 28 | Arguments |
@@ -29,7 +34,10 @@ class Wasserstein_Matcher(KNeighborsClassifier): | |||
29 | self.sinkhorn_reg = sinkhorn_reg | 34 | self.sinkhorn_reg = sinkhorn_reg |
30 | self.W_embed = W_embed | 35 | self.W_embed = W_embed |
31 | self.verbose = verbose | 36 | self.verbose = verbose |
32 | super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') | 37 | super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors, |
38 | n_jobs=n_jobs, | ||
39 | metric='precomputed', | ||
40 | algorithm='brute') | ||
33 | 41 | ||
34 | def _wmd(self, i, row, X_train): | 42 | def _wmd(self, i, row, X_train): |
35 | union_idx = np.union1d(X_train[i].indices, row.indices) | 43 | union_idx = np.union1d(X_train[i].indices, row.indices) |
@@ -38,9 +46,16 @@ class Wasserstein_Matcher(KNeighborsClassifier): | |||
38 | bow_i = X_train[i, union_idx].A.ravel() | 46 | bow_i = X_train[i, union_idx].A.ravel() |
39 | bow_j = row[:, union_idx].A.ravel() | 47 | bow_j = row[:, union_idx].A.ravel() |
40 | if self.sinkhorn: | 48 | if self.sinkhorn: |
41 | return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0] | 49 | return ot.sinkhorn2( |
50 | bow_i, | ||
51 | bow_j, | ||
52 | W_dist, | ||
53 | self.sinkhorn_reg, | ||
54 | numItermax=50, | ||
55 | method='sinkhorn_stabilized', | ||
56 | )[0] | ||
42 | else: | 57 | else: |
43 | return ot.emd2(bow_i, bow_j, W_dist) | 58 | return ot.emd2(bow_i, bow_j, W_dist) |
44 | 59 | ||
45 | def _wmd_row(self, row): | 60 | def _wmd_row(self, row): |
46 | X_train = self._fit_X | 61 | X_train = self._fit_X |
@@ -52,28 +67,31 @@ class Wasserstein_Matcher(KNeighborsClassifier): | |||
52 | 67 | ||
53 | if X_train is None: | 68 | if X_train is None: |
54 | X_train = self._fit_X | 69 | X_train = self._fit_X |
55 | pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances | 70 | pool = Pool(nodes=self.n_jobs |
56 | dist = pool.map(self._wmd_row, X_test) | 71 | ) # Parallelization of the calculation of the distances |
72 | dist = pool.map(self._wmd_row, X_test) | ||
57 | return np.array(dist) | 73 | return np.array(dist) |
58 | 74 | ||
59 | def fit(self, X, y): # X_train_idf | 75 | def fit(self, X, y): # X_train_idf |
60 | X = check_array(X, accept_sparse='csr', copy=True) # check if array is sparse | 76 | X = check_array(X, accept_sparse='csr', |
77 | copy=True) # check if array is sparse | ||
61 | X = normalize(X, norm='l1', copy=False) | 78 | X = normalize(X, norm='l1', copy=False) |
62 | return super(Wasserstein_Matcher, self).fit(X, y) # X_train_idf, np_ones(document collection size) | 79 | return super(Wasserstein_Matcher, self).fit( |
80 | X, y) # X_train_idf, np_ones(document collection size) | ||
63 | 81 | ||
64 | def predict(self, X): | 82 | def predict(self, X): |
65 | X = check_array(X, accept_sparse='csr', copy=True) | 83 | X = check_array(X, accept_sparse='csr', copy=True) |
66 | X = normalize(X, norm='l1', copy=False) | 84 | X = normalize(X, norm='l1', copy=False) |
67 | dist = self._pairwise_wmd(X) | 85 | dist = self._pairwise_wmd(X) |
68 | dist = dist * 1000 # for lapjv, small floating point numbers are evil | 86 | dist = dist * 1000 # for lapjv, small floating point numbers are evil |
69 | return super(Wasserstein_Matcher, self).predict(dist) | 87 | return super(Wasserstein_Matcher, self).predict(dist) |
70 | 88 | ||
71 | def kneighbors(self, X, n_neighbors=1): # X : X_train_idf | 89 | def kneighbors(self, X, n_neighbors=1): # X : X_train_idf |
72 | X = check_array(X, accept_sparse='csr', copy=True) | 90 | X = check_array(X, accept_sparse='csr', copy=True) |
73 | X = normalize(X, norm='l1', copy=False) | 91 | X = normalize(X, norm='l1', copy=False) |
74 | dist = self._pairwise_wmd(X) | 92 | dist = self._pairwise_wmd(X) |
75 | dist = dist * 1000 # for lapjv, small floating point numbers are evil | 93 | dist = dist * 1000 # for lapjv, small floating point numbers are evil |
76 | return lapjv(dist) # and here is the matching part | 94 | return lapjv(dist) # and here is the matching part |
77 | 95 | ||
78 | 96 | ||
79 | class Wasserstein_Retriever(KNeighborsClassifier): | 97 | class Wasserstein_Retriever(KNeighborsClassifier): |
@@ -82,7 +100,13 @@ class Wasserstein_Retriever(KNeighborsClassifier): | |||
82 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. | 100 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. |
83 | Wasserstein is parametrized by the distances between the individual points of the distributions. | 101 | Wasserstein is parametrized by the distances between the individual points of the distributions. |
84 | """ | 102 | """ |
85 | def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1): | 103 | def __init__(self, |
104 | W_embed, | ||
105 | n_neighbors=1, | ||
106 | n_jobs=1, | ||
107 | verbose=False, | ||
108 | sinkhorn=False, | ||
109 | sinkhorn_reg=0.1): | ||
86 | """ | 110 | """ |
87 | Initialization of the class. | 111 | Initialization of the class. |
88 | Arguments | 112 | Arguments |
@@ -94,7 +118,10 @@ class Wasserstein_Retriever(KNeighborsClassifier): | |||
94 | self.sinkhorn_reg = sinkhorn_reg | 118 | self.sinkhorn_reg = sinkhorn_reg |
95 | self.W_embed = W_embed | 119 | self.W_embed = W_embed |
96 | self.verbose = verbose | 120 | self.verbose = verbose |
97 | super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') | 121 | super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, |
122 | n_jobs=n_jobs, | ||
123 | metric='precomputed', | ||
124 | algorithm='brute') | ||
98 | 125 | ||
99 | def _wmd(self, i, row, X_train): | 126 | def _wmd(self, i, row, X_train): |
100 | union_idx = np.union1d(X_train[i].indices, row.indices) | 127 | union_idx = np.union1d(X_train[i].indices, row.indices) |
@@ -103,9 +130,16 @@ class Wasserstein_Retriever(KNeighborsClassifier): | |||
103 | bow_i = X_train[i, union_idx].A.ravel() | 130 | bow_i = X_train[i, union_idx].A.ravel() |
104 | bow_j = row[:, union_idx].A.ravel() | 131 | bow_j = row[:, union_idx].A.ravel() |
105 | if self.sinkhorn: | 132 | if self.sinkhorn: |
106 | return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0] | 133 | return ot.sinkhorn2( |
134 | bow_i, | ||
135 | bow_j, | ||
136 | W_dist, | ||
137 | self.sinkhorn_reg, | ||
138 | numItermax=50, | ||
139 | method='sinkhorn_stabilized', | ||
140 | )[0] | ||
107 | else: | 141 | else: |
108 | return ot.emd2(bow_i, bow_j, W_dist) | 142 | return ot.emd2(bow_i, bow_j, W_dist) |
109 | 143 | ||
110 | def _wmd_row(self, row): | 144 | def _wmd_row(self, row): |
111 | X_train = self._fit_X | 145 | X_train = self._fit_X |
@@ -117,8 +151,8 @@ class Wasserstein_Retriever(KNeighborsClassifier): | |||
117 | 151 | ||
118 | if X_train is None: | 152 | if X_train is None: |
119 | X_train = self._fit_X | 153 | X_train = self._fit_X |
120 | pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances | 154 | pool = Pool(nodes=self.n_jobs) |
121 | dist = pool.map(self._wmd_row, X_test) | 155 | dist = pool.map(self._wmd_row, X_test) |
122 | return np.array(dist) | 156 | return np.array(dist) |
123 | 157 | ||
124 | def fit(self, X, y): | 158 | def fit(self, X, y): |
@@ -144,8 +178,8 @@ class Wasserstein_Retriever(KNeighborsClassifier): | |||
144 | precision at one and percentage values | 178 | precision at one and percentage values |
145 | 179 | ||
146 | """ | 180 | """ |
147 | dist, preds = self.kneighbors(X, n_neighbors) | 181 | _, preds = self.kneighbors(X, n_neighbors) |
148 | mrr, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds) | 182 | _, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds) |
149 | percentage = p_at_one * 100 | 183 | percentage = p_at_one * 100 |
150 | return (p_at_one, percentage) | 184 | return (p_at_one, percentage) |
151 | 185 | ||
@@ -168,7 +202,8 @@ def load_embeddings(path, dimension=300): | |||
168 | fp.seek(0) | 202 | fp.seek(0) |
169 | for line in fp: | 203 | for line in fp: |
170 | elems = line.split() | 204 | elems = line.split() |
171 | vectors[" ".join(elems[:-dimension])] = " ".join(elems[-dimension:]) | 205 | vectors[" ".join(elems[:-dimension])] = " ".join( |
206 | elems[-dimension:]) | ||
172 | return vectors | 207 | return vectors |
173 | 208 | ||
174 | 209 | ||
@@ -177,7 +212,7 @@ def clean_corpus_using_embeddings_vocabulary( | |||
177 | corpus, | 212 | corpus, |
178 | vectors, | 213 | vectors, |
179 | language, | 214 | language, |
180 | ): | 215 | ): |
181 | ''' | 216 | ''' |
182 | Cleans corpus using the dictionary of embeddings. | 217 | Cleans corpus using the dictionary of embeddings. |
183 | Any word without an associated embedding in the dictionary is ignored. | 218 | Any word without an associated embedding in the dictionary is ignored. |
@@ -192,7 +227,8 @@ def clean_corpus_using_embeddings_vocabulary( | |||
192 | for word in words: | 227 | for word in words: |
193 | if word in words_we_want: | 228 | if word in words_we_want: |
194 | clean_doc.append(word + '__%s' % language) | 229 | clean_doc.append(word + '__%s' % language) |
195 | clean_vectors[word + '__%s' % language] = np.array(vectors[word].split()).astype(np.float) | 230 | clean_vectors[word + '__%s' % language] = np.array( |
231 | vectors[word].split()).astype(np.float) | ||
196 | if len(clean_doc) > 3 and len(clean_doc) < 25: | 232 | if len(clean_doc) > 3 and len(clean_doc) < 25: |
197 | keys.append(key) | 233 | keys.append(key) |
198 | clean_corpus.append(' '.join(clean_doc)) | 234 | clean_corpus.append(' '.join(clean_doc)) |
@@ -208,10 +244,9 @@ def mrr_precision_at_k(golden, preds, k_list=[1,]): | |||
208 | precision_at = np.zeros(len(k_list)) | 244 | precision_at = np.zeros(len(k_list)) |
209 | for key, elem in enumerate(golden): | 245 | for key, elem in enumerate(golden): |
210 | if elem in preds[key]: | 246 | if elem in preds[key]: |
211 | location = np.where(preds[key]==elem)[0][0] | 247 | location = np.where(preds[key] == elem)[0][0] |
212 | my_score += 1/(1+ location) | 248 | my_score += 1 / (1 + location) |
213 | for k_index, k_value in enumerate(k_list): | 249 | for k_index, k_value in enumerate(k_list): |
214 | if location < k_value: | 250 | if location < k_value: |
215 | precision_at[k_index] += 1 | 251 | precision_at[k_index] += 1 |
216 | return my_score/len(golden), (precision_at/len(golden))[0] | 252 | return my_score / len(golden), (precision_at / len(golden))[0] |
217 | |||