diff options
author | Yigit Sever | 2019-09-22 02:02:35 +0300 |
---|---|---|
committer | Yigit Sever | 2019-09-22 02:02:35 +0300 |
commit | 40d725329a1fb3699dd3b2c870fc8213db9d75eb (patch) | |
tree | 826c039fe8b95b87c78490d6809d20e3bb61322f | |
parent | 2936635892e17031c37facfd2115e8cfd6633222 (diff) | |
download | Evaluating-Dictionary-Alignment-40d725329a1fb3699dd3b2c870fc8213db9d75eb.tar.gz Evaluating-Dictionary-Alignment-40d725329a1fb3699dd3b2c870fc8213db9d75eb.tar.bz2 Evaluating-Dictionary-Alignment-40d725329a1fb3699dd3b2c870fc8213db9d75eb.zip |
Unified WMD/SNK matching & retrieval
-rw-r--r-- | WMD_matching.py | 17 | ||||
-rw-r--r-- | WMD_retrieval.py | 15 | ||||
-rw-r--r-- | Wasserstein_Distance.py | 57 |
3 files changed, 46 insertions, 43 deletions
diff --git a/WMD_matching.py b/WMD_matching.py index ea496b8..2755d15 100644 --- a/WMD_matching.py +++ b/WMD_matching.py | |||
@@ -6,7 +6,7 @@ import numpy as np | |||
6 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | 6 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer |
7 | from sklearn.preprocessing import normalize | 7 | from sklearn.preprocessing import normalize |
8 | 8 | ||
9 | from Wasserstein_Distance import (Wasserstein_Matcher, | 9 | from Wasserstein_Distance import (WassersteinMatcher, |
10 | clean_corpus_using_embeddings_vocabulary, | 10 | clean_corpus_using_embeddings_vocabulary, |
11 | load_embeddings) | 11 | load_embeddings) |
12 | 12 | ||
@@ -103,16 +103,13 @@ def main(args): | |||
103 | if not batch: | 103 | if not batch: |
104 | print(f'{metric}: {source_lang} - {target_lang}') | 104 | print(f'{metric}: {source_lang} - {target_lang}') |
105 | 105 | ||
106 | clf = Wasserstein_Matcher(W_embed=W_common, | 106 | clf = WassersteinMatcher(W_embed=W_common, |
107 | n_neighbors=5, | 107 | n_neighbors=5, |
108 | n_jobs=14, | 108 | n_jobs=14, |
109 | sinkhorn=(metric == 'snk')) | 109 | sinkhorn=(metric == 'snk')) |
110 | clf.fit(X_train_idf[:instances], np.ones(instances)) | 110 | clf.fit(X_train_idf[:instances], np.ones(instances)) |
111 | row_ind, col_ind, _ = clf.kneighbors(X_test_idf[:instances], | 111 | p_at_one, percentage = clf.align(X_test_idf[:instances], |
112 | n_neighbors=instances) | 112 | n_neighbors=instances) |
113 | result = zip(row_ind, col_ind) | ||
114 | p_at_one = len([x for x, y in result if x == y]) | ||
115 | percentage = p_at_one / instances * 100 | ||
116 | 113 | ||
117 | if not batch: | 114 | if not batch: |
118 | print(f'P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%') | 115 | print(f'P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%') |
diff --git a/WMD_retrieval.py b/WMD_retrieval.py index 3328023..02f35be 100644 --- a/WMD_retrieval.py +++ b/WMD_retrieval.py | |||
@@ -6,7 +6,7 @@ import numpy as np | |||
6 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | 6 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer |
7 | from sklearn.preprocessing import normalize | 7 | from sklearn.preprocessing import normalize |
8 | 8 | ||
9 | from Wasserstein_Distance import (Wasserstein_Retriever, | 9 | from Wasserstein_Distance import (WassersteinRetriever, |
10 | clean_corpus_using_embeddings_vocabulary, | 10 | clean_corpus_using_embeddings_vocabulary, |
11 | load_embeddings) | 11 | load_embeddings) |
12 | 12 | ||
@@ -101,16 +101,13 @@ def main(args): | |||
101 | 101 | ||
102 | for metric in runfor: | 102 | for metric in runfor: |
103 | if not batch: | 103 | if not batch: |
104 | print(f'{metric} - tfidf: {source_lang} - {target_lang}') | 104 | print(f'{metric}: {source_lang} - {target_lang}') |
105 | 105 | ||
106 | clf = Wasserstein_Retriever(W_embed=W_common, | 106 | clf = WassersteinRetriever(W_embed=W_common, |
107 | n_neighbors=5, | 107 | n_neighbors=5, |
108 | n_jobs=14, | 108 | n_jobs=14, |
109 | sinkhorn=(metric == 'snk')) | 109 | sinkhorn=(metric == 'snk')) |
110 | clf.fit(X_train_idf[:instances], np.ones(instances)) | 110 | clf.fit(X_train_idf[:instances], np.ones(instances)) |
111 | # dist, preds = clf.kneighbors(X_test_idf[:instances], n_neighbors=instances) | ||
112 | # mrr, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds) | ||
113 | # percentage = p_at_one * 100 | ||
114 | p_at_one, percentage = clf.align(X_test_idf[:instances], | 111 | p_at_one, percentage = clf.align(X_test_idf[:instances], |
115 | n_neighbors=instances) | 112 | n_neighbors=instances) |
116 | 113 | ||
diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py index 161c13c..78bf9cf 100644 --- a/Wasserstein_Distance.py +++ b/Wasserstein_Distance.py | |||
@@ -1,16 +1,15 @@ | |||
1 | import numpy as np | 1 | import numpy as np |
2 | from sklearn.metrics import euclidean_distances | ||
3 | from sklearn.neighbors import KNeighborsClassifier | ||
4 | from sklearn.preprocessing import normalize | ||
5 | from sklearn.utils import check_array | ||
6 | |||
7 | import ot | 2 | import ot |
8 | from lapjv import lapjv | 3 | from lapjv import lapjv |
9 | from mosestokenizer import MosesTokenizer | 4 | from mosestokenizer import MosesTokenizer |
10 | from pathos.multiprocessing import ProcessingPool as Pool | 5 | from pathos.multiprocessing import ProcessingPool as Pool |
6 | from sklearn.metrics import euclidean_distances | ||
7 | from sklearn.neighbors import KNeighborsClassifier | ||
8 | from sklearn.preprocessing import normalize | ||
9 | from sklearn.utils import check_array | ||
11 | 10 | ||
12 | 11 | ||
13 | class Wasserstein_Matcher(KNeighborsClassifier): | 12 | class WassersteinMatcher(KNeighborsClassifier): |
14 | """ | 13 | """ |
15 | Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. | 14 | Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. |
16 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. | 15 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. |
@@ -34,10 +33,10 @@ class Wasserstein_Matcher(KNeighborsClassifier): | |||
34 | self.sinkhorn_reg = sinkhorn_reg | 33 | self.sinkhorn_reg = sinkhorn_reg |
35 | self.W_embed = W_embed | 34 | self.W_embed = W_embed |
36 | self.verbose = verbose | 35 | self.verbose = verbose |
37 | super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors, | 36 | super(WassersteinMatcher, self).__init__(n_neighbors=n_neighbors, |
38 | n_jobs=n_jobs, | 37 | n_jobs=n_jobs, |
39 | metric='precomputed', | 38 | metric='precomputed', |
40 | algorithm='brute') | 39 | algorithm='brute') |
41 | 40 | ||
42 | def _wmd(self, i, row, X_train): | 41 | def _wmd(self, i, row, X_train): |
43 | union_idx = np.union1d(X_train[i].indices, row.indices) | 42 | union_idx = np.union1d(X_train[i].indices, row.indices) |
@@ -76,25 +75,34 @@ class Wasserstein_Matcher(KNeighborsClassifier): | |||
76 | X = check_array(X, accept_sparse='csr', | 75 | X = check_array(X, accept_sparse='csr', |
77 | copy=True) # check if array is sparse | 76 | copy=True) # check if array is sparse |
78 | X = normalize(X, norm='l1', copy=False) | 77 | X = normalize(X, norm='l1', copy=False) |
79 | return super(Wasserstein_Matcher, self).fit( | 78 | return super(WassersteinMatcher, self).fit(X, y) |
80 | X, y) # X_train_idf, np_ones(document collection size) | ||
81 | 79 | ||
82 | def predict(self, X): | 80 | def predict(self, X): |
83 | X = check_array(X, accept_sparse='csr', copy=True) | 81 | X = check_array(X, accept_sparse='csr', copy=True) |
84 | X = normalize(X, norm='l1', copy=False) | 82 | X = normalize(X, norm='l1', copy=False) |
85 | dist = self._pairwise_wmd(X) | 83 | dist = self._pairwise_wmd(X) |
86 | dist = dist * 1000 # for lapjv, small floating point numbers are evil | 84 | dist = dist * 1000 # for lapjv, small floating point numbers are evil |
87 | return super(Wasserstein_Matcher, self).predict(dist) | 85 | return super(WassersteinMatcher, self).predict(dist) |
88 | 86 | ||
89 | def kneighbors(self, X, n_neighbors=1): # X : X_train_idf | 87 | def kneighbors(self, X, n_neighbors=1): |
90 | X = check_array(X, accept_sparse='csr', copy=True) | 88 | X = check_array(X, accept_sparse='csr', copy=True) |
91 | X = normalize(X, norm='l1', copy=False) | 89 | X = normalize(X, norm='l1', copy=False) |
92 | dist = self._pairwise_wmd(X) | 90 | dist = self._pairwise_wmd(X) |
93 | dist = dist * 1000 # for lapjv, small floating point numbers are evil | 91 | dist = dist * 1000 # for lapjv, small floating point numbers are evil |
94 | return lapjv(dist) # and here is the matching part | 92 | return lapjv(dist) |
95 | 93 | ||
94 | def align(self, X, n_neighbors=1): | ||
95 | """ Wrapper function over kneighbors to return | ||
96 | precision at one and percentage values | ||
96 | 97 | ||
97 | class Wasserstein_Retriever(KNeighborsClassifier): | 98 | """ |
99 | row_ind, col_ind, _ = self.kneighbors(X, n_neighbors) | ||
100 | result = zip(row_ind, col_ind) | ||
101 | p_at_one = len([x for x, y in result if x == y]) | ||
102 | percentage = p_at_one / n_neighbors * 100 | ||
103 | return p_at_one, percentage | ||
104 | |||
105 | class WassersteinRetriever(KNeighborsClassifier): | ||
98 | """ | 106 | """ |
99 | Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. | 107 | Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. |
100 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. | 108 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. |
@@ -118,7 +126,7 @@ class Wasserstein_Retriever(KNeighborsClassifier): | |||
118 | self.sinkhorn_reg = sinkhorn_reg | 126 | self.sinkhorn_reg = sinkhorn_reg |
119 | self.W_embed = W_embed | 127 | self.W_embed = W_embed |
120 | self.verbose = verbose | 128 | self.verbose = verbose |
121 | super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, | 129 | super(WassersteinRetriever, self).__init__(n_neighbors=n_neighbors, |
122 | n_jobs=n_jobs, | 130 | n_jobs=n_jobs, |
123 | metric='precomputed', | 131 | metric='precomputed', |
124 | algorithm='brute') | 132 | algorithm='brute') |
@@ -158,23 +166,22 @@ class Wasserstein_Retriever(KNeighborsClassifier): | |||
158 | def fit(self, X, y): | 166 | def fit(self, X, y): |
159 | X = check_array(X, accept_sparse='csr', copy=True) | 167 | X = check_array(X, accept_sparse='csr', copy=True) |
160 | X = normalize(X, norm='l1', copy=False) | 168 | X = normalize(X, norm='l1', copy=False) |
161 | return super(Wasserstein_Retriever, self).fit(X, y) | 169 | return super(WassersteinRetriever, self).fit(X, y) |
162 | 170 | ||
163 | def predict(self, X): | 171 | def predict(self, X): |
164 | X = check_array(X, accept_sparse='csr', copy=True) | 172 | X = check_array(X, accept_sparse='csr', copy=True) |
165 | X = normalize(X, norm='l1', copy=False) | 173 | X = normalize(X, norm='l1', copy=False) |
166 | dist = self._pairwise_wmd(X) | 174 | dist = self._pairwise_wmd(X) |
167 | return super(Wasserstein_Retriever, self).predict(dist) | 175 | return super(WassersteinRetriever, self).predict(dist) |
168 | 176 | ||
169 | def kneighbors(self, X, n_neighbors=1): | 177 | def kneighbors(self, X, n_neighbors=1): |
170 | X = check_array(X, accept_sparse='csr', copy=True) | 178 | X = check_array(X, accept_sparse='csr', copy=True) |
171 | X = normalize(X, norm='l1', copy=False) | 179 | X = normalize(X, norm='l1', copy=False) |
172 | dist = self._pairwise_wmd(X) | 180 | dist = self._pairwise_wmd(X) |
173 | return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors) | 181 | return super(WassersteinRetriever, self).kneighbors(dist, n_neighbors) |
174 | 182 | ||
175 | def align(self, X, n_neighbors=1): | 183 | def align(self, X, n_neighbors=1): |
176 | """ | 184 | """ Wrapper function over kneighbors to return |
177 | Wrapper function over kneighbors to return | ||
178 | precision at one and percentage values | 185 | precision at one and percentage values |
179 | 186 | ||
180 | """ | 187 | """ |
@@ -196,7 +203,7 @@ def load_embeddings(path, dimension=300): | |||
196 | first_line = fp.readline().rstrip('\n') | 203 | first_line = fp.readline().rstrip('\n') |
197 | if first_line.count(' ') == 1: | 204 | if first_line.count(' ') == 1: |
198 | # includes the "word_count dimension" information | 205 | # includes the "word_count dimension" information |
199 | (word_count, dimension) = map(int, first_line.split()) | 206 | (_, dimension) = map(int, first_line.split()) |
200 | else: | 207 | else: |
201 | # assume the file only contains vectors | 208 | # assume the file only contains vectors |
202 | fp.seek(0) | 209 | fp.seek(0) |
@@ -236,7 +243,9 @@ def clean_corpus_using_embeddings_vocabulary( | |||
236 | return np.array(clean_corpus), clean_vectors, keys | 243 | return np.array(clean_corpus), clean_vectors, keys |
237 | 244 | ||
238 | 245 | ||
239 | def mrr_precision_at_k(golden, preds, k_list=[1,]): | 246 | def mrr_precision_at_k(golden, preds, k_list=[ |
247 | 1, | ||
248 | ]): | ||
240 | """ | 249 | """ |
241 | Calculates Mean Reciprocal Error and Hits@1 == Precision@1 | 250 | Calculates Mean Reciprocal Error and Hits@1 == Precision@1 |
242 | """ | 251 | """ |