aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYigit Sever2019-09-22 02:02:35 +0300
committerYigit Sever2019-09-22 02:02:35 +0300
commit40d725329a1fb3699dd3b2c870fc8213db9d75eb (patch)
tree826c039fe8b95b87c78490d6809d20e3bb61322f
parent2936635892e17031c37facfd2115e8cfd6633222 (diff)
downloadEvaluating-Dictionary-Alignment-40d725329a1fb3699dd3b2c870fc8213db9d75eb.tar.gz
Evaluating-Dictionary-Alignment-40d725329a1fb3699dd3b2c870fc8213db9d75eb.tar.bz2
Evaluating-Dictionary-Alignment-40d725329a1fb3699dd3b2c870fc8213db9d75eb.zip
Unified WMD/SNK matching & retrieval
-rw-r--r--WMD_matching.py17
-rw-r--r--WMD_retrieval.py15
-rw-r--r--Wasserstein_Distance.py57
3 files changed, 46 insertions, 43 deletions
diff --git a/WMD_matching.py b/WMD_matching.py
index ea496b8..2755d15 100644
--- a/WMD_matching.py
+++ b/WMD_matching.py
@@ -6,7 +6,7 @@ import numpy as np
6from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 6from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
7from sklearn.preprocessing import normalize 7from sklearn.preprocessing import normalize
8 8
9from Wasserstein_Distance import (Wasserstein_Matcher, 9from Wasserstein_Distance import (WassersteinMatcher,
10 clean_corpus_using_embeddings_vocabulary, 10 clean_corpus_using_embeddings_vocabulary,
11 load_embeddings) 11 load_embeddings)
12 12
@@ -103,16 +103,13 @@ def main(args):
103 if not batch: 103 if not batch:
104 print(f'{metric}: {source_lang} - {target_lang}') 104 print(f'{metric}: {source_lang} - {target_lang}')
105 105
106 clf = Wasserstein_Matcher(W_embed=W_common, 106 clf = WassersteinMatcher(W_embed=W_common,
107 n_neighbors=5, 107 n_neighbors=5,
108 n_jobs=14, 108 n_jobs=14,
109 sinkhorn=(metric == 'snk')) 109 sinkhorn=(metric == 'snk'))
110 clf.fit(X_train_idf[:instances], np.ones(instances)) 110 clf.fit(X_train_idf[:instances], np.ones(instances))
111 row_ind, col_ind, _ = clf.kneighbors(X_test_idf[:instances], 111 p_at_one, percentage = clf.align(X_test_idf[:instances],
112 n_neighbors=instances) 112 n_neighbors=instances)
113 result = zip(row_ind, col_ind)
114 p_at_one = len([x for x, y in result if x == y])
115 percentage = p_at_one / instances * 100
116 113
117 if not batch: 114 if not batch:
118 print(f'P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%') 115 print(f'P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%')
diff --git a/WMD_retrieval.py b/WMD_retrieval.py
index 3328023..02f35be 100644
--- a/WMD_retrieval.py
+++ b/WMD_retrieval.py
@@ -6,7 +6,7 @@ import numpy as np
6from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 6from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
7from sklearn.preprocessing import normalize 7from sklearn.preprocessing import normalize
8 8
9from Wasserstein_Distance import (Wasserstein_Retriever, 9from Wasserstein_Distance import (WassersteinRetriever,
10 clean_corpus_using_embeddings_vocabulary, 10 clean_corpus_using_embeddings_vocabulary,
11 load_embeddings) 11 load_embeddings)
12 12
@@ -101,16 +101,13 @@ def main(args):
101 101
102 for metric in runfor: 102 for metric in runfor:
103 if not batch: 103 if not batch:
104 print(f'{metric} - tfidf: {source_lang} - {target_lang}') 104 print(f'{metric}: {source_lang} - {target_lang}')
105 105
106 clf = Wasserstein_Retriever(W_embed=W_common, 106 clf = WassersteinRetriever(W_embed=W_common,
107 n_neighbors=5, 107 n_neighbors=5,
108 n_jobs=14, 108 n_jobs=14,
109 sinkhorn=(metric == 'snk')) 109 sinkhorn=(metric == 'snk'))
110 clf.fit(X_train_idf[:instances], np.ones(instances)) 110 clf.fit(X_train_idf[:instances], np.ones(instances))
111 # dist, preds = clf.kneighbors(X_test_idf[:instances], n_neighbors=instances)
112 # mrr, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds)
113 # percentage = p_at_one * 100
114 p_at_one, percentage = clf.align(X_test_idf[:instances], 111 p_at_one, percentage = clf.align(X_test_idf[:instances],
115 n_neighbors=instances) 112 n_neighbors=instances)
116 113
diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py
index 161c13c..78bf9cf 100644
--- a/Wasserstein_Distance.py
+++ b/Wasserstein_Distance.py
@@ -1,16 +1,15 @@
1import numpy as np 1import numpy as np
2from sklearn.metrics import euclidean_distances
3from sklearn.neighbors import KNeighborsClassifier
4from sklearn.preprocessing import normalize
5from sklearn.utils import check_array
6
7import ot 2import ot
8from lapjv import lapjv 3from lapjv import lapjv
9from mosestokenizer import MosesTokenizer 4from mosestokenizer import MosesTokenizer
10from pathos.multiprocessing import ProcessingPool as Pool 5from pathos.multiprocessing import ProcessingPool as Pool
6from sklearn.metrics import euclidean_distances
7from sklearn.neighbors import KNeighborsClassifier
8from sklearn.preprocessing import normalize
9from sklearn.utils import check_array
11 10
12 11
13class Wasserstein_Matcher(KNeighborsClassifier): 12class WassersteinMatcher(KNeighborsClassifier):
14 """ 13 """
15 Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. 14 Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric.
16 Source and target distributions are l_1 normalized before computing the Wasserstein distance. 15 Source and target distributions are l_1 normalized before computing the Wasserstein distance.
@@ -34,10 +33,10 @@ class Wasserstein_Matcher(KNeighborsClassifier):
34 self.sinkhorn_reg = sinkhorn_reg 33 self.sinkhorn_reg = sinkhorn_reg
35 self.W_embed = W_embed 34 self.W_embed = W_embed
36 self.verbose = verbose 35 self.verbose = verbose
37 super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors, 36 super(WassersteinMatcher, self).__init__(n_neighbors=n_neighbors,
38 n_jobs=n_jobs, 37 n_jobs=n_jobs,
39 metric='precomputed', 38 metric='precomputed',
40 algorithm='brute') 39 algorithm='brute')
41 40
42 def _wmd(self, i, row, X_train): 41 def _wmd(self, i, row, X_train):
43 union_idx = np.union1d(X_train[i].indices, row.indices) 42 union_idx = np.union1d(X_train[i].indices, row.indices)
@@ -76,25 +75,34 @@ class Wasserstein_Matcher(KNeighborsClassifier):
76 X = check_array(X, accept_sparse='csr', 75 X = check_array(X, accept_sparse='csr',
77 copy=True) # check if array is sparse 76 copy=True) # check if array is sparse
78 X = normalize(X, norm='l1', copy=False) 77 X = normalize(X, norm='l1', copy=False)
79 return super(Wasserstein_Matcher, self).fit( 78 return super(WassersteinMatcher, self).fit(X, y)
80 X, y) # X_train_idf, np_ones(document collection size)
81 79
82 def predict(self, X): 80 def predict(self, X):
83 X = check_array(X, accept_sparse='csr', copy=True) 81 X = check_array(X, accept_sparse='csr', copy=True)
84 X = normalize(X, norm='l1', copy=False) 82 X = normalize(X, norm='l1', copy=False)
85 dist = self._pairwise_wmd(X) 83 dist = self._pairwise_wmd(X)
86 dist = dist * 1000 # for lapjv, small floating point numbers are evil 84 dist = dist * 1000 # for lapjv, small floating point numbers are evil
87 return super(Wasserstein_Matcher, self).predict(dist) 85 return super(WassersteinMatcher, self).predict(dist)
88 86
89 def kneighbors(self, X, n_neighbors=1): # X : X_train_idf 87 def kneighbors(self, X, n_neighbors=1):
90 X = check_array(X, accept_sparse='csr', copy=True) 88 X = check_array(X, accept_sparse='csr', copy=True)
91 X = normalize(X, norm='l1', copy=False) 89 X = normalize(X, norm='l1', copy=False)
92 dist = self._pairwise_wmd(X) 90 dist = self._pairwise_wmd(X)
93 dist = dist * 1000 # for lapjv, small floating point numbers are evil 91 dist = dist * 1000 # for lapjv, small floating point numbers are evil
94 return lapjv(dist) # and here is the matching part 92 return lapjv(dist)
95 93
94 def align(self, X, n_neighbors=1):
95 """ Wrapper function over kneighbors to return
96 precision at one and percentage values
96 97
97class Wasserstein_Retriever(KNeighborsClassifier): 98 """
99 row_ind, col_ind, _ = self.kneighbors(X, n_neighbors)
100 result = zip(row_ind, col_ind)
101 p_at_one = len([x for x, y in result if x == y])
102 percentage = p_at_one / n_neighbors * 100
103 return p_at_one, percentage
104
105class WassersteinRetriever(KNeighborsClassifier):
98 """ 106 """
99 Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. 107 Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric.
100 Source and target distributions are l_1 normalized before computing the Wasserstein distance. 108 Source and target distributions are l_1 normalized before computing the Wasserstein distance.
@@ -118,7 +126,7 @@ class Wasserstein_Retriever(KNeighborsClassifier):
118 self.sinkhorn_reg = sinkhorn_reg 126 self.sinkhorn_reg = sinkhorn_reg
119 self.W_embed = W_embed 127 self.W_embed = W_embed
120 self.verbose = verbose 128 self.verbose = verbose
121 super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, 129 super(WassersteinRetriever, self).__init__(n_neighbors=n_neighbors,
122 n_jobs=n_jobs, 130 n_jobs=n_jobs,
123 metric='precomputed', 131 metric='precomputed',
124 algorithm='brute') 132 algorithm='brute')
@@ -158,23 +166,22 @@ class Wasserstein_Retriever(KNeighborsClassifier):
158 def fit(self, X, y): 166 def fit(self, X, y):
159 X = check_array(X, accept_sparse='csr', copy=True) 167 X = check_array(X, accept_sparse='csr', copy=True)
160 X = normalize(X, norm='l1', copy=False) 168 X = normalize(X, norm='l1', copy=False)
161 return super(Wasserstein_Retriever, self).fit(X, y) 169 return super(WassersteinRetriever, self).fit(X, y)
162 170
163 def predict(self, X): 171 def predict(self, X):
164 X = check_array(X, accept_sparse='csr', copy=True) 172 X = check_array(X, accept_sparse='csr', copy=True)
165 X = normalize(X, norm='l1', copy=False) 173 X = normalize(X, norm='l1', copy=False)
166 dist = self._pairwise_wmd(X) 174 dist = self._pairwise_wmd(X)
167 return super(Wasserstein_Retriever, self).predict(dist) 175 return super(WassersteinRetriever, self).predict(dist)
168 176
169 def kneighbors(self, X, n_neighbors=1): 177 def kneighbors(self, X, n_neighbors=1):
170 X = check_array(X, accept_sparse='csr', copy=True) 178 X = check_array(X, accept_sparse='csr', copy=True)
171 X = normalize(X, norm='l1', copy=False) 179 X = normalize(X, norm='l1', copy=False)
172 dist = self._pairwise_wmd(X) 180 dist = self._pairwise_wmd(X)
173 return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors) 181 return super(WassersteinRetriever, self).kneighbors(dist, n_neighbors)
174 182
175 def align(self, X, n_neighbors=1): 183 def align(self, X, n_neighbors=1):
176 """ 184 """ Wrapper function over kneighbors to return
177 Wrapper function over kneighbors to return
178 precision at one and percentage values 185 precision at one and percentage values
179 186
180 """ 187 """
@@ -196,7 +203,7 @@ def load_embeddings(path, dimension=300):
196 first_line = fp.readline().rstrip('\n') 203 first_line = fp.readline().rstrip('\n')
197 if first_line.count(' ') == 1: 204 if first_line.count(' ') == 1:
198 # includes the "word_count dimension" information 205 # includes the "word_count dimension" information
199 (word_count, dimension) = map(int, first_line.split()) 206 (_, dimension) = map(int, first_line.split())
200 else: 207 else:
201 # assume the file only contains vectors 208 # assume the file only contains vectors
202 fp.seek(0) 209 fp.seek(0)
@@ -236,7 +243,9 @@ def clean_corpus_using_embeddings_vocabulary(
236 return np.array(clean_corpus), clean_vectors, keys 243 return np.array(clean_corpus), clean_vectors, keys
237 244
238 245
239def mrr_precision_at_k(golden, preds, k_list=[1,]): 246def mrr_precision_at_k(golden, preds, k_list=[
247 1,
248]):
240 """ 249 """
241 Calculates Mean Reciprocal Error and Hits@1 == Precision@1 250 Calculates Mean Reciprocal Error and Hits@1 == Precision@1
242 """ 251 """