From 40d725329a1fb3699dd3b2c870fc8213db9d75eb Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Sun, 22 Sep 2019 02:02:35 +0300 Subject: Unified WMD/SNK matching & retrieval --- WMD_matching.py | 17 ++++++--------- WMD_retrieval.py | 15 ++++++------- Wasserstein_Distance.py | 57 ++++++++++++++++++++++++++++--------------------- 3 files changed, 46 insertions(+), 43 deletions(-) diff --git a/WMD_matching.py b/WMD_matching.py index ea496b8..2755d15 100644 --- a/WMD_matching.py +++ b/WMD_matching.py @@ -6,7 +6,7 @@ import numpy as np from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.preprocessing import normalize -from Wasserstein_Distance import (Wasserstein_Matcher, +from Wasserstein_Distance import (WassersteinMatcher, clean_corpus_using_embeddings_vocabulary, load_embeddings) @@ -103,16 +103,13 @@ def main(args): if not batch: print(f'{metric}: {source_lang} - {target_lang}') - clf = Wasserstein_Matcher(W_embed=W_common, - n_neighbors=5, - n_jobs=14, - sinkhorn=(metric == 'snk')) + clf = WassersteinMatcher(W_embed=W_common, + n_neighbors=5, + n_jobs=14, + sinkhorn=(metric == 'snk')) clf.fit(X_train_idf[:instances], np.ones(instances)) - row_ind, col_ind, _ = clf.kneighbors(X_test_idf[:instances], - n_neighbors=instances) - result = zip(row_ind, col_ind) - p_at_one = len([x for x, y in result if x == y]) - percentage = p_at_one / instances * 100 + p_at_one, percentage = clf.align(X_test_idf[:instances], + n_neighbors=instances) if not batch: print(f'P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%') diff --git a/WMD_retrieval.py b/WMD_retrieval.py index 3328023..02f35be 100644 --- a/WMD_retrieval.py +++ b/WMD_retrieval.py @@ -6,7 +6,7 @@ import numpy as np from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.preprocessing import normalize -from Wasserstein_Distance import (Wasserstein_Retriever, +from Wasserstein_Distance import (WassersteinRetriever, clean_corpus_using_embeddings_vocabulary, load_embeddings) @@ -101,16 +101,13 @@ def main(args): for metric in runfor: if not batch: - print(f'{metric} - tfidf: {source_lang} - {target_lang}') + print(f'{metric}: {source_lang} - {target_lang}') - clf = Wasserstein_Retriever(W_embed=W_common, - n_neighbors=5, - n_jobs=14, - sinkhorn=(metric == 'snk')) + clf = WassersteinRetriever(W_embed=W_common, + n_neighbors=5, + n_jobs=14, + sinkhorn=(metric == 'snk')) clf.fit(X_train_idf[:instances], np.ones(instances)) - # dist, preds = clf.kneighbors(X_test_idf[:instances], n_neighbors=instances) - # mrr, p_at_one = mrr_precision_at_k(list(range(len(preds))), preds) - # percentage = p_at_one * 100 p_at_one, percentage = clf.align(X_test_idf[:instances], n_neighbors=instances) diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py index 161c13c..78bf9cf 100644 --- a/Wasserstein_Distance.py +++ b/Wasserstein_Distance.py @@ -1,16 +1,15 @@ import numpy as np -from sklearn.metrics import euclidean_distances -from sklearn.neighbors import KNeighborsClassifier -from sklearn.preprocessing import normalize -from sklearn.utils import check_array - import ot from lapjv import lapjv from mosestokenizer import MosesTokenizer from pathos.multiprocessing import ProcessingPool as Pool +from sklearn.metrics import euclidean_distances +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import normalize +from sklearn.utils import check_array -class Wasserstein_Matcher(KNeighborsClassifier): +class WassersteinMatcher(KNeighborsClassifier): """ Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. Source and target distributions are l_1 normalized before computing the Wasserstein distance. @@ -34,10 +33,10 @@ class Wasserstein_Matcher(KNeighborsClassifier): self.sinkhorn_reg = sinkhorn_reg self.W_embed = W_embed self.verbose = verbose - super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors, - n_jobs=n_jobs, - metric='precomputed', - algorithm='brute') + super(WassersteinMatcher, self).__init__(n_neighbors=n_neighbors, + n_jobs=n_jobs, + metric='precomputed', + algorithm='brute') def _wmd(self, i, row, X_train): union_idx = np.union1d(X_train[i].indices, row.indices) @@ -76,25 +75,34 @@ class Wasserstein_Matcher(KNeighborsClassifier): X = check_array(X, accept_sparse='csr', copy=True) # check if array is sparse X = normalize(X, norm='l1', copy=False) - return super(Wasserstein_Matcher, self).fit( - X, y) # X_train_idf, np_ones(document collection size) + return super(WassersteinMatcher, self).fit(X, y) def predict(self, X): X = check_array(X, accept_sparse='csr', copy=True) X = normalize(X, norm='l1', copy=False) dist = self._pairwise_wmd(X) dist = dist * 1000 # for lapjv, small floating point numbers are evil - return super(Wasserstein_Matcher, self).predict(dist) + return super(WassersteinMatcher, self).predict(dist) - def kneighbors(self, X, n_neighbors=1): # X : X_train_idf + def kneighbors(self, X, n_neighbors=1): X = check_array(X, accept_sparse='csr', copy=True) X = normalize(X, norm='l1', copy=False) dist = self._pairwise_wmd(X) dist = dist * 1000 # for lapjv, small floating point numbers are evil - return lapjv(dist) # and here is the matching part + return lapjv(dist) + def align(self, X, n_neighbors=1): + """ Wrapper function over kneighbors to return + precision at one and percentage values -class Wasserstein_Retriever(KNeighborsClassifier): + """ + row_ind, col_ind, _ = self.kneighbors(X, n_neighbors) + result = zip(row_ind, col_ind) + p_at_one = len([x for x, y in result if x == y]) + percentage = p_at_one / n_neighbors * 100 + return p_at_one, percentage + +class WassersteinRetriever(KNeighborsClassifier): """ Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. Source and target distributions are l_1 normalized before computing the Wasserstein distance. @@ -118,7 +126,7 @@ class Wasserstein_Retriever(KNeighborsClassifier): self.sinkhorn_reg = sinkhorn_reg self.W_embed = W_embed self.verbose = verbose - super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, + super(WassersteinRetriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') @@ -158,23 +166,22 @@ class Wasserstein_Retriever(KNeighborsClassifier): def fit(self, X, y): X = check_array(X, accept_sparse='csr', copy=True) X = normalize(X, norm='l1', copy=False) - return super(Wasserstein_Retriever, self).fit(X, y) + return super(WassersteinRetriever, self).fit(X, y) def predict(self, X): X = check_array(X, accept_sparse='csr', copy=True) X = normalize(X, norm='l1', copy=False) dist = self._pairwise_wmd(X) - return super(Wasserstein_Retriever, self).predict(dist) + return super(WassersteinRetriever, self).predict(dist) def kneighbors(self, X, n_neighbors=1): X = check_array(X, accept_sparse='csr', copy=True) X = normalize(X, norm='l1', copy=False) dist = self._pairwise_wmd(X) - return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors) + return super(WassersteinRetriever, self).kneighbors(dist, n_neighbors) def align(self, X, n_neighbors=1): - """ - Wrapper function over kneighbors to return + """ Wrapper function over kneighbors to return precision at one and percentage values """ @@ -196,7 +203,7 @@ def load_embeddings(path, dimension=300): first_line = fp.readline().rstrip('\n') if first_line.count(' ') == 1: # includes the "word_count dimension" information - (word_count, dimension) = map(int, first_line.split()) + (_, dimension) = map(int, first_line.split()) else: # assume the file only contains vectors fp.seek(0) @@ -236,7 +243,9 @@ def clean_corpus_using_embeddings_vocabulary( return np.array(clean_corpus), clean_vectors, keys -def mrr_precision_at_k(golden, preds, k_list=[1,]): +def mrr_precision_at_k(golden, preds, k_list=[ + 1, +]): """ Calculates Mean Reciprocal Error and Hits@1 == Precision@1 """ -- cgit v1.2.3-70-g09d2