aboutsummaryrefslogtreecommitdiffstats
path: root/Wass_Retriever.py
diff options
context:
space:
mode:
authorYigit Sever2019-09-19 00:22:25 +0300
committerYigit Sever2019-09-19 00:22:25 +0300
commit1890976ed1eee59eda92ceabdcb1c966d6707269 (patch)
treef7bb7de36d158ee970aaf4f0f5a6b682ac359825 /Wass_Retriever.py
parent68b6c55d0e3217362d6e17ea8458dfa7e5242e17 (diff)
downloadEvaluating-Dictionary-Alignment-1890976ed1eee59eda92ceabdcb1c966d6707269.tar.gz
Evaluating-Dictionary-Alignment-1890976ed1eee59eda92ceabdcb1c966d6707269.tar.bz2
Evaluating-Dictionary-Alignment-1890976ed1eee59eda92ceabdcb1c966d6707269.zip
Add experiment scripts
Diffstat (limited to 'Wass_Retriever.py')
-rw-r--r--Wass_Retriever.py73
1 files changed, 73 insertions, 0 deletions
diff --git a/Wass_Retriever.py b/Wass_Retriever.py
new file mode 100644
index 0000000..036cf93
--- /dev/null
+++ b/Wass_Retriever.py
@@ -0,0 +1,73 @@
1import ot
2from sklearn.preprocessing import normalize
3from sklearn.neighbors import KNeighborsClassifier
4from sklearn.metrics import euclidean_distances
5from sklearn.externals.joblib import Parallel, delayed
6from sklearn.utils import check_array
7from sklearn.metrics.scorer import check_scoring
8from pathos.multiprocessing import ProcessingPool as Pool
9from sklearn.metrics import euclidean_distances
10import numpy as np
11
12class Wasserstein_Retriever(KNeighborsClassifier):
13 """
14 Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric.
15 Source and target distributions are l_1 normalized before computing the Wasserstein distance.
16 Wasserstein is parametrized by the distances between the individual points of the distributions.
17 """
18 def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1):
19 """
20 Initialization of the class.
21 Arguments
22 ---------
23 W_embed: embeddings of the words, np.array
24 verbose: True/False
25 """
26 self.sinkhorn = sinkhorn
27 self.sinkhorn_reg = sinkhorn_reg
28 self.W_embed = W_embed
29 self.verbose = verbose
30 super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute')
31
32 def _wmd(self, i, row, X_train):
33 union_idx = np.union1d(X_train[i].indices, row.indices)
34 W_minimal = self.W_embed[union_idx]
35 W_dist = euclidean_distances(W_minimal)
36 bow_i = X_train[i, union_idx].A.ravel()
37 bow_j = row[:, union_idx].A.ravel()
38 if self.sinkhorn:
39 return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0]
40 else:
41 return ot.emd2(bow_i, bow_j, W_dist)
42
43 def _wmd_row(self, row):
44 X_train = self._fit_X
45 n_samples_train = X_train.shape[0]
46 return [self._wmd(i, row, X_train) for i in range(n_samples_train)]
47
48 def _pairwise_wmd(self, X_test, X_train=None):
49 n_samples_test = X_test.shape[0]
50
51 if X_train is None:
52 X_train = self._fit_X
53 pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances
54 dist = pool.map(self._wmd_row, X_test)
55 return np.array(dist)
56
57 def fit(self, X, y):
58 X = check_array(X, accept_sparse='csr', copy=True)
59 X = normalize(X, norm='l1', copy=False)
60 return super(Wasserstein_Retriever, self).fit(X, y)
61
62 def predict(self, X):
63 X = check_array(X, accept_sparse='csr', copy=True)
64 X = normalize(X, norm='l1', copy=False)
65 dist = self._pairwise_wmd(X)
66 return super(Wasserstein_Retriever, self).predict(dist)
67
68 def kneighbors(self, X, n_neighbors=1):
69 X = check_array(X, accept_sparse='csr', copy=True)
70 X = normalize(X, norm='l1', copy=False)
71 dist = self._pairwise_wmd(X)
72 return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors)
73