diff options
Diffstat (limited to 'Wass_Retriever.py')
-rw-r--r-- | Wass_Retriever.py | 73 |
1 files changed, 0 insertions, 73 deletions
diff --git a/Wass_Retriever.py b/Wass_Retriever.py deleted file mode 100644 index 036cf93..0000000 --- a/Wass_Retriever.py +++ /dev/null | |||
@@ -1,73 +0,0 @@ | |||
1 | import ot | ||
2 | from sklearn.preprocessing import normalize | ||
3 | from sklearn.neighbors import KNeighborsClassifier | ||
4 | from sklearn.metrics import euclidean_distances | ||
5 | from sklearn.externals.joblib import Parallel, delayed | ||
6 | from sklearn.utils import check_array | ||
7 | from sklearn.metrics.scorer import check_scoring | ||
8 | from pathos.multiprocessing import ProcessingPool as Pool | ||
9 | from sklearn.metrics import euclidean_distances | ||
10 | import numpy as np | ||
11 | |||
12 | class Wasserstein_Retriever(KNeighborsClassifier): | ||
13 | """ | ||
14 | Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. | ||
15 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. | ||
16 | Wasserstein is parametrized by the distances between the individual points of the distributions. | ||
17 | """ | ||
18 | def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1): | ||
19 | """ | ||
20 | Initialization of the class. | ||
21 | Arguments | ||
22 | --------- | ||
23 | W_embed: embeddings of the words, np.array | ||
24 | verbose: True/False | ||
25 | """ | ||
26 | self.sinkhorn = sinkhorn | ||
27 | self.sinkhorn_reg = sinkhorn_reg | ||
28 | self.W_embed = W_embed | ||
29 | self.verbose = verbose | ||
30 | super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') | ||
31 | |||
32 | def _wmd(self, i, row, X_train): | ||
33 | union_idx = np.union1d(X_train[i].indices, row.indices) | ||
34 | W_minimal = self.W_embed[union_idx] | ||
35 | W_dist = euclidean_distances(W_minimal) | ||
36 | bow_i = X_train[i, union_idx].A.ravel() | ||
37 | bow_j = row[:, union_idx].A.ravel() | ||
38 | if self.sinkhorn: | ||
39 | return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0] | ||
40 | else: | ||
41 | return ot.emd2(bow_i, bow_j, W_dist) | ||
42 | |||
43 | def _wmd_row(self, row): | ||
44 | X_train = self._fit_X | ||
45 | n_samples_train = X_train.shape[0] | ||
46 | return [self._wmd(i, row, X_train) for i in range(n_samples_train)] | ||
47 | |||
48 | def _pairwise_wmd(self, X_test, X_train=None): | ||
49 | n_samples_test = X_test.shape[0] | ||
50 | |||
51 | if X_train is None: | ||
52 | X_train = self._fit_X | ||
53 | pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances | ||
54 | dist = pool.map(self._wmd_row, X_test) | ||
55 | return np.array(dist) | ||
56 | |||
57 | def fit(self, X, y): | ||
58 | X = check_array(X, accept_sparse='csr', copy=True) | ||
59 | X = normalize(X, norm='l1', copy=False) | ||
60 | return super(Wasserstein_Retriever, self).fit(X, y) | ||
61 | |||
62 | def predict(self, X): | ||
63 | X = check_array(X, accept_sparse='csr', copy=True) | ||
64 | X = normalize(X, norm='l1', copy=False) | ||
65 | dist = self._pairwise_wmd(X) | ||
66 | return super(Wasserstein_Retriever, self).predict(dist) | ||
67 | |||
68 | def kneighbors(self, X, n_neighbors=1): | ||
69 | X = check_array(X, accept_sparse='csr', copy=True) | ||
70 | X = normalize(X, norm='l1', copy=False) | ||
71 | dist = self._pairwise_wmd(X) | ||
72 | return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors) | ||
73 | |||