diff options
Diffstat (limited to 'Wass_Matcher.py')
-rw-r--r-- | Wass_Matcher.py | 76 |
1 files changed, 76 insertions, 0 deletions
diff --git a/Wass_Matcher.py b/Wass_Matcher.py new file mode 100644 index 0000000..44b29eb --- /dev/null +++ b/Wass_Matcher.py | |||
@@ -0,0 +1,76 @@ | |||
1 | import ot | ||
2 | from sklearn.preprocessing import normalize | ||
3 | from lapjv import lapjv | ||
4 | from sklearn.neighbors import KNeighborsClassifier | ||
5 | from sklearn.metrics import euclidean_distances | ||
6 | from sklearn.externals.joblib import Parallel, delayed | ||
7 | from sklearn.utils import check_array | ||
8 | from sklearn.metrics.scorer import check_scoring | ||
9 | from pathos.multiprocessing import ProcessingPool as Pool | ||
10 | from sklearn.metrics import euclidean_distances | ||
11 | import numpy as np | ||
12 | |||
13 | class Wasserstein_Matcher(KNeighborsClassifier): | ||
14 | """ | ||
15 | Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. | ||
16 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. | ||
17 | Wasserstein is parametrized by the distances between the individual points of the distributions. | ||
18 | """ | ||
19 | def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1): | ||
20 | """ | ||
21 | Initialization of the class. | ||
22 | Arguments | ||
23 | --------- | ||
24 | W_embed: embeddings of the words, np.array | ||
25 | verbose: True/False | ||
26 | """ | ||
27 | self.sinkhorn = sinkhorn | ||
28 | self.sinkhorn_reg = sinkhorn_reg | ||
29 | self.W_embed = W_embed | ||
30 | self.verbose = verbose | ||
31 | super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') | ||
32 | |||
33 | def _wmd(self, i, row, X_train): | ||
34 | union_idx = np.union1d(X_train[i].indices, row.indices) | ||
35 | W_minimal = self.W_embed[union_idx] | ||
36 | W_dist = euclidean_distances(W_minimal) | ||
37 | bow_i = X_train[i, union_idx].A.ravel() | ||
38 | bow_j = row[:, union_idx].A.ravel() | ||
39 | if self.sinkhorn: | ||
40 | return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0] | ||
41 | else: | ||
42 | return ot.emd2(bow_i, bow_j, W_dist) | ||
43 | |||
44 | def _wmd_row(self, row): | ||
45 | X_train = self._fit_X | ||
46 | n_samples_train = X_train.shape[0] | ||
47 | return [self._wmd(i, row, X_train) for i in range(n_samples_train)] | ||
48 | |||
49 | def _pairwise_wmd(self, X_test, X_train=None): | ||
50 | n_samples_test = X_test.shape[0] | ||
51 | |||
52 | if X_train is None: | ||
53 | X_train = self._fit_X | ||
54 | pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances | ||
55 | dist = pool.map(self._wmd_row, X_test) | ||
56 | return np.array(dist) | ||
57 | |||
58 | def fit(self, X, y): # X_train_idf | ||
59 | X = check_array(X, accept_sparse='csr', copy=True) # check if array is sparse | ||
60 | X = normalize(X, norm='l1', copy=False) | ||
61 | return super(Wasserstein_Matcher, self).fit(X, y) # X_train_idf, np_ones(document collection size) | ||
62 | |||
63 | def predict(self, X): | ||
64 | X = check_array(X, accept_sparse='csr', copy=True) | ||
65 | X = normalize(X, norm='l1', copy=False) | ||
66 | dist = self._pairwise_wmd(X) | ||
67 | dist = dist * 1000 # for lapjv, small floating point numbers are evil | ||
68 | return super(Wasserstein_Matcher, self).predict(dist) | ||
69 | |||
70 | def kneighbors(self, X, n_neighbors=1): # X : X_train_idf | ||
71 | X = check_array(X, accept_sparse='csr', copy=True) | ||
72 | X = normalize(X, norm='l1', copy=False) | ||
73 | dist = self._pairwise_wmd(X) | ||
74 | dist = dist * 1000 # for lapjv, small floating point numbers are evil | ||
75 | return lapjv(dist) # and here is the matching part | ||
76 | |||