diff options
Diffstat (limited to 'Wasserstein_Distance.py')
-rw-r--r-- | Wasserstein_Distance.py | 140 |
1 files changed, 140 insertions, 0 deletions
diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py new file mode 100644 index 0000000..d2a6408 --- /dev/null +++ b/Wasserstein_Distance.py | |||
@@ -0,0 +1,140 @@ | |||
1 | import ot | ||
2 | from sklearn.preprocessing import normalize | ||
3 | from lapjv import lapjv | ||
4 | from sklearn.neighbors import KNeighborsClassifier | ||
5 | from sklearn.metrics import euclidean_distances | ||
6 | from sklearn.externals.joblib import Parallel, delayed | ||
7 | from sklearn.utils import check_array | ||
8 | from sklearn.metrics.scorer import check_scoring | ||
9 | from pathos.multiprocessing import ProcessingPool as Pool | ||
10 | from sklearn.metrics import euclidean_distances | ||
11 | import numpy as np | ||
12 | |||
13 | class Wasserstein_Matcher(KNeighborsClassifier): | ||
14 | """ | ||
15 | Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. | ||
16 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. | ||
17 | Wasserstein is parametrized by the distances between the individual points of the distributions. | ||
18 | """ | ||
19 | def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1): | ||
20 | """ | ||
21 | Initialization of the class. | ||
22 | Arguments | ||
23 | --------- | ||
24 | W_embed: embeddings of the words, np.array | ||
25 | verbose: True/False | ||
26 | """ | ||
27 | self.sinkhorn = sinkhorn | ||
28 | self.sinkhorn_reg = sinkhorn_reg | ||
29 | self.W_embed = W_embed | ||
30 | self.verbose = verbose | ||
31 | super(Wasserstein_Matcher, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') | ||
32 | |||
33 | def _wmd(self, i, row, X_train): | ||
34 | union_idx = np.union1d(X_train[i].indices, row.indices) | ||
35 | W_minimal = self.W_embed[union_idx] | ||
36 | W_dist = euclidean_distances(W_minimal) | ||
37 | bow_i = X_train[i, union_idx].A.ravel() | ||
38 | bow_j = row[:, union_idx].A.ravel() | ||
39 | if self.sinkhorn: | ||
40 | return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0] | ||
41 | else: | ||
42 | return ot.emd2(bow_i, bow_j, W_dist) | ||
43 | |||
44 | def _wmd_row(self, row): | ||
45 | X_train = self._fit_X | ||
46 | n_samples_train = X_train.shape[0] | ||
47 | return [self._wmd(i, row, X_train) for i in range(n_samples_train)] | ||
48 | |||
49 | def _pairwise_wmd(self, X_test, X_train=None): | ||
50 | n_samples_test = X_test.shape[0] | ||
51 | |||
52 | if X_train is None: | ||
53 | X_train = self._fit_X | ||
54 | pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances | ||
55 | dist = pool.map(self._wmd_row, X_test) | ||
56 | return np.array(dist) | ||
57 | |||
58 | def fit(self, X, y): # X_train_idf | ||
59 | X = check_array(X, accept_sparse='csr', copy=True) # check if array is sparse | ||
60 | X = normalize(X, norm='l1', copy=False) | ||
61 | return super(Wasserstein_Matcher, self).fit(X, y) # X_train_idf, np_ones(document collection size) | ||
62 | |||
63 | def predict(self, X): | ||
64 | X = check_array(X, accept_sparse='csr', copy=True) | ||
65 | X = normalize(X, norm='l1', copy=False) | ||
66 | dist = self._pairwise_wmd(X) | ||
67 | dist = dist * 1000 # for lapjv, small floating point numbers are evil | ||
68 | return super(Wasserstein_Matcher, self).predict(dist) | ||
69 | |||
70 | def kneighbors(self, X, n_neighbors=1): # X : X_train_idf | ||
71 | X = check_array(X, accept_sparse='csr', copy=True) | ||
72 | X = normalize(X, norm='l1', copy=False) | ||
73 | dist = self._pairwise_wmd(X) | ||
74 | dist = dist * 1000 # for lapjv, small floating point numbers are evil | ||
75 | return lapjv(dist) # and here is the matching part | ||
76 | |||
77 | |||
78 | class Wasserstein_Retriever(KNeighborsClassifier): | ||
79 | """ | ||
80 | Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. | ||
81 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. | ||
82 | Wasserstein is parametrized by the distances between the individual points of the distributions. | ||
83 | """ | ||
84 | def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1): | ||
85 | """ | ||
86 | Initialization of the class. | ||
87 | Arguments | ||
88 | --------- | ||
89 | W_embed: embeddings of the words, np.array | ||
90 | verbose: True/False | ||
91 | """ | ||
92 | self.sinkhorn = sinkhorn | ||
93 | self.sinkhorn_reg = sinkhorn_reg | ||
94 | self.W_embed = W_embed | ||
95 | self.verbose = verbose | ||
96 | super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') | ||
97 | |||
98 | def _wmd(self, i, row, X_train): | ||
99 | union_idx = np.union1d(X_train[i].indices, row.indices) | ||
100 | W_minimal = self.W_embed[union_idx] | ||
101 | W_dist = euclidean_distances(W_minimal) | ||
102 | bow_i = X_train[i, union_idx].A.ravel() | ||
103 | bow_j = row[:, union_idx].A.ravel() | ||
104 | if self.sinkhorn: | ||
105 | return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0] | ||
106 | else: | ||
107 | return ot.emd2(bow_i, bow_j, W_dist) | ||
108 | |||
109 | def _wmd_row(self, row): | ||
110 | X_train = self._fit_X | ||
111 | n_samples_train = X_train.shape[0] | ||
112 | return [self._wmd(i, row, X_train) for i in range(n_samples_train)] | ||
113 | |||
114 | def _pairwise_wmd(self, X_test, X_train=None): | ||
115 | n_samples_test = X_test.shape[0] | ||
116 | |||
117 | if X_train is None: | ||
118 | X_train = self._fit_X | ||
119 | pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances | ||
120 | dist = pool.map(self._wmd_row, X_test) | ||
121 | return np.array(dist) | ||
122 | |||
123 | def fit(self, X, y): | ||
124 | X = check_array(X, accept_sparse='csr', copy=True) | ||
125 | X = normalize(X, norm='l1', copy=False) | ||
126 | return super(Wasserstein_Retriever, self).fit(X, y) | ||
127 | |||
128 | def predict(self, X): | ||
129 | X = check_array(X, accept_sparse='csr', copy=True) | ||
130 | X = normalize(X, norm='l1', copy=False) | ||
131 | dist = self._pairwise_wmd(X) | ||
132 | return super(Wasserstein_Retriever, self).predict(dist) | ||
133 | |||
134 | def kneighbors(self, X, n_neighbors=1): | ||
135 | X = check_array(X, accept_sparse='csr', copy=True) | ||
136 | X = normalize(X, norm='l1', copy=False) | ||
137 | dist = self._pairwise_wmd(X) | ||
138 | return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors) | ||
139 | |||
140 | |||