diff options
-rw-r--r-- | WMD_matching.py | 2 | ||||
-rw-r--r-- | Wass_Retriever.py | 73 | ||||
-rw-r--r-- | Wasserstein_Distance.py (renamed from Wass_Matcher.py) | 64 |
3 files changed, 65 insertions, 74 deletions
diff --git a/WMD_matching.py b/WMD_matching.py index 7fdf2f3..38dbff4 100644 --- a/WMD_matching.py +++ b/WMD_matching.py | |||
@@ -5,7 +5,7 @@ import nltk | |||
5 | import random | 5 | import random |
6 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | 6 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer |
7 | from sklearn.preprocessing import normalize | 7 | from sklearn.preprocessing import normalize |
8 | from Wass_Matcher import Wasserstein_Matcher | 8 | from Wasserstein_Distance import Wasserstein_Matcher |
9 | 9 | ||
10 | def load_embeddings(path, dimension=300): | 10 | def load_embeddings(path, dimension=300): |
11 | """ | 11 | """ |
diff --git a/Wass_Retriever.py b/Wass_Retriever.py deleted file mode 100644 index 036cf93..0000000 --- a/Wass_Retriever.py +++ /dev/null | |||
@@ -1,73 +0,0 @@ | |||
1 | import ot | ||
2 | from sklearn.preprocessing import normalize | ||
3 | from sklearn.neighbors import KNeighborsClassifier | ||
4 | from sklearn.metrics import euclidean_distances | ||
5 | from sklearn.externals.joblib import Parallel, delayed | ||
6 | from sklearn.utils import check_array | ||
7 | from sklearn.metrics.scorer import check_scoring | ||
8 | from pathos.multiprocessing import ProcessingPool as Pool | ||
9 | from sklearn.metrics import euclidean_distances | ||
10 | import numpy as np | ||
11 | |||
12 | class Wasserstein_Retriever(KNeighborsClassifier): | ||
13 | """ | ||
14 | Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. | ||
15 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. | ||
16 | Wasserstein is parametrized by the distances between the individual points of the distributions. | ||
17 | """ | ||
18 | def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1): | ||
19 | """ | ||
20 | Initialization of the class. | ||
21 | Arguments | ||
22 | --------- | ||
23 | W_embed: embeddings of the words, np.array | ||
24 | verbose: True/False | ||
25 | """ | ||
26 | self.sinkhorn = sinkhorn | ||
27 | self.sinkhorn_reg = sinkhorn_reg | ||
28 | self.W_embed = W_embed | ||
29 | self.verbose = verbose | ||
30 | super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') | ||
31 | |||
32 | def _wmd(self, i, row, X_train): | ||
33 | union_idx = np.union1d(X_train[i].indices, row.indices) | ||
34 | W_minimal = self.W_embed[union_idx] | ||
35 | W_dist = euclidean_distances(W_minimal) | ||
36 | bow_i = X_train[i, union_idx].A.ravel() | ||
37 | bow_j = row[:, union_idx].A.ravel() | ||
38 | if self.sinkhorn: | ||
39 | return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0] | ||
40 | else: | ||
41 | return ot.emd2(bow_i, bow_j, W_dist) | ||
42 | |||
43 | def _wmd_row(self, row): | ||
44 | X_train = self._fit_X | ||
45 | n_samples_train = X_train.shape[0] | ||
46 | return [self._wmd(i, row, X_train) for i in range(n_samples_train)] | ||
47 | |||
48 | def _pairwise_wmd(self, X_test, X_train=None): | ||
49 | n_samples_test = X_test.shape[0] | ||
50 | |||
51 | if X_train is None: | ||
52 | X_train = self._fit_X | ||
53 | pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances | ||
54 | dist = pool.map(self._wmd_row, X_test) | ||
55 | return np.array(dist) | ||
56 | |||
57 | def fit(self, X, y): | ||
58 | X = check_array(X, accept_sparse='csr', copy=True) | ||
59 | X = normalize(X, norm='l1', copy=False) | ||
60 | return super(Wasserstein_Retriever, self).fit(X, y) | ||
61 | |||
62 | def predict(self, X): | ||
63 | X = check_array(X, accept_sparse='csr', copy=True) | ||
64 | X = normalize(X, norm='l1', copy=False) | ||
65 | dist = self._pairwise_wmd(X) | ||
66 | return super(Wasserstein_Retriever, self).predict(dist) | ||
67 | |||
68 | def kneighbors(self, X, n_neighbors=1): | ||
69 | X = check_array(X, accept_sparse='csr', copy=True) | ||
70 | X = normalize(X, norm='l1', copy=False) | ||
71 | dist = self._pairwise_wmd(X) | ||
72 | return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors) | ||
73 | |||
diff --git a/Wass_Matcher.py b/Wasserstein_Distance.py index 44b29eb..d2a6408 100644 --- a/Wass_Matcher.py +++ b/Wasserstein_Distance.py | |||
@@ -74,3 +74,67 @@ class Wasserstein_Matcher(KNeighborsClassifier): | |||
74 | dist = dist * 1000 # for lapjv, small floating point numbers are evil | 74 | dist = dist * 1000 # for lapjv, small floating point numbers are evil |
75 | return lapjv(dist) # and here is the matching part | 75 | return lapjv(dist) # and here is the matching part |
76 | 76 | ||
77 | |||
78 | class Wasserstein_Retriever(KNeighborsClassifier): | ||
79 | """ | ||
80 | Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. | ||
81 | Source and target distributions are l_1 normalized before computing the Wasserstein distance. | ||
82 | Wasserstein is parametrized by the distances between the individual points of the distributions. | ||
83 | """ | ||
84 | def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1): | ||
85 | """ | ||
86 | Initialization of the class. | ||
87 | Arguments | ||
88 | --------- | ||
89 | W_embed: embeddings of the words, np.array | ||
90 | verbose: True/False | ||
91 | """ | ||
92 | self.sinkhorn = sinkhorn | ||
93 | self.sinkhorn_reg = sinkhorn_reg | ||
94 | self.W_embed = W_embed | ||
95 | self.verbose = verbose | ||
96 | super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') | ||
97 | |||
98 | def _wmd(self, i, row, X_train): | ||
99 | union_idx = np.union1d(X_train[i].indices, row.indices) | ||
100 | W_minimal = self.W_embed[union_idx] | ||
101 | W_dist = euclidean_distances(W_minimal) | ||
102 | bow_i = X_train[i, union_idx].A.ravel() | ||
103 | bow_j = row[:, union_idx].A.ravel() | ||
104 | if self.sinkhorn: | ||
105 | return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0] | ||
106 | else: | ||
107 | return ot.emd2(bow_i, bow_j, W_dist) | ||
108 | |||
109 | def _wmd_row(self, row): | ||
110 | X_train = self._fit_X | ||
111 | n_samples_train = X_train.shape[0] | ||
112 | return [self._wmd(i, row, X_train) for i in range(n_samples_train)] | ||
113 | |||
114 | def _pairwise_wmd(self, X_test, X_train=None): | ||
115 | n_samples_test = X_test.shape[0] | ||
116 | |||
117 | if X_train is None: | ||
118 | X_train = self._fit_X | ||
119 | pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances | ||
120 | dist = pool.map(self._wmd_row, X_test) | ||
121 | return np.array(dist) | ||
122 | |||
123 | def fit(self, X, y): | ||
124 | X = check_array(X, accept_sparse='csr', copy=True) | ||
125 | X = normalize(X, norm='l1', copy=False) | ||
126 | return super(Wasserstein_Retriever, self).fit(X, y) | ||
127 | |||
128 | def predict(self, X): | ||
129 | X = check_array(X, accept_sparse='csr', copy=True) | ||
130 | X = normalize(X, norm='l1', copy=False) | ||
131 | dist = self._pairwise_wmd(X) | ||
132 | return super(Wasserstein_Retriever, self).predict(dist) | ||
133 | |||
134 | def kneighbors(self, X, n_neighbors=1): | ||
135 | X = check_array(X, accept_sparse='csr', copy=True) | ||
136 | X = normalize(X, norm='l1', copy=False) | ||
137 | dist = self._pairwise_wmd(X) | ||
138 | return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors) | ||
139 | |||
140 | |||