aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--WMD_matching.py2
-rw-r--r--Wass_Retriever.py73
-rw-r--r--Wasserstein_Distance.py (renamed from Wass_Matcher.py)64
3 files changed, 65 insertions, 74 deletions
diff --git a/WMD_matching.py b/WMD_matching.py
index 7fdf2f3..38dbff4 100644
--- a/WMD_matching.py
+++ b/WMD_matching.py
@@ -5,7 +5,7 @@ import nltk
5import random 5import random
6from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 6from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
7from sklearn.preprocessing import normalize 7from sklearn.preprocessing import normalize
8from Wass_Matcher import Wasserstein_Matcher 8from Wasserstein_Distance import Wasserstein_Matcher
9 9
10def load_embeddings(path, dimension=300): 10def load_embeddings(path, dimension=300):
11 """ 11 """
diff --git a/Wass_Retriever.py b/Wass_Retriever.py
deleted file mode 100644
index 036cf93..0000000
--- a/Wass_Retriever.py
+++ /dev/null
@@ -1,73 +0,0 @@
1import ot
2from sklearn.preprocessing import normalize
3from sklearn.neighbors import KNeighborsClassifier
4from sklearn.metrics import euclidean_distances
5from sklearn.externals.joblib import Parallel, delayed
6from sklearn.utils import check_array
7from sklearn.metrics.scorer import check_scoring
8from pathos.multiprocessing import ProcessingPool as Pool
9from sklearn.metrics import euclidean_distances
10import numpy as np
11
12class Wasserstein_Retriever(KNeighborsClassifier):
13 """
14 Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric.
15 Source and target distributions are l_1 normalized before computing the Wasserstein distance.
16 Wasserstein is parametrized by the distances between the individual points of the distributions.
17 """
18 def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1):
19 """
20 Initialization of the class.
21 Arguments
22 ---------
23 W_embed: embeddings of the words, np.array
24 verbose: True/False
25 """
26 self.sinkhorn = sinkhorn
27 self.sinkhorn_reg = sinkhorn_reg
28 self.W_embed = W_embed
29 self.verbose = verbose
30 super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute')
31
32 def _wmd(self, i, row, X_train):
33 union_idx = np.union1d(X_train[i].indices, row.indices)
34 W_minimal = self.W_embed[union_idx]
35 W_dist = euclidean_distances(W_minimal)
36 bow_i = X_train[i, union_idx].A.ravel()
37 bow_j = row[:, union_idx].A.ravel()
38 if self.sinkhorn:
39 return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0]
40 else:
41 return ot.emd2(bow_i, bow_j, W_dist)
42
43 def _wmd_row(self, row):
44 X_train = self._fit_X
45 n_samples_train = X_train.shape[0]
46 return [self._wmd(i, row, X_train) for i in range(n_samples_train)]
47
48 def _pairwise_wmd(self, X_test, X_train=None):
49 n_samples_test = X_test.shape[0]
50
51 if X_train is None:
52 X_train = self._fit_X
53 pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances
54 dist = pool.map(self._wmd_row, X_test)
55 return np.array(dist)
56
57 def fit(self, X, y):
58 X = check_array(X, accept_sparse='csr', copy=True)
59 X = normalize(X, norm='l1', copy=False)
60 return super(Wasserstein_Retriever, self).fit(X, y)
61
62 def predict(self, X):
63 X = check_array(X, accept_sparse='csr', copy=True)
64 X = normalize(X, norm='l1', copy=False)
65 dist = self._pairwise_wmd(X)
66 return super(Wasserstein_Retriever, self).predict(dist)
67
68 def kneighbors(self, X, n_neighbors=1):
69 X = check_array(X, accept_sparse='csr', copy=True)
70 X = normalize(X, norm='l1', copy=False)
71 dist = self._pairwise_wmd(X)
72 return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors)
73
diff --git a/Wass_Matcher.py b/Wasserstein_Distance.py
index 44b29eb..d2a6408 100644
--- a/Wass_Matcher.py
+++ b/Wasserstein_Distance.py
@@ -74,3 +74,67 @@ class Wasserstein_Matcher(KNeighborsClassifier):
74 dist = dist * 1000 # for lapjv, small floating point numbers are evil 74 dist = dist * 1000 # for lapjv, small floating point numbers are evil
75 return lapjv(dist) # and here is the matching part 75 return lapjv(dist) # and here is the matching part
76 76
77
78class Wasserstein_Retriever(KNeighborsClassifier):
79 """
80 Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric.
81 Source and target distributions are l_1 normalized before computing the Wasserstein distance.
82 Wasserstein is parametrized by the distances between the individual points of the distributions.
83 """
84 def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1):
85 """
86 Initialization of the class.
87 Arguments
88 ---------
89 W_embed: embeddings of the words, np.array
90 verbose: True/False
91 """
92 self.sinkhorn = sinkhorn
93 self.sinkhorn_reg = sinkhorn_reg
94 self.W_embed = W_embed
95 self.verbose = verbose
96 super(Wasserstein_Retriever, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute')
97
98 def _wmd(self, i, row, X_train):
99 union_idx = np.union1d(X_train[i].indices, row.indices)
100 W_minimal = self.W_embed[union_idx]
101 W_dist = euclidean_distances(W_minimal)
102 bow_i = X_train[i, union_idx].A.ravel()
103 bow_j = row[:, union_idx].A.ravel()
104 if self.sinkhorn:
105 return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0]
106 else:
107 return ot.emd2(bow_i, bow_j, W_dist)
108
109 def _wmd_row(self, row):
110 X_train = self._fit_X
111 n_samples_train = X_train.shape[0]
112 return [self._wmd(i, row, X_train) for i in range(n_samples_train)]
113
114 def _pairwise_wmd(self, X_test, X_train=None):
115 n_samples_test = X_test.shape[0]
116
117 if X_train is None:
118 X_train = self._fit_X
119 pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances
120 dist = pool.map(self._wmd_row, X_test)
121 return np.array(dist)
122
123 def fit(self, X, y):
124 X = check_array(X, accept_sparse='csr', copy=True)
125 X = normalize(X, norm='l1', copy=False)
126 return super(Wasserstein_Retriever, self).fit(X, y)
127
128 def predict(self, X):
129 X = check_array(X, accept_sparse='csr', copy=True)
130 X = normalize(X, norm='l1', copy=False)
131 dist = self._pairwise_wmd(X)
132 return super(Wasserstein_Retriever, self).predict(dist)
133
134 def kneighbors(self, X, n_neighbors=1):
135 X = check_array(X, accept_sparse='csr', copy=True)
136 X = normalize(X, norm='l1', copy=False)
137 dist = self._pairwise_wmd(X)
138 return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors)
139
140