From 49c6f58e51e12af691f7a1322137c64f46043b15 Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Tue, 24 Sep 2019 21:26:34 +0300 Subject: Use black linter for WMD --- WMD_matching.py | 97 ++++++++++++++++------------------ WMD_retrieval.py | 97 ++++++++++++++++------------------ Wasserstein_Distance.py | 136 +++++++++++++++++++++++++----------------------- 3 files changed, 163 insertions(+), 167 deletions(-) diff --git a/WMD_matching.py b/WMD_matching.py index 2755d15..69ea10e 100644 --- a/WMD_matching.py +++ b/WMD_matching.py @@ -13,7 +13,7 @@ from Wasserstein_Distance import (WassersteinMatcher, def main(args): - np.seterr(divide='ignore') # POT has issues with divide by zero errors + np.seterr(divide="ignore") # POT has issues with divide by zero errors source_lang = args.source_lang target_lang = args.target_lang @@ -29,32 +29,24 @@ def main(args): mode = args.mode runfor = list() - if mode == 'all': - runfor.extend(['wmd', 'snk']) + if mode == "all": + runfor.extend(["wmd", "snk"]) else: runfor.append(mode) defs_source = [ - line.rstrip('\n') - for line in open(source_defs_filename, encoding='utf8') + line.rstrip("\n") for line in open(source_defs_filename, encoding="utf8") ] defs_target = [ - line.rstrip('\n') - for line in open(target_defs_filename, encoding='utf8') + line.rstrip("\n") for line in open(target_defs_filename, encoding="utf8") ] clean_src_corpus, clean_src_vectors, src_keys = clean_corpus_using_embeddings_vocabulary( - set(vectors_source.keys()), - defs_source, - vectors_source, - source_lang, + set(vectors_source.keys()), defs_source, vectors_source, source_lang ) clean_target_corpus, clean_target_vectors, target_keys = clean_corpus_using_embeddings_vocabulary( - set(vectors_target.keys()), - defs_target, - vectors_target, - target_lang, + set(vectors_target.keys()), defs_target, vectors_target, target_lang ) take = args.instances @@ -70,14 +62,15 @@ def main(args): if not batch: print( - f'{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}' + f"{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}" ) del vectors_source, vectors_target, defs_source, defs_target vec = CountVectorizer().fit(clean_src_corpus + clean_target_corpus) common = [ - word for word in vec.get_feature_names() + word + for word in vec.get_feature_names() if word in clean_src_vectors or word in clean_target_vectors ] W_common = [] @@ -88,9 +81,7 @@ def main(args): W_common.append(np.array(clean_target_vectors[w])) if not batch: - print( - f'{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}' - ) + print(f"{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}") W_common = np.array(W_common) W_common = normalize(W_common) @@ -101,24 +92,25 @@ def main(args): for metric in runfor: if not batch: - print(f'{metric}: {source_lang} - {target_lang}') + print(f"{metric}: {source_lang} - {target_lang}") - clf = WassersteinMatcher(W_embed=W_common, - n_neighbors=5, - n_jobs=14, - sinkhorn=(metric == 'snk')) + clf = WassersteinMatcher( + W_embed=W_common, n_neighbors=5, n_jobs=14, sinkhorn=(metric == "snk") + ) clf.fit(X_train_idf[:instances], np.ones(instances)) - p_at_one, percentage = clf.align(X_test_idf[:instances], - n_neighbors=instances) + p_at_one, percentage = clf.align(X_test_idf[:instances], n_neighbors=instances) if not batch: - print(f'P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%') + print(f"P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%") else: fields = [ - f'{source_lang}', f'{target_lang}', f'{instances}', - f'{p_at_one}', f'{percentage}' + f"{source_lang}", + f"{target_lang}", + f"{instances}", + f"{p_at_one}", + f"{percentage}", ] - with open(f'{metric}_matching_results.csv', 'a') as f: + with open(f"{metric}_matching_results.csv", "a") as f: writer = csv.writer(f) writer.writerow(fields) @@ -126,30 +118,33 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser( - description='matching using wmd and wasserstein distance') - parser.add_argument('source_lang', help='source language short name') - parser.add_argument('target_lang', help='target language short name') - parser.add_argument('source_vector', help='path of the source vector') - parser.add_argument('target_vector', help='path of the target vector') - parser.add_argument('source_defs', help='path of the source definitions') - parser.add_argument('target_defs', help='path of the target definitions') + description="matching using wmd and wasserstein distance" + ) + parser.add_argument("source_lang", help="source language short name") + parser.add_argument("target_lang", help="target language short name") + parser.add_argument("source_vector", help="path of the source vector") + parser.add_argument("target_vector", help="path of the target vector") + parser.add_argument("source_defs", help="path of the source definitions") + parser.add_argument("target_defs", help="path of the target definitions") parser.add_argument( - '-b', - '--batch', - action='store_true', - help= - 'running in batch (store results in csv) or running a single instance (output the results)' + "-b", + "--batch", + action="store_true", + help="running in batch (store results in csv) or running a single instance (output the results)", ) - parser.add_argument('mode', - choices=['all', 'wmd', 'snk'], - default='all', - help='which methods to run') parser.add_argument( - '-n', - '--instances', - help='number of instances in each language to retrieve', + "mode", + choices=["all", "wmd", "snk"], + default="all", + help="which methods to run", + ) + parser.add_argument( + "-n", + "--instances", + help="number of instances in each language to retrieve", default=1000, - type=int) + type=int, + ) args = parser.parse_args() diff --git a/WMD_retrieval.py b/WMD_retrieval.py index 02f35be..cb72079 100644 --- a/WMD_retrieval.py +++ b/WMD_retrieval.py @@ -13,7 +13,7 @@ from Wasserstein_Distance import (WassersteinRetriever, def main(args): - np.seterr(divide='ignore') # POT has issues with divide by zero errors + np.seterr(divide="ignore") # POT has issues with divide by zero errors source_lang = args.source_lang target_lang = args.target_lang @@ -29,32 +29,24 @@ def main(args): mode = args.mode runfor = list() - if mode == 'all': - runfor.extend(['wmd', 'snk']) + if mode == "all": + runfor.extend(["wmd", "snk"]) else: runfor.append(mode) defs_source = [ - line.rstrip('\n') - for line in open(source_defs_filename, encoding='utf8') + line.rstrip("\n") for line in open(source_defs_filename, encoding="utf8") ] defs_target = [ - line.rstrip('\n') - for line in open(target_defs_filename, encoding='utf8') + line.rstrip("\n") for line in open(target_defs_filename, encoding="utf8") ] clean_src_corpus, clean_src_vectors, src_keys = clean_corpus_using_embeddings_vocabulary( - set(vectors_source.keys()), - defs_source, - vectors_source, - source_lang, + set(vectors_source.keys()), defs_source, vectors_source, source_lang ) clean_target_corpus, clean_target_vectors, target_keys = clean_corpus_using_embeddings_vocabulary( - set(vectors_target.keys()), - defs_target, - vectors_target, - target_lang, + set(vectors_target.keys()), defs_target, vectors_target, target_lang ) take = args.instances @@ -70,14 +62,15 @@ def main(args): if not batch: print( - f'{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}' + f"{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}" ) del vectors_source, vectors_target, defs_source, defs_target vec = CountVectorizer().fit(clean_src_corpus + clean_target_corpus) common = [ - word for word in vec.get_feature_names() + word + for word in vec.get_feature_names() if word in clean_src_vectors or word in clean_target_vectors ] W_common = [] @@ -88,9 +81,7 @@ def main(args): W_common.append(np.array(clean_target_vectors[w])) if not batch: - print( - f'{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}' - ) + print(f"{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}") W_common = np.array(W_common) W_common = normalize(W_common) @@ -101,55 +92,57 @@ def main(args): for metric in runfor: if not batch: - print(f'{metric}: {source_lang} - {target_lang}') + print(f"{metric}: {source_lang} - {target_lang}") - clf = WassersteinRetriever(W_embed=W_common, - n_neighbors=5, - n_jobs=14, - sinkhorn=(metric == 'snk')) + clf = WassersteinRetriever( + W_embed=W_common, n_neighbors=5, n_jobs=14, sinkhorn=(metric == "snk") + ) clf.fit(X_train_idf[:instances], np.ones(instances)) - p_at_one, percentage = clf.align(X_test_idf[:instances], - n_neighbors=instances) + p_at_one, percentage = clf.align(X_test_idf[:instances], n_neighbors=instances) if not batch: - print(f'P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%') + print(f"P @ 1: {p_at_one}\ninstances: {instances}\n{percentage}%") else: fields = [ - f'{source_lang}', f'{target_lang}', f'{instances}', - f'{p_at_one}', f'{percentage}' + f"{source_lang}", + f"{target_lang}", + f"{instances}", + f"{p_at_one}", + f"{percentage}", ] - with open(f'{metric}_retrieval_result.csv', 'a') as f: + with open(f"{metric}_retrieval_result.csv", "a") as f: writer = csv.writer(f) writer.writerow(fields) if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='run retrieval using wmd or snk') - parser.add_argument('source_lang', help='source language short name') - parser.add_argument('target_lang', help='target language short name') - parser.add_argument('source_vector', help='path of the source vector') - parser.add_argument('target_vector', help='path of the target vector') - parser.add_argument('source_defs', help='path of the source definitions') - parser.add_argument('target_defs', help='path of the target definitions') + parser = argparse.ArgumentParser(description="run retrieval using wmd or snk") + parser.add_argument("source_lang", help="source language short name") + parser.add_argument("target_lang", help="target language short name") + parser.add_argument("source_vector", help="path of the source vector") + parser.add_argument("target_vector", help="path of the target vector") + parser.add_argument("source_defs", help="path of the source definitions") + parser.add_argument("target_defs", help="path of the target definitions") parser.add_argument( - '-b', - '--batch', - action='store_true', - help= - 'running in batch (store results in csv) or running a single instance (output the results)' + "-b", + "--batch", + action="store_true", + help="running in batch (store results in csv) or running a single instance (output the results)", ) - parser.add_argument('mode', - choices=['all', 'wmd', 'snk'], - default='all', - help='which methods to run') parser.add_argument( - '-n', - '--instances', - help='number of instances in each language to retrieve', + "mode", + choices=["all", "wmd", "snk"], + default="all", + help="which methods to run", + ) + parser.add_argument( + "-n", + "--instances", + help="number of instances in each language to retrieve", default=1000, - type=int) + type=int, + ) args = parser.parse_args() diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py index 78bf9cf..60991b9 100644 --- a/Wasserstein_Distance.py +++ b/Wasserstein_Distance.py @@ -11,17 +11,20 @@ from sklearn.utils import check_array class WassersteinMatcher(KNeighborsClassifier): """ - Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. - Source and target distributions are l_1 normalized before computing the Wasserstein distance. - Wasserstein is parametrized by the distances between the individual points of the distributions. + Source and target distributions are l_1 normalized before computing the Wasserstein + distance. Wasserstein is parametrized by the distances between the individual + points of the distributions. """ - def __init__(self, - W_embed, - n_neighbors=1, - n_jobs=1, - verbose=False, - sinkhorn=False, - sinkhorn_reg=0.1): + + def __init__( + self, + W_embed, + n_neighbors=1, + n_jobs=1, + verbose=False, + sinkhorn=False, + sinkhorn_reg=0.1, + ): """ Initialization of the class. Arguments @@ -33,10 +36,12 @@ class WassersteinMatcher(KNeighborsClassifier): self.sinkhorn_reg = sinkhorn_reg self.W_embed = W_embed self.verbose = verbose - super(WassersteinMatcher, self).__init__(n_neighbors=n_neighbors, - n_jobs=n_jobs, - metric='precomputed', - algorithm='brute') + super(WassersteinMatcher, self).__init__( + n_neighbors=n_neighbors, + n_jobs=n_jobs, + metric="precomputed", + algorithm="brute", + ) def _wmd(self, i, row, X_train): union_idx = np.union1d(X_train[i].indices, row.indices) @@ -51,7 +56,7 @@ class WassersteinMatcher(KNeighborsClassifier): W_dist, self.sinkhorn_reg, numItermax=50, - method='sinkhorn_stabilized', + method="sinkhorn_stabilized", )[0] else: return ot.emd2(bow_i, bow_j, W_dist) @@ -66,27 +71,27 @@ class WassersteinMatcher(KNeighborsClassifier): if X_train is None: X_train = self._fit_X - pool = Pool(nodes=self.n_jobs - ) # Parallelization of the calculation of the distances + pool = Pool( + nodes=self.n_jobs + ) # Parallelization of the calculation of the distances dist = pool.map(self._wmd_row, X_test) return np.array(dist) def fit(self, X, y): # X_train_idf - X = check_array(X, accept_sparse='csr', - copy=True) # check if array is sparse - X = normalize(X, norm='l1', copy=False) + X = check_array(X, accept_sparse="csr", copy=True) # check if array is sparse + X = normalize(X, norm="l1", copy=False) return super(WassersteinMatcher, self).fit(X, y) def predict(self, X): - X = check_array(X, accept_sparse='csr', copy=True) - X = normalize(X, norm='l1', copy=False) + X = check_array(X, accept_sparse="csr", copy=True) + X = normalize(X, norm="l1", copy=False) dist = self._pairwise_wmd(X) dist = dist * 1000 # for lapjv, small floating point numbers are evil return super(WassersteinMatcher, self).predict(dist) def kneighbors(self, X, n_neighbors=1): - X = check_array(X, accept_sparse='csr', copy=True) - X = normalize(X, norm='l1', copy=False) + X = check_array(X, accept_sparse="csr", copy=True) + X = normalize(X, norm="l1", copy=False) dist = self._pairwise_wmd(X) dist = dist * 1000 # for lapjv, small floating point numbers are evil return lapjv(dist) @@ -102,19 +107,24 @@ class WassersteinMatcher(KNeighborsClassifier): percentage = p_at_one / n_neighbors * 100 return p_at_one, percentage + class WassersteinRetriever(KNeighborsClassifier): """ - Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. - Source and target distributions are l_1 normalized before computing the Wasserstein distance. - Wasserstein is parametrized by the distances between the individual points of the distributions. + Implements a nearest neighbors classifier for input distributions using + the Wasserstein distance as metric. Source and target distributions + are l_1 normalized before computing the Wasserstein distance. Wasserstein is + parametrized by the distances between the individual points of the distributions. """ - def __init__(self, - W_embed, - n_neighbors=1, - n_jobs=1, - verbose=False, - sinkhorn=False, - sinkhorn_reg=0.1): + + def __init__( + self, + W_embed, + n_neighbors=1, + n_jobs=1, + verbose=False, + sinkhorn=False, + sinkhorn_reg=0.1, + ): """ Initialization of the class. Arguments @@ -126,10 +136,12 @@ class WassersteinRetriever(KNeighborsClassifier): self.sinkhorn_reg = sinkhorn_reg self.W_embed = W_embed self.verbose = verbose - super(WassersteinRetriever, self).__init__(n_neighbors=n_neighbors, - n_jobs=n_jobs, - metric='precomputed', - algorithm='brute') + super(WassersteinRetriever, self).__init__( + n_neighbors=n_neighbors, + n_jobs=n_jobs, + metric="precomputed", + algorithm="brute", + ) def _wmd(self, i, row, X_train): union_idx = np.union1d(X_train[i].indices, row.indices) @@ -144,7 +156,7 @@ class WassersteinRetriever(KNeighborsClassifier): W_dist, self.sinkhorn_reg, numItermax=50, - method='sinkhorn_stabilized', + method="sinkhorn_stabilized", )[0] else: return ot.emd2(bow_i, bow_j, W_dist) @@ -164,19 +176,19 @@ class WassersteinRetriever(KNeighborsClassifier): return np.array(dist) def fit(self, X, y): - X = check_array(X, accept_sparse='csr', copy=True) - X = normalize(X, norm='l1', copy=False) + X = check_array(X, accept_sparse="csr", copy=True) + X = normalize(X, norm="l1", copy=False) return super(WassersteinRetriever, self).fit(X, y) def predict(self, X): - X = check_array(X, accept_sparse='csr', copy=True) - X = normalize(X, norm='l1', copy=False) + X = check_array(X, accept_sparse="csr", copy=True) + X = normalize(X, norm="l1", copy=False) dist = self._pairwise_wmd(X) return super(WassersteinRetriever, self).predict(dist) def kneighbors(self, X, n_neighbors=1): - X = check_array(X, accept_sparse='csr', copy=True) - X = normalize(X, norm='l1', copy=False) + X = check_array(X, accept_sparse="csr", copy=True) + X = normalize(X, norm="l1", copy=False) dist = self._pairwise_wmd(X) return super(WassersteinRetriever, self).kneighbors(dist, n_neighbors) @@ -199,9 +211,9 @@ def load_embeddings(path, dimension=300): The first line may or may not include the word count and dimension """ vectors = {} - with open(path, mode='r', encoding='utf8') as fp: - first_line = fp.readline().rstrip('\n') - if first_line.count(' ') == 1: + with open(path, mode="r", encoding="utf8") as fp: + first_line = fp.readline().rstrip("\n") + if first_line.count(" ") == 1: # includes the "word_count dimension" information (_, dimension) = map(int, first_line.split()) else: @@ -209,22 +221,19 @@ def load_embeddings(path, dimension=300): fp.seek(0) for line in fp: elems = line.split() - vectors[" ".join(elems[:-dimension])] = " ".join( - elems[-dimension:]) + vectors[" ".join(elems[:-dimension])] = " ".join(elems[-dimension:]) return vectors def clean_corpus_using_embeddings_vocabulary( - embeddings_dictionary, - corpus, - vectors, - language, + embeddings_dictionary, corpus, vectors, language ): - ''' + """ Cleans corpus using the dictionary of embeddings. Any word without an associated embedding in the dictionary is ignored. - Adds '__target-language' and '__source-language' at the end of the words according to their language. - ''' + Adds '__target-language' and '__source-language' at the end + of the words according to their language. + """ clean_corpus, clean_vectors, keys = [], {}, [] words_we_want = set(embeddings_dictionary) tokenize = MosesTokenizer(language) @@ -233,19 +242,18 @@ def clean_corpus_using_embeddings_vocabulary( words = tokenize(doc) for word in words: if word in words_we_want: - clean_doc.append(word + '__%s' % language) - clean_vectors[word + '__%s' % language] = np.array( - vectors[word].split()).astype(np.float) + clean_doc.append(word + "__%s" % language) + clean_vectors[word + "__%s" % language] = np.array( + vectors[word].split() + ).astype(np.float) if len(clean_doc) > 3 and len(clean_doc) < 25: keys.append(key) - clean_corpus.append(' '.join(clean_doc)) + clean_corpus.append(" ".join(clean_doc)) tokenize.close() return np.array(clean_corpus), clean_vectors, keys -def mrr_precision_at_k(golden, preds, k_list=[ - 1, -]): +def mrr_precision_at_k(golden, preds, k_list=[1]): """ Calculates Mean Reciprocal Error and Hits@1 == Precision@1 """ -- cgit v1.2.3-70-g09d2