From 1558c5e3ea8e34286ad587a8473f5121d5e8d289 Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Sat, 21 Sep 2019 16:52:54 +0300 Subject: Cleaned up WMD_retrieval --- WMD_retrieval.py | 334 +++++++++++++++++-------------------------------------- 1 file changed, 100 insertions(+), 234 deletions(-) (limited to 'WMD_retrieval.py') diff --git a/WMD_retrieval.py b/WMD_retrieval.py index f99eaa1..b49ba7d 100644 --- a/WMD_retrieval.py +++ b/WMD_retrieval.py @@ -1,259 +1,125 @@ -########################### -# Wasserstein Retrieval # -########################### import argparse - -parser = argparse.ArgumentParser(description='run retrieval using wmd and wasserstein distances') -parser.add_argument('source_lang', help='source language short name') -parser.add_argument('target_lang', help='target language short name') -parser.add_argument('source_vector', help='path of the source vector') -parser.add_argument('target_vector', help='path of the target vector') -parser.add_argument('source_defs', help='path of the source definitions') -parser.add_argument('target_defs', help='path of the target definitions') -parser.add_argument('-n', '--instances', help='number of instances in each language to retrieve', default=2000, type=int) - -args = parser.parse_args() - -source_lang = args.source_lang -target_lang = args.target_lang - -def load_embeddings(path, dimension=300): - """ - Loads the embeddings from a word2vec formatted file. - word2vec format is one line per word and it's associated embedding - (dimension x floating numbers) separated by spaces - The first line may or may not include the word count and dimension - """ - vectors = {} - with open(path, mode='r', encoding='utf8') as fp: - first_line = fp.readline().rstrip('\n') - if first_line.count(' ') == 1: - # includes the "word_count dimension" information - (word_count, dimension) = map(int, first_line.split()) - else: # assume the file only contains vectors - fp.seek(0) - for line in fp: - elems = line.split() - vectors[" ".join(elems[:-dimension])] = " ".join(elems[-dimension:]) - return vectors - -####################################################################### -# Vectors Load Here # -####################################################################### - -source_vectors_filename = args.source_vector -target_vectors_filename = args.target_vector -vectors_source = load_embeddings(source_vectors_filename) -vectors_target = load_embeddings(target_vectors_filename) - -####################################################################### -# Corpora Load Here # -####################################################################### -source_defs_filename = args.source_defs -target_defs_filename = args.target_defs -defs_source = [line.rstrip('\n') for line in open(source_defs_filename, encoding='utf8')] -defs_target = [line.rstrip('\n') for line in open(target_defs_filename, encoding='utf8')] - import numpy as np -from mosestokenizer import * - -def clean_corpus_using_embeddings_vocabulary( - embeddings_dictionary, - corpus, - vectors, - language, - ): - ''' - Cleans corpus using the dictionary of embeddings. - Any word without an associated embedding in the dictionary is ignored. - Adds '__target-language' and '__source-language' at the end of the words according to their language. - ''' - clean_corpus, clean_vectors, keys = [], {}, [] - words_we_want = set(embeddings_dictionary) - tokenize = MosesTokenizer(language) - for key, doc in enumerate(corpus): - clean_doc = [] - words = tokenize(doc) - for word in words: - if word in words_we_want: - clean_doc.append(word + '__%s' % language) - clean_vectors[word + '__%s' % language] = np.array(vectors[word].split()).astype(np.float) - if len(clean_doc) > 3 and len(clean_doc) < 25: - keys.append(key) - clean_corpus.append(' '.join(clean_doc)) - tokenize.close() - return np.array(clean_corpus), clean_vectors, keys - -clean_src_corpus, clean_src_vectors, src_keys = clean_corpus_using_embeddings_vocabulary( - set(vectors_source.keys()), - defs_source, - vectors_source, - source_lang, - ) - -clean_target_corpus, clean_target_vectors, target_keys = clean_corpus_using_embeddings_vocabulary( - set(vectors_target.keys()), - defs_target, - vectors_target, - target_lang, - ) - -# Here is the part Wasserstein prunes two corporas to 500 articles each -# Our dataset does not have that luxury (turns out it's not a luxury but a necessity) - import random -take = args.instances +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer +from sklearn.preprocessing import normalize +from Wasserstein_Distance import Wasserstein_Retriever +from Wasserstein_Distance import load_embeddings, clean_corpus_using_embeddings_vocabulary, mrr_precision_at_k +import csv +import sys -common_keys = set(src_keys).intersection(set(target_keys)) -take = min(len(common_keys), take) # you can't sample more than length -experiment_keys = random.sample(common_keys, take) +def main(args): -instances = len(experiment_keys) + np.seterr(divide='ignore') # POT has issues with divide by zero errors + source_lang = args.source_lang + target_lang = args.target_lang -clean_src_corpus = list(clean_src_corpus[experiment_keys]) -clean_target_corpus = list(clean_target_corpus[experiment_keys]) + source_vectors_filename = args.source_vector + target_vectors_filename = args.target_vector + vectors_source = load_embeddings(source_vectors_filename) + vectors_target = load_embeddings(target_vectors_filename) -print(f'{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}') + source_defs_filename = args.source_defs + target_defs_filename = args.target_defs -del vectors_source, vectors_target, defs_source, defs_target + batch = args.batch + mode = args.mode + runfor = list() -from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer - -vec = CountVectorizer().fit(clean_src_corpus + clean_target_corpus) -common = [word for word in vec.get_feature_names() if word in clean_src_vectors or word in clean_target_vectors] -W_common = [] -for w in common: - if w in clean_src_vectors: - W_common.append(np.array(clean_src_vectors[w])) + if (mode == 'all'): + runfor.extend(['wmd','snk']) else: - W_common.append(np.array(clean_target_vectors[w])) + runfor.append(mode) -print(f'{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}') + defs_source = [line.rstrip('\n') for line in open(source_defs_filename, encoding='utf8')] + defs_target = [line.rstrip('\n') for line in open(target_defs_filename, encoding='utf8')] -from sklearn.preprocessing import normalize -W_common = np.array(W_common) -W_common = normalize(W_common) -vect = TfidfVectorizer(vocabulary=common, dtype=np.double, norm=None) -vect.fit(clean_src_corpus + clean_target_corpus) -X_train_idf = vect.transform(clean_src_corpus) -X_test_idf = vect.transform(clean_target_corpus) + clean_src_corpus, clean_src_vectors, src_keys = clean_corpus_using_embeddings_vocabulary( + set(vectors_source.keys()), + defs_source, + vectors_source, + source_lang, + ) -vect_tf = CountVectorizer(vocabulary=common, dtype=np.double) -vect_tf.fit(clean_src_corpus + clean_target_corpus) -X_train_tf = vect_tf.transform(clean_src_corpus) -X_test_tf = vect_tf.transform(clean_target_corpus) + clean_target_corpus, clean_target_vectors, target_keys = clean_corpus_using_embeddings_vocabulary( + set(vectors_target.keys()), + defs_target, + vectors_target, + target_lang, + ) -import ot -from sklearn.neighbors import KNeighborsClassifier -from sklearn.metrics import euclidean_distances -from sklearn.externals.joblib import Parallel, delayed -from sklearn.utils import check_array -from sklearn.metrics.scorer import check_scoring -from pathos.multiprocessing import ProcessingPool as Pool -from sklearn.metrics import euclidean_distances + take = args.instances -class WassersteinDistances(KNeighborsClassifier): - """ - Implements a nearest neighbors classifier for input distributions using the Wasserstein distance as metric. - Source and target distributions are l_1 normalized before computing the Wasserstein distance. - Wasserstein is parametrized by the distances between the individual points of the distributions. - In this work, we propose to use cross-lingual embeddings for calculating these distances. + common_keys = set(src_keys).intersection(set(target_keys)) + take = min(len(common_keys), take) # you can't sample more than length + experiment_keys = random.sample(common_keys, take) - """ - def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False, sinkhorn= False, sinkhorn_reg=0.1): - """ - Initialization of the class. - Arguments - --------- - W_embed: embeddings of the words, np.array - verbose: True/False - """ - self.sinkhorn = sinkhorn - self.sinkhorn_reg = sinkhorn_reg - self.W_embed = W_embed - self.verbose = verbose - super(WassersteinDistances, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs, metric='precomputed', algorithm='brute') + instances = len(experiment_keys) - def _wmd(self, i, row, X_train): - union_idx = np.union1d(X_train[i].indices, row.indices) - W_minimal = self.W_embed[union_idx] - W_dist = euclidean_distances(W_minimal) - bow_i = X_train[i, union_idx].A.ravel() - bow_j = row[:, union_idx].A.ravel() - if self.sinkhorn: - return ot.sinkhorn2(bow_i, bow_j, W_dist, self.sinkhorn_reg, numItermax=50, method='sinkhorn_stabilized',)[0] - else: - return ot.emd2(bow_i, bow_j, W_dist) + clean_src_corpus = list(clean_src_corpus[experiment_keys]) + clean_target_corpus = list(clean_target_corpus[experiment_keys]) - def _wmd_row(self, row): - X_train = self._fit_X - n_samples_train = X_train.shape[0] - return [self._wmd(i, row, X_train) for i in range(n_samples_train)] + if (not batch): + print(f'{source_lang} - {target_lang} : document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}') - def _pairwise_wmd(self, X_test, X_train=None): - n_samples_test = X_test.shape[0] + del vectors_source, vectors_target, defs_source, defs_target - if X_train is None: - X_train = self._fit_X - pool = Pool(nodes=self.n_jobs) # Parallelization of the calculation of the distances - dist = pool.map(self._wmd_row, X_test) - return np.array(dist) - - def fit(self, X, y): - X = check_array(X, accept_sparse='csr', copy=True) - X = normalize(X, norm='l1', copy=False) - return super(WassersteinDistances, self).fit(X, y) - - def predict(self, X): - X = check_array(X, accept_sparse='csr', copy=True) - X = normalize(X, norm='l1', copy=False) - dist = self._pairwise_wmd(X) - return super(WassersteinDistances, self).predict(dist) - - def kneighbors(self, X, n_neighbors=1): - X = check_array(X, accept_sparse='csr', copy=True) - X = normalize(X, norm='l1', copy=False) - dist = self._pairwise_wmd(X) - return super(WassersteinDistances, self).kneighbors(dist, n_neighbors) + vec = CountVectorizer().fit(clean_src_corpus + clean_target_corpus) + common = [word for word in vec.get_feature_names() if word in clean_src_vectors or word in clean_target_vectors] + W_common = [] + for w in common: + if w in clean_src_vectors: + W_common.append(np.array(clean_src_vectors[w])) + else: + W_common.append(np.array(clean_target_vectors[w])) + + if (not batch): + print(f'{source_lang} - {target_lang}: the vocabulary size is {len(W_common)}') + + W_common = np.array(W_common) + W_common = normalize(W_common) + vect = TfidfVectorizer(vocabulary=common, dtype=np.double, norm=None) + vect.fit(clean_src_corpus + clean_target_corpus) + X_train_idf = vect.transform(clean_src_corpus) + X_test_idf = vect.transform(clean_target_corpus) + + vect_tf = CountVectorizer(vocabulary=common, dtype=np.double) + vect_tf.fit(clean_src_corpus + clean_target_corpus) + X_train_tf = vect_tf.transform(clean_src_corpus) + X_test_tf = vect_tf.transform(clean_target_corpus) + + for metric in runfor: + if (not batch): + print(f'{metric} - tfidf: {source_lang} - {target_lang}') + + clf = WassersteinDistances(W_embed=W_common, n_neighbors=5, n_jobs=14, sinkhorn=(metric == 'snk')) + clf.fit(X_train_idf[:instances], np.ones(instances)) + dist, preds = clf.kneighbors(X_test_idf[:instances], n_neighbors=instances) + mrr, p_at_1 = mrr_precision_at_k(list(range(len(preds))), preds) + percentage = p_at_1 * 100 + + if (not batch): + print(f'MRR: {mrr} | Precision @ 1: {p_at_1}') + else: + fields = [f'{source_lang}', f'{target_lang}', f'{instances}', f'{mrr}', f'{p_at_1}', f'{percentage}'] + with open(f'{metric}_retrieval_result.csv', 'a') as f: + writer = csv.writer(f) + writer.writerow(fields) -def mrr_precision_at_k(golden, preds, k_list=[1,]): - """ - Calculates Mean Reciprocal Error and Hits@1 == Precision@1 - """ - my_score = 0 - precision_at = np.zeros(len(k_list)) - for key, elem in enumerate(golden): - if elem in preds[key]: - location = np.where(preds[key]==elem)[0][0] - my_score += 1/(1+ location) - for k_index, k_value in enumerate(k_list): - if location < k_value: - precision_at[k_index] += 1 - return my_score/len(golden), (precision_at/len(golden))[0] -print(f'WMD - tfidf: {source_lang} - {target_lang}') -clf = WassersteinDistances(W_embed=W_common, n_neighbors=5, n_jobs=14) -clf.fit(X_train_idf[:instances], np.ones(instances)) -dist, preds = clf.kneighbors(X_test_idf[:instances], n_neighbors=instances) -mrr, p_at_1 = mrr_precision_at_k(list(range(len(preds))), preds) -print(f'MRR: {mrr} | Precision @ 1: {p_at_1}') +if __name__ == "__main__": -import csv -percentage = p_at_1 * 100 -fields = [f'{source_lang}', f'{target_lang}', f'{instances}', f'{mrr}', f'{p_at_1}', f'{percentage}'] -with open('/home/syigit/multilang_results/wmd_retrieval_result.csv', 'a') as f: - writer = csv.writer(f) - writer.writerow(fields) + parser = argparse.ArgumentParser(description='run retrieval using wmd or snk') + parser.add_argument('source_lang', help='source language short name') + parser.add_argument('target_lang', help='target language short name') + parser.add_argument('source_vector', help='path of the source vector') + parser.add_argument('target_vector', help='path of the target vector') + parser.add_argument('source_defs', help='path of the source definitions') + parser.add_argument('target_defs', help='path of the target definitions') + parser.add_argument('-b', '--batch', action='store_true', help='running in batch (store results in csv) or running a single instance (output the results)') + parser.add_argument('mode', choices=['all', 'wmd', 'snk'], default='all', help='which methods to run') + parser.add_argument('-n', '--instances', help='number of instances in each language to retrieve', default=2000, type=int) -print(f'Sinkhorn - tfidf: {source_lang} - {target_lang}') -clf = WassersteinDistances(W_embed=W_common, n_neighbors=5, n_jobs=14, sinkhorn=True) -clf.fit(X_train_idf[:instances], np.ones(instances)) -dist, preds = clf.kneighbors(X_test_idf[:instances], n_neighbors=instances) -mrr, p_at_1 = mrr_precision_at_k(list(range(len(preds))), preds) -print(f'MRR: {mrr} | Precision @ 1: {p_at_1}') + args = parser.parse_args() -percentage = p_at_1 * 100 -fields = [f'{source_lang}', f'{target_lang}', f'{instances}', f'{mrr}', f'{p_at_1}', f'{percentage}'] -with open('/home/syigit/multilang_results/sinkhorn_retrieval_result.csv', 'a') as f: - writer = csv.writer(f) - writer.writerow(fields) + main(args) -- cgit v1.2.3-70-g09d2