From 778f3936edf3104660d23a88fe23da46c42709a4 Mon Sep 17 00:00:00 2001
From: Yigit Sever
Date: Sat, 21 Sep 2019 15:58:19 +0300
Subject: Move functions to centralize

---
 Wasserstein_Distance.py | 65 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

(limited to 'Wasserstein_Distance.py')

diff --git a/Wasserstein_Distance.py b/Wasserstein_Distance.py
index d2a6408..d8d08b8 100644
--- a/Wasserstein_Distance.py
+++ b/Wasserstein_Distance.py
@@ -138,3 +138,68 @@ class Wasserstein_Retriever(KNeighborsClassifier):
         return super(Wasserstein_Retriever, self).kneighbors(dist, n_neighbors)
 
 
+def load_embeddings(path, dimension=300):
+    """
+    Loads the embeddings from a word2vec formatted file.
+    word2vec format is one line per word and it's associated embedding
+    (dimension x floating numbers) separated by spaces
+    The first line may or may not include the word count and dimension
+    """
+    vectors = {}
+    with open(path, mode='r', encoding='utf8') as fp:
+        first_line = fp.readline().rstrip('\n')
+        if first_line.count(' ') == 1:
+            # includes the "word_count dimension" information
+            (word_count, dimension) = map(int, first_line.split())
+        else:
+            # assume the file only contains vectors
+            fp.seek(0)
+        for line in fp:
+            elems = line.split()
+            vectors[" ".join(elems[:-dimension])] = " ".join(elems[-dimension:])
+    return vectors
+
+
+def clean_corpus_using_embeddings_vocabulary(
+        embeddings_dictionary,
+        corpus,
+        vectors,
+        language,
+        ):
+    '''
+    Cleans corpus using the dictionary of embeddings.
+    Any word without an associated embedding in the dictionary is ignored.
+    Adds '__target-language' and '__source-language' at the end of the words according to their language.
+    '''
+    clean_corpus, clean_vectors, keys = [], {}, []
+    words_we_want = set(embeddings_dictionary)
+    tokenize = MosesTokenizer(language)
+    for key, doc in enumerate(corpus):
+        clean_doc = []
+        words = tokenize(doc)
+        for word in words:
+            if word in words_we_want:
+                clean_doc.append(word + '__%s' % language)
+                clean_vectors[word + '__%s' % language] = np.array(vectors[word].split()).astype(np.float)
+        if len(clean_doc) > 3 and len(clean_doc) < 25:
+            keys.append(key)
+        clean_corpus.append(' '.join(clean_doc))
+    tokenize.close()
+    return np.array(clean_corpus), clean_vectors, keys
+
+
+def mrr_precision_at_k(golden, preds, k_list=[1,]):
+    """
+    Calculates Mean Reciprocal Error and Hits@1 == Precision@1
+    """
+    my_score = 0
+    precision_at = np.zeros(len(k_list))
+    for key, elem in enumerate(golden):
+        if elem in preds[key]:
+            location = np.where(preds[key]==elem)[0][0]
+            my_score += 1/(1+ location)
+        for k_index, k_value in enumerate(k_list):
+            if location < k_value:
+                precision_at[k_index] += 1
+    return my_score/len(golden), (precision_at/len(golden))[0]
+
-- 
cgit v1.2.3-70-g09d2