aboutsummaryrefslogtreecommitdiffstats
path: root/Helpers.py
diff options
context:
space:
mode:
Diffstat (limited to 'Helpers.py')
-rw-r--r--Helpers.py157
1 files changed, 157 insertions, 0 deletions
diff --git a/Helpers.py b/Helpers.py
new file mode 100644
index 0000000..7c615ab
--- /dev/null
+++ b/Helpers.py
@@ -0,0 +1,157 @@
1import itertools
2
3import numpy as np
4from sklearn.model_selection import train_test_split as split_data
5
6import pandas as pd
7from gensim.models import KeyedVectors
8from keras.preprocessing.sequence import pad_sequences
9
10
11class Data(object):
12 def __init__(
13 self,
14 source_lang,
15 target_lang,
16 data_file,
17 max_len=None,
18 instances=1000,
19 vocab_limit=None,
20 sentence_cols=None,
21 score_col=None,
22 ):
23 self.source_lang = source_lang
24 self.target_lang = target_lang
25 self.data_file = data_file
26 self.max_len = max_len
27 self.instances = instances
28 self.vocab_size = 1
29 self.vocab_limit = vocab_limit
30
31 if sentence_cols is None:
32 self.sequence_cols = [
33 f"{source_lang} definition",
34 f"{target_lang} definition",
35 ]
36 else:
37 self.sequence_cols = sentence_cols
38
39 if score_col is None:
40 self.score_col = "is same"
41 else:
42 self.score_col = score_col
43
44 self.x_train = list()
45 self.y_train = list()
46 self.x_val = list()
47 self.y_val = list()
48 self.vocab = set("PAD")
49 self.word_to_id = {"PAD": 0}
50 self.id_to_word = {0: "PAD"}
51 self.word_to_count = dict()
52 self.run()
53
54 def text_to_word_list(self, text):
55 """ Pre process and convert texts to a list of words """
56 text = str(text)
57 text = text.split()
58 return text
59
60 def load_data(self):
61 # Load data set
62 data_df = pd.read_csv(self.data_file, sep="\t")
63
64 # Iterate over required sequences of provided dataset
65 for index, row in data_df.iterrows():
66 # Iterate through the text of both questions of the row
67 for sequence in self.sequence_cols:
68 s2n = [] # Sequences with words replaces with indices
69 for word in self.text_to_word_list(row[sequence]):
70 if word not in self.vocab:
71 self.vocab.add(word)
72 self.word_to_id[word] = self.vocab_size
73 self.word_to_count[word] = 1
74 s2n.append(self.vocab_size)
75 self.id_to_word[self.vocab_size] = word
76 self.vocab_size += 1
77 else:
78 self.word_to_count[word] += 1
79 s2n.append(self.word_to_id[word])
80
81 # Replace |sequence as word| with |sequence as number| representation
82 data_df.at[index, sequence] = s2n
83 return data_df
84
85 def pad_sequences(self):
86 if self.max_len == 0:
87 self.max_len = max(
88 max(len(seq) for seq in self.x_train[0]),
89 max(len(seq) for seq in self.x_train[1]),
90 max(len(seq) for seq in self.x_val[0]),
91 max(len(seq) for seq in self.x_val[1]),
92 )
93
94 # Zero padding
95 for dataset, side in itertools.product([self.x_train, self.x_val], [0, 1]):
96 if self.max_len:
97 dataset[side] = pad_sequences(dataset[side], maxlen=self.max_len)
98 else:
99 dataset[side] = pad_sequences(dataset[side])
100
101 def run(self):
102 # Loading data and building vocabulary.
103 data_df = self.load_data()
104
105 X = data_df[self.sequence_cols]
106 Y = data_df[self.score_col]
107
108 self.x_train, self.x_val, self.y_train, self.y_val = split_data(
109 X, Y, test_size=self.instances, shuffle=False
110 )
111
112 # Split to lists
113 self.x_train = [self.x_train[column] for column in self.sequence_cols]
114 self.x_val = [self.x_val[column] for column in self.sequence_cols]
115
116 # Convert labels to their numpy representations
117 self.y_train = self.y_train.values
118 self.y_val = self.y_val.values
119
120 # Padding Sequences.
121 self.pad_sequences()
122
123
124class Get_Embedding(object):
125 def __init__(self, source_lang, target_lang, source_emb, target_emb, word_index):
126 self.embedding_size = 300 # Default dimensionality
127 self.embedding_matrix = self.create_embed_matrix(
128 source_lang, target_lang, source_emb, target_emb, word_index
129 )
130
131 def create_embed_matrix(
132 self, source_lang, target_lang, source_emb, target_emb, word_index
133 ):
134 source_vecs = KeyedVectors.load_word2vec_format(source_emb)
135 target_vecs = KeyedVectors.load_word2vec_format(target_emb)
136
137 # Prepare Embedding Matrix.
138 embedding_matrix = np.zeros((len(word_index) + 1, self.embedding_size))
139
140 # word has either __source or __target appended
141 for key, i in word_index.items():
142 if "__" not in key:
143 print("Skipping {}".format(key))
144 continue
145
146 word, lang = key.split("__")
147
148 if lang == source_lang:
149 if word in source_vecs.vocab:
150 embedding_matrix[i] = source_vecs.word_vec(word)
151 else:
152 if word in target_vecs.vocab:
153 embedding_matrix[i] = target_vecs.word_vec(word)
154
155 del source_vecs
156 del target_vecs
157 return embedding_matrix