1 files changed, 157 insertions, 0 deletions
diff --git a/Helpers.py b/Helpers.py
new file mode 100644
index 0000000..7c615ab
--- /dev/null
+++ b/Helpers.py
@@ -0,0 +1,157 @@
+import itertools
+import numpy as np
+from sklearn.model_selection import train_test_split as split_data
+import pandas as pd
+from gensim.models import KeyedVectors
+from keras.preprocessing.sequence import pad_sequences
+class Data(object):
+    def __init__(
+        self,
+        source_lang,
+        target_lang,
+        data_file,
+        max_len=None,
+        instances=1000,
+        vocab_limit=None,
+        sentence_cols=None,
+        score_col=None,
+    ):
+        self.source_lang = source_lang
+        self.target_lang = target_lang
+        self.data_file = data_file
+        self.max_len = max_len
+        self.instances = instances
+        self.vocab_size = 1
+        self.vocab_limit = vocab_limit
+        if sentence_cols is None:
+            self.sequence_cols = [
+                f"{source_lang} definition",
+                f"{target_lang} definition",
+            ]
+        else:
+            self.sequence_cols = sentence_cols
+        if score_col is None:
+            self.score_col = "is same"
+        else:
+            self.score_col = score_col
+        self.x_train = list()
+        self.y_train = list()
+        self.x_val = list()
+        self.y_val = list()
+        self.vocab = set("PAD")
+        self.word_to_id = {"PAD": 0}
+        self.id_to_word = {0: "PAD"}
+        self.word_to_count = dict()
+        self.run()
+    def text_to_word_list(self, text):
+        """ Pre process and convert texts to a list of words """
+        text = str(text)
+        text = text.split()
+        return text
+    def load_data(self):
+        # Load data set
+        data_df = pd.read_csv(self.data_file, sep="\t")
+        # Iterate over required sequences of provided dataset
+        for index, row in data_df.iterrows():
+            # Iterate through the text of both questions of the row
+            for sequence in self.sequence_cols:
+                s2n = []  # Sequences with words replaces with indices
+                for word in self.text_to_word_list(row[sequence]):
+                    if word not in self.vocab:
+                        self.vocab.add(word)
+                        self.word_to_id[word] = self.vocab_size
+                        self.word_to_count[word] = 1
+                        s2n.append(self.vocab_size)
+                        self.id_to_word[self.vocab_size] = word
+                        self.vocab_size += 1
+                    else:
+                        self.word_to_count[word] += 1
+                        s2n.append(self.word_to_id[word])
+                # Replace |sequence as word| with |sequence as number| representation
+                data_df.at[index, sequence] = s2n
+        return data_df
+    def pad_sequences(self):
+        if self.max_len == 0:
+            self.max_len = max(
+                max(len(seq) for seq in self.x_train[0]),
+                max(len(seq) for seq in self.x_train[1]),
+                max(len(seq) for seq in self.x_val[0]),
+                max(len(seq) for seq in self.x_val[1]),
+            )
+        # Zero padding
+        for dataset, side in itertools.product([self.x_train, self.x_val], [0, 1]):
+            if self.max_len:
+                dataset[side] = pad_sequences(dataset[side], maxlen=self.max_len)
+            else:
+                dataset[side] = pad_sequences(dataset[side])
+    def run(self):
+        # Loading data and building vocabulary.
+        data_df = self.load_data()
+        X = data_df[self.sequence_cols]
+        Y = data_df[self.score_col]
+        self.x_train, self.x_val, self.y_train, self.y_val = split_data(
+            X, Y, test_size=self.instances, shuffle=False
+        )
+        # Split to lists
+        self.x_train = [self.x_train[column] for column in self.sequence_cols]
+        self.x_val = [self.x_val[column] for column in self.sequence_cols]
+        # Convert labels to their numpy representations
+        self.y_train = self.y_train.values
+        self.y_val = self.y_val.values
+        # Padding Sequences.
+        self.pad_sequences()
+class Get_Embedding(object):
+    def __init__(self, source_lang, target_lang, source_emb, target_emb, word_index):
+        self.embedding_size = 300  # Default dimensionality
+        self.embedding_matrix = self.create_embed_matrix(
+            source_lang, target_lang, source_emb, target_emb, word_index
+        )
+    def create_embed_matrix(
+        self, source_lang, target_lang, source_emb, target_emb, word_index
+    ):
+        source_vecs = KeyedVectors.load_word2vec_format(source_emb)
+        target_vecs = KeyedVectors.load_word2vec_format(target_emb)
+        # Prepare Embedding Matrix.
+        embedding_matrix = np.zeros((len(word_index) + 1, self.embedding_size))
+        # word has either __source or __target appended
+        for key, i in word_index.items():
+            if "__" not in key:
+                print("Skipping {}".format(key))
+                continue
+            word, lang = key.split("__")
+            if lang == source_lang:
+                if word in source_vecs.vocab:
+                    embedding_matrix[i] = source_vecs.word_vec(word)
+            else:
+                if word in target_vecs.vocab:
+                    embedding_matrix[i] = target_vecs.word_vec(word)
+        del source_vecs
+        del target_vecs
+        return embedding_matrix

diff --git a/Helpers.py b/Helpers.py new file mode 100644 index 0000000..7c615ab --- /dev/null +++ b/Helpers.py
@@ -0,0 +1,157 @@
	1	import itertools
	2
	3	import numpy as np
	4	from sklearn.model_selection import train_test_split as split_data
	5
	6	import pandas as pd
	7	from gensim.models import KeyedVectors
	8	from keras.preprocessing.sequence import pad_sequences
	9
	10
	11	class Data(object):
	12	def __init__(
	13	self,
	14	source_lang,
	15	target_lang,
	16	data_file,
	17	max_len=None,
	18	instances=1000,
	19	vocab_limit=None,
	20	sentence_cols=None,
	21	score_col=None,
	22	):
	23	self.source_lang = source_lang
	24	self.target_lang = target_lang
	25	self.data_file = data_file
	26	self.max_len = max_len
	27	self.instances = instances
	28	self.vocab_size = 1
	29	self.vocab_limit = vocab_limit
	30
	31	if sentence_cols is None:
	32	self.sequence_cols = [
	33	f"{source_lang} definition",
	34	f"{target_lang} definition",
	35	]
	36	else:
	37	self.sequence_cols = sentence_cols
	38
	39	if score_col is None:
	40	self.score_col = "is same"
	41	else:
	42	self.score_col = score_col
	43
	44	self.x_train = list()
	45	self.y_train = list()
	46	self.x_val = list()
	47	self.y_val = list()
	48	self.vocab = set("PAD")
	49	self.word_to_id = {"PAD": 0}
	50	self.id_to_word = {0: "PAD"}
	51	self.word_to_count = dict()
	52	self.run()
	53
	54	def text_to_word_list(self, text):
	55	""" Pre process and convert texts to a list of words """
	56	text = str(text)
	57	text = text.split()
	58	return text
	59
	60	def load_data(self):
	61	# Load data set
	62	data_df = pd.read_csv(self.data_file, sep="\t")
	63
	64	# Iterate over required sequences of provided dataset
	65	for index, row in data_df.iterrows():
	66	# Iterate through the text of both questions of the row
	67	for sequence in self.sequence_cols:
	68	s2n = [] # Sequences with words replaces with indices
	69	for word in self.text_to_word_list(row[sequence]):
	70	if word not in self.vocab:
	71	self.vocab.add(word)
	72	self.word_to_id[word] = self.vocab_size
	73	self.word_to_count[word] = 1
	74	s2n.append(self.vocab_size)
	75	self.id_to_word[self.vocab_size] = word
	76	self.vocab_size += 1
	77	else:
	78	self.word_to_count[word] += 1
	79	s2n.append(self.word_to_id[word])
	80
	81	# Replace \|sequence as word\| with \|sequence as number\| representation
	82	data_df.at[index, sequence] = s2n
	83	return data_df
	84
	85	def pad_sequences(self):
	86	if self.max_len == 0:
	87	self.max_len = max(
	88	max(len(seq) for seq in self.x_train[0]),
	89	max(len(seq) for seq in self.x_train[1]),
	90	max(len(seq) for seq in self.x_val[0]),
	91	max(len(seq) for seq in self.x_val[1]),
	92	)
	93
	94	# Zero padding
	95	for dataset, side in itertools.product([self.x_train, self.x_val], [0, 1]):
	96	if self.max_len:
	97	dataset[side] = pad_sequences(dataset[side], maxlen=self.max_len)
	98	else:
	99	dataset[side] = pad_sequences(dataset[side])
	100
	101	def run(self):
	102	# Loading data and building vocabulary.
	103	data_df = self.load_data()
	104
	105	X = data_df[self.sequence_cols]
	106	Y = data_df[self.score_col]
	107
	108	self.x_train, self.x_val, self.y_train, self.y_val = split_data(
	109	X, Y, test_size=self.instances, shuffle=False
	110	)
	111
	112	# Split to lists
	113	self.x_train = [self.x_train[column] for column in self.sequence_cols]
	114	self.x_val = [self.x_val[column] for column in self.sequence_cols]
	115
	116	# Convert labels to their numpy representations
	117	self.y_train = self.y_train.values
	118	self.y_val = self.y_val.values
	119
	120	# Padding Sequences.
	121	self.pad_sequences()
	122
	123
	124	class Get_Embedding(object):
	125	def __init__(self, source_lang, target_lang, source_emb, target_emb, word_index):
	126	self.embedding_size = 300 # Default dimensionality
	127	self.embedding_matrix = self.create_embed_matrix(
	128	source_lang, target_lang, source_emb, target_emb, word_index
	129	)
	130
	131	def create_embed_matrix(
	132	self, source_lang, target_lang, source_emb, target_emb, word_index
	133	):
	134	source_vecs = KeyedVectors.load_word2vec_format(source_emb)
	135	target_vecs = KeyedVectors.load_word2vec_format(target_emb)
	136
	137	# Prepare Embedding Matrix.
	138	embedding_matrix = np.zeros((len(word_index) + 1, self.embedding_size))
	139
	140	# word has either __source or __target appended
	141	for key, i in word_index.items():
	142	if "__" not in key:
	143	print("Skipping {}".format(key))
	144	continue
	145
	146	word, lang = key.split("__")
	147
	148	if lang == source_lang:
	149	if word in source_vecs.vocab:
	150	embedding_matrix[i] = source_vecs.word_vec(word)
	151	else:
	152	if word in target_vecs.vocab:
	153	embedding_matrix[i] = target_vecs.word_vec(word)
	154
	155	del source_vecs
	156	del target_vecs
	157	return embedding_matrix