1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
|
import itertools
import numpy as np
from sklearn.model_selection import train_test_split as split_data
import pandas as pd
from gensim.models import KeyedVectors
from keras.preprocessing.sequence import pad_sequences
class Data(object):
def __init__(
self,
source_lang,
target_lang,
data_file,
max_len=None,
instances=1000,
vocab_limit=None,
sentence_cols=None,
score_col=None,
):
self.source_lang = source_lang
self.target_lang = target_lang
self.data_file = data_file
self.max_len = max_len
self.instances = instances
self.vocab_size = 1
self.vocab_limit = vocab_limit
if sentence_cols is None:
self.sequence_cols = [
f"{source_lang} definition",
f"{target_lang} definition",
]
else:
self.sequence_cols = sentence_cols
if score_col is None:
self.score_col = "is same"
else:
self.score_col = score_col
self.x_train = list()
self.y_train = list()
self.x_val = list()
self.y_val = list()
self.vocab = set("PAD")
self.word_to_id = {"PAD": 0}
self.id_to_word = {0: "PAD"}
self.word_to_count = dict()
self.run()
def text_to_word_list(self, text):
""" Pre process and convert texts to a list of words """
text = str(text)
text = text.split()
return text
def load_data(self):
# Load data set
data_df = pd.read_csv(self.data_file, sep="\t")
# Iterate over required sequences of provided dataset
for index, row in data_df.iterrows():
# Iterate through the text of both questions of the row
for sequence in self.sequence_cols:
s2n = [] # Sequences with words replaces with indices
for word in self.text_to_word_list(row[sequence]):
if word not in self.vocab:
self.vocab.add(word)
self.word_to_id[word] = self.vocab_size
self.word_to_count[word] = 1
s2n.append(self.vocab_size)
self.id_to_word[self.vocab_size] = word
self.vocab_size += 1
else:
self.word_to_count[word] += 1
s2n.append(self.word_to_id[word])
# Replace |sequence as word| with |sequence as number| representation
data_df.at[index, sequence] = s2n
return data_df
def pad_sequences(self):
if self.max_len == 0:
self.max_len = max(
max(len(seq) for seq in self.x_train[0]),
max(len(seq) for seq in self.x_train[1]),
max(len(seq) for seq in self.x_val[0]),
max(len(seq) for seq in self.x_val[1]),
)
# Zero padding
for dataset, side in itertools.product([self.x_train, self.x_val], [0, 1]):
if self.max_len:
dataset[side] = pad_sequences(dataset[side], maxlen=self.max_len)
else:
dataset[side] = pad_sequences(dataset[side])
def run(self):
# Loading data and building vocabulary.
data_df = self.load_data()
X = data_df[self.sequence_cols]
Y = data_df[self.score_col]
self.x_train, self.x_val, self.y_train, self.y_val = split_data(
X, Y, test_size=self.instances, shuffle=False
)
# Split to lists
self.x_train = [self.x_train[column] for column in self.sequence_cols]
self.x_val = [self.x_val[column] for column in self.sequence_cols]
# Convert labels to their numpy representations
self.y_train = self.y_train.values
self.y_val = self.y_val.values
# Padding Sequences.
self.pad_sequences()
class Get_Embedding(object):
def __init__(self, source_lang, target_lang, source_emb, target_emb, word_index):
self.embedding_size = 300 # Default dimensionality
self.embedding_matrix = self.create_embed_matrix(
source_lang, target_lang, source_emb, target_emb, word_index
)
def create_embed_matrix(
self, source_lang, target_lang, source_emb, target_emb, word_index
):
source_vecs = KeyedVectors.load_word2vec_format(source_emb)
target_vecs = KeyedVectors.load_word2vec_format(target_emb)
# Prepare Embedding Matrix.
embedding_matrix = np.zeros((len(word_index) + 1, self.embedding_size))
# word has either __source or __target appended
for key, i in word_index.items():
if "__" not in key:
print("Skipping {}".format(key))
continue
word, lang = key.split("__")
if lang == source_lang:
if word in source_vecs.vocab:
embedding_matrix[i] = source_vecs.word_vec(word)
else:
if word in target_vecs.vocab:
embedding_matrix[i] = target_vecs.word_vec(word)
del source_vecs
del target_vecs
return embedding_matrix
|