aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Helpers.py157
-rw-r--r--learn_and_predict.py202
2 files changed, 359 insertions, 0 deletions
diff --git a/Helpers.py b/Helpers.py
new file mode 100644
index 0000000..7c615ab
--- /dev/null
+++ b/Helpers.py
@@ -0,0 +1,157 @@
1import itertools
2
3import numpy as np
4from sklearn.model_selection import train_test_split as split_data
5
6import pandas as pd
7from gensim.models import KeyedVectors
8from keras.preprocessing.sequence import pad_sequences
9
10
11class Data(object):
12 def __init__(
13 self,
14 source_lang,
15 target_lang,
16 data_file,
17 max_len=None,
18 instances=1000,
19 vocab_limit=None,
20 sentence_cols=None,
21 score_col=None,
22 ):
23 self.source_lang = source_lang
24 self.target_lang = target_lang
25 self.data_file = data_file
26 self.max_len = max_len
27 self.instances = instances
28 self.vocab_size = 1
29 self.vocab_limit = vocab_limit
30
31 if sentence_cols is None:
32 self.sequence_cols = [
33 f"{source_lang} definition",
34 f"{target_lang} definition",
35 ]
36 else:
37 self.sequence_cols = sentence_cols
38
39 if score_col is None:
40 self.score_col = "is same"
41 else:
42 self.score_col = score_col
43
44 self.x_train = list()
45 self.y_train = list()
46 self.x_val = list()
47 self.y_val = list()
48 self.vocab = set("PAD")
49 self.word_to_id = {"PAD": 0}
50 self.id_to_word = {0: "PAD"}
51 self.word_to_count = dict()
52 self.run()
53
54 def text_to_word_list(self, text):
55 """ Pre process and convert texts to a list of words """
56 text = str(text)
57 text = text.split()
58 return text
59
60 def load_data(self):
61 # Load data set
62 data_df = pd.read_csv(self.data_file, sep="\t")
63
64 # Iterate over required sequences of provided dataset
65 for index, row in data_df.iterrows():
66 # Iterate through the text of both questions of the row
67 for sequence in self.sequence_cols:
68 s2n = [] # Sequences with words replaces with indices
69 for word in self.text_to_word_list(row[sequence]):
70 if word not in self.vocab:
71 self.vocab.add(word)
72 self.word_to_id[word] = self.vocab_size
73 self.word_to_count[word] = 1
74 s2n.append(self.vocab_size)
75 self.id_to_word[self.vocab_size] = word
76 self.vocab_size += 1
77 else:
78 self.word_to_count[word] += 1
79 s2n.append(self.word_to_id[word])
80
81 # Replace |sequence as word| with |sequence as number| representation
82 data_df.at[index, sequence] = s2n
83 return data_df
84
85 def pad_sequences(self):
86 if self.max_len == 0:
87 self.max_len = max(
88 max(len(seq) for seq in self.x_train[0]),
89 max(len(seq) for seq in self.x_train[1]),
90 max(len(seq) for seq in self.x_val[0]),
91 max(len(seq) for seq in self.x_val[1]),
92 )
93
94 # Zero padding
95 for dataset, side in itertools.product([self.x_train, self.x_val], [0, 1]):
96 if self.max_len:
97 dataset[side] = pad_sequences(dataset[side], maxlen=self.max_len)
98 else:
99 dataset[side] = pad_sequences(dataset[side])
100
101 def run(self):
102 # Loading data and building vocabulary.
103 data_df = self.load_data()
104
105 X = data_df[self.sequence_cols]
106 Y = data_df[self.score_col]
107
108 self.x_train, self.x_val, self.y_train, self.y_val = split_data(
109 X, Y, test_size=self.instances, shuffle=False
110 )
111
112 # Split to lists
113 self.x_train = [self.x_train[column] for column in self.sequence_cols]
114 self.x_val = [self.x_val[column] for column in self.sequence_cols]
115
116 # Convert labels to their numpy representations
117 self.y_train = self.y_train.values
118 self.y_val = self.y_val.values
119
120 # Padding Sequences.
121 self.pad_sequences()
122
123
124class Get_Embedding(object):
125 def __init__(self, source_lang, target_lang, source_emb, target_emb, word_index):
126 self.embedding_size = 300 # Default dimensionality
127 self.embedding_matrix = self.create_embed_matrix(
128 source_lang, target_lang, source_emb, target_emb, word_index
129 )
130
131 def create_embed_matrix(
132 self, source_lang, target_lang, source_emb, target_emb, word_index
133 ):
134 source_vecs = KeyedVectors.load_word2vec_format(source_emb)
135 target_vecs = KeyedVectors.load_word2vec_format(target_emb)
136
137 # Prepare Embedding Matrix.
138 embedding_matrix = np.zeros((len(word_index) + 1, self.embedding_size))
139
140 # word has either __source or __target appended
141 for key, i in word_index.items():
142 if "__" not in key:
143 print("Skipping {}".format(key))
144 continue
145
146 word, lang = key.split("__")
147
148 if lang == source_lang:
149 if word in source_vecs.vocab:
150 embedding_matrix[i] = source_vecs.word_vec(word)
151 else:
152 if word in target_vecs.vocab:
153 embedding_matrix[i] = target_vecs.word_vec(word)
154
155 del source_vecs
156 del target_vecs
157 return embedding_matrix
diff --git a/learn_and_predict.py b/learn_and_predict.py
new file mode 100644
index 0000000..36c56f2
--- /dev/null
+++ b/learn_and_predict.py
@@ -0,0 +1,202 @@
1import argparse
2import csv
3
4import numpy as np
5
6import keras
7import keras.backend as K
8from Helpers import Data, Get_Embedding
9from keras.layers import LSTM, Embedding, Input, Lambda, concatenate
10from keras.models import Model
11
12
13def get_learning_rate(epoch=None, model=None):
14 return np.round(float(K.get_value(model.optimizer.lr)), 5)
15
16
17def make_cosine_func(hidden_size=50):
18 def exponent_neg_cosine_similarity(x):
19 """ Helper function for the similarity estimate of the LSTMs outputs """
20 leftNorm = K.l2_normalize(x[:, :hidden_size], axis=-1)
21 rightNorm = K.l2_normalize(x[:, hidden_size:], axis=-1)
22 return K.sum(K.prod([leftNorm, rightNorm], axis=0), axis=1, keepdims=True)
23
24 return exponent_neg_cosine_similarity
25
26
27def main(args):
28
29 source_lang = args.source_lang
30 target_lang = args.target_lang
31 hidden_size = args.hidden_size
32 max_len = args.max_len
33 num_iters = args.num_iters
34 data_file = args.data_file
35 learning_rate = args.learning_rate
36 batch = args.batch
37
38 data = Data(source_lang, target_lang, data_file, max_len)
39
40 x_train = data.x_train
41 y_train = data.y_train
42 x_predict = data.x_val
43 y_predict = data.y_val
44 vocab_size = data.vocab_size
45 max_len = data.max_len
46
47 # https://stackoverflow.com/a/10741692/3005749
48 x = data.y_val
49 y = np.bincount(x.astype(np.int32))
50 ii = np.nonzero(y)[0]
51 assert ii == 1
52 assert y[ii] == 1000 # hardcoded for now
53
54 if not batch:
55 print(f"Source Lang: {source_lang}")
56 print(f"Target Lang: {target_lang}")
57 print(f"Using {len(x_train[0])} pairs to learn")
58 print(f"Predicting {len(y_predict)} pairs")
59 print(f"Vocabulary size: {vocab_size}")
60 print(f"Maximum sequence length: {max_len}")
61
62 source_emb_file = args.source_emb_file
63 target_emb_file = args.target_emb_file
64
65 embedding = Get_Embedding(
66 source_lang, target_lang, source_emb_file, target_emb_file, data.word_to_id
67 )
68 embedding_size = embedding.embedding_matrix.shape[1]
69
70 seq_1 = Input(shape=(max_len,), dtype="int32", name="sequence1")
71 seq_2 = Input(shape=(max_len,), dtype="int32", name="sequence2")
72
73 embed_layer = Embedding(
74 output_dim=embedding_size,
75 input_dim=vocab_size + 1,
76 input_length=max_len,
77 trainable=False,
78 )
79 embed_layer.build((None,))
80 embed_layer.set_weights([embedding.embedding_matrix])
81
82 input_1 = embed_layer(seq_1)
83 input_2 = embed_layer(seq_2)
84
85 l1 = LSTM(units=hidden_size)
86
87 l1_out = l1(input_1)
88 l2_out = l1(input_2)
89
90 concats = concatenate([l1_out, l2_out], axis=-1)
91
92 out_func = make_cosine_func(hidden_size)
93
94 main_output = Lambda(out_func, output_shape=(1,))(concats)
95
96 model = Model(inputs=[seq_1, seq_2], outputs=[main_output])
97
98 opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.25)
99
100 model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"])
101 model.summary()
102
103 adjuster = keras.callbacks.ReduceLROnPlateau(
104 monitor="val_acc", patience=5, verbose=1, factor=0.5, min_lr=0.0001
105 )
106
107 history = model.fit(
108 x_train,
109 y_train,
110 validation_data=(x_predict, y_predict),
111 epochs=num_iters,
112 batch_size=32,
113 verbose=1,
114 callbacks=[adjuster],
115 )
116
117 target_sents = x_predict[1]
118 precision_at_one = 0
119 precision_at_ten = 0
120 for index, sent in enumerate(x_predict[0]):
121 source_sents = np.array([sent] * 1000)
122 to_predict = [source_sents, target_sents]
123 preds = model.predict(to_predict)
124 ind = np.argpartition(preds.ravel(), -10)[-10:]
125 if index in ind:
126 precision_at_ten += 1
127 if np.argmax(preds.ravel()) == index:
128 precision_at_one += 1
129
130 training_samples = len(x_train[0])
131 validation_samples = len(y_predict)
132 fields = [
133 source_lang,
134 target_lang,
135 training_samples,
136 validation_samples,
137 precision_at_one,
138 precision_at_ten,
139 ]
140
141 if not batch:
142 print(f"P@1: {precision_at_one/1000}, {precision_at_one} defs")
143 else:
144 with open("supervised.csv", "a") as f:
145 writer = csv.writer(f)
146 writer.writerow(fields)
147
148
149if __name__ == "__main__":
150
151 parser = argparse.ArgumentParser()
152
153 parser.add_argument(
154 "-sl", "--source_lang", type=str, help="Source language.", default="english"
155 )
156 parser.add_argument(
157 "-tl", "--target_lang", type=str, help="Target language.", default="italian"
158 )
159 parser.add_argument("-df", "--data_file", type=str, help="Path to dataset.")
160 parser.add_argument(
161 "-es",
162 "--source_emb_file",
163 type=str,
164 help="Path to Source (English) Embedding File.",
165 )
166 parser.add_argument(
167 "-et", "--target_emb_file", type=str, help="Path to Target Embedding File."
168 )
169 parser.add_argument(
170 "-l",
171 "--max_len",
172 type=int,
173 help="Maximum number of words in a sentence.",
174 default=20,
175 )
176 parser.add_argument(
177 "-z",
178 "--hidden_size",
179 type=int,
180 help="Number of Units in LSTM layer.",
181 default=50,
182 )
183 parser.add_argument(
184 "-b",
185 "--batch",
186 action="store_true",
187 help="running in batch (store results to csv) or"
188 + "running in a single instance (output the results)",
189 )
190 parser.add_argument(
191 "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=7
192 )
193 parser.add_argument(
194 "-lr",
195 "--learning_rate",
196 type=float,
197 help="Learning rate for optimizer.",
198 default=1.0,
199 )
200
201 args = parser.parse_args()
202 main(args)