aboutsummaryrefslogtreecommitdiffstats
path: root/learn_and_predict.py
diff options
context:
space:
mode:
Diffstat (limited to 'learn_and_predict.py')
-rw-r--r--learn_and_predict.py202
1 files changed, 202 insertions, 0 deletions
diff --git a/learn_and_predict.py b/learn_and_predict.py
new file mode 100644
index 0000000..36c56f2
--- /dev/null
+++ b/learn_and_predict.py
@@ -0,0 +1,202 @@
1import argparse
2import csv
3
4import numpy as np
5
6import keras
7import keras.backend as K
8from Helpers import Data, Get_Embedding
9from keras.layers import LSTM, Embedding, Input, Lambda, concatenate
10from keras.models import Model
11
12
13def get_learning_rate(epoch=None, model=None):
14 return np.round(float(K.get_value(model.optimizer.lr)), 5)
15
16
17def make_cosine_func(hidden_size=50):
18 def exponent_neg_cosine_similarity(x):
19 """ Helper function for the similarity estimate of the LSTMs outputs """
20 leftNorm = K.l2_normalize(x[:, :hidden_size], axis=-1)
21 rightNorm = K.l2_normalize(x[:, hidden_size:], axis=-1)
22 return K.sum(K.prod([leftNorm, rightNorm], axis=0), axis=1, keepdims=True)
23
24 return exponent_neg_cosine_similarity
25
26
27def main(args):
28
29 source_lang = args.source_lang
30 target_lang = args.target_lang
31 hidden_size = args.hidden_size
32 max_len = args.max_len
33 num_iters = args.num_iters
34 data_file = args.data_file
35 learning_rate = args.learning_rate
36 batch = args.batch
37
38 data = Data(source_lang, target_lang, data_file, max_len)
39
40 x_train = data.x_train
41 y_train = data.y_train
42 x_predict = data.x_val
43 y_predict = data.y_val
44 vocab_size = data.vocab_size
45 max_len = data.max_len
46
47 # https://stackoverflow.com/a/10741692/3005749
48 x = data.y_val
49 y = np.bincount(x.astype(np.int32))
50 ii = np.nonzero(y)[0]
51 assert ii == 1
52 assert y[ii] == 1000 # hardcoded for now
53
54 if not batch:
55 print(f"Source Lang: {source_lang}")
56 print(f"Target Lang: {target_lang}")
57 print(f"Using {len(x_train[0])} pairs to learn")
58 print(f"Predicting {len(y_predict)} pairs")
59 print(f"Vocabulary size: {vocab_size}")
60 print(f"Maximum sequence length: {max_len}")
61
62 source_emb_file = args.source_emb_file
63 target_emb_file = args.target_emb_file
64
65 embedding = Get_Embedding(
66 source_lang, target_lang, source_emb_file, target_emb_file, data.word_to_id
67 )
68 embedding_size = embedding.embedding_matrix.shape[1]
69
70 seq_1 = Input(shape=(max_len,), dtype="int32", name="sequence1")
71 seq_2 = Input(shape=(max_len,), dtype="int32", name="sequence2")
72
73 embed_layer = Embedding(
74 output_dim=embedding_size,
75 input_dim=vocab_size + 1,
76 input_length=max_len,
77 trainable=False,
78 )
79 embed_layer.build((None,))
80 embed_layer.set_weights([embedding.embedding_matrix])
81
82 input_1 = embed_layer(seq_1)
83 input_2 = embed_layer(seq_2)
84
85 l1 = LSTM(units=hidden_size)
86
87 l1_out = l1(input_1)
88 l2_out = l1(input_2)
89
90 concats = concatenate([l1_out, l2_out], axis=-1)
91
92 out_func = make_cosine_func(hidden_size)
93
94 main_output = Lambda(out_func, output_shape=(1,))(concats)
95
96 model = Model(inputs=[seq_1, seq_2], outputs=[main_output])
97
98 opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.25)
99
100 model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"])
101 model.summary()
102
103 adjuster = keras.callbacks.ReduceLROnPlateau(
104 monitor="val_acc", patience=5, verbose=1, factor=0.5, min_lr=0.0001
105 )
106
107 history = model.fit(
108 x_train,
109 y_train,
110 validation_data=(x_predict, y_predict),
111 epochs=num_iters,
112 batch_size=32,
113 verbose=1,
114 callbacks=[adjuster],
115 )
116
117 target_sents = x_predict[1]
118 precision_at_one = 0
119 precision_at_ten = 0
120 for index, sent in enumerate(x_predict[0]):
121 source_sents = np.array([sent] * 1000)
122 to_predict = [source_sents, target_sents]
123 preds = model.predict(to_predict)
124 ind = np.argpartition(preds.ravel(), -10)[-10:]
125 if index in ind:
126 precision_at_ten += 1
127 if np.argmax(preds.ravel()) == index:
128 precision_at_one += 1
129
130 training_samples = len(x_train[0])
131 validation_samples = len(y_predict)
132 fields = [
133 source_lang,
134 target_lang,
135 training_samples,
136 validation_samples,
137 precision_at_one,
138 precision_at_ten,
139 ]
140
141 if not batch:
142 print(f"P@1: {precision_at_one/1000}, {precision_at_one} defs")
143 else:
144 with open("supervised.csv", "a") as f:
145 writer = csv.writer(f)
146 writer.writerow(fields)
147
148
149if __name__ == "__main__":
150
151 parser = argparse.ArgumentParser()
152
153 parser.add_argument(
154 "-sl", "--source_lang", type=str, help="Source language.", default="english"
155 )
156 parser.add_argument(
157 "-tl", "--target_lang", type=str, help="Target language.", default="italian"
158 )
159 parser.add_argument("-df", "--data_file", type=str, help="Path to dataset.")
160 parser.add_argument(
161 "-es",
162 "--source_emb_file",
163 type=str,
164 help="Path to Source (English) Embedding File.",
165 )
166 parser.add_argument(
167 "-et", "--target_emb_file", type=str, help="Path to Target Embedding File."
168 )
169 parser.add_argument(
170 "-l",
171 "--max_len",
172 type=int,
173 help="Maximum number of words in a sentence.",
174 default=20,
175 )
176 parser.add_argument(
177 "-z",
178 "--hidden_size",
179 type=int,
180 help="Number of Units in LSTM layer.",
181 default=50,
182 )
183 parser.add_argument(
184 "-b",
185 "--batch",
186 action="store_true",
187 help="running in batch (store results to csv) or"
188 + "running in a single instance (output the results)",
189 )
190 parser.add_argument(
191 "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=7
192 )
193 parser.add_argument(
194 "-lr",
195 "--learning_rate",
196 type=float,
197 help="Learning rate for optimizer.",
198 default=1.0,
199 )
200
201 args = parser.parse_args()
202 main(args)