diff options
author | Yigit Sever | 2019-09-25 21:08:18 +0300 |
---|---|---|
committer | Yigit Sever | 2019-09-25 21:08:18 +0300 |
commit | 4d117258017fb1518d7ce7242e5a9c5a780d70d7 (patch) | |
tree | 9410d052a4a5870b1270a82a861d5d4f5e355d48 /learn_and_predict.py | |
parent | 442a1895fe567502ec5fec20a62083ea090f38cc (diff) | |
download | Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.tar.gz Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.tar.bz2 Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.zip |
Include supervised code
Due to how we handle data, supervised approaches are hardcoded
to work on 1000 instances for now
Diffstat (limited to 'learn_and_predict.py')
-rw-r--r-- | learn_and_predict.py | 202 |
1 files changed, 202 insertions, 0 deletions
diff --git a/learn_and_predict.py b/learn_and_predict.py new file mode 100644 index 0000000..36c56f2 --- /dev/null +++ b/learn_and_predict.py | |||
@@ -0,0 +1,202 @@ | |||
1 | import argparse | ||
2 | import csv | ||
3 | |||
4 | import numpy as np | ||
5 | |||
6 | import keras | ||
7 | import keras.backend as K | ||
8 | from Helpers import Data, Get_Embedding | ||
9 | from keras.layers import LSTM, Embedding, Input, Lambda, concatenate | ||
10 | from keras.models import Model | ||
11 | |||
12 | |||
13 | def get_learning_rate(epoch=None, model=None): | ||
14 | return np.round(float(K.get_value(model.optimizer.lr)), 5) | ||
15 | |||
16 | |||
17 | def make_cosine_func(hidden_size=50): | ||
18 | def exponent_neg_cosine_similarity(x): | ||
19 | """ Helper function for the similarity estimate of the LSTMs outputs """ | ||
20 | leftNorm = K.l2_normalize(x[:, :hidden_size], axis=-1) | ||
21 | rightNorm = K.l2_normalize(x[:, hidden_size:], axis=-1) | ||
22 | return K.sum(K.prod([leftNorm, rightNorm], axis=0), axis=1, keepdims=True) | ||
23 | |||
24 | return exponent_neg_cosine_similarity | ||
25 | |||
26 | |||
27 | def main(args): | ||
28 | |||
29 | source_lang = args.source_lang | ||
30 | target_lang = args.target_lang | ||
31 | hidden_size = args.hidden_size | ||
32 | max_len = args.max_len | ||
33 | num_iters = args.num_iters | ||
34 | data_file = args.data_file | ||
35 | learning_rate = args.learning_rate | ||
36 | batch = args.batch | ||
37 | |||
38 | data = Data(source_lang, target_lang, data_file, max_len) | ||
39 | |||
40 | x_train = data.x_train | ||
41 | y_train = data.y_train | ||
42 | x_predict = data.x_val | ||
43 | y_predict = data.y_val | ||
44 | vocab_size = data.vocab_size | ||
45 | max_len = data.max_len | ||
46 | |||
47 | # https://stackoverflow.com/a/10741692/3005749 | ||
48 | x = data.y_val | ||
49 | y = np.bincount(x.astype(np.int32)) | ||
50 | ii = np.nonzero(y)[0] | ||
51 | assert ii == 1 | ||
52 | assert y[ii] == 1000 # hardcoded for now | ||
53 | |||
54 | if not batch: | ||
55 | print(f"Source Lang: {source_lang}") | ||
56 | print(f"Target Lang: {target_lang}") | ||
57 | print(f"Using {len(x_train[0])} pairs to learn") | ||
58 | print(f"Predicting {len(y_predict)} pairs") | ||
59 | print(f"Vocabulary size: {vocab_size}") | ||
60 | print(f"Maximum sequence length: {max_len}") | ||
61 | |||
62 | source_emb_file = args.source_emb_file | ||
63 | target_emb_file = args.target_emb_file | ||
64 | |||
65 | embedding = Get_Embedding( | ||
66 | source_lang, target_lang, source_emb_file, target_emb_file, data.word_to_id | ||
67 | ) | ||
68 | embedding_size = embedding.embedding_matrix.shape[1] | ||
69 | |||
70 | seq_1 = Input(shape=(max_len,), dtype="int32", name="sequence1") | ||
71 | seq_2 = Input(shape=(max_len,), dtype="int32", name="sequence2") | ||
72 | |||
73 | embed_layer = Embedding( | ||
74 | output_dim=embedding_size, | ||
75 | input_dim=vocab_size + 1, | ||
76 | input_length=max_len, | ||
77 | trainable=False, | ||
78 | ) | ||
79 | embed_layer.build((None,)) | ||
80 | embed_layer.set_weights([embedding.embedding_matrix]) | ||
81 | |||
82 | input_1 = embed_layer(seq_1) | ||
83 | input_2 = embed_layer(seq_2) | ||
84 | |||
85 | l1 = LSTM(units=hidden_size) | ||
86 | |||
87 | l1_out = l1(input_1) | ||
88 | l2_out = l1(input_2) | ||
89 | |||
90 | concats = concatenate([l1_out, l2_out], axis=-1) | ||
91 | |||
92 | out_func = make_cosine_func(hidden_size) | ||
93 | |||
94 | main_output = Lambda(out_func, output_shape=(1,))(concats) | ||
95 | |||
96 | model = Model(inputs=[seq_1, seq_2], outputs=[main_output]) | ||
97 | |||
98 | opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.25) | ||
99 | |||
100 | model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"]) | ||
101 | model.summary() | ||
102 | |||
103 | adjuster = keras.callbacks.ReduceLROnPlateau( | ||
104 | monitor="val_acc", patience=5, verbose=1, factor=0.5, min_lr=0.0001 | ||
105 | ) | ||
106 | |||
107 | history = model.fit( | ||
108 | x_train, | ||
109 | y_train, | ||
110 | validation_data=(x_predict, y_predict), | ||
111 | epochs=num_iters, | ||
112 | batch_size=32, | ||
113 | verbose=1, | ||
114 | callbacks=[adjuster], | ||
115 | ) | ||
116 | |||
117 | target_sents = x_predict[1] | ||
118 | precision_at_one = 0 | ||
119 | precision_at_ten = 0 | ||
120 | for index, sent in enumerate(x_predict[0]): | ||
121 | source_sents = np.array([sent] * 1000) | ||
122 | to_predict = [source_sents, target_sents] | ||
123 | preds = model.predict(to_predict) | ||
124 | ind = np.argpartition(preds.ravel(), -10)[-10:] | ||
125 | if index in ind: | ||
126 | precision_at_ten += 1 | ||
127 | if np.argmax(preds.ravel()) == index: | ||
128 | precision_at_one += 1 | ||
129 | |||
130 | training_samples = len(x_train[0]) | ||
131 | validation_samples = len(y_predict) | ||
132 | fields = [ | ||
133 | source_lang, | ||
134 | target_lang, | ||
135 | training_samples, | ||
136 | validation_samples, | ||
137 | precision_at_one, | ||
138 | precision_at_ten, | ||
139 | ] | ||
140 | |||
141 | if not batch: | ||
142 | print(f"P@1: {precision_at_one/1000}, {precision_at_one} defs") | ||
143 | else: | ||
144 | with open("supervised.csv", "a") as f: | ||
145 | writer = csv.writer(f) | ||
146 | writer.writerow(fields) | ||
147 | |||
148 | |||
149 | if __name__ == "__main__": | ||
150 | |||
151 | parser = argparse.ArgumentParser() | ||
152 | |||
153 | parser.add_argument( | ||
154 | "-sl", "--source_lang", type=str, help="Source language.", default="english" | ||
155 | ) | ||
156 | parser.add_argument( | ||
157 | "-tl", "--target_lang", type=str, help="Target language.", default="italian" | ||
158 | ) | ||
159 | parser.add_argument("-df", "--data_file", type=str, help="Path to dataset.") | ||
160 | parser.add_argument( | ||
161 | "-es", | ||
162 | "--source_emb_file", | ||
163 | type=str, | ||
164 | help="Path to Source (English) Embedding File.", | ||
165 | ) | ||
166 | parser.add_argument( | ||
167 | "-et", "--target_emb_file", type=str, help="Path to Target Embedding File." | ||
168 | ) | ||
169 | parser.add_argument( | ||
170 | "-l", | ||
171 | "--max_len", | ||
172 | type=int, | ||
173 | help="Maximum number of words in a sentence.", | ||
174 | default=20, | ||
175 | ) | ||
176 | parser.add_argument( | ||
177 | "-z", | ||
178 | "--hidden_size", | ||
179 | type=int, | ||
180 | help="Number of Units in LSTM layer.", | ||
181 | default=50, | ||
182 | ) | ||
183 | parser.add_argument( | ||
184 | "-b", | ||
185 | "--batch", | ||
186 | action="store_true", | ||
187 | help="running in batch (store results to csv) or" | ||
188 | + "running in a single instance (output the results)", | ||
189 | ) | ||
190 | parser.add_argument( | ||
191 | "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=7 | ||
192 | ) | ||
193 | parser.add_argument( | ||
194 | "-lr", | ||
195 | "--learning_rate", | ||
196 | type=float, | ||
197 | help="Learning rate for optimizer.", | ||
198 | default=1.0, | ||
199 | ) | ||
200 | |||
201 | args = parser.parse_args() | ||
202 | main(args) | ||