diff options
-rw-r--r-- | learn_and_predict.py | 78 |
1 files changed, 52 insertions, 26 deletions
diff --git a/learn_and_predict.py b/learn_and_predict.py index 4c094d7..907d4be 100644 --- a/learn_and_predict.py +++ b/learn_and_predict.py | |||
@@ -4,10 +4,11 @@ import csv | |||
4 | import keras | 4 | import keras |
5 | import keras.backend as K | 5 | import keras.backend as K |
6 | import numpy as np | 6 | import numpy as np |
7 | from Helpers import Data, Get_Embedding | ||
7 | from keras.layers import LSTM, Embedding, Input, Lambda, concatenate | 8 | from keras.layers import LSTM, Embedding, Input, Lambda, concatenate |
8 | from keras.models import Model | 9 | from keras.models import Model |
9 | 10 | from lapjv import lapjv | |
10 | from Helpers import Data, Get_Embedding | 11 | from sklearn.model_selection import train_test_split |
11 | 12 | ||
12 | 13 | ||
13 | def get_learning_rate(epoch=None, model=None): | 14 | def get_learning_rate(epoch=None, model=None): |
@@ -39,8 +40,8 @@ def main(args): | |||
39 | 40 | ||
40 | x_train = data.x_train | 41 | x_train = data.x_train |
41 | y_train = data.y_train | 42 | y_train = data.y_train |
42 | x_predict = data.x_val | 43 | x_test = data.x_val |
43 | y_predict = data.y_val | 44 | y_test = data.y_val |
44 | vocab_size = data.vocab_size | 45 | vocab_size = data.vocab_size |
45 | max_len = data.max_len | 46 | max_len = data.max_len |
46 | 47 | ||
@@ -49,13 +50,23 @@ def main(args): | |||
49 | y = np.bincount(x.astype(np.int32)) | 50 | y = np.bincount(x.astype(np.int32)) |
50 | ii = np.nonzero(y)[0] | 51 | ii = np.nonzero(y)[0] |
51 | assert ii == 1 | 52 | assert ii == 1 |
52 | assert y[ii] == 1000 # hardcoded for now | 53 | assert y[ii] == 1000 # hardcoded, sorry |
54 | |||
55 | # separating the train-test set and the set used for our task | ||
56 | ( | ||
57 | source_train, | ||
58 | source_validate, | ||
59 | target_train, | ||
60 | target_validate, | ||
61 | Y_train, | ||
62 | Y_validate, | ||
63 | ) = train_test_split(x_train[0], x_train[1], y_train) | ||
53 | 64 | ||
54 | if not batch: | 65 | if not batch: |
55 | print(f"Source Lang: {source_lang}") | 66 | print(f"Source Lang: {source_lang}") |
56 | print(f"Target Lang: {target_lang}") | 67 | print(f"Target Lang: {target_lang}") |
57 | print(f"Using {len(x_train[0])} pairs to learn") | 68 | print(f"Using {len(x_train[0])} pairs to learn") |
58 | print(f"Predicting {len(y_predict)} pairs") | 69 | print(f"Predicting {len(Y_validate)} pairs") |
59 | print(f"Vocabulary size: {vocab_size}") | 70 | print(f"Vocabulary size: {vocab_size}") |
60 | print(f"Maximum sequence length: {max_len}") | 71 | print(f"Maximum sequence length: {max_len}") |
61 | 72 | ||
@@ -82,7 +93,7 @@ def main(args): | |||
82 | input_1 = embed_layer(seq_1) | 93 | input_1 = embed_layer(seq_1) |
83 | input_2 = embed_layer(seq_2) | 94 | input_2 = embed_layer(seq_2) |
84 | 95 | ||
85 | l1 = LSTM(units=hidden_size) | 96 | l1 = LSTM(units=300, use_bias=True, dropout=0.05, recurrent_dropout=0.1) |
86 | 97 | ||
87 | l1_out = l1(input_1) | 98 | l1_out = l1(input_1) |
88 | l2_out = l1(input_2) | 99 | l2_out = l1(input_2) |
@@ -95,7 +106,7 @@ def main(args): | |||
95 | 106 | ||
96 | model = Model(inputs=[seq_1, seq_2], outputs=[main_output]) | 107 | model = Model(inputs=[seq_1, seq_2], outputs=[main_output]) |
97 | 108 | ||
98 | opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.25) | 109 | opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.2) |
99 | 110 | ||
100 | model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"]) | 111 | model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"]) |
101 | model.summary() | 112 | model.summary() |
@@ -104,45 +115,60 @@ def main(args): | |||
104 | monitor="val_accuracy", patience=5, verbose=1, factor=0.5, min_lr=0.0001 | 115 | monitor="val_accuracy", patience=5, verbose=1, factor=0.5, min_lr=0.0001 |
105 | ) | 116 | ) |
106 | 117 | ||
107 | history = model.fit( | 118 | model.fit( |
108 | x_train, | 119 | x=[source_train, target_train], |
109 | y_train, | 120 | y=Y_train, |
110 | validation_data=(x_predict, y_predict), | 121 | validation_data=([source_validate, target_validate], Y_validate), |
111 | epochs=num_iters, | 122 | epochs=num_iters, |
112 | batch_size=32, | 123 | batch_size=32, |
113 | verbose=1, | 124 | verbose=0, |
114 | callbacks=[adjuster], | 125 | callbacks=[adjuster], |
115 | ) | 126 | ) |
116 | 127 | ||
117 | target_sents = x_predict[1] | 128 | cost_matrix = None |
129 | |||
130 | target_sents = x_test[1] | ||
118 | precision_at_one = 0 | 131 | precision_at_one = 0 |
119 | precision_at_ten = 0 | 132 | for index, sent in enumerate(x_test[0]): |
120 | for index, sent in enumerate(x_predict[0]): | ||
121 | source_sents = np.array([sent] * 1000) | 133 | source_sents = np.array([sent] * 1000) |
122 | to_predict = [source_sents, target_sents] | 134 | to_predict = [source_sents, target_sents] |
123 | preds = model.predict(to_predict) | 135 | preds = model.predict(to_predict) |
124 | ind = np.argpartition(preds.ravel(), -10)[-10:] | 136 | |
125 | if index in ind: | 137 | if index == 0: |
126 | precision_at_ten += 1 | 138 | cost_matrix = np.c_[preds] |
139 | else: | ||
140 | cost_matrix = np.c_[cost_matrix, preds] | ||
141 | |||
127 | if np.argmax(preds.ravel()) == index: | 142 | if np.argmax(preds.ravel()) == index: |
128 | precision_at_one += 1 | 143 | precision_at_one += 1 |
129 | 144 | ||
130 | training_samples = len(x_train[0]) | 145 | cost_matrix = cost_matrix * -1000 |
131 | validation_samples = len(y_predict) | 146 | |
147 | row_ind, col_ind, a = lapjv(cost_matrix, verbose=False) | ||
148 | |||
149 | result = zip(row_ind, col_ind) | ||
150 | hit_one = len([x for x, y in result if x == y]) | ||
151 | |||
152 | matching_pa1 = hit_one / 1000 * 100 | ||
153 | |||
154 | training_samples = len(Y_train) | ||
155 | validation_samples = len(Y_validate) | ||
156 | test_samples = len(y_test) | ||
132 | fields = [ | 157 | fields = [ |
133 | source_lang, | 158 | source_lang, |
134 | target_lang, | 159 | target_lang, |
135 | training_samples, | 160 | training_samples, |
136 | validation_samples, | 161 | validation_samples, |
162 | test_samples, | ||
137 | precision_at_one, | 163 | precision_at_one, |
138 | precision_at_ten, | 164 | matching_pa1, |
139 | ] | 165 | ] |
140 | 166 | ||
141 | if not batch: | 167 | if not batch: |
142 | print(f"Supervised Retrieval {source_lang} - {target_lang}") | 168 | print(f"Supervised Retrieval {source_lang} - {target_lang}") |
143 | print(f"P@1: {precision_at_one/1000}") | 169 | print(f"P@1: {precision_at_one/1000}") |
144 | else: | 170 | else: |
145 | with open("supervised.csv", "a") as f: | 171 | with open("supervised_matching.csv", "a") as f: |
146 | writer = csv.writer(f) | 172 | writer = csv.writer(f) |
147 | writer.writerow(fields) | 173 | writer.writerow(fields) |
148 | 174 | ||
@@ -186,7 +212,7 @@ if __name__ == "__main__": | |||
186 | "--hidden_size", | 212 | "--hidden_size", |
187 | type=int, | 213 | type=int, |
188 | help="Number of units in LSTM layer.", | 214 | help="Number of units in LSTM layer.", |
189 | default=50, | 215 | default=300, |
190 | ) | 216 | ) |
191 | parser.add_argument( | 217 | parser.add_argument( |
192 | "-b", | 218 | "-b", |
@@ -196,14 +222,14 @@ if __name__ == "__main__": | |||
196 | + "running in a single instance (output the results)", | 222 | + "running in a single instance (output the results)", |
197 | ) | 223 | ) |
198 | parser.add_argument( | 224 | parser.add_argument( |
199 | "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=7 | 225 | "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=30 |
200 | ) | 226 | ) |
201 | parser.add_argument( | 227 | parser.add_argument( |
202 | "-lr", | 228 | "-lr", |
203 | "--learning_rate", | 229 | "--learning_rate", |
204 | type=float, | 230 | type=float, |
205 | help="Learning rate for optimizer.", | 231 | help="Learning rate for optimizer.", |
206 | default=1.0, | 232 | default=0.5, |
207 | ) | 233 | ) |
208 | 234 | ||
209 | args = parser.parse_args() | 235 | args = parser.parse_args() |