diff options
| author | Yigit Sever | 2020-07-03 00:19:30 +0300 |
|---|---|---|
| committer | Yigit Sever | 2020-07-03 00:19:30 +0300 |
| commit | 28cc011c31b0dde05eddb5ea2d170e8c0c7fa78a (patch) | |
| tree | 386c47c4fc3ee768dbdb50ee071689128d6617d1 | |
| parent | ddd2c5349617ac01afef4758a2418ba512bd9ab0 (diff) | |
| download | Evaluating-Dictionary-Alignment-28cc011c31b0dde05eddb5ea2d170e8c0c7fa78a.tar.gz Evaluating-Dictionary-Alignment-28cc011c31b0dde05eddb5ea2d170e8c0c7fa78a.tar.bz2 Evaluating-Dictionary-Alignment-28cc011c31b0dde05eddb5ea2d170e8c0c7fa78a.zip | |
Clean supervised matching
| -rw-r--r-- | learn_and_predict.py | 78 |
1 files changed, 52 insertions, 26 deletions
diff --git a/learn_and_predict.py b/learn_and_predict.py index 4c094d7..907d4be 100644 --- a/learn_and_predict.py +++ b/learn_and_predict.py | |||
| @@ -4,10 +4,11 @@ import csv | |||
| 4 | import keras | 4 | import keras |
| 5 | import keras.backend as K | 5 | import keras.backend as K |
| 6 | import numpy as np | 6 | import numpy as np |
| 7 | from Helpers import Data, Get_Embedding | ||
| 7 | from keras.layers import LSTM, Embedding, Input, Lambda, concatenate | 8 | from keras.layers import LSTM, Embedding, Input, Lambda, concatenate |
| 8 | from keras.models import Model | 9 | from keras.models import Model |
| 9 | 10 | from lapjv import lapjv | |
| 10 | from Helpers import Data, Get_Embedding | 11 | from sklearn.model_selection import train_test_split |
| 11 | 12 | ||
| 12 | 13 | ||
| 13 | def get_learning_rate(epoch=None, model=None): | 14 | def get_learning_rate(epoch=None, model=None): |
| @@ -39,8 +40,8 @@ def main(args): | |||
| 39 | 40 | ||
| 40 | x_train = data.x_train | 41 | x_train = data.x_train |
| 41 | y_train = data.y_train | 42 | y_train = data.y_train |
| 42 | x_predict = data.x_val | 43 | x_test = data.x_val |
| 43 | y_predict = data.y_val | 44 | y_test = data.y_val |
| 44 | vocab_size = data.vocab_size | 45 | vocab_size = data.vocab_size |
| 45 | max_len = data.max_len | 46 | max_len = data.max_len |
| 46 | 47 | ||
| @@ -49,13 +50,23 @@ def main(args): | |||
| 49 | y = np.bincount(x.astype(np.int32)) | 50 | y = np.bincount(x.astype(np.int32)) |
| 50 | ii = np.nonzero(y)[0] | 51 | ii = np.nonzero(y)[0] |
| 51 | assert ii == 1 | 52 | assert ii == 1 |
| 52 | assert y[ii] == 1000 # hardcoded for now | 53 | assert y[ii] == 1000 # hardcoded, sorry |
| 54 | |||
| 55 | # separating the train-test set and the set used for our task | ||
| 56 | ( | ||
| 57 | source_train, | ||
| 58 | source_validate, | ||
| 59 | target_train, | ||
| 60 | target_validate, | ||
| 61 | Y_train, | ||
| 62 | Y_validate, | ||
| 63 | ) = train_test_split(x_train[0], x_train[1], y_train) | ||
| 53 | 64 | ||
| 54 | if not batch: | 65 | if not batch: |
| 55 | print(f"Source Lang: {source_lang}") | 66 | print(f"Source Lang: {source_lang}") |
| 56 | print(f"Target Lang: {target_lang}") | 67 | print(f"Target Lang: {target_lang}") |
| 57 | print(f"Using {len(x_train[0])} pairs to learn") | 68 | print(f"Using {len(x_train[0])} pairs to learn") |
| 58 | print(f"Predicting {len(y_predict)} pairs") | 69 | print(f"Predicting {len(Y_validate)} pairs") |
| 59 | print(f"Vocabulary size: {vocab_size}") | 70 | print(f"Vocabulary size: {vocab_size}") |
| 60 | print(f"Maximum sequence length: {max_len}") | 71 | print(f"Maximum sequence length: {max_len}") |
| 61 | 72 | ||
| @@ -82,7 +93,7 @@ def main(args): | |||
| 82 | input_1 = embed_layer(seq_1) | 93 | input_1 = embed_layer(seq_1) |
| 83 | input_2 = embed_layer(seq_2) | 94 | input_2 = embed_layer(seq_2) |
| 84 | 95 | ||
| 85 | l1 = LSTM(units=hidden_size) | 96 | l1 = LSTM(units=300, use_bias=True, dropout=0.05, recurrent_dropout=0.1) |
| 86 | 97 | ||
| 87 | l1_out = l1(input_1) | 98 | l1_out = l1(input_1) |
| 88 | l2_out = l1(input_2) | 99 | l2_out = l1(input_2) |
| @@ -95,7 +106,7 @@ def main(args): | |||
| 95 | 106 | ||
| 96 | model = Model(inputs=[seq_1, seq_2], outputs=[main_output]) | 107 | model = Model(inputs=[seq_1, seq_2], outputs=[main_output]) |
| 97 | 108 | ||
| 98 | opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.25) | 109 | opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.2) |
| 99 | 110 | ||
| 100 | model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"]) | 111 | model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"]) |
| 101 | model.summary() | 112 | model.summary() |
| @@ -104,45 +115,60 @@ def main(args): | |||
| 104 | monitor="val_accuracy", patience=5, verbose=1, factor=0.5, min_lr=0.0001 | 115 | monitor="val_accuracy", patience=5, verbose=1, factor=0.5, min_lr=0.0001 |
| 105 | ) | 116 | ) |
| 106 | 117 | ||
| 107 | history = model.fit( | 118 | model.fit( |
| 108 | x_train, | 119 | x=[source_train, target_train], |
| 109 | y_train, | 120 | y=Y_train, |
| 110 | validation_data=(x_predict, y_predict), | 121 | validation_data=([source_validate, target_validate], Y_validate), |
| 111 | epochs=num_iters, | 122 | epochs=num_iters, |
| 112 | batch_size=32, | 123 | batch_size=32, |
| 113 | verbose=1, | 124 | verbose=0, |
| 114 | callbacks=[adjuster], | 125 | callbacks=[adjuster], |
| 115 | ) | 126 | ) |
| 116 | 127 | ||
| 117 | target_sents = x_predict[1] | 128 | cost_matrix = None |
| 129 | |||
| 130 | target_sents = x_test[1] | ||
| 118 | precision_at_one = 0 | 131 | precision_at_one = 0 |
| 119 | precision_at_ten = 0 | 132 | for index, sent in enumerate(x_test[0]): |
| 120 | for index, sent in enumerate(x_predict[0]): | ||
| 121 | source_sents = np.array([sent] * 1000) | 133 | source_sents = np.array([sent] * 1000) |
| 122 | to_predict = [source_sents, target_sents] | 134 | to_predict = [source_sents, target_sents] |
| 123 | preds = model.predict(to_predict) | 135 | preds = model.predict(to_predict) |
| 124 | ind = np.argpartition(preds.ravel(), -10)[-10:] | 136 | |
| 125 | if index in ind: | 137 | if index == 0: |
| 126 | precision_at_ten += 1 | 138 | cost_matrix = np.c_[preds] |
| 139 | else: | ||
| 140 | cost_matrix = np.c_[cost_matrix, preds] | ||
| 141 | |||
| 127 | if np.argmax(preds.ravel()) == index: | 142 | if np.argmax(preds.ravel()) == index: |
| 128 | precision_at_one += 1 | 143 | precision_at_one += 1 |
| 129 | 144 | ||
| 130 | training_samples = len(x_train[0]) | 145 | cost_matrix = cost_matrix * -1000 |
| 131 | validation_samples = len(y_predict) | 146 | |
| 147 | row_ind, col_ind, a = lapjv(cost_matrix, verbose=False) | ||
| 148 | |||
| 149 | result = zip(row_ind, col_ind) | ||
| 150 | hit_one = len([x for x, y in result if x == y]) | ||
| 151 | |||
| 152 | matching_pa1 = hit_one / 1000 * 100 | ||
| 153 | |||
| 154 | training_samples = len(Y_train) | ||
| 155 | validation_samples = len(Y_validate) | ||
| 156 | test_samples = len(y_test) | ||
| 132 | fields = [ | 157 | fields = [ |
| 133 | source_lang, | 158 | source_lang, |
| 134 | target_lang, | 159 | target_lang, |
| 135 | training_samples, | 160 | training_samples, |
| 136 | validation_samples, | 161 | validation_samples, |
| 162 | test_samples, | ||
| 137 | precision_at_one, | 163 | precision_at_one, |
| 138 | precision_at_ten, | 164 | matching_pa1, |
| 139 | ] | 165 | ] |
| 140 | 166 | ||
| 141 | if not batch: | 167 | if not batch: |
| 142 | print(f"Supervised Retrieval {source_lang} - {target_lang}") | 168 | print(f"Supervised Retrieval {source_lang} - {target_lang}") |
| 143 | print(f"P@1: {precision_at_one/1000}") | 169 | print(f"P@1: {precision_at_one/1000}") |
| 144 | else: | 170 | else: |
| 145 | with open("supervised.csv", "a") as f: | 171 | with open("supervised_matching.csv", "a") as f: |
| 146 | writer = csv.writer(f) | 172 | writer = csv.writer(f) |
| 147 | writer.writerow(fields) | 173 | writer.writerow(fields) |
| 148 | 174 | ||
| @@ -186,7 +212,7 @@ if __name__ == "__main__": | |||
| 186 | "--hidden_size", | 212 | "--hidden_size", |
| 187 | type=int, | 213 | type=int, |
| 188 | help="Number of units in LSTM layer.", | 214 | help="Number of units in LSTM layer.", |
| 189 | default=50, | 215 | default=300, |
| 190 | ) | 216 | ) |
| 191 | parser.add_argument( | 217 | parser.add_argument( |
| 192 | "-b", | 218 | "-b", |
| @@ -196,14 +222,14 @@ if __name__ == "__main__": | |||
| 196 | + "running in a single instance (output the results)", | 222 | + "running in a single instance (output the results)", |
| 197 | ) | 223 | ) |
| 198 | parser.add_argument( | 224 | parser.add_argument( |
| 199 | "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=7 | 225 | "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=30 |
| 200 | ) | 226 | ) |
| 201 | parser.add_argument( | 227 | parser.add_argument( |
| 202 | "-lr", | 228 | "-lr", |
| 203 | "--learning_rate", | 229 | "--learning_rate", |
| 204 | type=float, | 230 | type=float, |
| 205 | help="Learning rate for optimizer.", | 231 | help="Learning rate for optimizer.", |
| 206 | default=1.0, | 232 | default=0.5, |
| 207 | ) | 233 | ) |
| 208 | 234 | ||
| 209 | args = parser.parse_args() | 235 | args = parser.parse_args() |
