From 28cc011c31b0dde05eddb5ea2d170e8c0c7fa78a Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Fri, 3 Jul 2020 00:19:30 +0300 Subject: Clean supervised matching --- learn_and_predict.py | 78 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/learn_and_predict.py b/learn_and_predict.py index 4c094d7..907d4be 100644 --- a/learn_and_predict.py +++ b/learn_and_predict.py @@ -4,10 +4,11 @@ import csv import keras import keras.backend as K import numpy as np +from Helpers import Data, Get_Embedding from keras.layers import LSTM, Embedding, Input, Lambda, concatenate from keras.models import Model - -from Helpers import Data, Get_Embedding +from lapjv import lapjv +from sklearn.model_selection import train_test_split def get_learning_rate(epoch=None, model=None): @@ -39,8 +40,8 @@ def main(args): x_train = data.x_train y_train = data.y_train - x_predict = data.x_val - y_predict = data.y_val + x_test = data.x_val + y_test = data.y_val vocab_size = data.vocab_size max_len = data.max_len @@ -49,13 +50,23 @@ def main(args): y = np.bincount(x.astype(np.int32)) ii = np.nonzero(y)[0] assert ii == 1 - assert y[ii] == 1000 # hardcoded for now + assert y[ii] == 1000 # hardcoded, sorry + + # separating the train-test set and the set used for our task + ( + source_train, + source_validate, + target_train, + target_validate, + Y_train, + Y_validate, + ) = train_test_split(x_train[0], x_train[1], y_train) if not batch: print(f"Source Lang: {source_lang}") print(f"Target Lang: {target_lang}") print(f"Using {len(x_train[0])} pairs to learn") - print(f"Predicting {len(y_predict)} pairs") + print(f"Predicting {len(Y_validate)} pairs") print(f"Vocabulary size: {vocab_size}") print(f"Maximum sequence length: {max_len}") @@ -82,7 +93,7 @@ def main(args): input_1 = embed_layer(seq_1) input_2 = embed_layer(seq_2) - l1 = LSTM(units=hidden_size) + l1 = LSTM(units=300, use_bias=True, dropout=0.05, recurrent_dropout=0.1) l1_out = l1(input_1) l2_out = l1(input_2) @@ -95,7 +106,7 @@ def main(args): model = Model(inputs=[seq_1, seq_2], outputs=[main_output]) - opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.25) + opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.2) model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"]) model.summary() @@ -104,45 +115,60 @@ def main(args): monitor="val_accuracy", patience=5, verbose=1, factor=0.5, min_lr=0.0001 ) - history = model.fit( - x_train, - y_train, - validation_data=(x_predict, y_predict), + model.fit( + x=[source_train, target_train], + y=Y_train, + validation_data=([source_validate, target_validate], Y_validate), epochs=num_iters, batch_size=32, - verbose=1, + verbose=0, callbacks=[adjuster], ) - target_sents = x_predict[1] + cost_matrix = None + + target_sents = x_test[1] precision_at_one = 0 - precision_at_ten = 0 - for index, sent in enumerate(x_predict[0]): + for index, sent in enumerate(x_test[0]): source_sents = np.array([sent] * 1000) to_predict = [source_sents, target_sents] preds = model.predict(to_predict) - ind = np.argpartition(preds.ravel(), -10)[-10:] - if index in ind: - precision_at_ten += 1 + + if index == 0: + cost_matrix = np.c_[preds] + else: + cost_matrix = np.c_[cost_matrix, preds] + if np.argmax(preds.ravel()) == index: precision_at_one += 1 - training_samples = len(x_train[0]) - validation_samples = len(y_predict) + cost_matrix = cost_matrix * -1000 + + row_ind, col_ind, a = lapjv(cost_matrix, verbose=False) + + result = zip(row_ind, col_ind) + hit_one = len([x for x, y in result if x == y]) + + matching_pa1 = hit_one / 1000 * 100 + + training_samples = len(Y_train) + validation_samples = len(Y_validate) + test_samples = len(y_test) fields = [ source_lang, target_lang, training_samples, validation_samples, + test_samples, precision_at_one, - precision_at_ten, + matching_pa1, ] if not batch: print(f"Supervised Retrieval {source_lang} - {target_lang}") print(f"P@1: {precision_at_one/1000}") else: - with open("supervised.csv", "a") as f: + with open("supervised_matching.csv", "a") as f: writer = csv.writer(f) writer.writerow(fields) @@ -186,7 +212,7 @@ if __name__ == "__main__": "--hidden_size", type=int, help="Number of units in LSTM layer.", - default=50, + default=300, ) parser.add_argument( "-b", @@ -196,14 +222,14 @@ if __name__ == "__main__": + "running in a single instance (output the results)", ) parser.add_argument( - "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=7 + "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=30 ) parser.add_argument( "-lr", "--learning_rate", type=float, help="Learning rate for optimizer.", - default=1.0, + default=0.5, ) args = parser.parse_args() -- cgit v1.2.3-70-g09d2