diff options
| author | Yigit Sever | 2019-09-25 21:08:18 +0300 |
|---|---|---|
| committer | Yigit Sever | 2019-09-25 21:08:18 +0300 |
| commit | 4d117258017fb1518d7ce7242e5a9c5a780d70d7 (patch) | |
| tree | 9410d052a4a5870b1270a82a861d5d4f5e355d48 /learn_and_predict.py | |
| parent | 442a1895fe567502ec5fec20a62083ea090f38cc (diff) | |
| download | Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.tar.gz Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.tar.bz2 Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.zip | |
Include supervised code
Due to how we handle data, supervised approaches are hardcoded
to work on 1000 instances for now
Diffstat (limited to 'learn_and_predict.py')
| -rw-r--r-- | learn_and_predict.py | 202 |
1 files changed, 202 insertions, 0 deletions
diff --git a/learn_and_predict.py b/learn_and_predict.py new file mode 100644 index 0000000..36c56f2 --- /dev/null +++ b/learn_and_predict.py | |||
| @@ -0,0 +1,202 @@ | |||
| 1 | import argparse | ||
| 2 | import csv | ||
| 3 | |||
| 4 | import numpy as np | ||
| 5 | |||
| 6 | import keras | ||
| 7 | import keras.backend as K | ||
| 8 | from Helpers import Data, Get_Embedding | ||
| 9 | from keras.layers import LSTM, Embedding, Input, Lambda, concatenate | ||
| 10 | from keras.models import Model | ||
| 11 | |||
| 12 | |||
| 13 | def get_learning_rate(epoch=None, model=None): | ||
| 14 | return np.round(float(K.get_value(model.optimizer.lr)), 5) | ||
| 15 | |||
| 16 | |||
| 17 | def make_cosine_func(hidden_size=50): | ||
| 18 | def exponent_neg_cosine_similarity(x): | ||
| 19 | """ Helper function for the similarity estimate of the LSTMs outputs """ | ||
| 20 | leftNorm = K.l2_normalize(x[:, :hidden_size], axis=-1) | ||
| 21 | rightNorm = K.l2_normalize(x[:, hidden_size:], axis=-1) | ||
| 22 | return K.sum(K.prod([leftNorm, rightNorm], axis=0), axis=1, keepdims=True) | ||
| 23 | |||
| 24 | return exponent_neg_cosine_similarity | ||
| 25 | |||
| 26 | |||
| 27 | def main(args): | ||
| 28 | |||
| 29 | source_lang = args.source_lang | ||
| 30 | target_lang = args.target_lang | ||
| 31 | hidden_size = args.hidden_size | ||
| 32 | max_len = args.max_len | ||
| 33 | num_iters = args.num_iters | ||
| 34 | data_file = args.data_file | ||
| 35 | learning_rate = args.learning_rate | ||
| 36 | batch = args.batch | ||
| 37 | |||
| 38 | data = Data(source_lang, target_lang, data_file, max_len) | ||
| 39 | |||
| 40 | x_train = data.x_train | ||
| 41 | y_train = data.y_train | ||
| 42 | x_predict = data.x_val | ||
| 43 | y_predict = data.y_val | ||
| 44 | vocab_size = data.vocab_size | ||
| 45 | max_len = data.max_len | ||
| 46 | |||
| 47 | # https://stackoverflow.com/a/10741692/3005749 | ||
| 48 | x = data.y_val | ||
| 49 | y = np.bincount(x.astype(np.int32)) | ||
| 50 | ii = np.nonzero(y)[0] | ||
| 51 | assert ii == 1 | ||
| 52 | assert y[ii] == 1000 # hardcoded for now | ||
| 53 | |||
| 54 | if not batch: | ||
| 55 | print(f"Source Lang: {source_lang}") | ||
| 56 | print(f"Target Lang: {target_lang}") | ||
| 57 | print(f"Using {len(x_train[0])} pairs to learn") | ||
| 58 | print(f"Predicting {len(y_predict)} pairs") | ||
| 59 | print(f"Vocabulary size: {vocab_size}") | ||
| 60 | print(f"Maximum sequence length: {max_len}") | ||
| 61 | |||
| 62 | source_emb_file = args.source_emb_file | ||
| 63 | target_emb_file = args.target_emb_file | ||
| 64 | |||
| 65 | embedding = Get_Embedding( | ||
| 66 | source_lang, target_lang, source_emb_file, target_emb_file, data.word_to_id | ||
| 67 | ) | ||
| 68 | embedding_size = embedding.embedding_matrix.shape[1] | ||
| 69 | |||
| 70 | seq_1 = Input(shape=(max_len,), dtype="int32", name="sequence1") | ||
| 71 | seq_2 = Input(shape=(max_len,), dtype="int32", name="sequence2") | ||
| 72 | |||
| 73 | embed_layer = Embedding( | ||
| 74 | output_dim=embedding_size, | ||
| 75 | input_dim=vocab_size + 1, | ||
| 76 | input_length=max_len, | ||
| 77 | trainable=False, | ||
| 78 | ) | ||
| 79 | embed_layer.build((None,)) | ||
| 80 | embed_layer.set_weights([embedding.embedding_matrix]) | ||
| 81 | |||
| 82 | input_1 = embed_layer(seq_1) | ||
| 83 | input_2 = embed_layer(seq_2) | ||
| 84 | |||
| 85 | l1 = LSTM(units=hidden_size) | ||
| 86 | |||
| 87 | l1_out = l1(input_1) | ||
| 88 | l2_out = l1(input_2) | ||
| 89 | |||
| 90 | concats = concatenate([l1_out, l2_out], axis=-1) | ||
| 91 | |||
| 92 | out_func = make_cosine_func(hidden_size) | ||
| 93 | |||
| 94 | main_output = Lambda(out_func, output_shape=(1,))(concats) | ||
| 95 | |||
| 96 | model = Model(inputs=[seq_1, seq_2], outputs=[main_output]) | ||
| 97 | |||
| 98 | opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.25) | ||
| 99 | |||
| 100 | model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"]) | ||
| 101 | model.summary() | ||
| 102 | |||
| 103 | adjuster = keras.callbacks.ReduceLROnPlateau( | ||
| 104 | monitor="val_acc", patience=5, verbose=1, factor=0.5, min_lr=0.0001 | ||
| 105 | ) | ||
| 106 | |||
| 107 | history = model.fit( | ||
| 108 | x_train, | ||
| 109 | y_train, | ||
| 110 | validation_data=(x_predict, y_predict), | ||
| 111 | epochs=num_iters, | ||
| 112 | batch_size=32, | ||
| 113 | verbose=1, | ||
| 114 | callbacks=[adjuster], | ||
| 115 | ) | ||
| 116 | |||
| 117 | target_sents = x_predict[1] | ||
| 118 | precision_at_one = 0 | ||
| 119 | precision_at_ten = 0 | ||
| 120 | for index, sent in enumerate(x_predict[0]): | ||
| 121 | source_sents = np.array([sent] * 1000) | ||
| 122 | to_predict = [source_sents, target_sents] | ||
| 123 | preds = model.predict(to_predict) | ||
| 124 | ind = np.argpartition(preds.ravel(), -10)[-10:] | ||
| 125 | if index in ind: | ||
| 126 | precision_at_ten += 1 | ||
| 127 | if np.argmax(preds.ravel()) == index: | ||
| 128 | precision_at_one += 1 | ||
| 129 | |||
| 130 | training_samples = len(x_train[0]) | ||
| 131 | validation_samples = len(y_predict) | ||
| 132 | fields = [ | ||
| 133 | source_lang, | ||
| 134 | target_lang, | ||
| 135 | training_samples, | ||
| 136 | validation_samples, | ||
| 137 | precision_at_one, | ||
| 138 | precision_at_ten, | ||
| 139 | ] | ||
| 140 | |||
| 141 | if not batch: | ||
| 142 | print(f"P@1: {precision_at_one/1000}, {precision_at_one} defs") | ||
| 143 | else: | ||
| 144 | with open("supervised.csv", "a") as f: | ||
| 145 | writer = csv.writer(f) | ||
| 146 | writer.writerow(fields) | ||
| 147 | |||
| 148 | |||
| 149 | if __name__ == "__main__": | ||
| 150 | |||
| 151 | parser = argparse.ArgumentParser() | ||
| 152 | |||
| 153 | parser.add_argument( | ||
| 154 | "-sl", "--source_lang", type=str, help="Source language.", default="english" | ||
| 155 | ) | ||
| 156 | parser.add_argument( | ||
| 157 | "-tl", "--target_lang", type=str, help="Target language.", default="italian" | ||
| 158 | ) | ||
| 159 | parser.add_argument("-df", "--data_file", type=str, help="Path to dataset.") | ||
| 160 | parser.add_argument( | ||
| 161 | "-es", | ||
| 162 | "--source_emb_file", | ||
| 163 | type=str, | ||
| 164 | help="Path to Source (English) Embedding File.", | ||
| 165 | ) | ||
| 166 | parser.add_argument( | ||
| 167 | "-et", "--target_emb_file", type=str, help="Path to Target Embedding File." | ||
| 168 | ) | ||
| 169 | parser.add_argument( | ||
| 170 | "-l", | ||
| 171 | "--max_len", | ||
| 172 | type=int, | ||
| 173 | help="Maximum number of words in a sentence.", | ||
| 174 | default=20, | ||
| 175 | ) | ||
| 176 | parser.add_argument( | ||
| 177 | "-z", | ||
| 178 | "--hidden_size", | ||
| 179 | type=int, | ||
| 180 | help="Number of Units in LSTM layer.", | ||
| 181 | default=50, | ||
| 182 | ) | ||
| 183 | parser.add_argument( | ||
| 184 | "-b", | ||
| 185 | "--batch", | ||
| 186 | action="store_true", | ||
| 187 | help="running in batch (store results to csv) or" | ||
| 188 | + "running in a single instance (output the results)", | ||
| 189 | ) | ||
| 190 | parser.add_argument( | ||
| 191 | "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=7 | ||
| 192 | ) | ||
| 193 | parser.add_argument( | ||
| 194 | "-lr", | ||
| 195 | "--learning_rate", | ||
| 196 | type=float, | ||
| 197 | help="Learning rate for optimizer.", | ||
| 198 | default=1.0, | ||
| 199 | ) | ||
| 200 | |||
| 201 | args = parser.parse_args() | ||
| 202 | main(args) | ||
