aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYigit Sever2020-07-03 00:19:30 +0300
committerYigit Sever2020-07-03 00:19:30 +0300
commit28cc011c31b0dde05eddb5ea2d170e8c0c7fa78a (patch)
tree386c47c4fc3ee768dbdb50ee071689128d6617d1
parentddd2c5349617ac01afef4758a2418ba512bd9ab0 (diff)
downloadEvaluating-Dictionary-Alignment-28cc011c31b0dde05eddb5ea2d170e8c0c7fa78a.tar.gz
Evaluating-Dictionary-Alignment-28cc011c31b0dde05eddb5ea2d170e8c0c7fa78a.tar.bz2
Evaluating-Dictionary-Alignment-28cc011c31b0dde05eddb5ea2d170e8c0c7fa78a.zip
Clean supervised matching
-rw-r--r--learn_and_predict.py78
1 files changed, 52 insertions, 26 deletions
diff --git a/learn_and_predict.py b/learn_and_predict.py
index 4c094d7..907d4be 100644
--- a/learn_and_predict.py
+++ b/learn_and_predict.py
@@ -4,10 +4,11 @@ import csv
4import keras 4import keras
5import keras.backend as K 5import keras.backend as K
6import numpy as np 6import numpy as np
7from Helpers import Data, Get_Embedding
7from keras.layers import LSTM, Embedding, Input, Lambda, concatenate 8from keras.layers import LSTM, Embedding, Input, Lambda, concatenate
8from keras.models import Model 9from keras.models import Model
9 10from lapjv import lapjv
10from Helpers import Data, Get_Embedding 11from sklearn.model_selection import train_test_split
11 12
12 13
13def get_learning_rate(epoch=None, model=None): 14def get_learning_rate(epoch=None, model=None):
@@ -39,8 +40,8 @@ def main(args):
39 40
40 x_train = data.x_train 41 x_train = data.x_train
41 y_train = data.y_train 42 y_train = data.y_train
42 x_predict = data.x_val 43 x_test = data.x_val
43 y_predict = data.y_val 44 y_test = data.y_val
44 vocab_size = data.vocab_size 45 vocab_size = data.vocab_size
45 max_len = data.max_len 46 max_len = data.max_len
46 47
@@ -49,13 +50,23 @@ def main(args):
49 y = np.bincount(x.astype(np.int32)) 50 y = np.bincount(x.astype(np.int32))
50 ii = np.nonzero(y)[0] 51 ii = np.nonzero(y)[0]
51 assert ii == 1 52 assert ii == 1
52 assert y[ii] == 1000 # hardcoded for now 53 assert y[ii] == 1000 # hardcoded, sorry
54
55 # separating the train-test set and the set used for our task
56 (
57 source_train,
58 source_validate,
59 target_train,
60 target_validate,
61 Y_train,
62 Y_validate,
63 ) = train_test_split(x_train[0], x_train[1], y_train)
53 64
54 if not batch: 65 if not batch:
55 print(f"Source Lang: {source_lang}") 66 print(f"Source Lang: {source_lang}")
56 print(f"Target Lang: {target_lang}") 67 print(f"Target Lang: {target_lang}")
57 print(f"Using {len(x_train[0])} pairs to learn") 68 print(f"Using {len(x_train[0])} pairs to learn")
58 print(f"Predicting {len(y_predict)} pairs") 69 print(f"Predicting {len(Y_validate)} pairs")
59 print(f"Vocabulary size: {vocab_size}") 70 print(f"Vocabulary size: {vocab_size}")
60 print(f"Maximum sequence length: {max_len}") 71 print(f"Maximum sequence length: {max_len}")
61 72
@@ -82,7 +93,7 @@ def main(args):
82 input_1 = embed_layer(seq_1) 93 input_1 = embed_layer(seq_1)
83 input_2 = embed_layer(seq_2) 94 input_2 = embed_layer(seq_2)
84 95
85 l1 = LSTM(units=hidden_size) 96 l1 = LSTM(units=300, use_bias=True, dropout=0.05, recurrent_dropout=0.1)
86 97
87 l1_out = l1(input_1) 98 l1_out = l1(input_1)
88 l2_out = l1(input_2) 99 l2_out = l1(input_2)
@@ -95,7 +106,7 @@ def main(args):
95 106
96 model = Model(inputs=[seq_1, seq_2], outputs=[main_output]) 107 model = Model(inputs=[seq_1, seq_2], outputs=[main_output])
97 108
98 opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.25) 109 opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.2)
99 110
100 model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"]) 111 model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"])
101 model.summary() 112 model.summary()
@@ -104,45 +115,60 @@ def main(args):
104 monitor="val_accuracy", patience=5, verbose=1, factor=0.5, min_lr=0.0001 115 monitor="val_accuracy", patience=5, verbose=1, factor=0.5, min_lr=0.0001
105 ) 116 )
106 117
107 history = model.fit( 118 model.fit(
108 x_train, 119 x=[source_train, target_train],
109 y_train, 120 y=Y_train,
110 validation_data=(x_predict, y_predict), 121 validation_data=([source_validate, target_validate], Y_validate),
111 epochs=num_iters, 122 epochs=num_iters,
112 batch_size=32, 123 batch_size=32,
113 verbose=1, 124 verbose=0,
114 callbacks=[adjuster], 125 callbacks=[adjuster],
115 ) 126 )
116 127
117 target_sents = x_predict[1] 128 cost_matrix = None
129
130 target_sents = x_test[1]
118 precision_at_one = 0 131 precision_at_one = 0
119 precision_at_ten = 0 132 for index, sent in enumerate(x_test[0]):
120 for index, sent in enumerate(x_predict[0]):
121 source_sents = np.array([sent] * 1000) 133 source_sents = np.array([sent] * 1000)
122 to_predict = [source_sents, target_sents] 134 to_predict = [source_sents, target_sents]
123 preds = model.predict(to_predict) 135 preds = model.predict(to_predict)
124 ind = np.argpartition(preds.ravel(), -10)[-10:] 136
125 if index in ind: 137 if index == 0:
126 precision_at_ten += 1 138 cost_matrix = np.c_[preds]
139 else:
140 cost_matrix = np.c_[cost_matrix, preds]
141
127 if np.argmax(preds.ravel()) == index: 142 if np.argmax(preds.ravel()) == index:
128 precision_at_one += 1 143 precision_at_one += 1
129 144
130 training_samples = len(x_train[0]) 145 cost_matrix = cost_matrix * -1000
131 validation_samples = len(y_predict) 146
147 row_ind, col_ind, a = lapjv(cost_matrix, verbose=False)
148
149 result = zip(row_ind, col_ind)
150 hit_one = len([x for x, y in result if x == y])
151
152 matching_pa1 = hit_one / 1000 * 100
153
154 training_samples = len(Y_train)
155 validation_samples = len(Y_validate)
156 test_samples = len(y_test)
132 fields = [ 157 fields = [
133 source_lang, 158 source_lang,
134 target_lang, 159 target_lang,
135 training_samples, 160 training_samples,
136 validation_samples, 161 validation_samples,
162 test_samples,
137 precision_at_one, 163 precision_at_one,
138 precision_at_ten, 164 matching_pa1,
139 ] 165 ]
140 166
141 if not batch: 167 if not batch:
142 print(f"Supervised Retrieval {source_lang} - {target_lang}") 168 print(f"Supervised Retrieval {source_lang} - {target_lang}")
143 print(f"P@1: {precision_at_one/1000}") 169 print(f"P@1: {precision_at_one/1000}")
144 else: 170 else:
145 with open("supervised.csv", "a") as f: 171 with open("supervised_matching.csv", "a") as f:
146 writer = csv.writer(f) 172 writer = csv.writer(f)
147 writer.writerow(fields) 173 writer.writerow(fields)
148 174
@@ -186,7 +212,7 @@ if __name__ == "__main__":
186 "--hidden_size", 212 "--hidden_size",
187 type=int, 213 type=int,
188 help="Number of units in LSTM layer.", 214 help="Number of units in LSTM layer.",
189 default=50, 215 default=300,
190 ) 216 )
191 parser.add_argument( 217 parser.add_argument(
192 "-b", 218 "-b",
@@ -196,14 +222,14 @@ if __name__ == "__main__":
196 + "running in a single instance (output the results)", 222 + "running in a single instance (output the results)",
197 ) 223 )
198 parser.add_argument( 224 parser.add_argument(
199 "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=7 225 "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=30
200 ) 226 )
201 parser.add_argument( 227 parser.add_argument(
202 "-lr", 228 "-lr",
203 "--learning_rate", 229 "--learning_rate",
204 type=float, 230 type=float,
205 help="Learning rate for optimizer.", 231 help="Learning rate for optimizer.",
206 default=1.0, 232 default=0.5,
207 ) 233 )
208 234
209 args = parser.parse_args() 235 args = parser.parse_args()