diff options
author | Yigit Sever | 2019-09-25 21:08:18 +0300 |
---|---|---|
committer | Yigit Sever | 2019-09-25 21:08:18 +0300 |
commit | 4d117258017fb1518d7ce7242e5a9c5a780d70d7 (patch) | |
tree | 9410d052a4a5870b1270a82a861d5d4f5e355d48 | |
parent | 442a1895fe567502ec5fec20a62083ea090f38cc (diff) | |
download | Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.tar.gz Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.tar.bz2 Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.zip |
Include supervised code
Due to how we handle data, supervised approaches are hardcoded
to work on 1000 instances for now
-rw-r--r-- | Helpers.py | 157 | ||||
-rw-r--r-- | learn_and_predict.py | 202 |
2 files changed, 359 insertions, 0 deletions
diff --git a/Helpers.py b/Helpers.py new file mode 100644 index 0000000..7c615ab --- /dev/null +++ b/Helpers.py | |||
@@ -0,0 +1,157 @@ | |||
1 | import itertools | ||
2 | |||
3 | import numpy as np | ||
4 | from sklearn.model_selection import train_test_split as split_data | ||
5 | |||
6 | import pandas as pd | ||
7 | from gensim.models import KeyedVectors | ||
8 | from keras.preprocessing.sequence import pad_sequences | ||
9 | |||
10 | |||
11 | class Data(object): | ||
12 | def __init__( | ||
13 | self, | ||
14 | source_lang, | ||
15 | target_lang, | ||
16 | data_file, | ||
17 | max_len=None, | ||
18 | instances=1000, | ||
19 | vocab_limit=None, | ||
20 | sentence_cols=None, | ||
21 | score_col=None, | ||
22 | ): | ||
23 | self.source_lang = source_lang | ||
24 | self.target_lang = target_lang | ||
25 | self.data_file = data_file | ||
26 | self.max_len = max_len | ||
27 | self.instances = instances | ||
28 | self.vocab_size = 1 | ||
29 | self.vocab_limit = vocab_limit | ||
30 | |||
31 | if sentence_cols is None: | ||
32 | self.sequence_cols = [ | ||
33 | f"{source_lang} definition", | ||
34 | f"{target_lang} definition", | ||
35 | ] | ||
36 | else: | ||
37 | self.sequence_cols = sentence_cols | ||
38 | |||
39 | if score_col is None: | ||
40 | self.score_col = "is same" | ||
41 | else: | ||
42 | self.score_col = score_col | ||
43 | |||
44 | self.x_train = list() | ||
45 | self.y_train = list() | ||
46 | self.x_val = list() | ||
47 | self.y_val = list() | ||
48 | self.vocab = set("PAD") | ||
49 | self.word_to_id = {"PAD": 0} | ||
50 | self.id_to_word = {0: "PAD"} | ||
51 | self.word_to_count = dict() | ||
52 | self.run() | ||
53 | |||
54 | def text_to_word_list(self, text): | ||
55 | """ Pre process and convert texts to a list of words """ | ||
56 | text = str(text) | ||
57 | text = text.split() | ||
58 | return text | ||
59 | |||
60 | def load_data(self): | ||
61 | # Load data set | ||
62 | data_df = pd.read_csv(self.data_file, sep="\t") | ||
63 | |||
64 | # Iterate over required sequences of provided dataset | ||
65 | for index, row in data_df.iterrows(): | ||
66 | # Iterate through the text of both questions of the row | ||
67 | for sequence in self.sequence_cols: | ||
68 | s2n = [] # Sequences with words replaces with indices | ||
69 | for word in self.text_to_word_list(row[sequence]): | ||
70 | if word not in self.vocab: | ||
71 | self.vocab.add(word) | ||
72 | self.word_to_id[word] = self.vocab_size | ||
73 | self.word_to_count[word] = 1 | ||
74 | s2n.append(self.vocab_size) | ||
75 | self.id_to_word[self.vocab_size] = word | ||
76 | self.vocab_size += 1 | ||
77 | else: | ||
78 | self.word_to_count[word] += 1 | ||
79 | s2n.append(self.word_to_id[word]) | ||
80 | |||
81 | # Replace |sequence as word| with |sequence as number| representation | ||
82 | data_df.at[index, sequence] = s2n | ||
83 | return data_df | ||
84 | |||
85 | def pad_sequences(self): | ||
86 | if self.max_len == 0: | ||
87 | self.max_len = max( | ||
88 | max(len(seq) for seq in self.x_train[0]), | ||
89 | max(len(seq) for seq in self.x_train[1]), | ||
90 | max(len(seq) for seq in self.x_val[0]), | ||
91 | max(len(seq) for seq in self.x_val[1]), | ||
92 | ) | ||
93 | |||
94 | # Zero padding | ||
95 | for dataset, side in itertools.product([self.x_train, self.x_val], [0, 1]): | ||
96 | if self.max_len: | ||
97 | dataset[side] = pad_sequences(dataset[side], maxlen=self.max_len) | ||
98 | else: | ||
99 | dataset[side] = pad_sequences(dataset[side]) | ||
100 | |||
101 | def run(self): | ||
102 | # Loading data and building vocabulary. | ||
103 | data_df = self.load_data() | ||
104 | |||
105 | X = data_df[self.sequence_cols] | ||
106 | Y = data_df[self.score_col] | ||
107 | |||
108 | self.x_train, self.x_val, self.y_train, self.y_val = split_data( | ||
109 | X, Y, test_size=self.instances, shuffle=False | ||
110 | ) | ||
111 | |||
112 | # Split to lists | ||
113 | self.x_train = [self.x_train[column] for column in self.sequence_cols] | ||
114 | self.x_val = [self.x_val[column] for column in self.sequence_cols] | ||
115 | |||
116 | # Convert labels to their numpy representations | ||
117 | self.y_train = self.y_train.values | ||
118 | self.y_val = self.y_val.values | ||
119 | |||
120 | # Padding Sequences. | ||
121 | self.pad_sequences() | ||
122 | |||
123 | |||
124 | class Get_Embedding(object): | ||
125 | def __init__(self, source_lang, target_lang, source_emb, target_emb, word_index): | ||
126 | self.embedding_size = 300 # Default dimensionality | ||
127 | self.embedding_matrix = self.create_embed_matrix( | ||
128 | source_lang, target_lang, source_emb, target_emb, word_index | ||
129 | ) | ||
130 | |||
131 | def create_embed_matrix( | ||
132 | self, source_lang, target_lang, source_emb, target_emb, word_index | ||
133 | ): | ||
134 | source_vecs = KeyedVectors.load_word2vec_format(source_emb) | ||
135 | target_vecs = KeyedVectors.load_word2vec_format(target_emb) | ||
136 | |||
137 | # Prepare Embedding Matrix. | ||
138 | embedding_matrix = np.zeros((len(word_index) + 1, self.embedding_size)) | ||
139 | |||
140 | # word has either __source or __target appended | ||
141 | for key, i in word_index.items(): | ||
142 | if "__" not in key: | ||
143 | print("Skipping {}".format(key)) | ||
144 | continue | ||
145 | |||
146 | word, lang = key.split("__") | ||
147 | |||
148 | if lang == source_lang: | ||
149 | if word in source_vecs.vocab: | ||
150 | embedding_matrix[i] = source_vecs.word_vec(word) | ||
151 | else: | ||
152 | if word in target_vecs.vocab: | ||
153 | embedding_matrix[i] = target_vecs.word_vec(word) | ||
154 | |||
155 | del source_vecs | ||
156 | del target_vecs | ||
157 | return embedding_matrix | ||
diff --git a/learn_and_predict.py b/learn_and_predict.py new file mode 100644 index 0000000..36c56f2 --- /dev/null +++ b/learn_and_predict.py | |||
@@ -0,0 +1,202 @@ | |||
1 | import argparse | ||
2 | import csv | ||
3 | |||
4 | import numpy as np | ||
5 | |||
6 | import keras | ||
7 | import keras.backend as K | ||
8 | from Helpers import Data, Get_Embedding | ||
9 | from keras.layers import LSTM, Embedding, Input, Lambda, concatenate | ||
10 | from keras.models import Model | ||
11 | |||
12 | |||
13 | def get_learning_rate(epoch=None, model=None): | ||
14 | return np.round(float(K.get_value(model.optimizer.lr)), 5) | ||
15 | |||
16 | |||
17 | def make_cosine_func(hidden_size=50): | ||
18 | def exponent_neg_cosine_similarity(x): | ||
19 | """ Helper function for the similarity estimate of the LSTMs outputs """ | ||
20 | leftNorm = K.l2_normalize(x[:, :hidden_size], axis=-1) | ||
21 | rightNorm = K.l2_normalize(x[:, hidden_size:], axis=-1) | ||
22 | return K.sum(K.prod([leftNorm, rightNorm], axis=0), axis=1, keepdims=True) | ||
23 | |||
24 | return exponent_neg_cosine_similarity | ||
25 | |||
26 | |||
27 | def main(args): | ||
28 | |||
29 | source_lang = args.source_lang | ||
30 | target_lang = args.target_lang | ||
31 | hidden_size = args.hidden_size | ||
32 | max_len = args.max_len | ||
33 | num_iters = args.num_iters | ||
34 | data_file = args.data_file | ||
35 | learning_rate = args.learning_rate | ||
36 | batch = args.batch | ||
37 | |||
38 | data = Data(source_lang, target_lang, data_file, max_len) | ||
39 | |||
40 | x_train = data.x_train | ||
41 | y_train = data.y_train | ||
42 | x_predict = data.x_val | ||
43 | y_predict = data.y_val | ||
44 | vocab_size = data.vocab_size | ||
45 | max_len = data.max_len | ||
46 | |||
47 | # https://stackoverflow.com/a/10741692/3005749 | ||
48 | x = data.y_val | ||
49 | y = np.bincount(x.astype(np.int32)) | ||
50 | ii = np.nonzero(y)[0] | ||
51 | assert ii == 1 | ||
52 | assert y[ii] == 1000 # hardcoded for now | ||
53 | |||
54 | if not batch: | ||
55 | print(f"Source Lang: {source_lang}") | ||
56 | print(f"Target Lang: {target_lang}") | ||
57 | print(f"Using {len(x_train[0])} pairs to learn") | ||
58 | print(f"Predicting {len(y_predict)} pairs") | ||
59 | print(f"Vocabulary size: {vocab_size}") | ||
60 | print(f"Maximum sequence length: {max_len}") | ||
61 | |||
62 | source_emb_file = args.source_emb_file | ||
63 | target_emb_file = args.target_emb_file | ||
64 | |||
65 | embedding = Get_Embedding( | ||
66 | source_lang, target_lang, source_emb_file, target_emb_file, data.word_to_id | ||
67 | ) | ||
68 | embedding_size = embedding.embedding_matrix.shape[1] | ||
69 | |||
70 | seq_1 = Input(shape=(max_len,), dtype="int32", name="sequence1") | ||
71 | seq_2 = Input(shape=(max_len,), dtype="int32", name="sequence2") | ||
72 | |||
73 | embed_layer = Embedding( | ||
74 | output_dim=embedding_size, | ||
75 | input_dim=vocab_size + 1, | ||
76 | input_length=max_len, | ||
77 | trainable=False, | ||
78 | ) | ||
79 | embed_layer.build((None,)) | ||
80 | embed_layer.set_weights([embedding.embedding_matrix]) | ||
81 | |||
82 | input_1 = embed_layer(seq_1) | ||
83 | input_2 = embed_layer(seq_2) | ||
84 | |||
85 | l1 = LSTM(units=hidden_size) | ||
86 | |||
87 | l1_out = l1(input_1) | ||
88 | l2_out = l1(input_2) | ||
89 | |||
90 | concats = concatenate([l1_out, l2_out], axis=-1) | ||
91 | |||
92 | out_func = make_cosine_func(hidden_size) | ||
93 | |||
94 | main_output = Lambda(out_func, output_shape=(1,))(concats) | ||
95 | |||
96 | model = Model(inputs=[seq_1, seq_2], outputs=[main_output]) | ||
97 | |||
98 | opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.25) | ||
99 | |||
100 | model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"]) | ||
101 | model.summary() | ||
102 | |||
103 | adjuster = keras.callbacks.ReduceLROnPlateau( | ||
104 | monitor="val_acc", patience=5, verbose=1, factor=0.5, min_lr=0.0001 | ||
105 | ) | ||
106 | |||
107 | history = model.fit( | ||
108 | x_train, | ||
109 | y_train, | ||
110 | validation_data=(x_predict, y_predict), | ||
111 | epochs=num_iters, | ||
112 | batch_size=32, | ||
113 | verbose=1, | ||
114 | callbacks=[adjuster], | ||
115 | ) | ||
116 | |||
117 | target_sents = x_predict[1] | ||
118 | precision_at_one = 0 | ||
119 | precision_at_ten = 0 | ||
120 | for index, sent in enumerate(x_predict[0]): | ||
121 | source_sents = np.array([sent] * 1000) | ||
122 | to_predict = [source_sents, target_sents] | ||
123 | preds = model.predict(to_predict) | ||
124 | ind = np.argpartition(preds.ravel(), -10)[-10:] | ||
125 | if index in ind: | ||
126 | precision_at_ten += 1 | ||
127 | if np.argmax(preds.ravel()) == index: | ||
128 | precision_at_one += 1 | ||
129 | |||
130 | training_samples = len(x_train[0]) | ||
131 | validation_samples = len(y_predict) | ||
132 | fields = [ | ||
133 | source_lang, | ||
134 | target_lang, | ||
135 | training_samples, | ||
136 | validation_samples, | ||
137 | precision_at_one, | ||
138 | precision_at_ten, | ||
139 | ] | ||
140 | |||
141 | if not batch: | ||
142 | print(f"P@1: {precision_at_one/1000}, {precision_at_one} defs") | ||
143 | else: | ||
144 | with open("supervised.csv", "a") as f: | ||
145 | writer = csv.writer(f) | ||
146 | writer.writerow(fields) | ||
147 | |||
148 | |||
149 | if __name__ == "__main__": | ||
150 | |||
151 | parser = argparse.ArgumentParser() | ||
152 | |||
153 | parser.add_argument( | ||
154 | "-sl", "--source_lang", type=str, help="Source language.", default="english" | ||
155 | ) | ||
156 | parser.add_argument( | ||
157 | "-tl", "--target_lang", type=str, help="Target language.", default="italian" | ||
158 | ) | ||
159 | parser.add_argument("-df", "--data_file", type=str, help="Path to dataset.") | ||
160 | parser.add_argument( | ||
161 | "-es", | ||
162 | "--source_emb_file", | ||
163 | type=str, | ||
164 | help="Path to Source (English) Embedding File.", | ||
165 | ) | ||
166 | parser.add_argument( | ||
167 | "-et", "--target_emb_file", type=str, help="Path to Target Embedding File." | ||
168 | ) | ||
169 | parser.add_argument( | ||
170 | "-l", | ||
171 | "--max_len", | ||
172 | type=int, | ||
173 | help="Maximum number of words in a sentence.", | ||
174 | default=20, | ||
175 | ) | ||
176 | parser.add_argument( | ||
177 | "-z", | ||
178 | "--hidden_size", | ||
179 | type=int, | ||
180 | help="Number of Units in LSTM layer.", | ||
181 | default=50, | ||
182 | ) | ||
183 | parser.add_argument( | ||
184 | "-b", | ||
185 | "--batch", | ||
186 | action="store_true", | ||
187 | help="running in batch (store results to csv) or" | ||
188 | + "running in a single instance (output the results)", | ||
189 | ) | ||
190 | parser.add_argument( | ||
191 | "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=7 | ||
192 | ) | ||
193 | parser.add_argument( | ||
194 | "-lr", | ||
195 | "--learning_rate", | ||
196 | type=float, | ||
197 | help="Learning rate for optimizer.", | ||
198 | default=1.0, | ||
199 | ) | ||
200 | |||
201 | args = parser.parse_args() | ||
202 | main(args) | ||