diff options
| author | Yigit Sever | 2019-09-25 21:08:18 +0300 |
|---|---|---|
| committer | Yigit Sever | 2019-09-25 21:08:18 +0300 |
| commit | 4d117258017fb1518d7ce7242e5a9c5a780d70d7 (patch) | |
| tree | 9410d052a4a5870b1270a82a861d5d4f5e355d48 | |
| parent | 442a1895fe567502ec5fec20a62083ea090f38cc (diff) | |
| download | Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.tar.gz Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.tar.bz2 Evaluating-Dictionary-Alignment-4d117258017fb1518d7ce7242e5a9c5a780d70d7.zip | |
Include supervised code
Due to how we handle data, supervised approaches are hardcoded
to work on 1000 instances for now
| -rw-r--r-- | Helpers.py | 157 | ||||
| -rw-r--r-- | learn_and_predict.py | 202 |
2 files changed, 359 insertions, 0 deletions
diff --git a/Helpers.py b/Helpers.py new file mode 100644 index 0000000..7c615ab --- /dev/null +++ b/Helpers.py | |||
| @@ -0,0 +1,157 @@ | |||
| 1 | import itertools | ||
| 2 | |||
| 3 | import numpy as np | ||
| 4 | from sklearn.model_selection import train_test_split as split_data | ||
| 5 | |||
| 6 | import pandas as pd | ||
| 7 | from gensim.models import KeyedVectors | ||
| 8 | from keras.preprocessing.sequence import pad_sequences | ||
| 9 | |||
| 10 | |||
| 11 | class Data(object): | ||
| 12 | def __init__( | ||
| 13 | self, | ||
| 14 | source_lang, | ||
| 15 | target_lang, | ||
| 16 | data_file, | ||
| 17 | max_len=None, | ||
| 18 | instances=1000, | ||
| 19 | vocab_limit=None, | ||
| 20 | sentence_cols=None, | ||
| 21 | score_col=None, | ||
| 22 | ): | ||
| 23 | self.source_lang = source_lang | ||
| 24 | self.target_lang = target_lang | ||
| 25 | self.data_file = data_file | ||
| 26 | self.max_len = max_len | ||
| 27 | self.instances = instances | ||
| 28 | self.vocab_size = 1 | ||
| 29 | self.vocab_limit = vocab_limit | ||
| 30 | |||
| 31 | if sentence_cols is None: | ||
| 32 | self.sequence_cols = [ | ||
| 33 | f"{source_lang} definition", | ||
| 34 | f"{target_lang} definition", | ||
| 35 | ] | ||
| 36 | else: | ||
| 37 | self.sequence_cols = sentence_cols | ||
| 38 | |||
| 39 | if score_col is None: | ||
| 40 | self.score_col = "is same" | ||
| 41 | else: | ||
| 42 | self.score_col = score_col | ||
| 43 | |||
| 44 | self.x_train = list() | ||
| 45 | self.y_train = list() | ||
| 46 | self.x_val = list() | ||
| 47 | self.y_val = list() | ||
| 48 | self.vocab = set("PAD") | ||
| 49 | self.word_to_id = {"PAD": 0} | ||
| 50 | self.id_to_word = {0: "PAD"} | ||
| 51 | self.word_to_count = dict() | ||
| 52 | self.run() | ||
| 53 | |||
| 54 | def text_to_word_list(self, text): | ||
| 55 | """ Pre process and convert texts to a list of words """ | ||
| 56 | text = str(text) | ||
| 57 | text = text.split() | ||
| 58 | return text | ||
| 59 | |||
| 60 | def load_data(self): | ||
| 61 | # Load data set | ||
| 62 | data_df = pd.read_csv(self.data_file, sep="\t") | ||
| 63 | |||
| 64 | # Iterate over required sequences of provided dataset | ||
| 65 | for index, row in data_df.iterrows(): | ||
| 66 | # Iterate through the text of both questions of the row | ||
| 67 | for sequence in self.sequence_cols: | ||
| 68 | s2n = [] # Sequences with words replaces with indices | ||
| 69 | for word in self.text_to_word_list(row[sequence]): | ||
| 70 | if word not in self.vocab: | ||
| 71 | self.vocab.add(word) | ||
| 72 | self.word_to_id[word] = self.vocab_size | ||
| 73 | self.word_to_count[word] = 1 | ||
| 74 | s2n.append(self.vocab_size) | ||
| 75 | self.id_to_word[self.vocab_size] = word | ||
| 76 | self.vocab_size += 1 | ||
| 77 | else: | ||
| 78 | self.word_to_count[word] += 1 | ||
| 79 | s2n.append(self.word_to_id[word]) | ||
| 80 | |||
| 81 | # Replace |sequence as word| with |sequence as number| representation | ||
| 82 | data_df.at[index, sequence] = s2n | ||
| 83 | return data_df | ||
| 84 | |||
| 85 | def pad_sequences(self): | ||
| 86 | if self.max_len == 0: | ||
| 87 | self.max_len = max( | ||
| 88 | max(len(seq) for seq in self.x_train[0]), | ||
| 89 | max(len(seq) for seq in self.x_train[1]), | ||
| 90 | max(len(seq) for seq in self.x_val[0]), | ||
| 91 | max(len(seq) for seq in self.x_val[1]), | ||
| 92 | ) | ||
| 93 | |||
| 94 | # Zero padding | ||
| 95 | for dataset, side in itertools.product([self.x_train, self.x_val], [0, 1]): | ||
| 96 | if self.max_len: | ||
| 97 | dataset[side] = pad_sequences(dataset[side], maxlen=self.max_len) | ||
| 98 | else: | ||
| 99 | dataset[side] = pad_sequences(dataset[side]) | ||
| 100 | |||
| 101 | def run(self): | ||
| 102 | # Loading data and building vocabulary. | ||
| 103 | data_df = self.load_data() | ||
| 104 | |||
| 105 | X = data_df[self.sequence_cols] | ||
| 106 | Y = data_df[self.score_col] | ||
| 107 | |||
| 108 | self.x_train, self.x_val, self.y_train, self.y_val = split_data( | ||
| 109 | X, Y, test_size=self.instances, shuffle=False | ||
| 110 | ) | ||
| 111 | |||
| 112 | # Split to lists | ||
| 113 | self.x_train = [self.x_train[column] for column in self.sequence_cols] | ||
| 114 | self.x_val = [self.x_val[column] for column in self.sequence_cols] | ||
| 115 | |||
| 116 | # Convert labels to their numpy representations | ||
| 117 | self.y_train = self.y_train.values | ||
| 118 | self.y_val = self.y_val.values | ||
| 119 | |||
| 120 | # Padding Sequences. | ||
| 121 | self.pad_sequences() | ||
| 122 | |||
| 123 | |||
| 124 | class Get_Embedding(object): | ||
| 125 | def __init__(self, source_lang, target_lang, source_emb, target_emb, word_index): | ||
| 126 | self.embedding_size = 300 # Default dimensionality | ||
| 127 | self.embedding_matrix = self.create_embed_matrix( | ||
| 128 | source_lang, target_lang, source_emb, target_emb, word_index | ||
| 129 | ) | ||
| 130 | |||
| 131 | def create_embed_matrix( | ||
| 132 | self, source_lang, target_lang, source_emb, target_emb, word_index | ||
| 133 | ): | ||
| 134 | source_vecs = KeyedVectors.load_word2vec_format(source_emb) | ||
| 135 | target_vecs = KeyedVectors.load_word2vec_format(target_emb) | ||
| 136 | |||
| 137 | # Prepare Embedding Matrix. | ||
| 138 | embedding_matrix = np.zeros((len(word_index) + 1, self.embedding_size)) | ||
| 139 | |||
| 140 | # word has either __source or __target appended | ||
| 141 | for key, i in word_index.items(): | ||
| 142 | if "__" not in key: | ||
| 143 | print("Skipping {}".format(key)) | ||
| 144 | continue | ||
| 145 | |||
| 146 | word, lang = key.split("__") | ||
| 147 | |||
| 148 | if lang == source_lang: | ||
| 149 | if word in source_vecs.vocab: | ||
| 150 | embedding_matrix[i] = source_vecs.word_vec(word) | ||
| 151 | else: | ||
| 152 | if word in target_vecs.vocab: | ||
| 153 | embedding_matrix[i] = target_vecs.word_vec(word) | ||
| 154 | |||
| 155 | del source_vecs | ||
| 156 | del target_vecs | ||
| 157 | return embedding_matrix | ||
diff --git a/learn_and_predict.py b/learn_and_predict.py new file mode 100644 index 0000000..36c56f2 --- /dev/null +++ b/learn_and_predict.py | |||
| @@ -0,0 +1,202 @@ | |||
| 1 | import argparse | ||
| 2 | import csv | ||
| 3 | |||
| 4 | import numpy as np | ||
| 5 | |||
| 6 | import keras | ||
| 7 | import keras.backend as K | ||
| 8 | from Helpers import Data, Get_Embedding | ||
| 9 | from keras.layers import LSTM, Embedding, Input, Lambda, concatenate | ||
| 10 | from keras.models import Model | ||
| 11 | |||
| 12 | |||
| 13 | def get_learning_rate(epoch=None, model=None): | ||
| 14 | return np.round(float(K.get_value(model.optimizer.lr)), 5) | ||
| 15 | |||
| 16 | |||
| 17 | def make_cosine_func(hidden_size=50): | ||
| 18 | def exponent_neg_cosine_similarity(x): | ||
| 19 | """ Helper function for the similarity estimate of the LSTMs outputs """ | ||
| 20 | leftNorm = K.l2_normalize(x[:, :hidden_size], axis=-1) | ||
| 21 | rightNorm = K.l2_normalize(x[:, hidden_size:], axis=-1) | ||
| 22 | return K.sum(K.prod([leftNorm, rightNorm], axis=0), axis=1, keepdims=True) | ||
| 23 | |||
| 24 | return exponent_neg_cosine_similarity | ||
| 25 | |||
| 26 | |||
| 27 | def main(args): | ||
| 28 | |||
| 29 | source_lang = args.source_lang | ||
| 30 | target_lang = args.target_lang | ||
| 31 | hidden_size = args.hidden_size | ||
| 32 | max_len = args.max_len | ||
| 33 | num_iters = args.num_iters | ||
| 34 | data_file = args.data_file | ||
| 35 | learning_rate = args.learning_rate | ||
| 36 | batch = args.batch | ||
| 37 | |||
| 38 | data = Data(source_lang, target_lang, data_file, max_len) | ||
| 39 | |||
| 40 | x_train = data.x_train | ||
| 41 | y_train = data.y_train | ||
| 42 | x_predict = data.x_val | ||
| 43 | y_predict = data.y_val | ||
| 44 | vocab_size = data.vocab_size | ||
| 45 | max_len = data.max_len | ||
| 46 | |||
| 47 | # https://stackoverflow.com/a/10741692/3005749 | ||
| 48 | x = data.y_val | ||
| 49 | y = np.bincount(x.astype(np.int32)) | ||
| 50 | ii = np.nonzero(y)[0] | ||
| 51 | assert ii == 1 | ||
| 52 | assert y[ii] == 1000 # hardcoded for now | ||
| 53 | |||
| 54 | if not batch: | ||
| 55 | print(f"Source Lang: {source_lang}") | ||
| 56 | print(f"Target Lang: {target_lang}") | ||
| 57 | print(f"Using {len(x_train[0])} pairs to learn") | ||
| 58 | print(f"Predicting {len(y_predict)} pairs") | ||
| 59 | print(f"Vocabulary size: {vocab_size}") | ||
| 60 | print(f"Maximum sequence length: {max_len}") | ||
| 61 | |||
| 62 | source_emb_file = args.source_emb_file | ||
| 63 | target_emb_file = args.target_emb_file | ||
| 64 | |||
| 65 | embedding = Get_Embedding( | ||
| 66 | source_lang, target_lang, source_emb_file, target_emb_file, data.word_to_id | ||
| 67 | ) | ||
| 68 | embedding_size = embedding.embedding_matrix.shape[1] | ||
| 69 | |||
| 70 | seq_1 = Input(shape=(max_len,), dtype="int32", name="sequence1") | ||
| 71 | seq_2 = Input(shape=(max_len,), dtype="int32", name="sequence2") | ||
| 72 | |||
| 73 | embed_layer = Embedding( | ||
| 74 | output_dim=embedding_size, | ||
| 75 | input_dim=vocab_size + 1, | ||
| 76 | input_length=max_len, | ||
| 77 | trainable=False, | ||
| 78 | ) | ||
| 79 | embed_layer.build((None,)) | ||
| 80 | embed_layer.set_weights([embedding.embedding_matrix]) | ||
| 81 | |||
| 82 | input_1 = embed_layer(seq_1) | ||
| 83 | input_2 = embed_layer(seq_2) | ||
| 84 | |||
| 85 | l1 = LSTM(units=hidden_size) | ||
| 86 | |||
| 87 | l1_out = l1(input_1) | ||
| 88 | l2_out = l1(input_2) | ||
| 89 | |||
| 90 | concats = concatenate([l1_out, l2_out], axis=-1) | ||
| 91 | |||
| 92 | out_func = make_cosine_func(hidden_size) | ||
| 93 | |||
| 94 | main_output = Lambda(out_func, output_shape=(1,))(concats) | ||
| 95 | |||
| 96 | model = Model(inputs=[seq_1, seq_2], outputs=[main_output]) | ||
| 97 | |||
| 98 | opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.25) | ||
| 99 | |||
| 100 | model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"]) | ||
| 101 | model.summary() | ||
| 102 | |||
| 103 | adjuster = keras.callbacks.ReduceLROnPlateau( | ||
| 104 | monitor="val_acc", patience=5, verbose=1, factor=0.5, min_lr=0.0001 | ||
| 105 | ) | ||
| 106 | |||
| 107 | history = model.fit( | ||
| 108 | x_train, | ||
| 109 | y_train, | ||
| 110 | validation_data=(x_predict, y_predict), | ||
| 111 | epochs=num_iters, | ||
| 112 | batch_size=32, | ||
| 113 | verbose=1, | ||
| 114 | callbacks=[adjuster], | ||
| 115 | ) | ||
| 116 | |||
| 117 | target_sents = x_predict[1] | ||
| 118 | precision_at_one = 0 | ||
| 119 | precision_at_ten = 0 | ||
| 120 | for index, sent in enumerate(x_predict[0]): | ||
| 121 | source_sents = np.array([sent] * 1000) | ||
| 122 | to_predict = [source_sents, target_sents] | ||
| 123 | preds = model.predict(to_predict) | ||
| 124 | ind = np.argpartition(preds.ravel(), -10)[-10:] | ||
| 125 | if index in ind: | ||
| 126 | precision_at_ten += 1 | ||
| 127 | if np.argmax(preds.ravel()) == index: | ||
| 128 | precision_at_one += 1 | ||
| 129 | |||
| 130 | training_samples = len(x_train[0]) | ||
| 131 | validation_samples = len(y_predict) | ||
| 132 | fields = [ | ||
| 133 | source_lang, | ||
| 134 | target_lang, | ||
| 135 | training_samples, | ||
| 136 | validation_samples, | ||
| 137 | precision_at_one, | ||
| 138 | precision_at_ten, | ||
| 139 | ] | ||
| 140 | |||
| 141 | if not batch: | ||
| 142 | print(f"P@1: {precision_at_one/1000}, {precision_at_one} defs") | ||
| 143 | else: | ||
| 144 | with open("supervised.csv", "a") as f: | ||
| 145 | writer = csv.writer(f) | ||
| 146 | writer.writerow(fields) | ||
| 147 | |||
| 148 | |||
| 149 | if __name__ == "__main__": | ||
| 150 | |||
| 151 | parser = argparse.ArgumentParser() | ||
| 152 | |||
| 153 | parser.add_argument( | ||
| 154 | "-sl", "--source_lang", type=str, help="Source language.", default="english" | ||
| 155 | ) | ||
| 156 | parser.add_argument( | ||
| 157 | "-tl", "--target_lang", type=str, help="Target language.", default="italian" | ||
| 158 | ) | ||
| 159 | parser.add_argument("-df", "--data_file", type=str, help="Path to dataset.") | ||
| 160 | parser.add_argument( | ||
| 161 | "-es", | ||
| 162 | "--source_emb_file", | ||
| 163 | type=str, | ||
| 164 | help="Path to Source (English) Embedding File.", | ||
| 165 | ) | ||
| 166 | parser.add_argument( | ||
| 167 | "-et", "--target_emb_file", type=str, help="Path to Target Embedding File." | ||
| 168 | ) | ||
| 169 | parser.add_argument( | ||
| 170 | "-l", | ||
| 171 | "--max_len", | ||
| 172 | type=int, | ||
| 173 | help="Maximum number of words in a sentence.", | ||
| 174 | default=20, | ||
| 175 | ) | ||
| 176 | parser.add_argument( | ||
| 177 | "-z", | ||
| 178 | "--hidden_size", | ||
| 179 | type=int, | ||
| 180 | help="Number of Units in LSTM layer.", | ||
| 181 | default=50, | ||
| 182 | ) | ||
| 183 | parser.add_argument( | ||
| 184 | "-b", | ||
| 185 | "--batch", | ||
| 186 | action="store_true", | ||
| 187 | help="running in batch (store results to csv) or" | ||
| 188 | + "running in a single instance (output the results)", | ||
| 189 | ) | ||
| 190 | parser.add_argument( | ||
| 191 | "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=7 | ||
| 192 | ) | ||
| 193 | parser.add_argument( | ||
| 194 | "-lr", | ||
| 195 | "--learning_rate", | ||
| 196 | type=float, | ||
| 197 | help="Learning rate for optimizer.", | ||
| 198 | default=1.0, | ||
| 199 | ) | ||
| 200 | |||
| 201 | args = parser.parse_args() | ||
| 202 | main(args) | ||
