From 4d117258017fb1518d7ce7242e5a9c5a780d70d7 Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Wed, 25 Sep 2019 21:08:18 +0300 Subject: Include supervised code Due to how we handle data, supervised approaches are hardcoded to work on 1000 instances for now --- Helpers.py | 157 +++++++++++++++++++++++++++++++++++++++ learn_and_predict.py | 202 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 359 insertions(+) create mode 100644 Helpers.py create mode 100644 learn_and_predict.py diff --git a/Helpers.py b/Helpers.py new file mode 100644 index 0000000..7c615ab --- /dev/null +++ b/Helpers.py @@ -0,0 +1,157 @@ +import itertools + +import numpy as np +from sklearn.model_selection import train_test_split as split_data + +import pandas as pd +from gensim.models import KeyedVectors +from keras.preprocessing.sequence import pad_sequences + + +class Data(object): + def __init__( + self, + source_lang, + target_lang, + data_file, + max_len=None, + instances=1000, + vocab_limit=None, + sentence_cols=None, + score_col=None, + ): + self.source_lang = source_lang + self.target_lang = target_lang + self.data_file = data_file + self.max_len = max_len + self.instances = instances + self.vocab_size = 1 + self.vocab_limit = vocab_limit + + if sentence_cols is None: + self.sequence_cols = [ + f"{source_lang} definition", + f"{target_lang} definition", + ] + else: + self.sequence_cols = sentence_cols + + if score_col is None: + self.score_col = "is same" + else: + self.score_col = score_col + + self.x_train = list() + self.y_train = list() + self.x_val = list() + self.y_val = list() + self.vocab = set("PAD") + self.word_to_id = {"PAD": 0} + self.id_to_word = {0: "PAD"} + self.word_to_count = dict() + self.run() + + def text_to_word_list(self, text): + """ Pre process and convert texts to a list of words """ + text = str(text) + text = text.split() + return text + + def load_data(self): + # Load data set + data_df = pd.read_csv(self.data_file, sep="\t") + + # Iterate over required sequences of provided dataset + for index, row in data_df.iterrows(): + # Iterate through the text of both questions of the row + for sequence in self.sequence_cols: + s2n = [] # Sequences with words replaces with indices + for word in self.text_to_word_list(row[sequence]): + if word not in self.vocab: + self.vocab.add(word) + self.word_to_id[word] = self.vocab_size + self.word_to_count[word] = 1 + s2n.append(self.vocab_size) + self.id_to_word[self.vocab_size] = word + self.vocab_size += 1 + else: + self.word_to_count[word] += 1 + s2n.append(self.word_to_id[word]) + + # Replace |sequence as word| with |sequence as number| representation + data_df.at[index, sequence] = s2n + return data_df + + def pad_sequences(self): + if self.max_len == 0: + self.max_len = max( + max(len(seq) for seq in self.x_train[0]), + max(len(seq) for seq in self.x_train[1]), + max(len(seq) for seq in self.x_val[0]), + max(len(seq) for seq in self.x_val[1]), + ) + + # Zero padding + for dataset, side in itertools.product([self.x_train, self.x_val], [0, 1]): + if self.max_len: + dataset[side] = pad_sequences(dataset[side], maxlen=self.max_len) + else: + dataset[side] = pad_sequences(dataset[side]) + + def run(self): + # Loading data and building vocabulary. + data_df = self.load_data() + + X = data_df[self.sequence_cols] + Y = data_df[self.score_col] + + self.x_train, self.x_val, self.y_train, self.y_val = split_data( + X, Y, test_size=self.instances, shuffle=False + ) + + # Split to lists + self.x_train = [self.x_train[column] for column in self.sequence_cols] + self.x_val = [self.x_val[column] for column in self.sequence_cols] + + # Convert labels to their numpy representations + self.y_train = self.y_train.values + self.y_val = self.y_val.values + + # Padding Sequences. + self.pad_sequences() + + +class Get_Embedding(object): + def __init__(self, source_lang, target_lang, source_emb, target_emb, word_index): + self.embedding_size = 300 # Default dimensionality + self.embedding_matrix = self.create_embed_matrix( + source_lang, target_lang, source_emb, target_emb, word_index + ) + + def create_embed_matrix( + self, source_lang, target_lang, source_emb, target_emb, word_index + ): + source_vecs = KeyedVectors.load_word2vec_format(source_emb) + target_vecs = KeyedVectors.load_word2vec_format(target_emb) + + # Prepare Embedding Matrix. + embedding_matrix = np.zeros((len(word_index) + 1, self.embedding_size)) + + # word has either __source or __target appended + for key, i in word_index.items(): + if "__" not in key: + print("Skipping {}".format(key)) + continue + + word, lang = key.split("__") + + if lang == source_lang: + if word in source_vecs.vocab: + embedding_matrix[i] = source_vecs.word_vec(word) + else: + if word in target_vecs.vocab: + embedding_matrix[i] = target_vecs.word_vec(word) + + del source_vecs + del target_vecs + return embedding_matrix diff --git a/learn_and_predict.py b/learn_and_predict.py new file mode 100644 index 0000000..36c56f2 --- /dev/null +++ b/learn_and_predict.py @@ -0,0 +1,202 @@ +import argparse +import csv + +import numpy as np + +import keras +import keras.backend as K +from Helpers import Data, Get_Embedding +from keras.layers import LSTM, Embedding, Input, Lambda, concatenate +from keras.models import Model + + +def get_learning_rate(epoch=None, model=None): + return np.round(float(K.get_value(model.optimizer.lr)), 5) + + +def make_cosine_func(hidden_size=50): + def exponent_neg_cosine_similarity(x): + """ Helper function for the similarity estimate of the LSTMs outputs """ + leftNorm = K.l2_normalize(x[:, :hidden_size], axis=-1) + rightNorm = K.l2_normalize(x[:, hidden_size:], axis=-1) + return K.sum(K.prod([leftNorm, rightNorm], axis=0), axis=1, keepdims=True) + + return exponent_neg_cosine_similarity + + +def main(args): + + source_lang = args.source_lang + target_lang = args.target_lang + hidden_size = args.hidden_size + max_len = args.max_len + num_iters = args.num_iters + data_file = args.data_file + learning_rate = args.learning_rate + batch = args.batch + + data = Data(source_lang, target_lang, data_file, max_len) + + x_train = data.x_train + y_train = data.y_train + x_predict = data.x_val + y_predict = data.y_val + vocab_size = data.vocab_size + max_len = data.max_len + + # https://stackoverflow.com/a/10741692/3005749 + x = data.y_val + y = np.bincount(x.astype(np.int32)) + ii = np.nonzero(y)[0] + assert ii == 1 + assert y[ii] == 1000 # hardcoded for now + + if not batch: + print(f"Source Lang: {source_lang}") + print(f"Target Lang: {target_lang}") + print(f"Using {len(x_train[0])} pairs to learn") + print(f"Predicting {len(y_predict)} pairs") + print(f"Vocabulary size: {vocab_size}") + print(f"Maximum sequence length: {max_len}") + + source_emb_file = args.source_emb_file + target_emb_file = args.target_emb_file + + embedding = Get_Embedding( + source_lang, target_lang, source_emb_file, target_emb_file, data.word_to_id + ) + embedding_size = embedding.embedding_matrix.shape[1] + + seq_1 = Input(shape=(max_len,), dtype="int32", name="sequence1") + seq_2 = Input(shape=(max_len,), dtype="int32", name="sequence2") + + embed_layer = Embedding( + output_dim=embedding_size, + input_dim=vocab_size + 1, + input_length=max_len, + trainable=False, + ) + embed_layer.build((None,)) + embed_layer.set_weights([embedding.embedding_matrix]) + + input_1 = embed_layer(seq_1) + input_2 = embed_layer(seq_2) + + l1 = LSTM(units=hidden_size) + + l1_out = l1(input_1) + l2_out = l1(input_2) + + concats = concatenate([l1_out, l2_out], axis=-1) + + out_func = make_cosine_func(hidden_size) + + main_output = Lambda(out_func, output_shape=(1,))(concats) + + model = Model(inputs=[seq_1, seq_2], outputs=[main_output]) + + opt = keras.optimizers.Adadelta(lr=learning_rate, clipnorm=1.25) + + model.compile(optimizer=opt, loss="mean_squared_error", metrics=["accuracy"]) + model.summary() + + adjuster = keras.callbacks.ReduceLROnPlateau( + monitor="val_acc", patience=5, verbose=1, factor=0.5, min_lr=0.0001 + ) + + history = model.fit( + x_train, + y_train, + validation_data=(x_predict, y_predict), + epochs=num_iters, + batch_size=32, + verbose=1, + callbacks=[adjuster], + ) + + target_sents = x_predict[1] + precision_at_one = 0 + precision_at_ten = 0 + for index, sent in enumerate(x_predict[0]): + source_sents = np.array([sent] * 1000) + to_predict = [source_sents, target_sents] + preds = model.predict(to_predict) + ind = np.argpartition(preds.ravel(), -10)[-10:] + if index in ind: + precision_at_ten += 1 + if np.argmax(preds.ravel()) == index: + precision_at_one += 1 + + training_samples = len(x_train[0]) + validation_samples = len(y_predict) + fields = [ + source_lang, + target_lang, + training_samples, + validation_samples, + precision_at_one, + precision_at_ten, + ] + + if not batch: + print(f"P@1: {precision_at_one/1000}, {precision_at_one} defs") + else: + with open("supervised.csv", "a") as f: + writer = csv.writer(f) + writer.writerow(fields) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + parser.add_argument( + "-sl", "--source_lang", type=str, help="Source language.", default="english" + ) + parser.add_argument( + "-tl", "--target_lang", type=str, help="Target language.", default="italian" + ) + parser.add_argument("-df", "--data_file", type=str, help="Path to dataset.") + parser.add_argument( + "-es", + "--source_emb_file", + type=str, + help="Path to Source (English) Embedding File.", + ) + parser.add_argument( + "-et", "--target_emb_file", type=str, help="Path to Target Embedding File." + ) + parser.add_argument( + "-l", + "--max_len", + type=int, + help="Maximum number of words in a sentence.", + default=20, + ) + parser.add_argument( + "-z", + "--hidden_size", + type=int, + help="Number of Units in LSTM layer.", + default=50, + ) + parser.add_argument( + "-b", + "--batch", + action="store_true", + help="running in batch (store results to csv) or" + + "running in a single instance (output the results)", + ) + parser.add_argument( + "-n", "--num_iters", type=int, help="Number of iterations/epochs.", default=7 + ) + parser.add_argument( + "-lr", + "--learning_rate", + type=float, + help="Learning rate for optimizer.", + default=1.0, + ) + + args = parser.parse_args() + main(args) -- cgit v1.2.3-70-g09d2