diff options
Diffstat (limited to 'Helpers.py')
| -rw-r--r-- | Helpers.py | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/Helpers.py b/Helpers.py new file mode 100644 index 0000000..7c615ab --- /dev/null +++ b/Helpers.py | |||
| @@ -0,0 +1,157 @@ | |||
| 1 | import itertools | ||
| 2 | |||
| 3 | import numpy as np | ||
| 4 | from sklearn.model_selection import train_test_split as split_data | ||
| 5 | |||
| 6 | import pandas as pd | ||
| 7 | from gensim.models import KeyedVectors | ||
| 8 | from keras.preprocessing.sequence import pad_sequences | ||
| 9 | |||
| 10 | |||
| 11 | class Data(object): | ||
| 12 | def __init__( | ||
| 13 | self, | ||
| 14 | source_lang, | ||
| 15 | target_lang, | ||
| 16 | data_file, | ||
| 17 | max_len=None, | ||
| 18 | instances=1000, | ||
| 19 | vocab_limit=None, | ||
| 20 | sentence_cols=None, | ||
| 21 | score_col=None, | ||
| 22 | ): | ||
| 23 | self.source_lang = source_lang | ||
| 24 | self.target_lang = target_lang | ||
| 25 | self.data_file = data_file | ||
| 26 | self.max_len = max_len | ||
| 27 | self.instances = instances | ||
| 28 | self.vocab_size = 1 | ||
| 29 | self.vocab_limit = vocab_limit | ||
| 30 | |||
| 31 | if sentence_cols is None: | ||
| 32 | self.sequence_cols = [ | ||
| 33 | f"{source_lang} definition", | ||
| 34 | f"{target_lang} definition", | ||
| 35 | ] | ||
| 36 | else: | ||
| 37 | self.sequence_cols = sentence_cols | ||
| 38 | |||
| 39 | if score_col is None: | ||
| 40 | self.score_col = "is same" | ||
| 41 | else: | ||
| 42 | self.score_col = score_col | ||
| 43 | |||
| 44 | self.x_train = list() | ||
| 45 | self.y_train = list() | ||
| 46 | self.x_val = list() | ||
| 47 | self.y_val = list() | ||
| 48 | self.vocab = set("PAD") | ||
| 49 | self.word_to_id = {"PAD": 0} | ||
| 50 | self.id_to_word = {0: "PAD"} | ||
| 51 | self.word_to_count = dict() | ||
| 52 | self.run() | ||
| 53 | |||
| 54 | def text_to_word_list(self, text): | ||
| 55 | """ Pre process and convert texts to a list of words """ | ||
| 56 | text = str(text) | ||
| 57 | text = text.split() | ||
| 58 | return text | ||
| 59 | |||
| 60 | def load_data(self): | ||
| 61 | # Load data set | ||
| 62 | data_df = pd.read_csv(self.data_file, sep="\t") | ||
| 63 | |||
| 64 | # Iterate over required sequences of provided dataset | ||
| 65 | for index, row in data_df.iterrows(): | ||
| 66 | # Iterate through the text of both questions of the row | ||
| 67 | for sequence in self.sequence_cols: | ||
| 68 | s2n = [] # Sequences with words replaces with indices | ||
| 69 | for word in self.text_to_word_list(row[sequence]): | ||
| 70 | if word not in self.vocab: | ||
| 71 | self.vocab.add(word) | ||
| 72 | self.word_to_id[word] = self.vocab_size | ||
| 73 | self.word_to_count[word] = 1 | ||
| 74 | s2n.append(self.vocab_size) | ||
| 75 | self.id_to_word[self.vocab_size] = word | ||
| 76 | self.vocab_size += 1 | ||
| 77 | else: | ||
| 78 | self.word_to_count[word] += 1 | ||
| 79 | s2n.append(self.word_to_id[word]) | ||
| 80 | |||
| 81 | # Replace |sequence as word| with |sequence as number| representation | ||
| 82 | data_df.at[index, sequence] = s2n | ||
| 83 | return data_df | ||
| 84 | |||
| 85 | def pad_sequences(self): | ||
| 86 | if self.max_len == 0: | ||
| 87 | self.max_len = max( | ||
| 88 | max(len(seq) for seq in self.x_train[0]), | ||
| 89 | max(len(seq) for seq in self.x_train[1]), | ||
| 90 | max(len(seq) for seq in self.x_val[0]), | ||
| 91 | max(len(seq) for seq in self.x_val[1]), | ||
| 92 | ) | ||
| 93 | |||
| 94 | # Zero padding | ||
| 95 | for dataset, side in itertools.product([self.x_train, self.x_val], [0, 1]): | ||
| 96 | if self.max_len: | ||
| 97 | dataset[side] = pad_sequences(dataset[side], maxlen=self.max_len) | ||
| 98 | else: | ||
| 99 | dataset[side] = pad_sequences(dataset[side]) | ||
| 100 | |||
| 101 | def run(self): | ||
| 102 | # Loading data and building vocabulary. | ||
| 103 | data_df = self.load_data() | ||
| 104 | |||
| 105 | X = data_df[self.sequence_cols] | ||
| 106 | Y = data_df[self.score_col] | ||
| 107 | |||
| 108 | self.x_train, self.x_val, self.y_train, self.y_val = split_data( | ||
| 109 | X, Y, test_size=self.instances, shuffle=False | ||
| 110 | ) | ||
| 111 | |||
| 112 | # Split to lists | ||
| 113 | self.x_train = [self.x_train[column] for column in self.sequence_cols] | ||
| 114 | self.x_val = [self.x_val[column] for column in self.sequence_cols] | ||
| 115 | |||
| 116 | # Convert labels to their numpy representations | ||
| 117 | self.y_train = self.y_train.values | ||
| 118 | self.y_val = self.y_val.values | ||
| 119 | |||
| 120 | # Padding Sequences. | ||
| 121 | self.pad_sequences() | ||
| 122 | |||
| 123 | |||
| 124 | class Get_Embedding(object): | ||
| 125 | def __init__(self, source_lang, target_lang, source_emb, target_emb, word_index): | ||
| 126 | self.embedding_size = 300 # Default dimensionality | ||
| 127 | self.embedding_matrix = self.create_embed_matrix( | ||
| 128 | source_lang, target_lang, source_emb, target_emb, word_index | ||
| 129 | ) | ||
| 130 | |||
| 131 | def create_embed_matrix( | ||
| 132 | self, source_lang, target_lang, source_emb, target_emb, word_index | ||
| 133 | ): | ||
| 134 | source_vecs = KeyedVectors.load_word2vec_format(source_emb) | ||
| 135 | target_vecs = KeyedVectors.load_word2vec_format(target_emb) | ||
| 136 | |||
| 137 | # Prepare Embedding Matrix. | ||
| 138 | embedding_matrix = np.zeros((len(word_index) + 1, self.embedding_size)) | ||
| 139 | |||
| 140 | # word has either __source or __target appended | ||
| 141 | for key, i in word_index.items(): | ||
| 142 | if "__" not in key: | ||
| 143 | print("Skipping {}".format(key)) | ||
| 144 | continue | ||
| 145 | |||
| 146 | word, lang = key.split("__") | ||
| 147 | |||
| 148 | if lang == source_lang: | ||
| 149 | if word in source_vecs.vocab: | ||
| 150 | embedding_matrix[i] = source_vecs.word_vec(word) | ||
| 151 | else: | ||
| 152 | if word in target_vecs.vocab: | ||
| 153 | embedding_matrix[i] = target_vecs.word_vec(word) | ||
| 154 | |||
| 155 | del source_vecs | ||
| 156 | del target_vecs | ||
| 157 | return embedding_matrix | ||
