diff options
| -rw-r--r-- | scripts/tsv_creator.py | 160 |
1 files changed, 93 insertions, 67 deletions
diff --git a/scripts/tsv_creator.py b/scripts/tsv_creator.py index 7d587c5..f7df95c 100644 --- a/scripts/tsv_creator.py +++ b/scripts/tsv_creator.py | |||
| @@ -1,62 +1,41 @@ | |||
| 1 | import argparse | 1 | import argparse |
| 2 | import csv | ||
| 3 | import math | ||
| 4 | import random | ||
| 5 | import unicodedata | ||
| 6 | from re import sub | ||
| 2 | 7 | ||
| 3 | parser = argparse.ArgumentParser(description='Create .tsv file from two wordnet definitions') | 8 | from mosestokenizer import MosesTokenizer |
| 4 | parser.add_argument('source_lang', help='source language short name') | ||
| 5 | parser.add_argument('target_lang', help='target language short name') | ||
| 6 | parser.add_argument('source_defs', help='path of the source definitions') | ||
| 7 | parser.add_argument('target_defs', help='path of the target definitions') | ||
| 8 | parser.add_argument('-n', '--set_aside', help='set aside to validate on', type=int) | ||
| 9 | 9 | ||
| 10 | args = parser.parse_args() | ||
| 11 | 10 | ||
| 12 | source_lang = args.source_lang | 11 | def load_def_from_file(def_filename): |
| 13 | target_lang = args.target_lang | 12 | """ |
| 13 | def_filename: full path of the definition file | ||
| 14 | returns: list of dictionary definitions | ||
| 14 | 15 | ||
| 15 | from DataHelper.Loader import load_def_from_file as load_def | 16 | """ |
| 17 | return [ | ||
| 18 | unicodedata.normalize("NFC", line.rstrip("\n")) | ||
| 19 | for line in open(def_filename, encoding="utf8") | ||
| 20 | ] | ||
| 16 | 21 | ||
| 17 | source_defs_filename = args.source_defs | ||
| 18 | target_defs_filename = args.target_defs | ||
| 19 | defs_source = load_def(source_defs_filename) | ||
| 20 | defs_target = load_def(target_defs_filename) | ||
| 21 | |||
| 22 | import numpy as np | ||
| 23 | from re import sub | ||
| 24 | from mosestokenizer import * | ||
| 25 | 22 | ||
| 26 | def clean_corpus_suffix(corpus, language): | 23 | def clean_corpus_suffix(corpus, language): |
| 27 | ''' | 24 | """ |
| 28 | Adds '__target-language' and '__source-language' at the end of the words | 25 | Adds '__target-language' and '__source-language' at the end of the words |
| 29 | ''' | 26 | """ |
| 30 | clean_corpus = [] | 27 | clean_corpus = [] |
| 31 | tokenize = MosesTokenizer(language) | 28 | tokenize = MosesTokenizer(language) |
| 32 | for definition in corpus: | 29 | for definition in corpus: |
| 33 | definition = sub(r"'", '', definition) | 30 | definition = sub(r"'", "", definition) |
| 34 | definition = sub(r"[^\w]", ' ', definition) | 31 | definition = sub(r"[^\w]", " ", definition) |
| 35 | clean_doc = [] | 32 | clean_doc = [] |
| 36 | words = tokenize(definition) | 33 | words = tokenize(definition) |
| 37 | for word in words: | 34 | for word in words: |
| 38 | clean_doc.append(word + '__%s' % language) | 35 | clean_doc.append(word + "__%s" % language) |
| 39 | clean_corpus.append(' '.join(clean_doc)) | 36 | clean_corpus.append(" ".join(clean_doc)) |
| 40 | return clean_corpus | 37 | return clean_corpus |
| 41 | 38 | ||
| 42 | clean_source_corpus = clean_corpus_suffix(defs_source, source_lang) | ||
| 43 | clean_target_corpus = clean_corpus_suffix(defs_target, target_lang) | ||
| 44 | |||
| 45 | assert len(clean_source_corpus) == len(clean_target_corpus) | ||
| 46 | |||
| 47 | set_aside = args.set_aside | ||
| 48 | |||
| 49 | source_predict = clean_source_corpus[-set_aside:] | ||
| 50 | target_predict = clean_target_corpus[-set_aside:] | ||
| 51 | labels_predict = [1] * set_aside # placeholder, won't be used, we can use 1 because they're correct | ||
| 52 | |||
| 53 | clean_source_corpus = clean_source_corpus[:-set_aside] | ||
| 54 | clean_target_corpus = clean_target_corpus[:-set_aside] | ||
| 55 | |||
| 56 | size = len(clean_source_corpus) | ||
| 57 | |||
| 58 | import math | ||
| 59 | import random | ||
| 60 | 39 | ||
| 61 | def create_pos_neg_samples(length): | 40 | def create_pos_neg_samples(length): |
| 62 | indices = list(range(length)) | 41 | indices = list(range(length)) |
| @@ -67,35 +46,82 @@ def create_pos_neg_samples(length): | |||
| 67 | 46 | ||
| 68 | for (index, point) in zip(neg_indices, neg_points): | 47 | for (index, point) in zip(neg_indices, neg_points): |
| 69 | indices[point] = index | 48 | indices[point] = index |
| 70 | |||
| 71 | labels = [1] * length | 49 | labels = [1] * length |
| 72 | |||
| 73 | for i in neg_points: | 50 | for i in neg_points: |
| 74 | labels[i] = 0 | 51 | labels[i] = 0 |
| 75 | 52 | ||
| 76 | return indices, labels | 53 | return indices, labels |
| 77 | 54 | ||
| 78 | while True: | ||
| 79 | indices, labels = create_pos_neg_samples(size) | ||
| 80 | shuffled_target = [clean_target_corpus[index] for index in indices] | ||
| 81 | check = [clean for clean, shuf in zip(clean_target_corpus, shuffled_target) if clean == shuf] | ||
| 82 | halfsize = math.ceil(size/2) | ||
| 83 | try: | ||
| 84 | assert len(check) == halfsize | ||
| 85 | except AssertionError: | ||
| 86 | print(f'rolling again: {len(check)} vs {halfsize}') | ||
| 87 | else: | ||
| 88 | break | ||
| 89 | |||
| 90 | assert len(clean_source_corpus) == len(shuffled_target) == size | ||
| 91 | assert len(labels) == len(clean_source_corpus) == len(shuffled_target) | ||
| 92 | |||
| 93 | import csv | ||
| 94 | 55 | ||
| 95 | with open(f'/home/syigit/tsv_data/{source_lang}_{target_lang}_1000_data.tsv', 'w', encoding='utf8', newline='') as tsv_file: | 56 | def main(args): |
| 96 | tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n') | 57 | |
| 97 | tsv_writer.writerow([f'{source_lang} definition', f'{target_lang} definition', 'is same']) | 58 | source_lang = args.source_lang |
| 98 | for row in zip(clean_source_corpus, shuffled_target, labels): | 59 | target_lang = args.target_lang |
| 99 | tsv_writer.writerow(row) | 60 | |
| 100 | for row in zip(source_predict, target_predict, labels_predict): | 61 | source_defs_filename = args.source_defs |
| 101 | tsv_writer.writerow(row) | 62 | target_defs_filename = args.target_defs |
| 63 | defs_source = load_def(source_defs_filename) | ||
| 64 | defs_target = load_def(target_defs_filename) | ||
| 65 | |||
| 66 | clean_source_corpus = clean_corpus_suffix(defs_source, source_lang) | ||
| 67 | clean_target_corpus = clean_corpus_suffix(defs_target, target_lang) | ||
| 68 | |||
| 69 | assert len(clean_source_corpus) == len(clean_target_corpus) | ||
| 70 | |||
| 71 | set_aside = args.set_aside | ||
| 72 | |||
| 73 | source_predict = clean_source_corpus[-set_aside:] | ||
| 74 | target_predict = clean_target_corpus[-set_aside:] | ||
| 75 | labels_predict = [ | ||
| 76 | 1 | ||
| 77 | ] * set_aside # placeholder, won't be used, we can use 1 because they're correct | ||
| 78 | |||
| 79 | clean_source_corpus = clean_source_corpus[:-set_aside] | ||
| 80 | clean_target_corpus = clean_target_corpus[:-set_aside] | ||
| 81 | |||
| 82 | size = len(clean_source_corpus) | ||
| 83 | |||
| 84 | while True: | ||
| 85 | indices, labels = create_pos_neg_samples(size) | ||
| 86 | shuffled_target = [clean_target_corpus[index] for index in indices] | ||
| 87 | check = [ | ||
| 88 | clean | ||
| 89 | for clean, shuf in zip(clean_target_corpus, shuffled_target) | ||
| 90 | if clean == shuf | ||
| 91 | ] | ||
| 92 | halfsize = math.ceil(size / 2) | ||
| 93 | try: | ||
| 94 | assert len(check) == halfsize | ||
| 95 | except AssertionError: | ||
| 96 | print(f"rolling again: {len(check)} vs {halfsize}") | ||
| 97 | else: | ||
| 98 | break | ||
| 99 | |||
| 100 | assert len(clean_source_corpus) == len(shuffled_target) == size | ||
| 101 | assert len(labels) == len(clean_source_corpus) == len(shuffled_target) | ||
| 102 | |||
| 103 | with open( | ||
| 104 | f"{source_lang}_to_{target_lang}.tsv", "w", encoding="utf8", newline="" | ||
| 105 | ) as tsv_file: | ||
| 106 | tsv_writer = csv.writer(tsv_file, delimiter="\t", lineterminator="\n") | ||
| 107 | tsv_writer.writerow( | ||
| 108 | [f"{source_lang} definition", f"{target_lang} definition", "is same"] | ||
| 109 | ) | ||
| 110 | for row in zip(clean_source_corpus, shuffled_target, labels): | ||
| 111 | tsv_writer.writerow(row) | ||
| 112 | for row in zip(source_predict, target_predict, labels_predict): | ||
| 113 | tsv_writer.writerow(row) | ||
| 114 | |||
| 115 | |||
| 116 | if __name__ == "__main__": | ||
| 117 | |||
| 118 | parser = argparse.ArgumentParser( | ||
| 119 | description="Create .tsv file from two wordnet definitions" | ||
| 120 | ) | ||
| 121 | parser.add_argument("source_lang", help="source language short name") | ||
| 122 | parser.add_argument("target_lang", help="target language short name") | ||
| 123 | parser.add_argument("source_defs", help="path of the source definitions") | ||
| 124 | parser.add_argument("target_defs", help="path of the target definitions") | ||
| 125 | parser.add_argument("-n", "--set_aside", help="set aside to validate on", type=int) | ||
| 126 | args = parser.parse_args() | ||
| 127 | main(args) | ||
