diff options
-rw-r--r-- | scripts/tsv_creator.py | 160 |
1 files changed, 93 insertions, 67 deletions
diff --git a/scripts/tsv_creator.py b/scripts/tsv_creator.py index 7d587c5..f7df95c 100644 --- a/scripts/tsv_creator.py +++ b/scripts/tsv_creator.py | |||
@@ -1,62 +1,41 @@ | |||
1 | import argparse | 1 | import argparse |
2 | import csv | ||
3 | import math | ||
4 | import random | ||
5 | import unicodedata | ||
6 | from re import sub | ||
2 | 7 | ||
3 | parser = argparse.ArgumentParser(description='Create .tsv file from two wordnet definitions') | 8 | from mosestokenizer import MosesTokenizer |
4 | parser.add_argument('source_lang', help='source language short name') | ||
5 | parser.add_argument('target_lang', help='target language short name') | ||
6 | parser.add_argument('source_defs', help='path of the source definitions') | ||
7 | parser.add_argument('target_defs', help='path of the target definitions') | ||
8 | parser.add_argument('-n', '--set_aside', help='set aside to validate on', type=int) | ||
9 | 9 | ||
10 | args = parser.parse_args() | ||
11 | 10 | ||
12 | source_lang = args.source_lang | 11 | def load_def_from_file(def_filename): |
13 | target_lang = args.target_lang | 12 | """ |
13 | def_filename: full path of the definition file | ||
14 | returns: list of dictionary definitions | ||
14 | 15 | ||
15 | from DataHelper.Loader import load_def_from_file as load_def | 16 | """ |
17 | return [ | ||
18 | unicodedata.normalize("NFC", line.rstrip("\n")) | ||
19 | for line in open(def_filename, encoding="utf8") | ||
20 | ] | ||
16 | 21 | ||
17 | source_defs_filename = args.source_defs | ||
18 | target_defs_filename = args.target_defs | ||
19 | defs_source = load_def(source_defs_filename) | ||
20 | defs_target = load_def(target_defs_filename) | ||
21 | |||
22 | import numpy as np | ||
23 | from re import sub | ||
24 | from mosestokenizer import * | ||
25 | 22 | ||
26 | def clean_corpus_suffix(corpus, language): | 23 | def clean_corpus_suffix(corpus, language): |
27 | ''' | 24 | """ |
28 | Adds '__target-language' and '__source-language' at the end of the words | 25 | Adds '__target-language' and '__source-language' at the end of the words |
29 | ''' | 26 | """ |
30 | clean_corpus = [] | 27 | clean_corpus = [] |
31 | tokenize = MosesTokenizer(language) | 28 | tokenize = MosesTokenizer(language) |
32 | for definition in corpus: | 29 | for definition in corpus: |
33 | definition = sub(r"'", '', definition) | 30 | definition = sub(r"'", "", definition) |
34 | definition = sub(r"[^\w]", ' ', definition) | 31 | definition = sub(r"[^\w]", " ", definition) |
35 | clean_doc = [] | 32 | clean_doc = [] |
36 | words = tokenize(definition) | 33 | words = tokenize(definition) |
37 | for word in words: | 34 | for word in words: |
38 | clean_doc.append(word + '__%s' % language) | 35 | clean_doc.append(word + "__%s" % language) |
39 | clean_corpus.append(' '.join(clean_doc)) | 36 | clean_corpus.append(" ".join(clean_doc)) |
40 | return clean_corpus | 37 | return clean_corpus |
41 | 38 | ||
42 | clean_source_corpus = clean_corpus_suffix(defs_source, source_lang) | ||
43 | clean_target_corpus = clean_corpus_suffix(defs_target, target_lang) | ||
44 | |||
45 | assert len(clean_source_corpus) == len(clean_target_corpus) | ||
46 | |||
47 | set_aside = args.set_aside | ||
48 | |||
49 | source_predict = clean_source_corpus[-set_aside:] | ||
50 | target_predict = clean_target_corpus[-set_aside:] | ||
51 | labels_predict = [1] * set_aside # placeholder, won't be used, we can use 1 because they're correct | ||
52 | |||
53 | clean_source_corpus = clean_source_corpus[:-set_aside] | ||
54 | clean_target_corpus = clean_target_corpus[:-set_aside] | ||
55 | |||
56 | size = len(clean_source_corpus) | ||
57 | |||
58 | import math | ||
59 | import random | ||
60 | 39 | ||
61 | def create_pos_neg_samples(length): | 40 | def create_pos_neg_samples(length): |
62 | indices = list(range(length)) | 41 | indices = list(range(length)) |
@@ -67,35 +46,82 @@ def create_pos_neg_samples(length): | |||
67 | 46 | ||
68 | for (index, point) in zip(neg_indices, neg_points): | 47 | for (index, point) in zip(neg_indices, neg_points): |
69 | indices[point] = index | 48 | indices[point] = index |
70 | |||
71 | labels = [1] * length | 49 | labels = [1] * length |
72 | |||
73 | for i in neg_points: | 50 | for i in neg_points: |
74 | labels[i] = 0 | 51 | labels[i] = 0 |
75 | 52 | ||
76 | return indices, labels | 53 | return indices, labels |
77 | 54 | ||
78 | while True: | ||
79 | indices, labels = create_pos_neg_samples(size) | ||
80 | shuffled_target = [clean_target_corpus[index] for index in indices] | ||
81 | check = [clean for clean, shuf in zip(clean_target_corpus, shuffled_target) if clean == shuf] | ||
82 | halfsize = math.ceil(size/2) | ||
83 | try: | ||
84 | assert len(check) == halfsize | ||
85 | except AssertionError: | ||
86 | print(f'rolling again: {len(check)} vs {halfsize}') | ||
87 | else: | ||
88 | break | ||
89 | |||
90 | assert len(clean_source_corpus) == len(shuffled_target) == size | ||
91 | assert len(labels) == len(clean_source_corpus) == len(shuffled_target) | ||
92 | |||
93 | import csv | ||
94 | 55 | ||
95 | with open(f'/home/syigit/tsv_data/{source_lang}_{target_lang}_1000_data.tsv', 'w', encoding='utf8', newline='') as tsv_file: | 56 | def main(args): |
96 | tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n') | 57 | |
97 | tsv_writer.writerow([f'{source_lang} definition', f'{target_lang} definition', 'is same']) | 58 | source_lang = args.source_lang |
98 | for row in zip(clean_source_corpus, shuffled_target, labels): | 59 | target_lang = args.target_lang |
99 | tsv_writer.writerow(row) | 60 | |
100 | for row in zip(source_predict, target_predict, labels_predict): | 61 | source_defs_filename = args.source_defs |
101 | tsv_writer.writerow(row) | 62 | target_defs_filename = args.target_defs |
63 | defs_source = load_def(source_defs_filename) | ||
64 | defs_target = load_def(target_defs_filename) | ||
65 | |||
66 | clean_source_corpus = clean_corpus_suffix(defs_source, source_lang) | ||
67 | clean_target_corpus = clean_corpus_suffix(defs_target, target_lang) | ||
68 | |||
69 | assert len(clean_source_corpus) == len(clean_target_corpus) | ||
70 | |||
71 | set_aside = args.set_aside | ||
72 | |||
73 | source_predict = clean_source_corpus[-set_aside:] | ||
74 | target_predict = clean_target_corpus[-set_aside:] | ||
75 | labels_predict = [ | ||
76 | 1 | ||
77 | ] * set_aside # placeholder, won't be used, we can use 1 because they're correct | ||
78 | |||
79 | clean_source_corpus = clean_source_corpus[:-set_aside] | ||
80 | clean_target_corpus = clean_target_corpus[:-set_aside] | ||
81 | |||
82 | size = len(clean_source_corpus) | ||
83 | |||
84 | while True: | ||
85 | indices, labels = create_pos_neg_samples(size) | ||
86 | shuffled_target = [clean_target_corpus[index] for index in indices] | ||
87 | check = [ | ||
88 | clean | ||
89 | for clean, shuf in zip(clean_target_corpus, shuffled_target) | ||
90 | if clean == shuf | ||
91 | ] | ||
92 | halfsize = math.ceil(size / 2) | ||
93 | try: | ||
94 | assert len(check) == halfsize | ||
95 | except AssertionError: | ||
96 | print(f"rolling again: {len(check)} vs {halfsize}") | ||
97 | else: | ||
98 | break | ||
99 | |||
100 | assert len(clean_source_corpus) == len(shuffled_target) == size | ||
101 | assert len(labels) == len(clean_source_corpus) == len(shuffled_target) | ||
102 | |||
103 | with open( | ||
104 | f"{source_lang}_to_{target_lang}.tsv", "w", encoding="utf8", newline="" | ||
105 | ) as tsv_file: | ||
106 | tsv_writer = csv.writer(tsv_file, delimiter="\t", lineterminator="\n") | ||
107 | tsv_writer.writerow( | ||
108 | [f"{source_lang} definition", f"{target_lang} definition", "is same"] | ||
109 | ) | ||
110 | for row in zip(clean_source_corpus, shuffled_target, labels): | ||
111 | tsv_writer.writerow(row) | ||
112 | for row in zip(source_predict, target_predict, labels_predict): | ||
113 | tsv_writer.writerow(row) | ||
114 | |||
115 | |||
116 | if __name__ == "__main__": | ||
117 | |||
118 | parser = argparse.ArgumentParser( | ||
119 | description="Create .tsv file from two wordnet definitions" | ||
120 | ) | ||
121 | parser.add_argument("source_lang", help="source language short name") | ||
122 | parser.add_argument("target_lang", help="target language short name") | ||
123 | parser.add_argument("source_defs", help="path of the source definitions") | ||
124 | parser.add_argument("target_defs", help="path of the target definitions") | ||
125 | parser.add_argument("-n", "--set_aside", help="set aside to validate on", type=int) | ||
126 | args = parser.parse_args() | ||
127 | main(args) | ||