aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--scripts/tsv_creator.py160
1 files changed, 93 insertions, 67 deletions
diff --git a/scripts/tsv_creator.py b/scripts/tsv_creator.py
index 7d587c5..f7df95c 100644
--- a/scripts/tsv_creator.py
+++ b/scripts/tsv_creator.py
@@ -1,62 +1,41 @@
1import argparse 1import argparse
2import csv
3import math
4import random
5import unicodedata
6from re import sub
2 7
3parser = argparse.ArgumentParser(description='Create .tsv file from two wordnet definitions') 8from mosestokenizer import MosesTokenizer
4parser.add_argument('source_lang', help='source language short name')
5parser.add_argument('target_lang', help='target language short name')
6parser.add_argument('source_defs', help='path of the source definitions')
7parser.add_argument('target_defs', help='path of the target definitions')
8parser.add_argument('-n', '--set_aside', help='set aside to validate on', type=int)
9 9
10args = parser.parse_args()
11 10
12source_lang = args.source_lang 11def load_def_from_file(def_filename):
13target_lang = args.target_lang 12 """
13 def_filename: full path of the definition file
14 returns: list of dictionary definitions
14 15
15from DataHelper.Loader import load_def_from_file as load_def 16 """
17 return [
18 unicodedata.normalize("NFC", line.rstrip("\n"))
19 for line in open(def_filename, encoding="utf8")
20 ]
16 21
17source_defs_filename = args.source_defs
18target_defs_filename = args.target_defs
19defs_source = load_def(source_defs_filename)
20defs_target = load_def(target_defs_filename)
21
22import numpy as np
23from re import sub
24from mosestokenizer import *
25 22
26def clean_corpus_suffix(corpus, language): 23def clean_corpus_suffix(corpus, language):
27 ''' 24 """
28 Adds '__target-language' and '__source-language' at the end of the words 25 Adds '__target-language' and '__source-language' at the end of the words
29 ''' 26 """
30 clean_corpus = [] 27 clean_corpus = []
31 tokenize = MosesTokenizer(language) 28 tokenize = MosesTokenizer(language)
32 for definition in corpus: 29 for definition in corpus:
33 definition = sub(r"'", '', definition) 30 definition = sub(r"'", "", definition)
34 definition = sub(r"[^\w]", ' ', definition) 31 definition = sub(r"[^\w]", " ", definition)
35 clean_doc = [] 32 clean_doc = []
36 words = tokenize(definition) 33 words = tokenize(definition)
37 for word in words: 34 for word in words:
38 clean_doc.append(word + '__%s' % language) 35 clean_doc.append(word + "__%s" % language)
39 clean_corpus.append(' '.join(clean_doc)) 36 clean_corpus.append(" ".join(clean_doc))
40 return clean_corpus 37 return clean_corpus
41 38
42clean_source_corpus = clean_corpus_suffix(defs_source, source_lang)
43clean_target_corpus = clean_corpus_suffix(defs_target, target_lang)
44
45assert len(clean_source_corpus) == len(clean_target_corpus)
46
47set_aside = args.set_aside
48
49source_predict = clean_source_corpus[-set_aside:]
50target_predict = clean_target_corpus[-set_aside:]
51labels_predict = [1] * set_aside # placeholder, won't be used, we can use 1 because they're correct
52
53clean_source_corpus = clean_source_corpus[:-set_aside]
54clean_target_corpus = clean_target_corpus[:-set_aside]
55
56size = len(clean_source_corpus)
57
58import math
59import random
60 39
61def create_pos_neg_samples(length): 40def create_pos_neg_samples(length):
62 indices = list(range(length)) 41 indices = list(range(length))
@@ -67,35 +46,82 @@ def create_pos_neg_samples(length):
67 46
68 for (index, point) in zip(neg_indices, neg_points): 47 for (index, point) in zip(neg_indices, neg_points):
69 indices[point] = index 48 indices[point] = index
70
71 labels = [1] * length 49 labels = [1] * length
72
73 for i in neg_points: 50 for i in neg_points:
74 labels[i] = 0 51 labels[i] = 0
75 52
76 return indices, labels 53 return indices, labels
77 54
78while True:
79 indices, labels = create_pos_neg_samples(size)
80 shuffled_target = [clean_target_corpus[index] for index in indices]
81 check = [clean for clean, shuf in zip(clean_target_corpus, shuffled_target) if clean == shuf]
82 halfsize = math.ceil(size/2)
83 try:
84 assert len(check) == halfsize
85 except AssertionError:
86 print(f'rolling again: {len(check)} vs {halfsize}')
87 else:
88 break
89
90assert len(clean_source_corpus) == len(shuffled_target) == size
91assert len(labels) == len(clean_source_corpus) == len(shuffled_target)
92
93import csv
94 55
95with open(f'/home/syigit/tsv_data/{source_lang}_{target_lang}_1000_data.tsv', 'w', encoding='utf8', newline='') as tsv_file: 56def main(args):
96 tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n') 57
97 tsv_writer.writerow([f'{source_lang} definition', f'{target_lang} definition', 'is same']) 58 source_lang = args.source_lang
98 for row in zip(clean_source_corpus, shuffled_target, labels): 59 target_lang = args.target_lang
99 tsv_writer.writerow(row) 60
100 for row in zip(source_predict, target_predict, labels_predict): 61 source_defs_filename = args.source_defs
101 tsv_writer.writerow(row) 62 target_defs_filename = args.target_defs
63 defs_source = load_def(source_defs_filename)
64 defs_target = load_def(target_defs_filename)
65
66 clean_source_corpus = clean_corpus_suffix(defs_source, source_lang)
67 clean_target_corpus = clean_corpus_suffix(defs_target, target_lang)
68
69 assert len(clean_source_corpus) == len(clean_target_corpus)
70
71 set_aside = args.set_aside
72
73 source_predict = clean_source_corpus[-set_aside:]
74 target_predict = clean_target_corpus[-set_aside:]
75 labels_predict = [
76 1
77 ] * set_aside # placeholder, won't be used, we can use 1 because they're correct
78
79 clean_source_corpus = clean_source_corpus[:-set_aside]
80 clean_target_corpus = clean_target_corpus[:-set_aside]
81
82 size = len(clean_source_corpus)
83
84 while True:
85 indices, labels = create_pos_neg_samples(size)
86 shuffled_target = [clean_target_corpus[index] for index in indices]
87 check = [
88 clean
89 for clean, shuf in zip(clean_target_corpus, shuffled_target)
90 if clean == shuf
91 ]
92 halfsize = math.ceil(size / 2)
93 try:
94 assert len(check) == halfsize
95 except AssertionError:
96 print(f"rolling again: {len(check)} vs {halfsize}")
97 else:
98 break
99
100 assert len(clean_source_corpus) == len(shuffled_target) == size
101 assert len(labels) == len(clean_source_corpus) == len(shuffled_target)
102
103 with open(
104 f"{source_lang}_to_{target_lang}.tsv", "w", encoding="utf8", newline=""
105 ) as tsv_file:
106 tsv_writer = csv.writer(tsv_file, delimiter="\t", lineterminator="\n")
107 tsv_writer.writerow(
108 [f"{source_lang} definition", f"{target_lang} definition", "is same"]
109 )
110 for row in zip(clean_source_corpus, shuffled_target, labels):
111 tsv_writer.writerow(row)
112 for row in zip(source_predict, target_predict, labels_predict):
113 tsv_writer.writerow(row)
114
115
116if __name__ == "__main__":
117
118 parser = argparse.ArgumentParser(
119 description="Create .tsv file from two wordnet definitions"
120 )
121 parser.add_argument("source_lang", help="source language short name")
122 parser.add_argument("target_lang", help="target language short name")
123 parser.add_argument("source_defs", help="path of the source definitions")
124 parser.add_argument("target_defs", help="path of the target definitions")
125 parser.add_argument("-n", "--set_aside", help="set aside to validate on", type=int)
126 args = parser.parse_args()
127 main(args)