aboutsummaryrefslogtreecommitdiffstats
path: root/scripts
diff options
context:
space:
mode:
authorYigit Sever2019-09-25 21:16:29 +0300
committerYigit Sever2019-09-25 21:16:29 +0300
commite7340a15520523402099ef13a462c034fe58f51a (patch)
treec7b31fe768006c051ed35bb4f89e12f05e2084ec /scripts
parent4d117258017fb1518d7ce7242e5a9c5a780d70d7 (diff)
downloadEvaluating-Dictionary-Alignment-e7340a15520523402099ef13a462c034fe58f51a.tar.gz
Evaluating-Dictionary-Alignment-e7340a15520523402099ef13a462c034fe58f51a.tar.bz2
Evaluating-Dictionary-Alignment-e7340a15520523402099ef13a462c034fe58f51a.zip
Include .tsv creator
Diffstat (limited to 'scripts')
-rw-r--r--scripts/tsv_creator.py101
1 files changed, 101 insertions, 0 deletions
diff --git a/scripts/tsv_creator.py b/scripts/tsv_creator.py
new file mode 100644
index 0000000..7d587c5
--- /dev/null
+++ b/scripts/tsv_creator.py
@@ -0,0 +1,101 @@
1import argparse
2
3parser = argparse.ArgumentParser(description='Create .tsv file from two wordnet definitions')
4parser.add_argument('source_lang', help='source language short name')
5parser.add_argument('target_lang', help='target language short name')
6parser.add_argument('source_defs', help='path of the source definitions')
7parser.add_argument('target_defs', help='path of the target definitions')
8parser.add_argument('-n', '--set_aside', help='set aside to validate on', type=int)
9
10args = parser.parse_args()
11
12source_lang = args.source_lang
13target_lang = args.target_lang
14
15from DataHelper.Loader import load_def_from_file as load_def
16
17source_defs_filename = args.source_defs
18target_defs_filename = args.target_defs
19defs_source = load_def(source_defs_filename)
20defs_target = load_def(target_defs_filename)
21
22import numpy as np
23from re import sub
24from mosestokenizer import *
25
26def clean_corpus_suffix(corpus, language):
27 '''
28 Adds '__target-language' and '__source-language' at the end of the words
29 '''
30 clean_corpus = []
31 tokenize = MosesTokenizer(language)
32 for definition in corpus:
33 definition = sub(r"'", '', definition)
34 definition = sub(r"[^\w]", ' ', definition)
35 clean_doc = []
36 words = tokenize(definition)
37 for word in words:
38 clean_doc.append(word + '__%s' % language)
39 clean_corpus.append(' '.join(clean_doc))
40 return clean_corpus
41
42clean_source_corpus = clean_corpus_suffix(defs_source, source_lang)
43clean_target_corpus = clean_corpus_suffix(defs_target, target_lang)
44
45assert len(clean_source_corpus) == len(clean_target_corpus)
46
47set_aside = args.set_aside
48
49source_predict = clean_source_corpus[-set_aside:]
50target_predict = clean_target_corpus[-set_aside:]
51labels_predict = [1] * set_aside # placeholder, won't be used, we can use 1 because they're correct
52
53clean_source_corpus = clean_source_corpus[:-set_aside]
54clean_target_corpus = clean_target_corpus[:-set_aside]
55
56size = len(clean_source_corpus)
57
58import math
59import random
60
61def create_pos_neg_samples(length):
62 indices = list(range(length))
63 halfsize = math.ceil(length / 2)
64 neg_points = random.sample(indices, halfsize)
65 neg_indices = list(neg_points)
66 random.shuffle(neg_indices)
67
68 for (index, point) in zip(neg_indices, neg_points):
69 indices[point] = index
70
71 labels = [1] * length
72
73 for i in neg_points:
74 labels[i] = 0
75
76 return indices, labels
77
78while True:
79 indices, labels = create_pos_neg_samples(size)
80 shuffled_target = [clean_target_corpus[index] for index in indices]
81 check = [clean for clean, shuf in zip(clean_target_corpus, shuffled_target) if clean == shuf]
82 halfsize = math.ceil(size/2)
83 try:
84 assert len(check) == halfsize
85 except AssertionError:
86 print(f'rolling again: {len(check)} vs {halfsize}')
87 else:
88 break
89
90assert len(clean_source_corpus) == len(shuffled_target) == size
91assert len(labels) == len(clean_source_corpus) == len(shuffled_target)
92
93import csv
94
95with open(f'/home/syigit/tsv_data/{source_lang}_{target_lang}_1000_data.tsv', 'w', encoding='utf8', newline='') as tsv_file:
96 tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
97 tsv_writer.writerow([f'{source_lang} definition', f'{target_lang} definition', 'is same'])
98 for row in zip(clean_source_corpus, shuffled_target, labels):
99 tsv_writer.writerow(row)
100 for row in zip(source_predict, target_predict, labels_predict):
101 tsv_writer.writerow(row)