1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
import argparse
parser = argparse.ArgumentParser(description='Create .tsv file from two wordnet definitions')
parser.add_argument('source_lang', help='source language short name')
parser.add_argument('target_lang', help='target language short name')
parser.add_argument('source_defs', help='path of the source definitions')
parser.add_argument('target_defs', help='path of the target definitions')
parser.add_argument('-n', '--set_aside', help='set aside to validate on', type=int)
args = parser.parse_args()
source_lang = args.source_lang
target_lang = args.target_lang
from DataHelper.Loader import load_def_from_file as load_def
source_defs_filename = args.source_defs
target_defs_filename = args.target_defs
defs_source = load_def(source_defs_filename)
defs_target = load_def(target_defs_filename)
import numpy as np
from re import sub
from mosestokenizer import *
def clean_corpus_suffix(corpus, language):
'''
Adds '__target-language' and '__source-language' at the end of the words
'''
clean_corpus = []
tokenize = MosesTokenizer(language)
for definition in corpus:
definition = sub(r"'", '', definition)
definition = sub(r"[^\w]", ' ', definition)
clean_doc = []
words = tokenize(definition)
for word in words:
clean_doc.append(word + '__%s' % language)
clean_corpus.append(' '.join(clean_doc))
return clean_corpus
clean_source_corpus = clean_corpus_suffix(defs_source, source_lang)
clean_target_corpus = clean_corpus_suffix(defs_target, target_lang)
assert len(clean_source_corpus) == len(clean_target_corpus)
set_aside = args.set_aside
source_predict = clean_source_corpus[-set_aside:]
target_predict = clean_target_corpus[-set_aside:]
labels_predict = [1] * set_aside # placeholder, won't be used, we can use 1 because they're correct
clean_source_corpus = clean_source_corpus[:-set_aside]
clean_target_corpus = clean_target_corpus[:-set_aside]
size = len(clean_source_corpus)
import math
import random
def create_pos_neg_samples(length):
indices = list(range(length))
halfsize = math.ceil(length / 2)
neg_points = random.sample(indices, halfsize)
neg_indices = list(neg_points)
random.shuffle(neg_indices)
for (index, point) in zip(neg_indices, neg_points):
indices[point] = index
labels = [1] * length
for i in neg_points:
labels[i] = 0
return indices, labels
while True:
indices, labels = create_pos_neg_samples(size)
shuffled_target = [clean_target_corpus[index] for index in indices]
check = [clean for clean, shuf in zip(clean_target_corpus, shuffled_target) if clean == shuf]
halfsize = math.ceil(size/2)
try:
assert len(check) == halfsize
except AssertionError:
print(f'rolling again: {len(check)} vs {halfsize}')
else:
break
assert len(clean_source_corpus) == len(shuffled_target) == size
assert len(labels) == len(clean_source_corpus) == len(shuffled_target)
import csv
with open(f'/home/syigit/tsv_data/{source_lang}_{target_lang}_1000_data.tsv', 'w', encoding='utf8', newline='') as tsv_file:
tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
tsv_writer.writerow([f'{source_lang} definition', f'{target_lang} definition', 'is same'])
for row in zip(clean_source_corpus, shuffled_target, labels):
tsv_writer.writerow(row)
for row in zip(source_predict, target_predict, labels_predict):
tsv_writer.writerow(row)
|