1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
#!/usr/bin/env perl
#
#
#Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr>
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# Get source language code and target language code
# optionally give cutoff, cutoff/2 pairs will be prepared for train/test
# optionally give a different dictionary directory name
#
# USAGE:
# $ perl train_dic_creator.pl <source_lang> <target_lang> (cutoff) (dictionary_dir)
use strict;
use warnings;
use List::Util qw(shuffle);
my ($source_lang, $target_lang, $dict_dir, $cutoff) = @ARGV;
if (not defined $source_lang or not defined $target_lang) {
die "usage: ./train_dic_creator.pl <source_lang> <target_lang> (cutoff)";
}
if (not defined $cutoff && $cutoff ne '') {
$cutoff = 20000;
}
if (not defined $dict_dir && $dict_dir ne '') {
$dict_dir = '../dictionaries';
}
my $flipped = 0;
my $file_name;
if (-e "$dict_dir/$target_lang-$source_lang.dic") {
warn "Dictionary is formatted as $target_lang $source_lang, still creating $source_lang $target_lang";
$file_name = "$target_lang-$source_lang.dic";
$flipped = 1;
} elsif (-e "$dict_dir/$source_lang-$target_lang.dic") {
$file_name = "$source_lang-$target_lang.dic";
}
my $file_path = $dict_dir . $file_name;
local @ARGV = $file_path;
local $^I = '.bak';
while (<>) { # remove empty lines
print if ! /^$/;
}
my @lines = `sort -rn $file_path`; # better translations swim to top
my @result;
my $c = 0;
foreach my $line (@lines) {
chomp($line);
if ($line !~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/) {
# line has multiple tokens
next;
} else {
my ($source, $target) = $line =~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/;
if ($flipped) { # The file name and given parameters mismatch, correcting
push @result, "$target $source";
} else {
push @result, "$source $target";
}
$c++;
if ($c >= $cutoff) {
last;
}
}
}
my $test = scalar @result;
if ($cutoff > scalar @result) {
$cutoff = scalar @result;
}
@result = shuffle @result;
my $size = $cutoff / 2;
my @head = @result[0..$size - 1];
my @tail = @result[-$size..-1];
my $train_file_name = "$dict_dir/$source_lang_$target_lang.train';
my $test_file_name = "$dict_dir/$source_lang_$target_lang.test';
open my $train_fh, '>', $train_file_name;
open my $test_fh, '>', $test_file_name;
print $train_fh join("\n", @head);
print $test_fh join("\n", @tail);
unlink "$file_path$^I";
|