diff options
-rwxr-xr-x | train_dic_creator.pl | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/train_dic_creator.pl b/train_dic_creator.pl new file mode 100755 index 0000000..a8de6ea --- /dev/null +++ b/train_dic_creator.pl | |||
@@ -0,0 +1,118 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # | ||
4 | #Copyright © 2019 Yiğit Sever <[email protected]> | ||
5 | |||
6 | # Permission is hereby granted, free of charge, to any person obtaining | ||
7 | # a copy of this software and associated documentation files (the "Software"), | ||
8 | # to deal in the Software without restriction, including without limitation | ||
9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
10 | # and/or sell copies of the Software, and to permit persons to whom the | ||
11 | # Software is furnished to do so, subject to the following conditions: | ||
12 | # | ||
13 | # The above copyright notice and this permission notice shall be included | ||
14 | # in all copies or substantial portions of the Software. | ||
15 | # | ||
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
18 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
19 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
20 | # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
21 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
22 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
23 | |||
24 | |||
25 | # Get source language code and target language code | ||
26 | # optionally give cutoff, cutoff/2 pairs will be prepared for train/test | ||
27 | # optionally give a different dictionary directory name | ||
28 | # | ||
29 | # USAGE: | ||
30 | # $ perl train_dic_creator.pl <source_lang> <target_lang> (cutoff) (dictionary_dir) | ||
31 | |||
32 | use strict; | ||
33 | use warnings; | ||
34 | use List::Util qw(shuffle); | ||
35 | |||
36 | my ($source_lang, $target_lang, $cutoff, $dict_dir) = @ARGV; | ||
37 | |||
38 | if (not defined $source_lang or not defined $target_lang) { | ||
39 | die "usage: ./train_dic_creator.pl <source_lang> <target_lang> (cutoff)"; | ||
40 | } | ||
41 | |||
42 | if (not defined $cutoff && $cutoff ne '') { | ||
43 | $cutoff = 20000; | ||
44 | } | ||
45 | |||
46 | if (not defined $dict_dir && $dict_dir ne '') { | ||
47 | $dict_dir = './dictionaries/'; | ||
48 | } | ||
49 | |||
50 | my $flipped = 0; | ||
51 | my $file_name; | ||
52 | |||
53 | if (-e "$dict_dir/$target_lang-$source_lang.dic") { | ||
54 | warn "Dictionary is formatted as $target_lang $source_lang, still creating $source_lang $target_lang"; | ||
55 | $file_name = "$target_lang-$source_lang.dic"; | ||
56 | $flipped = 1; | ||
57 | } elsif (-e "$dict_dir/$source_lang-$target_lang.dic") { | ||
58 | $file_name = "$source_lang-$target_lang.dic"; | ||
59 | } | ||
60 | |||
61 | my $file_path = $dict_dir . $file_name; | ||
62 | |||
63 | local @ARGV = $file_path; | ||
64 | local $^I = '.bak'; | ||
65 | |||
66 | while (<>) { # remove empty lines | ||
67 | print if ! /^$/; | ||
68 | } | ||
69 | |||
70 | my @lines = `sort -rn $file_path`; # better translations swim to top | ||
71 | |||
72 | my @result; | ||
73 | my $c = 0; | ||
74 | |||
75 | foreach my $line (@lines) { | ||
76 | chomp($line); | ||
77 | if ($line !~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/) { | ||
78 | # line has multiple tokens | ||
79 | next; | ||
80 | } else { | ||
81 | my ($source, $target) = $line =~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/; | ||
82 | |||
83 | if ($flipped) { # The file name and given parameters mismatch, correcting | ||
84 | push @result, "$target $source"; | ||
85 | } else { | ||
86 | push @result, "$source $target"; | ||
87 | } | ||
88 | $c++; | ||
89 | |||
90 | if ($c >= $cutoff) { | ||
91 | last; | ||
92 | } | ||
93 | } | ||
94 | } | ||
95 | |||
96 | my $test = scalar @result; | ||
97 | |||
98 | if ($cutoff > scalar @result) { | ||
99 | $cutoff = scalar @result; | ||
100 | } | ||
101 | |||
102 | @result = shuffle @result; | ||
103 | |||
104 | my $size = $cutoff / 2; | ||
105 | |||
106 | my @head = @result[0..$size - 1]; | ||
107 | my @tail = @result[-$size..-1]; | ||
108 | |||
109 | my $train_file_name = $source_lang . '_' . $target_lang . '.train'; | ||
110 | my $test_file_name = $source_lang . '_' . $target_lang . '.test'; | ||
111 | |||
112 | open my $train_fh, '>', $train_file_name; | ||
113 | open my $test_fh, '>', $test_file_name; | ||
114 | |||
115 | print $train_fh join("\n", @head); | ||
116 | print $test_fh join("\n", @tail); | ||
117 | |||
118 | unlink "$file_path$^I"; | ||