aboutsummaryrefslogtreecommitdiffstats
path: root/scripts/train_dic_creator.pl
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/train_dic_creator.pl')
-rwxr-xr-xscripts/train_dic_creator.pl118
1 files changed, 118 insertions, 0 deletions
diff --git a/scripts/train_dic_creator.pl b/scripts/train_dic_creator.pl
new file mode 100755
index 0000000..448fecf
--- /dev/null
+++ b/scripts/train_dic_creator.pl
@@ -0,0 +1,118 @@
1#!/usr/bin/env perl
2#
3#
4#Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr>
5#
6# Permission is hereby granted, free of charge, to any person obtaining
7# a copy of this software and associated documentation files (the "Software"),
8# to deal in the Software without restriction, including without limitation
9# the rights to use, copy, modify, merge, publish, distribute, sublicense,
10# and/or sell copies of the Software, and to permit persons to whom the
11# Software is furnished to do so, subject to the following conditions:
12#
13# The above copyright notice and this permission notice shall be included
14# in all copies or substantial portions of the Software.
15#
16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
20# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
22# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24
25# Get source language code and target language code
26# optionally give cutoff, cutoff/2 pairs will be prepared for train/test
27# optionally give a different dictionary directory name
28#
29# USAGE:
30# $ perl train_dic_creator.pl <source_lang> <target_lang> (cutoff) (dictionary_dir)
31
32use strict;
33use warnings;
34use List::Util qw(shuffle);
35
36my ($source_lang, $target_lang, $dict_dir, $cutoff) = @ARGV;
37
38if (not defined $source_lang or not defined $target_lang) {
39 die "usage: ./train_dic_creator.pl <source_lang> <target_lang> (cutoff)";
40}
41
42if (not defined $cutoff && $cutoff ne '') {
43 $cutoff = 20000;
44}
45
46if (not defined $dict_dir && $dict_dir ne '') {
47 $dict_dir = './dictionaries/';
48}
49
50my $flipped = 0;
51my $file_name;
52
53if (-e "$dict_dir/$target_lang-$source_lang.dic") {
54 warn "Dictionary is formatted as $target_lang $source_lang, still creating $source_lang $target_lang";
55 $file_name = "$target_lang-$source_lang.dic";
56 $flipped = 1;
57} elsif (-e "$dict_dir/$source_lang-$target_lang.dic") {
58 $file_name = "$source_lang-$target_lang.dic";
59}
60
61my $file_path = $dict_dir . $file_name;
62
63local @ARGV = $file_path;
64local $^I = '.bak';
65
66while (<>) { # remove empty lines
67 print if ! /^$/;
68}
69
70my @lines = `sort -rn $file_path`; # better translations swim to top
71
72my @result;
73my $c = 0;
74
75foreach my $line (@lines) {
76 chomp($line);
77 if ($line !~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/) {
78 # line has multiple tokens
79 next;
80 } else {
81 my ($source, $target) = $line =~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/;
82
83 if ($flipped) { # The file name and given parameters mismatch, correcting
84 push @result, "$target $source";
85 } else {
86 push @result, "$source $target";
87 }
88 $c++;
89
90 if ($c >= $cutoff) {
91 last;
92 }
93 }
94}
95
96my $test = scalar @result;
97
98if ($cutoff > scalar @result) {
99 $cutoff = scalar @result;
100}
101
102@result = shuffle @result;
103
104my $size = $cutoff / 2;
105
106my @head = @result[0..$size - 1];
107my @tail = @result[-$size..-1];
108
109my $train_file_name = $source_lang . '_' . $target_lang . '.train';
110my $test_file_name = $source_lang . '_' . $target_lang . '.test';
111
112open my $train_fh, '>', $dict_dir . $train_file_name;
113open my $test_fh, '>', $dict_dir . $test_file_name;
114
115print $train_fh join("\n", @head);
116print $test_fh join("\n", @tail);
117
118unlink "$file_path$^I";