From ff425f488d12d4ecf41a81c65784d32224404aec Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Mon, 16 Sep 2019 22:06:29 +0300 Subject: Add dictionary creating script --- train_dic_creator.pl | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100755 train_dic_creator.pl (limited to 'train_dic_creator.pl') diff --git a/train_dic_creator.pl b/train_dic_creator.pl new file mode 100755 index 0000000..a8de6ea --- /dev/null +++ b/train_dic_creator.pl @@ -0,0 +1,118 @@ +#!/usr/bin/env perl +# +# +#Copyright © 2019 Yiğit Sever + +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +# Get source language code and target language code +# optionally give cutoff, cutoff/2 pairs will be prepared for train/test +# optionally give a different dictionary directory name +# +# USAGE: +# $ perl train_dic_creator.pl (cutoff) (dictionary_dir) + +use strict; +use warnings; +use List::Util qw(shuffle); + +my ($source_lang, $target_lang, $cutoff, $dict_dir) = @ARGV; + +if (not defined $source_lang or not defined $target_lang) { + die "usage: ./train_dic_creator.pl (cutoff)"; +} + +if (not defined $cutoff && $cutoff ne '') { + $cutoff = 20000; +} + +if (not defined $dict_dir && $dict_dir ne '') { + $dict_dir = './dictionaries/'; +} + +my $flipped = 0; +my $file_name; + +if (-e "$dict_dir/$target_lang-$source_lang.dic") { + warn "Dictionary is formatted as $target_lang $source_lang, still creating $source_lang $target_lang"; + $file_name = "$target_lang-$source_lang.dic"; + $flipped = 1; +} elsif (-e "$dict_dir/$source_lang-$target_lang.dic") { + $file_name = "$source_lang-$target_lang.dic"; +} + +my $file_path = $dict_dir . $file_name; + +local @ARGV = $file_path; +local $^I = '.bak'; + +while (<>) { # remove empty lines + print if ! /^$/; +} + +my @lines = `sort -rn $file_path`; # better translations swim to top + +my @result; +my $c = 0; + +foreach my $line (@lines) { + chomp($line); + if ($line !~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/) { + # line has multiple tokens + next; + } else { + my ($source, $target) = $line =~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/; + + if ($flipped) { # The file name and given parameters mismatch, correcting + push @result, "$target $source"; + } else { + push @result, "$source $target"; + } + $c++; + + if ($c >= $cutoff) { + last; + } + } +} + +my $test = scalar @result; + +if ($cutoff > scalar @result) { + $cutoff = scalar @result; +} + +@result = shuffle @result; + +my $size = $cutoff / 2; + +my @head = @result[0..$size - 1]; +my @tail = @result[-$size..-1]; + +my $train_file_name = $source_lang . '_' . $target_lang . '.train'; +my $test_file_name = $source_lang . '_' . $target_lang . '.test'; + +open my $train_fh, '>', $train_file_name; +open my $test_fh, '>', $test_file_name; + +print $train_fh join("\n", @head); +print $test_fh join("\n", @tail); + +unlink "$file_path$^I"; -- cgit v1.2.3-70-g09d2