From 2ba704a6fced5722a1f1e3da464185e3724a57d2 Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Tue, 17 Sep 2019 21:36:20 +0300 Subject: Hide away the scripts --- train_dic_creator.pl | 118 --------------------------------------------------- 1 file changed, 118 deletions(-) delete mode 100755 train_dic_creator.pl (limited to 'train_dic_creator.pl') diff --git a/train_dic_creator.pl b/train_dic_creator.pl deleted file mode 100755 index a228044..0000000 --- a/train_dic_creator.pl +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env perl -# -# -#Copyright © 2019 Yiğit Sever -# -# Permission is hereby granted, free of charge, to any person obtaining -# a copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE -# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -# Get source language code and target language code -# optionally give cutoff, cutoff/2 pairs will be prepared for train/test -# optionally give a different dictionary directory name -# -# USAGE: -# $ perl train_dic_creator.pl (cutoff) (dictionary_dir) - -use strict; -use warnings; -use List::Util qw(shuffle); - -my ($source_lang, $target_lang, $cutoff, $dict_dir) = @ARGV; - -if (not defined $source_lang or not defined $target_lang) { - die "usage: ./train_dic_creator.pl (cutoff)"; -} - -if (not defined $cutoff && $cutoff ne '') { - $cutoff = 20000; -} - -if (not defined $dict_dir && $dict_dir ne '') { - $dict_dir = './dictionaries/'; -} - -my $flipped = 0; -my $file_name; - -if (-e "$dict_dir/$target_lang-$source_lang.dic") { - warn "Dictionary is formatted as $target_lang $source_lang, still creating $source_lang $target_lang"; - $file_name = "$target_lang-$source_lang.dic"; - $flipped = 1; -} elsif (-e "$dict_dir/$source_lang-$target_lang.dic") { - $file_name = "$source_lang-$target_lang.dic"; -} - -my $file_path = $dict_dir . $file_name; - -local @ARGV = $file_path; -local $^I = '.bak'; - -while (<>) { # remove empty lines - print if ! /^$/; -} - -my @lines = `sort -rn $file_path`; # better translations swim to top - -my @result; -my $c = 0; - -foreach my $line (@lines) { - chomp($line); - if ($line !~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/) { - # line has multiple tokens - next; - } else { - my ($source, $target) = $line =~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/; - - if ($flipped) { # The file name and given parameters mismatch, correcting - push @result, "$target $source"; - } else { - push @result, "$source $target"; - } - $c++; - - if ($c >= $cutoff) { - last; - } - } -} - -my $test = scalar @result; - -if ($cutoff > scalar @result) { - $cutoff = scalar @result; -} - -@result = shuffle @result; - -my $size = $cutoff / 2; - -my @head = @result[0..$size - 1]; -my @tail = @result[-$size..-1]; - -my $train_file_name = $source_lang . '_' . $target_lang . '.train'; -my $test_file_name = $source_lang . '_' . $target_lang . '.test'; - -open my $train_fh, '>', $dict_dir . $train_file_name; -open my $test_fh, '>', $dict_dir . $test_file_name; - -print $train_fh join("\n", @head); -print $test_fh join("\n", @tail); - -unlink "$file_path$^I"; -- cgit v1.2.3-70-g09d2