diff options
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/prep_lookup.py | 61 | ||||
| -rwxr-xr-x | scripts/tab_creator.pl | 76 | ||||
| -rwxr-xr-x | scripts/train_dic_creator.pl | 118 | 
3 files changed, 255 insertions, 0 deletions
| diff --git a/scripts/prep_lookup.py b/scripts/prep_lookup.py new file mode 100644 index 0000000..7fdfeec --- /dev/null +++ b/scripts/prep_lookup.py | |||
| @@ -0,0 +1,61 @@ | |||
| 1 | import argparse | ||
| 2 | from pathlib import Path | ||
| 3 | import collections | ||
| 4 | import os | ||
| 5 | |||
| 6 | def en_and_other(other, dirname): | ||
| 7 | from nltk.corpus import wordnet as wn | ||
| 8 | other_file = os.path.join(dirname, other + "." + 'tab') | ||
| 9 | lookup = collections.defaultdict(dict) | ||
| 10 | |||
| 11 | with open(other_file, 'r') as f: | ||
| 12 | for line in f: | ||
| 13 | (pos, offset, rest) = line.split(' ', 2) | ||
| 14 | offset = int(offset) | ||
| 15 | # part of speech + offset is unique, so keys are combination of both | ||
| 16 | en_def = wn.synset_from_pos_and_offset(pos, offset).definition() | ||
| 17 | lookup[(pos, offset)]['en'] = en_def | ||
| 18 | lookup[(pos,offset)][other] = rest.rstrip() | ||
| 19 | return lookup | ||
| 20 | |||
| 21 | def both_lookup(source, target, dirname): | ||
| 22 | from_file = os.path.join(dirname, source + "." + 'tab') | ||
| 23 | to_file = os.path.join(dirname, target + "." + 'tab') | ||
| 24 | lookup = collections.defaultdict(dict) | ||
| 25 | |||
| 26 | for tab_file, lang_code in zip((from_file, to_file), (source, target)): | ||
| 27 | with open(tab_file, 'r') as f: | ||
| 28 | for line in f: | ||
| 29 | (pos, offset, rest) = line.split(' ', 2) | ||
| 30 | offset = int(offset) | ||
| 31 | # part of speech + offset is unique, so keys are combination of both | ||
| 32 | lookup[(pos,offset)][lang_code] = rest.rstrip() | ||
| 33 | return lookup | ||
| 34 | |||
| 35 | def main(args): | ||
| 36 | |||
| 37 | dirname = args.tab_directory | ||
| 38 | source_lang = args.source_lang | ||
| 39 | target_lang = args.target_lang | ||
| 40 | |||
| 41 | if (source_lang == 'en'): | ||
| 42 | lookup = en_and_other(target_lang, dirname) | ||
| 43 | elif (target_lang == 'en'): | ||
| 44 | lookup = en_and_other(source_lang, dirname) | ||
| 45 | else: | ||
| 46 | lookup = both_lookup(source_lang, target_lang, dirname) | ||
| 47 | |||
| 48 | with open(f'{source_lang}_to_{target_lang}.def', 'w') as sf, open(f'{target_lang}_to_{source_lang}.def', 'w') as tf: | ||
| 49 | for (pos, offset), overlap in lookup.items(): | ||
| 50 | if source_lang in overlap and target_lang in overlap: | ||
| 51 | print(overlap[source_lang], file=sf) | ||
| 52 | print(overlap[target_lang], file=tf) | ||
| 53 | |||
| 54 | if __name__ == "__main__": | ||
| 55 | parser = argparse.ArgumentParser(description='Create a pair of .def files for 2 given languages') | ||
| 56 | parser.add_argument('--tab_directory', help='directory of the .tab files', default='wordnets/tab_files') | ||
| 57 | parser.add_argument('-s', '--source_lang', help='source language 2 letter code') | ||
| 58 | parser.add_argument('-t', '--target_lang', help='target language 2 letter code') | ||
| 59 | args = parser.parse_args() | ||
| 60 | |||
| 61 | main(args) | ||
| diff --git a/scripts/tab_creator.pl b/scripts/tab_creator.pl new file mode 100755 index 0000000..6efce46 --- /dev/null +++ b/scripts/tab_creator.pl | |||
| @@ -0,0 +1,76 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # | ||
| 4 | # Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr> | ||
| 5 | # | ||
| 6 | # Permission is hereby granted, free of charge, to any person obtaining | ||
| 7 | # a copy of this software and associated documentation files (the "Software"), | ||
| 8 | # to deal in the Software without restriction, including without limitation | ||
| 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
| 10 | # and/or sell copies of the Software, and to permit persons to whom the | ||
| 11 | # Software is furnished to do so, subject to the following conditions: | ||
| 12 | # | ||
| 13 | # The above copyright notice and this permission notice shall be included | ||
| 14 | # in all copies or substantial portions of the Software. | ||
| 15 | # | ||
| 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
| 18 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
| 19 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
| 20 | # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
| 21 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
| 22 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
| 23 | |||
| 24 | use strict; | ||
| 25 | use warnings; | ||
| 26 | use File::Basename; | ||
| 27 | |||
| 28 | my %language_codes = ( | ||
| 29 | als => "sq", | ||
| 30 | bul => "bg", | ||
| 31 | ell => "el", | ||
| 32 | ita => "it", | ||
| 33 | ron => "ro", | ||
| 34 | slv => "sl", | ||
| 35 | ); | ||
| 36 | |||
| 37 | my ($tab_file, $tab_dir) = @ARGV; | ||
| 38 | |||
| 39 | if (not defined $tab_file or not defined $tab_file) { | ||
| 40 | die "usage: ./tab_creator.pl <tab_file>"; | ||
| 41 | } | ||
| 42 | |||
| 43 | if (not -e $tab_file) { | ||
| 44 | die "'$tab_file' does not exist"; | ||
| 45 | } | ||
| 46 | |||
| 47 | if (not defined $tab_dir && $tab_dir ne '') { | ||
| 48 | $tab_dir = './wordnets/tab_files'; | ||
| 49 | } | ||
| 50 | |||
| 51 | open (my $fh, '<', $tab_file) or die "Could not open '$tab_file' $!"; | ||
| 52 | |||
| 53 | my $filename = basename($tab_file); | ||
| 54 | |||
| 55 | my $lang_code; | ||
| 56 | if ($filename =~ m/wn-data-(\w{3})\.tab/) { | ||
| 57 | $lang_code = $1; | ||
| 58 | } | ||
| 59 | |||
| 60 | |||
| 61 | my $short_lang_code = $language_codes{$lang_code}; | ||
| 62 | |||
| 63 | my $outfilename = $tab_dir . '/' . $short_lang_code . '.tab'; | ||
| 64 | open (my $out_fh, '>', $outfilename) or die "Could not open '$outfilename', $!"; | ||
| 65 | |||
| 66 | while (my $row = <$fh>) { | ||
| 67 | chomp $row; | ||
| 68 | if ($row =~ m/$lang_code:def/) { | ||
| 69 | if ($row =~ m/^(\d+)-(\w)\s+$lang_code:def\s*\d\s+(.*)$/) { | ||
| 70 | my $offset = $1; | ||
| 71 | my $pos = $2; | ||
| 72 | my $def = $3; | ||
| 73 | print $out_fh "$pos $offset $def\n"; | ||
| 74 | } | ||
| 75 | } | ||
| 76 | } | ||
| diff --git a/scripts/train_dic_creator.pl b/scripts/train_dic_creator.pl new file mode 100755 index 0000000..448fecf --- /dev/null +++ b/scripts/train_dic_creator.pl | |||
| @@ -0,0 +1,118 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # | ||
| 4 | #Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr> | ||
| 5 | # | ||
| 6 | # Permission is hereby granted, free of charge, to any person obtaining | ||
| 7 | # a copy of this software and associated documentation files (the "Software"), | ||
| 8 | # to deal in the Software without restriction, including without limitation | ||
| 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
| 10 | # and/or sell copies of the Software, and to permit persons to whom the | ||
| 11 | # Software is furnished to do so, subject to the following conditions: | ||
| 12 | # | ||
| 13 | # The above copyright notice and this permission notice shall be included | ||
| 14 | # in all copies or substantial portions of the Software. | ||
| 15 | # | ||
| 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
| 18 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
| 19 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
| 20 | # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
| 21 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
| 22 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
| 23 | |||
| 24 | |||
| 25 | # Get source language code and target language code | ||
| 26 | # optionally give cutoff, cutoff/2 pairs will be prepared for train/test | ||
| 27 | # optionally give a different dictionary directory name | ||
| 28 | # | ||
| 29 | # USAGE: | ||
| 30 | # $ perl train_dic_creator.pl <source_lang> <target_lang> (cutoff) (dictionary_dir) | ||
| 31 | |||
| 32 | use strict; | ||
| 33 | use warnings; | ||
| 34 | use List::Util qw(shuffle); | ||
| 35 | |||
| 36 | my ($source_lang, $target_lang, $dict_dir, $cutoff) = @ARGV; | ||
| 37 | |||
| 38 | if (not defined $source_lang or not defined $target_lang) { | ||
| 39 | die "usage: ./train_dic_creator.pl <source_lang> <target_lang> (cutoff)"; | ||
| 40 | } | ||
| 41 | |||
| 42 | if (not defined $cutoff && $cutoff ne '') { | ||
| 43 | $cutoff = 20000; | ||
| 44 | } | ||
| 45 | |||
| 46 | if (not defined $dict_dir && $dict_dir ne '') { | ||
| 47 | $dict_dir = './dictionaries/'; | ||
| 48 | } | ||
| 49 | |||
| 50 | my $flipped = 0; | ||
| 51 | my $file_name; | ||
| 52 | |||
| 53 | if (-e "$dict_dir/$target_lang-$source_lang.dic") { | ||
| 54 | warn "Dictionary is formatted as $target_lang $source_lang, still creating $source_lang $target_lang"; | ||
| 55 | $file_name = "$target_lang-$source_lang.dic"; | ||
| 56 | $flipped = 1; | ||
| 57 | } elsif (-e "$dict_dir/$source_lang-$target_lang.dic") { | ||
| 58 | $file_name = "$source_lang-$target_lang.dic"; | ||
| 59 | } | ||
| 60 | |||
| 61 | my $file_path = $dict_dir . $file_name; | ||
| 62 | |||
| 63 | local @ARGV = $file_path; | ||
| 64 | local $^I = '.bak'; | ||
| 65 | |||
| 66 | while (<>) { # remove empty lines | ||
| 67 | print if ! /^$/; | ||
| 68 | } | ||
| 69 | |||
| 70 | my @lines = `sort -rn $file_path`; # better translations swim to top | ||
| 71 | |||
| 72 | my @result; | ||
| 73 | my $c = 0; | ||
| 74 | |||
| 75 | foreach my $line (@lines) { | ||
| 76 | chomp($line); | ||
| 77 | if ($line !~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/) { | ||
| 78 | # line has multiple tokens | ||
| 79 | next; | ||
| 80 | } else { | ||
| 81 | my ($source, $target) = $line =~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/; | ||
| 82 | |||
| 83 | if ($flipped) { # The file name and given parameters mismatch, correcting | ||
| 84 | push @result, "$target $source"; | ||
| 85 | } else { | ||
| 86 | push @result, "$source $target"; | ||
| 87 | } | ||
| 88 | $c++; | ||
| 89 | |||
| 90 | if ($c >= $cutoff) { | ||
| 91 | last; | ||
| 92 | } | ||
| 93 | } | ||
| 94 | } | ||
| 95 | |||
| 96 | my $test = scalar @result; | ||
| 97 | |||
| 98 | if ($cutoff > scalar @result) { | ||
| 99 | $cutoff = scalar @result; | ||
| 100 | } | ||
| 101 | |||
| 102 | @result = shuffle @result; | ||
| 103 | |||
| 104 | my $size = $cutoff / 2; | ||
| 105 | |||
| 106 | my @head = @result[0..$size - 1]; | ||
| 107 | my @tail = @result[-$size..-1]; | ||
| 108 | |||
| 109 | my $train_file_name = $source_lang . '_' . $target_lang . '.train'; | ||
| 110 | my $test_file_name = $source_lang . '_' . $target_lang . '.test'; | ||
| 111 | |||
| 112 | open my $train_fh, '>', $dict_dir . $train_file_name; | ||
| 113 | open my $test_fh, '>', $dict_dir . $test_file_name; | ||
| 114 | |||
| 115 | print $train_fh join("\n", @head); | ||
| 116 | print $test_fh join("\n", @tail); | ||
| 117 | |||
| 118 | unlink "$file_path$^I"; | ||
