diff options
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/prep_lookup.py | 61 | ||||
-rwxr-xr-x | scripts/tab_creator.pl | 76 | ||||
-rwxr-xr-x | scripts/train_dic_creator.pl | 118 |
3 files changed, 255 insertions, 0 deletions
diff --git a/scripts/prep_lookup.py b/scripts/prep_lookup.py new file mode 100644 index 0000000..7fdfeec --- /dev/null +++ b/scripts/prep_lookup.py | |||
@@ -0,0 +1,61 @@ | |||
1 | import argparse | ||
2 | from pathlib import Path | ||
3 | import collections | ||
4 | import os | ||
5 | |||
6 | def en_and_other(other, dirname): | ||
7 | from nltk.corpus import wordnet as wn | ||
8 | other_file = os.path.join(dirname, other + "." + 'tab') | ||
9 | lookup = collections.defaultdict(dict) | ||
10 | |||
11 | with open(other_file, 'r') as f: | ||
12 | for line in f: | ||
13 | (pos, offset, rest) = line.split(' ', 2) | ||
14 | offset = int(offset) | ||
15 | # part of speech + offset is unique, so keys are combination of both | ||
16 | en_def = wn.synset_from_pos_and_offset(pos, offset).definition() | ||
17 | lookup[(pos, offset)]['en'] = en_def | ||
18 | lookup[(pos,offset)][other] = rest.rstrip() | ||
19 | return lookup | ||
20 | |||
21 | def both_lookup(source, target, dirname): | ||
22 | from_file = os.path.join(dirname, source + "." + 'tab') | ||
23 | to_file = os.path.join(dirname, target + "." + 'tab') | ||
24 | lookup = collections.defaultdict(dict) | ||
25 | |||
26 | for tab_file, lang_code in zip((from_file, to_file), (source, target)): | ||
27 | with open(tab_file, 'r') as f: | ||
28 | for line in f: | ||
29 | (pos, offset, rest) = line.split(' ', 2) | ||
30 | offset = int(offset) | ||
31 | # part of speech + offset is unique, so keys are combination of both | ||
32 | lookup[(pos,offset)][lang_code] = rest.rstrip() | ||
33 | return lookup | ||
34 | |||
35 | def main(args): | ||
36 | |||
37 | dirname = args.tab_directory | ||
38 | source_lang = args.source_lang | ||
39 | target_lang = args.target_lang | ||
40 | |||
41 | if (source_lang == 'en'): | ||
42 | lookup = en_and_other(target_lang, dirname) | ||
43 | elif (target_lang == 'en'): | ||
44 | lookup = en_and_other(source_lang, dirname) | ||
45 | else: | ||
46 | lookup = both_lookup(source_lang, target_lang, dirname) | ||
47 | |||
48 | with open(f'{source_lang}_to_{target_lang}.def', 'w') as sf, open(f'{target_lang}_to_{source_lang}.def', 'w') as tf: | ||
49 | for (pos, offset), overlap in lookup.items(): | ||
50 | if source_lang in overlap and target_lang in overlap: | ||
51 | print(overlap[source_lang], file=sf) | ||
52 | print(overlap[target_lang], file=tf) | ||
53 | |||
54 | if __name__ == "__main__": | ||
55 | parser = argparse.ArgumentParser(description='Create a pair of .def files for 2 given languages') | ||
56 | parser.add_argument('--tab_directory', help='directory of the .tab files', default='wordnets/tab_files') | ||
57 | parser.add_argument('-s', '--source_lang', help='source language 2 letter code') | ||
58 | parser.add_argument('-t', '--target_lang', help='target language 2 letter code') | ||
59 | args = parser.parse_args() | ||
60 | |||
61 | main(args) | ||
diff --git a/scripts/tab_creator.pl b/scripts/tab_creator.pl new file mode 100755 index 0000000..6efce46 --- /dev/null +++ b/scripts/tab_creator.pl | |||
@@ -0,0 +1,76 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # | ||
4 | # Copyright © 2019 Yiğit Sever <[email protected]> | ||
5 | # | ||
6 | # Permission is hereby granted, free of charge, to any person obtaining | ||
7 | # a copy of this software and associated documentation files (the "Software"), | ||
8 | # to deal in the Software without restriction, including without limitation | ||
9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
10 | # and/or sell copies of the Software, and to permit persons to whom the | ||
11 | # Software is furnished to do so, subject to the following conditions: | ||
12 | # | ||
13 | # The above copyright notice and this permission notice shall be included | ||
14 | # in all copies or substantial portions of the Software. | ||
15 | # | ||
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
18 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
19 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
20 | # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
21 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
22 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
23 | |||
24 | use strict; | ||
25 | use warnings; | ||
26 | use File::Basename; | ||
27 | |||
28 | my %language_codes = ( | ||
29 | als => "sq", | ||
30 | bul => "bg", | ||
31 | ell => "el", | ||
32 | ita => "it", | ||
33 | ron => "ro", | ||
34 | slv => "sl", | ||
35 | ); | ||
36 | |||
37 | my ($tab_file, $tab_dir) = @ARGV; | ||
38 | |||
39 | if (not defined $tab_file or not defined $tab_file) { | ||
40 | die "usage: ./tab_creator.pl <tab_file>"; | ||
41 | } | ||
42 | |||
43 | if (not -e $tab_file) { | ||
44 | die "'$tab_file' does not exist"; | ||
45 | } | ||
46 | |||
47 | if (not defined $tab_dir && $tab_dir ne '') { | ||
48 | $tab_dir = './wordnets/tab_files'; | ||
49 | } | ||
50 | |||
51 | open (my $fh, '<', $tab_file) or die "Could not open '$tab_file' $!"; | ||
52 | |||
53 | my $filename = basename($tab_file); | ||
54 | |||
55 | my $lang_code; | ||
56 | if ($filename =~ m/wn-data-(\w{3})\.tab/) { | ||
57 | $lang_code = $1; | ||
58 | } | ||
59 | |||
60 | |||
61 | my $short_lang_code = $language_codes{$lang_code}; | ||
62 | |||
63 | my $outfilename = $tab_dir . '/' . $short_lang_code . '.tab'; | ||
64 | open (my $out_fh, '>', $outfilename) or die "Could not open '$outfilename', $!"; | ||
65 | |||
66 | while (my $row = <$fh>) { | ||
67 | chomp $row; | ||
68 | if ($row =~ m/$lang_code:def/) { | ||
69 | if ($row =~ m/^(\d+)-(\w)\s+$lang_code:def\s*\d\s+(.*)$/) { | ||
70 | my $offset = $1; | ||
71 | my $pos = $2; | ||
72 | my $def = $3; | ||
73 | print $out_fh "$pos $offset $def\n"; | ||
74 | } | ||
75 | } | ||
76 | } | ||
diff --git a/scripts/train_dic_creator.pl b/scripts/train_dic_creator.pl new file mode 100755 index 0000000..448fecf --- /dev/null +++ b/scripts/train_dic_creator.pl | |||
@@ -0,0 +1,118 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # | ||
4 | #Copyright © 2019 Yiğit Sever <[email protected]> | ||
5 | # | ||
6 | # Permission is hereby granted, free of charge, to any person obtaining | ||
7 | # a copy of this software and associated documentation files (the "Software"), | ||
8 | # to deal in the Software without restriction, including without limitation | ||
9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
10 | # and/or sell copies of the Software, and to permit persons to whom the | ||
11 | # Software is furnished to do so, subject to the following conditions: | ||
12 | # | ||
13 | # The above copyright notice and this permission notice shall be included | ||
14 | # in all copies or substantial portions of the Software. | ||
15 | # | ||
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
18 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
19 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
20 | # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
21 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
22 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
23 | |||
24 | |||
25 | # Get source language code and target language code | ||
26 | # optionally give cutoff, cutoff/2 pairs will be prepared for train/test | ||
27 | # optionally give a different dictionary directory name | ||
28 | # | ||
29 | # USAGE: | ||
30 | # $ perl train_dic_creator.pl <source_lang> <target_lang> (cutoff) (dictionary_dir) | ||
31 | |||
32 | use strict; | ||
33 | use warnings; | ||
34 | use List::Util qw(shuffle); | ||
35 | |||
36 | my ($source_lang, $target_lang, $dict_dir, $cutoff) = @ARGV; | ||
37 | |||
38 | if (not defined $source_lang or not defined $target_lang) { | ||
39 | die "usage: ./train_dic_creator.pl <source_lang> <target_lang> (cutoff)"; | ||
40 | } | ||
41 | |||
42 | if (not defined $cutoff && $cutoff ne '') { | ||
43 | $cutoff = 20000; | ||
44 | } | ||
45 | |||
46 | if (not defined $dict_dir && $dict_dir ne '') { | ||
47 | $dict_dir = './dictionaries/'; | ||
48 | } | ||
49 | |||
50 | my $flipped = 0; | ||
51 | my $file_name; | ||
52 | |||
53 | if (-e "$dict_dir/$target_lang-$source_lang.dic") { | ||
54 | warn "Dictionary is formatted as $target_lang $source_lang, still creating $source_lang $target_lang"; | ||
55 | $file_name = "$target_lang-$source_lang.dic"; | ||
56 | $flipped = 1; | ||
57 | } elsif (-e "$dict_dir/$source_lang-$target_lang.dic") { | ||
58 | $file_name = "$source_lang-$target_lang.dic"; | ||
59 | } | ||
60 | |||
61 | my $file_path = $dict_dir . $file_name; | ||
62 | |||
63 | local @ARGV = $file_path; | ||
64 | local $^I = '.bak'; | ||
65 | |||
66 | while (<>) { # remove empty lines | ||
67 | print if ! /^$/; | ||
68 | } | ||
69 | |||
70 | my @lines = `sort -rn $file_path`; # better translations swim to top | ||
71 | |||
72 | my @result; | ||
73 | my $c = 0; | ||
74 | |||
75 | foreach my $line (@lines) { | ||
76 | chomp($line); | ||
77 | if ($line !~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/) { | ||
78 | # line has multiple tokens | ||
79 | next; | ||
80 | } else { | ||
81 | my ($source, $target) = $line =~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/; | ||
82 | |||
83 | if ($flipped) { # The file name and given parameters mismatch, correcting | ||
84 | push @result, "$target $source"; | ||
85 | } else { | ||
86 | push @result, "$source $target"; | ||
87 | } | ||
88 | $c++; | ||
89 | |||
90 | if ($c >= $cutoff) { | ||
91 | last; | ||
92 | } | ||
93 | } | ||
94 | } | ||
95 | |||
96 | my $test = scalar @result; | ||
97 | |||
98 | if ($cutoff > scalar @result) { | ||
99 | $cutoff = scalar @result; | ||
100 | } | ||
101 | |||
102 | @result = shuffle @result; | ||
103 | |||
104 | my $size = $cutoff / 2; | ||
105 | |||
106 | my @head = @result[0..$size - 1]; | ||
107 | my @tail = @result[-$size..-1]; | ||
108 | |||
109 | my $train_file_name = $source_lang . '_' . $target_lang . '.train'; | ||
110 | my $test_file_name = $source_lang . '_' . $target_lang . '.test'; | ||
111 | |||
112 | open my $train_fh, '>', $dict_dir . $train_file_name; | ||
113 | open my $test_fh, '>', $dict_dir . $test_file_name; | ||
114 | |||
115 | print $train_fh join("\n", @head); | ||
116 | print $test_fh join("\n", @tail); | ||
117 | |||
118 | unlink "$file_path$^I"; | ||