aboutsummaryrefslogtreecommitdiffstats
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rw-r--r--scripts/prep_lookup.py61
-rwxr-xr-xscripts/tab_creator.pl76
-rwxr-xr-xscripts/train_dic_creator.pl118
3 files changed, 255 insertions, 0 deletions
diff --git a/scripts/prep_lookup.py b/scripts/prep_lookup.py
new file mode 100644
index 0000000..7fdfeec
--- /dev/null
+++ b/scripts/prep_lookup.py
@@ -0,0 +1,61 @@
1import argparse
2from pathlib import Path
3import collections
4import os
5
6def en_and_other(other, dirname):
7 from nltk.corpus import wordnet as wn
8 other_file = os.path.join(dirname, other + "." + 'tab')
9 lookup = collections.defaultdict(dict)
10
11 with open(other_file, 'r') as f:
12 for line in f:
13 (pos, offset, rest) = line.split(' ', 2)
14 offset = int(offset)
15 # part of speech + offset is unique, so keys are combination of both
16 en_def = wn.synset_from_pos_and_offset(pos, offset).definition()
17 lookup[(pos, offset)]['en'] = en_def
18 lookup[(pos,offset)][other] = rest.rstrip()
19 return lookup
20
21def both_lookup(source, target, dirname):
22 from_file = os.path.join(dirname, source + "." + 'tab')
23 to_file = os.path.join(dirname, target + "." + 'tab')
24 lookup = collections.defaultdict(dict)
25
26 for tab_file, lang_code in zip((from_file, to_file), (source, target)):
27 with open(tab_file, 'r') as f:
28 for line in f:
29 (pos, offset, rest) = line.split(' ', 2)
30 offset = int(offset)
31 # part of speech + offset is unique, so keys are combination of both
32 lookup[(pos,offset)][lang_code] = rest.rstrip()
33 return lookup
34
35def main(args):
36
37 dirname = args.tab_directory
38 source_lang = args.source_lang
39 target_lang = args.target_lang
40
41 if (source_lang == 'en'):
42 lookup = en_and_other(target_lang, dirname)
43 elif (target_lang == 'en'):
44 lookup = en_and_other(source_lang, dirname)
45 else:
46 lookup = both_lookup(source_lang, target_lang, dirname)
47
48 with open(f'{source_lang}_to_{target_lang}.def', 'w') as sf, open(f'{target_lang}_to_{source_lang}.def', 'w') as tf:
49 for (pos, offset), overlap in lookup.items():
50 if source_lang in overlap and target_lang in overlap:
51 print(overlap[source_lang], file=sf)
52 print(overlap[target_lang], file=tf)
53
54if __name__ == "__main__":
55 parser = argparse.ArgumentParser(description='Create a pair of .def files for 2 given languages')
56 parser.add_argument('--tab_directory', help='directory of the .tab files', default='wordnets/tab_files')
57 parser.add_argument('-s', '--source_lang', help='source language 2 letter code')
58 parser.add_argument('-t', '--target_lang', help='target language 2 letter code')
59 args = parser.parse_args()
60
61 main(args)
diff --git a/scripts/tab_creator.pl b/scripts/tab_creator.pl
new file mode 100755
index 0000000..6efce46
--- /dev/null
+++ b/scripts/tab_creator.pl
@@ -0,0 +1,76 @@
1#!/usr/bin/env perl
2#
3#
4# Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr>
5#
6# Permission is hereby granted, free of charge, to any person obtaining
7# a copy of this software and associated documentation files (the "Software"),
8# to deal in the Software without restriction, including without limitation
9# the rights to use, copy, modify, merge, publish, distribute, sublicense,
10# and/or sell copies of the Software, and to permit persons to whom the
11# Software is furnished to do so, subject to the following conditions:
12#
13# The above copyright notice and this permission notice shall be included
14# in all copies or substantial portions of the Software.
15#
16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
20# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
22# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24use strict;
25use warnings;
26use File::Basename;
27
28my %language_codes = (
29 als => "sq",
30 bul => "bg",
31 ell => "el",
32 ita => "it",
33 ron => "ro",
34 slv => "sl",
35);
36
37my ($tab_file, $tab_dir) = @ARGV;
38
39if (not defined $tab_file or not defined $tab_file) {
40 die "usage: ./tab_creator.pl <tab_file>";
41}
42
43if (not -e $tab_file) {
44 die "'$tab_file' does not exist";
45}
46
47if (not defined $tab_dir && $tab_dir ne '') {
48 $tab_dir = './wordnets/tab_files';
49}
50
51open (my $fh, '<', $tab_file) or die "Could not open '$tab_file' $!";
52
53my $filename = basename($tab_file);
54
55my $lang_code;
56if ($filename =~ m/wn-data-(\w{3})\.tab/) {
57 $lang_code = $1;
58}
59
60
61my $short_lang_code = $language_codes{$lang_code};
62
63my $outfilename = $tab_dir . '/' . $short_lang_code . '.tab';
64open (my $out_fh, '>', $outfilename) or die "Could not open '$outfilename', $!";
65
66while (my $row = <$fh>) {
67 chomp $row;
68 if ($row =~ m/$lang_code:def/) {
69 if ($row =~ m/^(\d+)-(\w)\s+$lang_code:def\s*\d\s+(.*)$/) {
70 my $offset = $1;
71 my $pos = $2;
72 my $def = $3;
73 print $out_fh "$pos $offset $def\n";
74 }
75 }
76}
diff --git a/scripts/train_dic_creator.pl b/scripts/train_dic_creator.pl
new file mode 100755
index 0000000..448fecf
--- /dev/null
+++ b/scripts/train_dic_creator.pl
@@ -0,0 +1,118 @@
1#!/usr/bin/env perl
2#
3#
4#Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr>
5#
6# Permission is hereby granted, free of charge, to any person obtaining
7# a copy of this software and associated documentation files (the "Software"),
8# to deal in the Software without restriction, including without limitation
9# the rights to use, copy, modify, merge, publish, distribute, sublicense,
10# and/or sell copies of the Software, and to permit persons to whom the
11# Software is furnished to do so, subject to the following conditions:
12#
13# The above copyright notice and this permission notice shall be included
14# in all copies or substantial portions of the Software.
15#
16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
20# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
22# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24
25# Get source language code and target language code
26# optionally give cutoff, cutoff/2 pairs will be prepared for train/test
27# optionally give a different dictionary directory name
28#
29# USAGE:
30# $ perl train_dic_creator.pl <source_lang> <target_lang> (cutoff) (dictionary_dir)
31
32use strict;
33use warnings;
34use List::Util qw(shuffle);
35
36my ($source_lang, $target_lang, $dict_dir, $cutoff) = @ARGV;
37
38if (not defined $source_lang or not defined $target_lang) {
39 die "usage: ./train_dic_creator.pl <source_lang> <target_lang> (cutoff)";
40}
41
42if (not defined $cutoff && $cutoff ne '') {
43 $cutoff = 20000;
44}
45
46if (not defined $dict_dir && $dict_dir ne '') {
47 $dict_dir = './dictionaries/';
48}
49
50my $flipped = 0;
51my $file_name;
52
53if (-e "$dict_dir/$target_lang-$source_lang.dic") {
54 warn "Dictionary is formatted as $target_lang $source_lang, still creating $source_lang $target_lang";
55 $file_name = "$target_lang-$source_lang.dic";
56 $flipped = 1;
57} elsif (-e "$dict_dir/$source_lang-$target_lang.dic") {
58 $file_name = "$source_lang-$target_lang.dic";
59}
60
61my $file_path = $dict_dir . $file_name;
62
63local @ARGV = $file_path;
64local $^I = '.bak';
65
66while (<>) { # remove empty lines
67 print if ! /^$/;
68}
69
70my @lines = `sort -rn $file_path`; # better translations swim to top
71
72my @result;
73my $c = 0;
74
75foreach my $line (@lines) {
76 chomp($line);
77 if ($line !~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/) {
78 # line has multiple tokens
79 next;
80 } else {
81 my ($source, $target) = $line =~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/;
82
83 if ($flipped) { # The file name and given parameters mismatch, correcting
84 push @result, "$target $source";
85 } else {
86 push @result, "$source $target";
87 }
88 $c++;
89
90 if ($c >= $cutoff) {
91 last;
92 }
93 }
94}
95
96my $test = scalar @result;
97
98if ($cutoff > scalar @result) {
99 $cutoff = scalar @result;
100}
101
102@result = shuffle @result;
103
104my $size = $cutoff / 2;
105
106my @head = @result[0..$size - 1];
107my @tail = @result[-$size..-1];
108
109my $train_file_name = $source_lang . '_' . $target_lang . '.train';
110my $test_file_name = $source_lang . '_' . $target_lang . '.test';
111
112open my $train_fh, '>', $dict_dir . $train_file_name;
113open my $test_fh, '>', $dict_dir . $test_file_name;
114
115print $train_fh join("\n", @head);
116print $test_fh join("\n", @tail);
117
118unlink "$file_path$^I";