3 files changed, 255 insertions, 0 deletions
diff --git a/scripts/prep_lookup.py b/scripts/prep_lookup.py
new file mode 100644
index 0000000..7fdfeec
--- /dev/null
+++ b/scripts/prep_lookup.py
@@ -0,0 +1,61 @@
+import argparse
+from pathlib import Path
+import collections
+import os
+def en_and_other(other, dirname):
+    from nltk.corpus import wordnet as wn
+    other_file = os.path.join(dirname, other + "." + 'tab')
+    lookup = collections.defaultdict(dict)
+    with open(other_file, 'r') as f:
+        for line in f:
+            (pos, offset, rest) = line.split(' ', 2)
+            offset = int(offset)
+            # part of speech + offset is unique, so keys are combination of both
+            en_def = wn.synset_from_pos_and_offset(pos, offset).definition()
+            lookup[(pos, offset)]['en'] = en_def
+            lookup[(pos,offset)][other] = rest.rstrip()
+    return lookup
+def both_lookup(source, target, dirname):
+    from_file = os.path.join(dirname, source + "." + 'tab')
+    to_file = os.path.join(dirname, target + "." + 'tab')
+    lookup = collections.defaultdict(dict)
+    for tab_file, lang_code in zip((from_file, to_file), (source, target)):
+        with open(tab_file, 'r') as f:
+            for line in f:
+                (pos, offset, rest) = line.split(' ', 2)
+                offset = int(offset)
+                # part of speech + offset is unique, so keys are combination of both
+                lookup[(pos,offset)][lang_code] = rest.rstrip()
+    return lookup
+def main(args):
+    dirname = args.tab_directory
+    source_lang = args.source_lang
+    target_lang = args.target_lang
+    if (source_lang == 'en'):
+            lookup = en_and_other(target_lang, dirname)
+    elif (target_lang == 'en'):
+            lookup = en_and_other(source_lang, dirname)
+    else:
+            lookup = both_lookup(source_lang, target_lang, dirname)
+    with open(f'{source_lang}_to_{target_lang}.def', 'w') as sf, open(f'{target_lang}_to_{source_lang}.def', 'w') as tf:
+        for (pos, offset), overlap in lookup.items():
+            if source_lang in overlap and target_lang in overlap:
+                print(overlap[source_lang], file=sf)
+                print(overlap[target_lang], file=tf)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Create a pair of .def files for 2 given languages')
+    parser.add_argument('--tab_directory', help='directory of the .tab files', default='wordnets/tab_files')
+    parser.add_argument('-s', '--source_lang', help='source language 2 letter code')
+    parser.add_argument('-t', '--target_lang', help='target language 2 letter code')
+    args = parser.parse_args()
+    main(args)
diff --git a/scripts/tab_creator.pl b/scripts/tab_creator.pl
new file mode 100755
index 0000000..6efce46
--- /dev/null
+++ b/scripts/tab_creator.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/env perl
+#
+#
+# Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr>
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+use strict;
+use warnings;
+use File::Basename;
+my %language_codes = (
+    als => "sq",
+    bul => "bg",
+    ell => "el",
+    ita => "it",
+    ron => "ro",
+    slv => "sl",
+);
+my ($tab_file, $tab_dir) = @ARGV;
+if (not defined $tab_file or not defined $tab_file) {
+    die "usage: ./tab_creator.pl <tab_file>";
+}
+if (not -e $tab_file) {
+    die "'$tab_file' does not exist";
+}
+if (not defined $tab_dir && $tab_dir ne '') {
+    $tab_dir = './wordnets/tab_files';
+}
+open (my $fh, '<', $tab_file) or die "Could not open '$tab_file' $!";
+my $filename = basename($tab_file);
+my $lang_code;
+if ($filename =~ m/wn-data-(\w{3})\.tab/) {
+    $lang_code = $1;
+}
+my $short_lang_code = $language_codes{$lang_code};
+my $outfilename = $tab_dir . '/' . $short_lang_code . '.tab';
+open (my $out_fh, '>', $outfilename) or die "Could not open '$outfilename', $!";
+while (my $row = <$fh>) {
+    chomp $row;
+    if ($row =~ m/$lang_code:def/) {
+        if ($row =~ m/^(\d+)-(\w)\s+$lang_code:def\s*\d\s+(.*)$/) {
+            my $offset = $1;
+            my $pos = $2;
+            my $def = $3;
+            print $out_fh "$pos $offset $def\n";
+        }
+    }
+}
diff --git a/scripts/train_dic_creator.pl b/scripts/train_dic_creator.pl
new file mode 100755
index 0000000..448fecf
--- /dev/null
+++ b/scripts/train_dic_creator.pl
@@ -0,0 +1,118 @@
+#!/usr/bin/env perl
+#
+#
+#Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr>
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# Get source language code and target language code
+# optionally give cutoff, cutoff/2 pairs will be prepared for train/test
+# optionally give a different dictionary directory name
+#
+# USAGE:
+# $ perl train_dic_creator.pl <source_lang> <target_lang> (cutoff) (dictionary_dir)
+use strict;
+use warnings;
+use List::Util qw(shuffle);
+my ($source_lang, $target_lang, $dict_dir, $cutoff) = @ARGV;
+if (not defined $source_lang or not defined $target_lang) {
+    die "usage: ./train_dic_creator.pl <source_lang> <target_lang> (cutoff)";
+}
+if (not defined $cutoff && $cutoff ne '') {
+    $cutoff = 20000;
+}
+if (not defined $dict_dir && $dict_dir ne '') {
+    $dict_dir = './dictionaries/';
+}
+my $flipped = 0;
+my $file_name;
+if (-e "$dict_dir/$target_lang-$source_lang.dic") {
+    warn "Dictionary is formatted as $target_lang $source_lang, still creating $source_lang $target_lang";
+    $file_name = "$target_lang-$source_lang.dic";
+    $flipped = 1;
+} elsif (-e "$dict_dir/$source_lang-$target_lang.dic") {
+    $file_name = "$source_lang-$target_lang.dic";
+}
+my $file_path = $dict_dir . $file_name;
+local @ARGV = $file_path;
+local $^I = '.bak';
+while (<>) { # remove empty lines
+    print if ! /^$/;
+}
+my @lines = `sort -rn $file_path`; # better translations swim to top
+my @result;
+my $c = 0;
+foreach my $line (@lines) {
+    chomp($line);
+    if ($line !~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/) {
+        # line has multiple tokens
+        next;
+    } else {
+        my ($source, $target) = $line =~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/;
+        if ($flipped) { # The file name and given parameters mismatch, correcting
+            push @result, "$target $source";
+        } else {
+            push @result, "$source $target";
+        }
+        $c++;
+        if ($c >= $cutoff) {
+            last;
+        }
+    }
+}
+my $test = scalar @result;
+if ($cutoff > scalar @result) {
+    $cutoff = scalar @result;
+}
+@result = shuffle @result;
+my $size = $cutoff / 2;
+my @head = @result[0..$size - 1];
+my @tail = @result[-$size..-1];
+my $train_file_name = $source_lang . '_' . $target_lang . '.train';
+my $test_file_name = $source_lang . '_' . $target_lang . '.test';
+open my $train_fh, '>', $dict_dir . $train_file_name;
+open my $test_fh, '>', $dict_dir . $test_file_name;
+print $train_fh join("\n", @head);
+print $test_fh join("\n", @tail);
+unlink "$file_path$^I";

diff --git a/scripts/prep_lookup.py b/scripts/prep_lookup.py new file mode 100644 index 0000000..7fdfeec --- /dev/null +++ b/scripts/prep_lookup.py
@@ -0,0 +1,61 @@
	1	import argparse
	2	from pathlib import Path
	3	import collections
	4	import os
	5
	6	def en_and_other(other, dirname):
	7	from nltk.corpus import wordnet as wn
	8	other_file = os.path.join(dirname, other + "." + 'tab')
	9	lookup = collections.defaultdict(dict)
	10
	11	with open(other_file, 'r') as f:
	12	for line in f:
	13	(pos, offset, rest) = line.split(' ', 2)
	14	offset = int(offset)
	15	# part of speech + offset is unique, so keys are combination of both
	16	en_def = wn.synset_from_pos_and_offset(pos, offset).definition()
	17	lookup[(pos, offset)]['en'] = en_def
	18	lookup[(pos,offset)][other] = rest.rstrip()
	19	return lookup
	20
	21	def both_lookup(source, target, dirname):
	22	from_file = os.path.join(dirname, source + "." + 'tab')
	23	to_file = os.path.join(dirname, target + "." + 'tab')
	24	lookup = collections.defaultdict(dict)
	25
	26	for tab_file, lang_code in zip((from_file, to_file), (source, target)):
	27	with open(tab_file, 'r') as f:
	28	for line in f:
	29	(pos, offset, rest) = line.split(' ', 2)
	30	offset = int(offset)
	31	# part of speech + offset is unique, so keys are combination of both
	32	lookup[(pos,offset)][lang_code] = rest.rstrip()
	33	return lookup
	34
	35	def main(args):
	36
	37	dirname = args.tab_directory
	38	source_lang = args.source_lang
	39	target_lang = args.target_lang
	40
	41	if (source_lang == 'en'):
	42	lookup = en_and_other(target_lang, dirname)
	43	elif (target_lang == 'en'):
	44	lookup = en_and_other(source_lang, dirname)
	45	else:
	46	lookup = both_lookup(source_lang, target_lang, dirname)
	47
	48	with open(f'{source_lang}_to_{target_lang}.def', 'w') as sf, open(f'{target_lang}_to_{source_lang}.def', 'w') as tf:
	49	for (pos, offset), overlap in lookup.items():
	50	if source_lang in overlap and target_lang in overlap:
	51	print(overlap[source_lang], file=sf)
	52	print(overlap[target_lang], file=tf)
	53
	54	if __name__ == "__main__":
	55	parser = argparse.ArgumentParser(description='Create a pair of .def files for 2 given languages')
	56	parser.add_argument('--tab_directory', help='directory of the .tab files', default='wordnets/tab_files')
	57	parser.add_argument('-s', '--source_lang', help='source language 2 letter code')
	58	parser.add_argument('-t', '--target_lang', help='target language 2 letter code')
	59	args = parser.parse_args()
	60
	61	main(args)


diff --git a/scripts/tab_creator.pl b/scripts/tab_creator.pl new file mode 100755 index 0000000..6efce46 --- /dev/null +++ b/scripts/tab_creator.pl
@@ -0,0 +1,76 @@
	1	#!/usr/bin/env perl
	2	#
	3	#
	4	# Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr>
	5	#
	6	# Permission is hereby granted, free of charge, to any person obtaining
	7	# a copy of this software and associated documentation files (the "Software"),
	8	# to deal in the Software without restriction, including without limitation
	9	# the rights to use, copy, modify, merge, publish, distribute, sublicense,
	10	# and/or sell copies of the Software, and to permit persons to whom the
	11	# Software is furnished to do so, subject to the following conditions:
	12	#
	13	# The above copyright notice and this permission notice shall be included
	14	# in all copies or substantial portions of the Software.
	15	#
	16	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	17	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
	18	# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	19	# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
	20	# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	21	# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
	22	# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	23
	24	use strict;
	25	use warnings;
	26	use File::Basename;
	27
	28	my %language_codes = (
	29	als => "sq",
	30	bul => "bg",
	31	ell => "el",
	32	ita => "it",
	33	ron => "ro",
	34	slv => "sl",
	35	);
	36
	37	my ($tab_file, $tab_dir) = @ARGV;
	38
	39	if (not defined $tab_file or not defined $tab_file) {
	40	die "usage: ./tab_creator.pl <tab_file>";
	41	}
	42
	43	if (not -e $tab_file) {
	44	die "'$tab_file' does not exist";
	45	}
	46
	47	if (not defined $tab_dir && $tab_dir ne '') {
	48	$tab_dir = './wordnets/tab_files';
	49	}
	50
	51	open (my $fh, '<', $tab_file) or die "Could not open '$tab_file' $!";
	52
	53	my $filename = basename($tab_file);
	54
	55	my $lang_code;
	56	if ($filename =~ m/wn-data-(\w{3})\.tab/) {
	57	$lang_code = $1;
	58	}
	59
	60
	61	my $short_lang_code = $language_codes{$lang_code};
	62
	63	my $outfilename = $tab_dir . '/' . $short_lang_code . '.tab';
	64	open (my $out_fh, '>', $outfilename) or die "Could not open '$outfilename', $!";
	65
	66	while (my $row = <$fh>) {
	67	chomp $row;
	68	if ($row =~ m/$lang_code:def/) {
	69	if ($row =~ m/^(\d+)-(\w)\s+$lang_code:def\s\d\s+(.)$/) {
	70	my $offset = $1;
	71	my $pos = $2;
	72	my $def = $3;
	73	print $out_fh "$pos $offset $def\n";
	74	}
	75	}
	76	}


diff --git a/scripts/train_dic_creator.pl b/scripts/train_dic_creator.pl new file mode 100755 index 0000000..448fecf --- /dev/null +++ b/scripts/train_dic_creator.pl
@@ -0,0 +1,118 @@
	1	#!/usr/bin/env perl
	2	#
	3	#
	4	#Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr>
	5	#
	6	# Permission is hereby granted, free of charge, to any person obtaining
	7	# a copy of this software and associated documentation files (the "Software"),
	8	# to deal in the Software without restriction, including without limitation
	9	# the rights to use, copy, modify, merge, publish, distribute, sublicense,
	10	# and/or sell copies of the Software, and to permit persons to whom the
	11	# Software is furnished to do so, subject to the following conditions:
	12	#
	13	# The above copyright notice and this permission notice shall be included
	14	# in all copies or substantial portions of the Software.
	15	#
	16	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	17	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
	18	# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	19	# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
	20	# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	21	# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
	22	# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	23
	24
	25	# Get source language code and target language code
	26	# optionally give cutoff, cutoff/2 pairs will be prepared for train/test
	27	# optionally give a different dictionary directory name
	28	#
	29	# USAGE:
	30	# $ perl train_dic_creator.pl <source_lang> <target_lang> (cutoff) (dictionary_dir)
	31
	32	use strict;
	33	use warnings;
	34	use List::Util qw(shuffle);
	35
	36	my ($source_lang, $target_lang, $dict_dir, $cutoff) = @ARGV;
	37
	38	if (not defined $source_lang or not defined $target_lang) {
	39	die "usage: ./train_dic_creator.pl <source_lang> <target_lang> (cutoff)";
	40	}
	41
	42	if (not defined $cutoff && $cutoff ne '') {
	43	$cutoff = 20000;
	44	}
	45
	46	if (not defined $dict_dir && $dict_dir ne '') {
	47	$dict_dir = './dictionaries/';
	48	}
	49
	50	my $flipped = 0;
	51	my $file_name;
	52
	53	if (-e "$dict_dir/$target_lang-$source_lang.dic") {
	54	warn "Dictionary is formatted as $target_lang $source_lang, still creating $source_lang $target_lang";
	55	$file_name = "$target_lang-$source_lang.dic";
	56	$flipped = 1;
	57	} elsif (-e "$dict_dir/$source_lang-$target_lang.dic") {
	58	$file_name = "$source_lang-$target_lang.dic";
	59	}
	60
	61	my $file_path = $dict_dir . $file_name;
	62
	63	local @ARGV = $file_path;
	64	local $^I = '.bak';
	65
	66	while (<>) { # remove empty lines
	67	print if ! /^$/;
	68	}
	69
	70	my @lines = `sort -rn $file_path`; # better translations swim to top
	71
	72	my @result;
	73	my $c = 0;
	74
	75	foreach my $line (@lines) {
	76	chomp($line);
	77	if ($line !~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/) {
	78	# line has multiple tokens
	79	next;
	80	} else {
	81	my ($source, $target) = $line =~ m/^\d+\s+[0-9.]+\s+(\S+)\s+(\S+)\s+[0-9.]+\s+[0-9.]+$/;
	82
	83	if ($flipped) { # The file name and given parameters mismatch, correcting
	84	push @result, "$target $source";
	85	} else {
	86	push @result, "$source $target";
	87	}
	88	$c++;
	89
	90	if ($c >= $cutoff) {
	91	last;
	92	}
	93	}
	94	}
	95
	96	my $test = scalar @result;
	97
	98	if ($cutoff > scalar @result) {
	99	$cutoff = scalar @result;
	100	}
	101
	102	@result = shuffle @result;
	103
	104	my $size = $cutoff / 2;
	105
	106	my @head = @result[0..$size - 1];
	107	my @tail = @result[-$size..-1];
	108
	109	my $train_file_name = $source_lang . '_' . $target_lang . '.train';
	110	my $test_file_name = $source_lang . '_' . $target_lang . '.test';
	111
	112	open my $train_fh, '>', $dict_dir . $train_file_name;
	113	open my $test_fh, '>', $dict_dir . $test_file_name;
	114
	115	print $train_fh join("\n", @head);
	116	print $test_fh join("\n", @tail);
	117
	118	unlink "$file_path$^I";