diff options
| -rwxr-xr-x | get_data.sh | 13 | ||||
| -rwxr-xr-x | tab_creator.pl | 78 | ||||
| -rwxr-xr-x | train_dic_creator.pl | 2 |
3 files changed, 90 insertions, 3 deletions
diff --git a/get_data.sh b/get_data.sh index d6487bf..934c5db 100755 --- a/get_data.sh +++ b/get_data.sh | |||
| @@ -38,11 +38,20 @@ wget -nc -q http://compling.hss.ntu.edu.sg/omw/wns/slv.zip -P "${WNET}" | |||
| 38 | echo "Unzipping wordnet data" | 38 | echo "Unzipping wordnet data" |
| 39 | 39 | ||
| 40 | for lang in als bul ell ita ron slv; do | 40 | for lang in als bul ell ita ron slv; do |
| 41 | unzip -o -f -q "${WNET}/${lang}.zip" -d "${WNET}" | 41 | unzip -o -q "${WNET}/${lang}.zip" -d "${WNET}" |
| 42 | # rm -f "${WNET}/${lang}.zip" | 42 | rm -f "${WNET}/${lang}.zip" |
| 43 | done | 43 | done |
| 44 | 44 | ||
| 45 | rm -rf "${WNET}/ita/" # comes alongside iwn, not useful for us | 45 | rm -rf "${WNET}/ita/" # comes alongside iwn, not useful for us |
| 46 | mv "${WNET}/iwn" "${WNET}/ita" | ||
| 47 | |||
| 48 | TAB_DIR="${WNET}/tab_files" | ||
| 49 | mkdir -p "${TAB_DIR}" | ||
| 50 | |||
| 51 | for filename in ${WNET}/*/wn-data*.tab; do | ||
| 52 | echo ">>>$filename" | ||
| 53 | ${ROOT}/tab_creator.pl $filename | ||
| 54 | done | ||
| 46 | 55 | ||
| 47 | echo "Downloading dictionaries" | 56 | echo "Downloading dictionaries" |
| 48 | 57 | ||
diff --git a/tab_creator.pl b/tab_creator.pl new file mode 100755 index 0000000..f9acf1c --- /dev/null +++ b/tab_creator.pl | |||
| @@ -0,0 +1,78 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # | ||
| 4 | # Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr> | ||
| 5 | # | ||
| 6 | # Permission is hereby granted, free of charge, to any person obtaining | ||
| 7 | # a copy of this software and associated documentation files (the "Software"), | ||
| 8 | # to deal in the Software without restriction, including without limitation | ||
| 9 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
| 10 | # and/or sell copies of the Software, and to permit persons to whom the | ||
| 11 | # Software is furnished to do so, subject to the following conditions: | ||
| 12 | # | ||
| 13 | # The above copyright notice and this permission notice shall be included | ||
| 14 | # in all copies or substantial portions of the Software. | ||
| 15 | # | ||
| 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
| 18 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
| 19 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | ||
| 20 | # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
| 21 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE | ||
| 22 | # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
| 23 | |||
| 24 | use strict; | ||
| 25 | use warnings; | ||
| 26 | use File::Basename; | ||
| 27 | |||
| 28 | my %language_codes = ( | ||
| 29 | als => "sq", | ||
| 30 | bul => "bg", | ||
| 31 | ell => "el", | ||
| 32 | ita => "it", | ||
| 33 | ron => "ro", | ||
| 34 | slv => "sl", | ||
| 35 | ); | ||
| 36 | |||
| 37 | my ($tab_file, $tab_dir) = @ARGV; | ||
| 38 | |||
| 39 | print "working on $tab_file\n"; | ||
| 40 | |||
| 41 | if (not defined $tab_file or not defined $tab_file) { | ||
| 42 | die "usage: ./tab_creator.pl <tab_file>"; | ||
| 43 | } | ||
| 44 | |||
| 45 | if (not -e $tab_file) { | ||
| 46 | die "'$tab_file' does not exist"; | ||
| 47 | } | ||
| 48 | |||
| 49 | if (not defined $tab_dir && $tab_dir ne '') { | ||
| 50 | $tab_dir = './wordnets/tab_files'; | ||
| 51 | } | ||
| 52 | |||
| 53 | open (my $fh, '<', $tab_file) or die "Could not open '$tab_file' $!"; | ||
| 54 | |||
| 55 | my $filename = basename($tab_file); | ||
| 56 | |||
| 57 | my $lang_code; | ||
| 58 | if ($filename =~ m/wn-data-(\w{3})\.tab/) { | ||
| 59 | $lang_code = $1; | ||
| 60 | } | ||
| 61 | |||
| 62 | |||
| 63 | my $short_lang_code = $language_codes{$lang_code}; | ||
| 64 | |||
| 65 | my $outfilename = $tab_dir . '/' . $short_lang_code . '.tab'; | ||
| 66 | open (my $out_fh, '>', $outfilename) or die "Could not open '$outfilename', $!"; | ||
| 67 | |||
| 68 | while (my $row = <$fh>) { | ||
| 69 | chomp $row; | ||
| 70 | if ($row =~ m/$lang_code:def/) { | ||
| 71 | if ($row =~ m/^(\d+)-(\w)\s+$lang_code:def\s*\d\s+(.*)$/) { | ||
| 72 | my $offset = $1; | ||
| 73 | my $pos = $2; | ||
| 74 | my $def = $3; | ||
| 75 | print $out_fh "$pos $offset $def\n"; | ||
| 76 | } | ||
| 77 | } | ||
| 78 | } | ||
diff --git a/train_dic_creator.pl b/train_dic_creator.pl index 1921a85..a228044 100755 --- a/train_dic_creator.pl +++ b/train_dic_creator.pl | |||
| @@ -2,7 +2,7 @@ | |||
| 2 | # | 2 | # |
| 3 | # | 3 | # |
| 4 | #Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr> | 4 | #Copyright © 2019 Yiğit Sever <yigit.sever@tedu.edu.tr> |
| 5 | 5 | # | |
| 6 | # Permission is hereby granted, free of charge, to any person obtaining | 6 | # Permission is hereby granted, free of charge, to any person obtaining |
| 7 | # a copy of this software and associated documentation files (the "Software"), | 7 | # a copy of this software and associated documentation files (the "Software"), |
| 8 | # to deal in the Software without restriction, including without limitation | 8 | # to deal in the Software without restriction, including without limitation |
