aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYigit Sever2019-09-17 20:56:36 +0300
committerYigit Sever2019-09-17 20:56:36 +0300
commit2c763b88c5c376c8dc39f1bef058449a2865d1c6 (patch)
treea5b437324e6509e838fb53e2c5ffeeb967f0f703
parent4add85258bacedab23db82631c887d3b07c40f80 (diff)
downloadEvaluating-Dictionary-Alignment-2c763b88c5c376c8dc39f1bef058449a2865d1c6.tar.gz
Evaluating-Dictionary-Alignment-2c763b88c5c376c8dc39f1bef058449a2865d1c6.tar.bz2
Evaluating-Dictionary-Alignment-2c763b88c5c376c8dc39f1bef058449a2865d1c6.zip
Creating .def files from scratch
-rwxr-xr-xget_data.sh13
-rw-r--r--prep_lookup.py61
-rwxr-xr-xtab_creator.pl2
3 files changed, 73 insertions, 3 deletions
diff --git a/get_data.sh b/get_data.sh
index 934c5db..f943722 100755
--- a/get_data.sh
+++ b/get_data.sh
@@ -45,14 +45,25 @@ done
45rm -rf "${WNET}/ita/" # comes alongside iwn, not useful for us 45rm -rf "${WNET}/ita/" # comes alongside iwn, not useful for us
46mv "${WNET}/iwn" "${WNET}/ita" 46mv "${WNET}/iwn" "${WNET}/ita"
47 47
48
49echo "Creating .def files"
50
48TAB_DIR="${WNET}/tab_files" 51TAB_DIR="${WNET}/tab_files"
49mkdir -p "${TAB_DIR}" 52mkdir -p "${TAB_DIR}"
50 53
51for filename in ${WNET}/*/wn-data*.tab; do 54for filename in ${WNET}/*/wn-data*.tab; do
52 echo ">>>$filename"
53 ${ROOT}/tab_creator.pl $filename 55 ${ROOT}/tab_creator.pl $filename
54done 56done
55 57
58for PAIR in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do
59 IFS=',' read -r source_lang target_lang <<< "${PAIR}"
60 python ${ROOT}/prep_lookup.py -s "${source_lang}" -t "${target_lang}"
61done
62
63READY="${WNET}/ready"
64mkdir -p "${READY}"
65mv ${ROOT}/*.def "${READY}"
66
56echo "Downloading dictionaries" 67echo "Downloading dictionaries"
57 68
58DICT="${ROOT}/dictionaries" 69DICT="${ROOT}/dictionaries"
diff --git a/prep_lookup.py b/prep_lookup.py
new file mode 100644
index 0000000..7fdfeec
--- /dev/null
+++ b/prep_lookup.py
@@ -0,0 +1,61 @@
1import argparse
2from pathlib import Path
3import collections
4import os
5
6def en_and_other(other, dirname):
7 from nltk.corpus import wordnet as wn
8 other_file = os.path.join(dirname, other + "." + 'tab')
9 lookup = collections.defaultdict(dict)
10
11 with open(other_file, 'r') as f:
12 for line in f:
13 (pos, offset, rest) = line.split(' ', 2)
14 offset = int(offset)
15 # part of speech + offset is unique, so keys are combination of both
16 en_def = wn.synset_from_pos_and_offset(pos, offset).definition()
17 lookup[(pos, offset)]['en'] = en_def
18 lookup[(pos,offset)][other] = rest.rstrip()
19 return lookup
20
21def both_lookup(source, target, dirname):
22 from_file = os.path.join(dirname, source + "." + 'tab')
23 to_file = os.path.join(dirname, target + "." + 'tab')
24 lookup = collections.defaultdict(dict)
25
26 for tab_file, lang_code in zip((from_file, to_file), (source, target)):
27 with open(tab_file, 'r') as f:
28 for line in f:
29 (pos, offset, rest) = line.split(' ', 2)
30 offset = int(offset)
31 # part of speech + offset is unique, so keys are combination of both
32 lookup[(pos,offset)][lang_code] = rest.rstrip()
33 return lookup
34
35def main(args):
36
37 dirname = args.tab_directory
38 source_lang = args.source_lang
39 target_lang = args.target_lang
40
41 if (source_lang == 'en'):
42 lookup = en_and_other(target_lang, dirname)
43 elif (target_lang == 'en'):
44 lookup = en_and_other(source_lang, dirname)
45 else:
46 lookup = both_lookup(source_lang, target_lang, dirname)
47
48 with open(f'{source_lang}_to_{target_lang}.def', 'w') as sf, open(f'{target_lang}_to_{source_lang}.def', 'w') as tf:
49 for (pos, offset), overlap in lookup.items():
50 if source_lang in overlap and target_lang in overlap:
51 print(overlap[source_lang], file=sf)
52 print(overlap[target_lang], file=tf)
53
54if __name__ == "__main__":
55 parser = argparse.ArgumentParser(description='Create a pair of .def files for 2 given languages')
56 parser.add_argument('--tab_directory', help='directory of the .tab files', default='wordnets/tab_files')
57 parser.add_argument('-s', '--source_lang', help='source language 2 letter code')
58 parser.add_argument('-t', '--target_lang', help='target language 2 letter code')
59 args = parser.parse_args()
60
61 main(args)
diff --git a/tab_creator.pl b/tab_creator.pl
index f9acf1c..6efce46 100755
--- a/tab_creator.pl
+++ b/tab_creator.pl
@@ -36,8 +36,6 @@ my %language_codes = (
36 36
37my ($tab_file, $tab_dir) = @ARGV; 37my ($tab_file, $tab_dir) = @ARGV;
38 38
39print "working on $tab_file\n";
40
41if (not defined $tab_file or not defined $tab_file) { 39if (not defined $tab_file or not defined $tab_file) {
42 die "usage: ./tab_creator.pl <tab_file>"; 40 die "usage: ./tab_creator.pl <tab_file>";
43} 41}