aboutsummaryrefslogtreecommitdiffstats
path: root/scripts/prep_lookup.py
diff options
context:
space:
mode:
authorYigit Sever2019-09-17 21:36:20 +0300
committerYigit Sever2019-09-17 21:36:20 +0300
commit576377215513e097d805fc1ed33b8613bdd2e43f (patch)
tree0f3aac612b4867ea35f330d97f2e38e3c23b6cba /scripts/prep_lookup.py
parent2c763b88c5c376c8dc39f1bef058449a2865d1c6 (diff)
downloadEvaluating-Dictionary-Alignment-576377215513e097d805fc1ed33b8613bdd2e43f.tar.gz
Evaluating-Dictionary-Alignment-576377215513e097d805fc1ed33b8613bdd2e43f.tar.bz2
Evaluating-Dictionary-Alignment-576377215513e097d805fc1ed33b8613bdd2e43f.zip
Hide away the scripts
Diffstat (limited to 'scripts/prep_lookup.py')
-rw-r--r--scripts/prep_lookup.py61
1 files changed, 61 insertions, 0 deletions
diff --git a/scripts/prep_lookup.py b/scripts/prep_lookup.py
new file mode 100644
index 0000000..7fdfeec
--- /dev/null
+++ b/scripts/prep_lookup.py
@@ -0,0 +1,61 @@
1import argparse
2from pathlib import Path
3import collections
4import os
5
6def en_and_other(other, dirname):
7 from nltk.corpus import wordnet as wn
8 other_file = os.path.join(dirname, other + "." + 'tab')
9 lookup = collections.defaultdict(dict)
10
11 with open(other_file, 'r') as f:
12 for line in f:
13 (pos, offset, rest) = line.split(' ', 2)
14 offset = int(offset)
15 # part of speech + offset is unique, so keys are combination of both
16 en_def = wn.synset_from_pos_and_offset(pos, offset).definition()
17 lookup[(pos, offset)]['en'] = en_def
18 lookup[(pos,offset)][other] = rest.rstrip()
19 return lookup
20
21def both_lookup(source, target, dirname):
22 from_file = os.path.join(dirname, source + "." + 'tab')
23 to_file = os.path.join(dirname, target + "." + 'tab')
24 lookup = collections.defaultdict(dict)
25
26 for tab_file, lang_code in zip((from_file, to_file), (source, target)):
27 with open(tab_file, 'r') as f:
28 for line in f:
29 (pos, offset, rest) = line.split(' ', 2)
30 offset = int(offset)
31 # part of speech + offset is unique, so keys are combination of both
32 lookup[(pos,offset)][lang_code] = rest.rstrip()
33 return lookup
34
35def main(args):
36
37 dirname = args.tab_directory
38 source_lang = args.source_lang
39 target_lang = args.target_lang
40
41 if (source_lang == 'en'):
42 lookup = en_and_other(target_lang, dirname)
43 elif (target_lang == 'en'):
44 lookup = en_and_other(source_lang, dirname)
45 else:
46 lookup = both_lookup(source_lang, target_lang, dirname)
47
48 with open(f'{source_lang}_to_{target_lang}.def', 'w') as sf, open(f'{target_lang}_to_{source_lang}.def', 'w') as tf:
49 for (pos, offset), overlap in lookup.items():
50 if source_lang in overlap and target_lang in overlap:
51 print(overlap[source_lang], file=sf)
52 print(overlap[target_lang], file=tf)
53
54if __name__ == "__main__":
55 parser = argparse.ArgumentParser(description='Create a pair of .def files for 2 given languages')
56 parser.add_argument('--tab_directory', help='directory of the .tab files', default='wordnets/tab_files')
57 parser.add_argument('-s', '--source_lang', help='source language 2 letter code')
58 parser.add_argument('-t', '--target_lang', help='target language 2 letter code')
59 args = parser.parse_args()
60
61 main(args)