diff options
author | Yigit Sever | 2019-09-17 21:36:20 +0300 |
---|---|---|
committer | Yigit Sever | 2019-09-17 21:42:03 +0300 |
commit | 2ba704a6fced5722a1f1e3da464185e3724a57d2 (patch) | |
tree | 6e240fb60e7e97cad5252a18a3454897347a1586 /scripts/prep_lookup.py | |
parent | 2c763b88c5c376c8dc39f1bef058449a2865d1c6 (diff) | |
download | Evaluating-Dictionary-Alignment-2ba704a6fced5722a1f1e3da464185e3724a57d2.tar.gz Evaluating-Dictionary-Alignment-2ba704a6fced5722a1f1e3da464185e3724a57d2.tar.bz2 Evaluating-Dictionary-Alignment-2ba704a6fced5722a1f1e3da464185e3724a57d2.zip |
Hide away the scripts
Diffstat (limited to 'scripts/prep_lookup.py')
-rw-r--r-- | scripts/prep_lookup.py | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/scripts/prep_lookup.py b/scripts/prep_lookup.py new file mode 100644 index 0000000..7fdfeec --- /dev/null +++ b/scripts/prep_lookup.py | |||
@@ -0,0 +1,61 @@ | |||
1 | import argparse | ||
2 | from pathlib import Path | ||
3 | import collections | ||
4 | import os | ||
5 | |||
6 | def en_and_other(other, dirname): | ||
7 | from nltk.corpus import wordnet as wn | ||
8 | other_file = os.path.join(dirname, other + "." + 'tab') | ||
9 | lookup = collections.defaultdict(dict) | ||
10 | |||
11 | with open(other_file, 'r') as f: | ||
12 | for line in f: | ||
13 | (pos, offset, rest) = line.split(' ', 2) | ||
14 | offset = int(offset) | ||
15 | # part of speech + offset is unique, so keys are combination of both | ||
16 | en_def = wn.synset_from_pos_and_offset(pos, offset).definition() | ||
17 | lookup[(pos, offset)]['en'] = en_def | ||
18 | lookup[(pos,offset)][other] = rest.rstrip() | ||
19 | return lookup | ||
20 | |||
21 | def both_lookup(source, target, dirname): | ||
22 | from_file = os.path.join(dirname, source + "." + 'tab') | ||
23 | to_file = os.path.join(dirname, target + "." + 'tab') | ||
24 | lookup = collections.defaultdict(dict) | ||
25 | |||
26 | for tab_file, lang_code in zip((from_file, to_file), (source, target)): | ||
27 | with open(tab_file, 'r') as f: | ||
28 | for line in f: | ||
29 | (pos, offset, rest) = line.split(' ', 2) | ||
30 | offset = int(offset) | ||
31 | # part of speech + offset is unique, so keys are combination of both | ||
32 | lookup[(pos,offset)][lang_code] = rest.rstrip() | ||
33 | return lookup | ||
34 | |||
35 | def main(args): | ||
36 | |||
37 | dirname = args.tab_directory | ||
38 | source_lang = args.source_lang | ||
39 | target_lang = args.target_lang | ||
40 | |||
41 | if (source_lang == 'en'): | ||
42 | lookup = en_and_other(target_lang, dirname) | ||
43 | elif (target_lang == 'en'): | ||
44 | lookup = en_and_other(source_lang, dirname) | ||
45 | else: | ||
46 | lookup = both_lookup(source_lang, target_lang, dirname) | ||
47 | |||
48 | with open(f'{source_lang}_to_{target_lang}.def', 'w') as sf, open(f'{target_lang}_to_{source_lang}.def', 'w') as tf: | ||
49 | for (pos, offset), overlap in lookup.items(): | ||
50 | if source_lang in overlap and target_lang in overlap: | ||
51 | print(overlap[source_lang], file=sf) | ||
52 | print(overlap[target_lang], file=tf) | ||
53 | |||
54 | if __name__ == "__main__": | ||
55 | parser = argparse.ArgumentParser(description='Create a pair of .def files for 2 given languages') | ||
56 | parser.add_argument('--tab_directory', help='directory of the .tab files', default='wordnets/tab_files') | ||
57 | parser.add_argument('-s', '--source_lang', help='source language 2 letter code') | ||
58 | parser.add_argument('-t', '--target_lang', help='target language 2 letter code') | ||
59 | args = parser.parse_args() | ||
60 | |||
61 | main(args) | ||