aboutsummaryrefslogtreecommitdiffstats
path: root/prep_lookup.py
blob: 7fdfeeceff9ae4743ee43754727d95a257bcae00 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import argparse
from pathlib import Path
import collections
import os

def en_and_other(other, dirname):
    from nltk.corpus import wordnet as wn
    other_file = os.path.join(dirname, other + "." + 'tab')
    lookup = collections.defaultdict(dict)

    with open(other_file, 'r') as f:
        for line in f:
            (pos, offset, rest) = line.split(' ', 2)
            offset = int(offset)
            # part of speech + offset is unique, so keys are combination of both
            en_def = wn.synset_from_pos_and_offset(pos, offset).definition()
            lookup[(pos, offset)]['en'] = en_def
            lookup[(pos,offset)][other] = rest.rstrip()
    return lookup

def both_lookup(source, target, dirname):
    from_file = os.path.join(dirname, source + "." + 'tab')
    to_file = os.path.join(dirname, target + "." + 'tab')
    lookup = collections.defaultdict(dict)

    for tab_file, lang_code in zip((from_file, to_file), (source, target)):
        with open(tab_file, 'r') as f:
            for line in f:
                (pos, offset, rest) = line.split(' ', 2)
                offset = int(offset)
                # part of speech + offset is unique, so keys are combination of both
                lookup[(pos,offset)][lang_code] = rest.rstrip()
    return lookup

def main(args):

    dirname = args.tab_directory
    source_lang = args.source_lang
    target_lang = args.target_lang

    if (source_lang == 'en'):
            lookup = en_and_other(target_lang, dirname)
    elif (target_lang == 'en'):
            lookup = en_and_other(source_lang, dirname)
    else:
            lookup = both_lookup(source_lang, target_lang, dirname)

    with open(f'{source_lang}_to_{target_lang}.def', 'w') as sf, open(f'{target_lang}_to_{source_lang}.def', 'w') as tf:
        for (pos, offset), overlap in lookup.items():
            if source_lang in overlap and target_lang in overlap:
                print(overlap[source_lang], file=sf)
                print(overlap[target_lang], file=tf)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Create a pair of .def files for 2 given languages')
    parser.add_argument('--tab_directory', help='directory of the .tab files', default='wordnets/tab_files')
    parser.add_argument('-s', '--source_lang', help='source language 2 letter code')
    parser.add_argument('-t', '--target_lang', help='target language 2 letter code')
    args = parser.parse_args()

    main(args)