From 2c763b88c5c376c8dc39f1bef058449a2865d1c6 Mon Sep 17 00:00:00 2001 From: Yigit Sever Date: Tue, 17 Sep 2019 20:56:36 +0300 Subject: Creating .def files from scratch --- get_data.sh | 13 ++++++++++++- prep_lookup.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ tab_creator.pl | 2 -- 3 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 prep_lookup.py diff --git a/get_data.sh b/get_data.sh index 934c5db..f943722 100755 --- a/get_data.sh +++ b/get_data.sh @@ -45,14 +45,25 @@ done rm -rf "${WNET}/ita/" # comes alongside iwn, not useful for us mv "${WNET}/iwn" "${WNET}/ita" + +echo "Creating .def files" + TAB_DIR="${WNET}/tab_files" mkdir -p "${TAB_DIR}" for filename in ${WNET}/*/wn-data*.tab; do - echo ">>>$filename" ${ROOT}/tab_creator.pl $filename done +for PAIR in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do + IFS=',' read -r source_lang target_lang <<< "${PAIR}" + python ${ROOT}/prep_lookup.py -s "${source_lang}" -t "${target_lang}" +done + +READY="${WNET}/ready" +mkdir -p "${READY}" +mv ${ROOT}/*.def "${READY}" + echo "Downloading dictionaries" DICT="${ROOT}/dictionaries" diff --git a/prep_lookup.py b/prep_lookup.py new file mode 100644 index 0000000..7fdfeec --- /dev/null +++ b/prep_lookup.py @@ -0,0 +1,61 @@ +import argparse +from pathlib import Path +import collections +import os + +def en_and_other(other, dirname): + from nltk.corpus import wordnet as wn + other_file = os.path.join(dirname, other + "." + 'tab') + lookup = collections.defaultdict(dict) + + with open(other_file, 'r') as f: + for line in f: + (pos, offset, rest) = line.split(' ', 2) + offset = int(offset) + # part of speech + offset is unique, so keys are combination of both + en_def = wn.synset_from_pos_and_offset(pos, offset).definition() + lookup[(pos, offset)]['en'] = en_def + lookup[(pos,offset)][other] = rest.rstrip() + return lookup + +def both_lookup(source, target, dirname): + from_file = os.path.join(dirname, source + "." + 'tab') + to_file = os.path.join(dirname, target + "." + 'tab') + lookup = collections.defaultdict(dict) + + for tab_file, lang_code in zip((from_file, to_file), (source, target)): + with open(tab_file, 'r') as f: + for line in f: + (pos, offset, rest) = line.split(' ', 2) + offset = int(offset) + # part of speech + offset is unique, so keys are combination of both + lookup[(pos,offset)][lang_code] = rest.rstrip() + return lookup + +def main(args): + + dirname = args.tab_directory + source_lang = args.source_lang + target_lang = args.target_lang + + if (source_lang == 'en'): + lookup = en_and_other(target_lang, dirname) + elif (target_lang == 'en'): + lookup = en_and_other(source_lang, dirname) + else: + lookup = both_lookup(source_lang, target_lang, dirname) + + with open(f'{source_lang}_to_{target_lang}.def', 'w') as sf, open(f'{target_lang}_to_{source_lang}.def', 'w') as tf: + for (pos, offset), overlap in lookup.items(): + if source_lang in overlap and target_lang in overlap: + print(overlap[source_lang], file=sf) + print(overlap[target_lang], file=tf) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Create a pair of .def files for 2 given languages') + parser.add_argument('--tab_directory', help='directory of the .tab files', default='wordnets/tab_files') + parser.add_argument('-s', '--source_lang', help='source language 2 letter code') + parser.add_argument('-t', '--target_lang', help='target language 2 letter code') + args = parser.parse_args() + + main(args) diff --git a/tab_creator.pl b/tab_creator.pl index f9acf1c..6efce46 100755 --- a/tab_creator.pl +++ b/tab_creator.pl @@ -36,8 +36,6 @@ my %language_codes = ( my ($tab_file, $tab_dir) = @ARGV; -print "working on $tab_file\n"; - if (not defined $tab_file or not defined $tab_file) { die "usage: ./tab_creator.pl "; } -- cgit v1.2.3-70-g09d2