diff options
| author | Yigit Sever | 2019-09-17 20:56:36 +0300 |
|---|---|---|
| committer | Yigit Sever | 2019-09-17 20:56:36 +0300 |
| commit | 2c763b88c5c376c8dc39f1bef058449a2865d1c6 (patch) | |
| tree | a5b437324e6509e838fb53e2c5ffeeb967f0f703 | |
| parent | 4add85258bacedab23db82631c887d3b07c40f80 (diff) | |
| download | Evaluating-Dictionary-Alignment-2c763b88c5c376c8dc39f1bef058449a2865d1c6.tar.gz Evaluating-Dictionary-Alignment-2c763b88c5c376c8dc39f1bef058449a2865d1c6.tar.bz2 Evaluating-Dictionary-Alignment-2c763b88c5c376c8dc39f1bef058449a2865d1c6.zip | |
Creating .def files from scratch
| -rwxr-xr-x | get_data.sh | 13 | ||||
| -rw-r--r-- | prep_lookup.py | 61 | ||||
| -rwxr-xr-x | tab_creator.pl | 2 |
3 files changed, 73 insertions, 3 deletions
diff --git a/get_data.sh b/get_data.sh index 934c5db..f943722 100755 --- a/get_data.sh +++ b/get_data.sh | |||
| @@ -45,14 +45,25 @@ done | |||
| 45 | rm -rf "${WNET}/ita/" # comes alongside iwn, not useful for us | 45 | rm -rf "${WNET}/ita/" # comes alongside iwn, not useful for us |
| 46 | mv "${WNET}/iwn" "${WNET}/ita" | 46 | mv "${WNET}/iwn" "${WNET}/ita" |
| 47 | 47 | ||
| 48 | |||
| 49 | echo "Creating .def files" | ||
| 50 | |||
| 48 | TAB_DIR="${WNET}/tab_files" | 51 | TAB_DIR="${WNET}/tab_files" |
| 49 | mkdir -p "${TAB_DIR}" | 52 | mkdir -p "${TAB_DIR}" |
| 50 | 53 | ||
| 51 | for filename in ${WNET}/*/wn-data*.tab; do | 54 | for filename in ${WNET}/*/wn-data*.tab; do |
| 52 | echo ">>>$filename" | ||
| 53 | ${ROOT}/tab_creator.pl $filename | 55 | ${ROOT}/tab_creator.pl $filename |
| 54 | done | 56 | done |
| 55 | 57 | ||
| 58 | for PAIR in en,bg en,el en,it, en,ro, en,sl en,sq, bg,el bg,it bg,ro el,it el,ro el,sq it,ro ro,sl ro,sq; do | ||
| 59 | IFS=',' read -r source_lang target_lang <<< "${PAIR}" | ||
| 60 | python ${ROOT}/prep_lookup.py -s "${source_lang}" -t "${target_lang}" | ||
| 61 | done | ||
| 62 | |||
| 63 | READY="${WNET}/ready" | ||
| 64 | mkdir -p "${READY}" | ||
| 65 | mv ${ROOT}/*.def "${READY}" | ||
| 66 | |||
| 56 | echo "Downloading dictionaries" | 67 | echo "Downloading dictionaries" |
| 57 | 68 | ||
| 58 | DICT="${ROOT}/dictionaries" | 69 | DICT="${ROOT}/dictionaries" |
diff --git a/prep_lookup.py b/prep_lookup.py new file mode 100644 index 0000000..7fdfeec --- /dev/null +++ b/prep_lookup.py | |||
| @@ -0,0 +1,61 @@ | |||
| 1 | import argparse | ||
| 2 | from pathlib import Path | ||
| 3 | import collections | ||
| 4 | import os | ||
| 5 | |||
| 6 | def en_and_other(other, dirname): | ||
| 7 | from nltk.corpus import wordnet as wn | ||
| 8 | other_file = os.path.join(dirname, other + "." + 'tab') | ||
| 9 | lookup = collections.defaultdict(dict) | ||
| 10 | |||
| 11 | with open(other_file, 'r') as f: | ||
| 12 | for line in f: | ||
| 13 | (pos, offset, rest) = line.split(' ', 2) | ||
| 14 | offset = int(offset) | ||
| 15 | # part of speech + offset is unique, so keys are combination of both | ||
| 16 | en_def = wn.synset_from_pos_and_offset(pos, offset).definition() | ||
| 17 | lookup[(pos, offset)]['en'] = en_def | ||
| 18 | lookup[(pos,offset)][other] = rest.rstrip() | ||
| 19 | return lookup | ||
| 20 | |||
| 21 | def both_lookup(source, target, dirname): | ||
| 22 | from_file = os.path.join(dirname, source + "." + 'tab') | ||
| 23 | to_file = os.path.join(dirname, target + "." + 'tab') | ||
| 24 | lookup = collections.defaultdict(dict) | ||
| 25 | |||
| 26 | for tab_file, lang_code in zip((from_file, to_file), (source, target)): | ||
| 27 | with open(tab_file, 'r') as f: | ||
| 28 | for line in f: | ||
| 29 | (pos, offset, rest) = line.split(' ', 2) | ||
| 30 | offset = int(offset) | ||
| 31 | # part of speech + offset is unique, so keys are combination of both | ||
| 32 | lookup[(pos,offset)][lang_code] = rest.rstrip() | ||
| 33 | return lookup | ||
| 34 | |||
| 35 | def main(args): | ||
| 36 | |||
| 37 | dirname = args.tab_directory | ||
| 38 | source_lang = args.source_lang | ||
| 39 | target_lang = args.target_lang | ||
| 40 | |||
| 41 | if (source_lang == 'en'): | ||
| 42 | lookup = en_and_other(target_lang, dirname) | ||
| 43 | elif (target_lang == 'en'): | ||
| 44 | lookup = en_and_other(source_lang, dirname) | ||
| 45 | else: | ||
| 46 | lookup = both_lookup(source_lang, target_lang, dirname) | ||
| 47 | |||
| 48 | with open(f'{source_lang}_to_{target_lang}.def', 'w') as sf, open(f'{target_lang}_to_{source_lang}.def', 'w') as tf: | ||
| 49 | for (pos, offset), overlap in lookup.items(): | ||
| 50 | if source_lang in overlap and target_lang in overlap: | ||
| 51 | print(overlap[source_lang], file=sf) | ||
| 52 | print(overlap[target_lang], file=tf) | ||
| 53 | |||
| 54 | if __name__ == "__main__": | ||
| 55 | parser = argparse.ArgumentParser(description='Create a pair of .def files for 2 given languages') | ||
| 56 | parser.add_argument('--tab_directory', help='directory of the .tab files', default='wordnets/tab_files') | ||
| 57 | parser.add_argument('-s', '--source_lang', help='source language 2 letter code') | ||
| 58 | parser.add_argument('-t', '--target_lang', help='target language 2 letter code') | ||
| 59 | args = parser.parse_args() | ||
| 60 | |||
| 61 | main(args) | ||
diff --git a/tab_creator.pl b/tab_creator.pl index f9acf1c..6efce46 100755 --- a/tab_creator.pl +++ b/tab_creator.pl | |||
| @@ -36,8 +36,6 @@ my %language_codes = ( | |||
| 36 | 36 | ||
| 37 | my ($tab_file, $tab_dir) = @ARGV; | 37 | my ($tab_file, $tab_dir) = @ARGV; |
| 38 | 38 | ||
| 39 | print "working on $tab_file\n"; | ||
| 40 | |||
| 41 | if (not defined $tab_file or not defined $tab_file) { | 39 | if (not defined $tab_file or not defined $tab_file) { |
| 42 | die "usage: ./tab_creator.pl <tab_file>"; | 40 | die "usage: ./tab_creator.pl <tab_file>"; |
| 43 | } | 41 | } |
