#! /usr/bin/python # -*- coding: utf-8 -*- import zipfile from django.core.management.base import BaseCommand import sys, os, codecs from xml.sax import saxutils, handler, make_parser from collections import defaultdict from meanings.models import Synset, LexicalUnit from shellvalier.settings import BASE_DIR BULK = 250 POS_MAP = { 'czasownik' : 'verb', 'rzeczownik' : 'noun', 'przymiotnik' : 'adj', 'przysłówek' : 'adv', } class PlWNHandler(handler.ContentHandler): def __init__(self, out = sys.stdout): handler.ContentHandler.__init__(self) self._out = out self._lexical_units = {} self._reflexive_lexical_units = {} self._mutual_lexical_units = {} self._synsets = {} self._root_synsets_ids = set() self._root_synsets = [] self._defined_synset = -1 self._unit = False self._content = '' self._other_synset_relations = set() self._lexical_relations = set() self._synsets_to_base = [] self._lexical_units_to_base = [] self._hypernymy_to_base = defaultdict(list) self._synonymy_to_base = [] def startElement(self, name, attrs): if name == 'lexical-unit': if 'pwn' not in attrs['pos']: luid = int(attrs['id']) lubase = attrs['name'] lusense = int(attrs['variant']) pos = attrs['pos'] desc = attrs['desc'] self._lexical_units[luid] = (lubase, lusense, pos, desc) elif name == 'synset': sid = int(attrs['id']) self._defined_synset = sid defintion = attrs['definition'] if attrs['definition'] != 'brak danych' else '' s = Synset(id=sid, definition=defintion) self._synsets_to_base.append(s) self._synsets[sid] = s elif name == 'unit-id': self._unit = True elif name == 'synsetrelations' and attrs['relation'] == '11': self._hypernymy_to_base[int(attrs['child'])].append(int(attrs['parent'])) def endElement(self, name): if name == 'synset': self._defined_synset = -1 elif name == 'unit-id': luid = int(self._content) if luid in self._lexical_units: s = self._synsets[self._defined_synset] lubase, lusense, pos, desc = self._lexical_units[luid] if desc == 'brak danych': desc = '' pos = POS_MAP[pos] lu = LexicalUnit(luid=luid, base=lubase, sense=lusense, pos=pos, synset=s, gloss='', definition=desc, text_rep='{}-{}'.format(lubase, lusense)) # print luid, lubase, lusense self._lexical_units_to_base.append(lu) self._unit = False self._content = '' def characters(self, content): if self._unit and self._defined_synset >= 0 and content.strip(): self._content += content def endDocument(self): pass #==========================================================# class Command(BaseCommand): args = 'none' help = '' def handle(self, **options): import_plWordnet() def import_plWordnet(): wordnet_dir = os.path.join(BASE_DIR, 'data', 'plwordnet') zipped_xml_file = os.path.join(wordnet_dir, 'plwordnet_2_1.xml.zip') parser = make_parser() parser.setContentHandler(PlWNHandler()) print("Parsing Wordnet...") with zipfile.ZipFile(zipped_xml_file, 'r') as zip_file: parser.parse(zip_file.open("data/plwordnet/plwordnet_2_1.xml")) print("...DONE") print() print("Storing synsets...") synsets = parser.getContentHandler()._synsets_to_base max_len = len(synsets) i = 0 while i*BULK < max_len: store = synsets[i*BULK:min((i+1)*BULK, max_len)] Synset.objects.bulk_create(store) i += 1 if (i % 50 == 0): print(str(i*BULK) + "...") print("...DONE") print() print("Storing lexical units...") lexical_units = parser.getContentHandler()._lexical_units_to_base max_len = len(lexical_units) i = 0 while i*BULK < max_len: store = lexical_units[i*BULK:min((i+1)*BULK, max_len)] LexicalUnit.objects.bulk_create(store) i += 1 if (i % 20 == 0): print(str(i*BULK) + "...") print("...DONE") print() print("Storing hypernyms...") hypernyms = parser.getContentHandler()._hypernymy_to_base print(len(hypernyms)) i = 0 for child_id, parent_ids in hypernyms.items(): i += 1 try: child = Synset.objects.get(id=child_id) except Synset.DoesNotExist: print(f'************ Missing Synset {child_id}') continue parents = list(Synset.objects.filter(id__in=parent_ids).only("id")) missing_parent_ids = set(parent_ids) - {p.id for p in parents} if missing_parent_ids: print(f'************ Missing parent Synsets for {child_id}: {missing_parent_ids}') if i % 2000 == 0: print(i, child, parents) child.hypernyms.add(*parents) print("...DONE")