import os from xml.sax import handler, make_parser from django.core.management.base import BaseCommand from importer.Phrase import phrase_from_tree from importer.Position import Position from importer.WalentyXML import XMLNode from shellvalier.settings import BASE_DIR from phrase_expansions.models import ExpansionOpinion, PhraseExpansionType, PhraseExpansion, ExpansionPosition, ExpansionPhrase, ExpansionPhraseDescription from entries.phrase_descriptions.descriptions import phrase_description2 class Command(BaseCommand): args = 'none' help = '' def handle(self, **options): import_expansions() OPINION_MAP = { 'archaiczna' : 'dat', 'pewna' : 'cer', 'potoczna' : 'col', 'wątpliwa' : 'unc', } def import_expansions(): xml_file_name = os.getenv('PHRASE_TYPES_EXPAND_FILE_NAME', default='phrase_types_expand_20210913.xml') xml_file = os.path.join(BASE_DIR, 'data', 'walenty', xml_file_name) xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), xml_file) parser = make_parser() parser.setContentHandler(ExpansionsTeiHandler()) parser.parse(xml_path) expansions = parser.getContentHandler()._expansions # for cls in (ExpansionOpinion, PhraseExpansion, ExpansionPosition, ExpansionPhrase, ExpansionPhraseDescription): # cls.objects.all().delete() opinions = [(50, u'col'), (40, u'dat'), (20, u'unc'), (10, u'cer'),] for pri, short in opinions: opinion = ExpansionOpinion(key=short, priority=pri) opinion.save() for (phrase_type, phrase_subtype), exps in expansions.items(): expansion_type = PhraseExpansionType.objects.create(phrase_type=phrase_type, phrase_subtype=phrase_subtype) expansion_type.save() print(expansion_type) for i, (positions, opinion) in enumerate(exps): opinion = ExpansionOpinion.objects.get(key=OPINION_MAP[opinion]) expansion = PhraseExpansion.objects.create(expansion_type=expansion_type, opinion=opinion, priority=(i + 1)) expansion.save() expansion_type.max_positions = max(expansion_type.max_positions, len(positions)) for j, pos in enumerate(positions): position = ExpansionPosition(expansion=expansion, priority=(j + 1)) position.save() for k, (text_rep, desc_pl, desc_en) in enumerate(pos): phrase = ExpansionPhrase(position=position, text_rep=text_rep) phrase.save() d_pl = ExpansionPhraseDescription(phrase=phrase, lang='pl', description=desc_pl) d_en = ExpansionPhraseDescription(phrase=phrase, lang='en', description=desc_en) d_pl.save() d_en.save() expansion_type.save() print(sum(map(len, expansions.values()))) class ExpansionsTeiHandler(handler.ContentHandler): def __init__(self): handler.ContentHandler.__init__(self) self._subtree = None self._current = None self._constructing = False self._content = "" self._expansions = dict() def startElement(self, name, attrs): if name == 'entry': self._constructing = True self._content = "" if (self._constructing): node = XMLNode(name, attrs, self._current) if self._current is not None: self._current.addChild(node) else: self._subtree = node self._current = node def endElement(self, name): if self._current is not None: self._current.setContent(self._content.strip()) self._current = self._current._parent if name == 'entry': if self._current is not None: raise TEIStructureError() typ = self._subtree._children[0]._attrs['type'] self.get_expansions(self._subtree, typ) self._content = '' def characters(self, content): self._content += content def get_expansions(self, tree, phrase_type): dummy_position = Position(None, None, None, None, None) if phrase_type == 'advp': subtype = tree._children[0]._children[0]._children[0]._attrs['value'] elif phrase_type == 'xp': subtype = tree._children[0]._children[0]._children[0]._children[0]._children[0]._attrs['value'] elif phrase_type == 'comprepnp': subtype = tree._children[0]._children[0]._children[0]._content elif phrase_type in ('distrp', 'possp'): subtype = None #print('{}({})'.format(phrase_type, subtype)) assert((phrase_type, subtype) not in self._expansions) expansions = [] for exp in tree._children[1]._children[0]._children: expansion_positions = [] opinion = exp._children[0]._children[0]._attrs['value'] if exp._children[1]._attrs['name'] == 'phrases': positions = [exp._children[1]] elif exp._children[1]._attrs['name'] == 'positions': positions = [pos._children[0] for pos in exp._children[1]._children[0]._children] for position in positions: expansion_position = [] for phrase in position._children[0]._children: typ = phrase._attrs['type'] if typ == 'adverb': adverb = phrase._children[0]._children[0]._attrs['value'] expansion_position.append((adverb, 'przysłówek <i>{}</i>'.format(adverb), '<i>{}</i> adverb'.format(adverb))) elif typ == 'advp': # xp realised by advp(cat) advpcat = phrase._children[-1]._children[0]._attrs['value'] expansion_position.append(('advp({})'.format(advpcat), '???', '???')) elif typ == 'comprepnp': prep = phrase._children[1]._children[0]._content expansion_position.append(('comprepnp({})'.format(prep), 'fraza rzeczownikowo-przyimkowa z przyimkiem złożonym <i>{}</i>'.format(prep), 'nominal-prepositional phrase with <i>{}</i> complex preposition'.format(prep))) else: phr = phrase_from_tree(phrase) if False:#str(phr) == 'lex(adjp(agr),agr,agr,pos,OR(cudzy;czyj;czyjkolwiek;czyjś;mój;nasz;niczyj;pański;swój;twój;wasz),natr)': desc_pl, desc_en = None, None else: desc_pl, desc_en = phrase_description2(phr, dummy_position, None, 'pl'), phrase_description2(phr, dummy_position, None, 'en') expansion_position.append((str(phr), desc_pl, desc_en)) expansion_positions.append(expansion_position) assert(expansion_positions) expansions.append((expansion_positions, opinion)) self._expansions[(phrase_type, subtype)] = expansions