An error occurred while loading the file. Please try again.
-
dcz authored
PHRASE_TYPES_EXPAND_FILE_NAME env
54e1d9d6
import_expansions.py 7.11 KiB
import os
from xml.sax import handler, make_parser
from django.core.management.base import BaseCommand
from importer.Phrase import phrase_from_tree
from importer.Position import Position
from importer.WalentyXML import XMLNode
from shellvalier.settings import BASE_DIR
from phrase_expansions.models import ExpansionOpinion, PhraseExpansionType, PhraseExpansion, ExpansionPosition, ExpansionPhrase, ExpansionPhraseDescription
from entries.phrase_descriptions.descriptions import phrase_description2
class Command(BaseCommand):
args = 'none'
help = ''
def handle(self, **options):
import_expansions()
OPINION_MAP = {
'archaiczna' : 'dat',
'pewna' : 'cer',
'potoczna' : 'col',
'wątpliwa' : 'unc',
}
def import_expansions():
xml_file_name = os.getenv('PHRASE_TYPES_EXPAND_FILE_NAME', default='phrase_types_expand_20210913.xml')
xml_file = os.path.join(BASE_DIR, 'data', 'walenty', xml_file_name)
xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), xml_file)
parser = make_parser()
parser.setContentHandler(ExpansionsTeiHandler())
parser.parse(xml_path)
expansions = parser.getContentHandler()._expansions
# for cls in (ExpansionOpinion, PhraseExpansion, ExpansionPosition, ExpansionPhrase, ExpansionPhraseDescription):
# cls.objects.all().delete()
opinions = [(50, u'col'), (40, u'dat'), (20, u'unc'), (10, u'cer'),]
for pri, short in opinions:
opinion = ExpansionOpinion(key=short, priority=pri)
opinion.save()
for (phrase_type, phrase_subtype), exps in expansions.items():
expansion_type = PhraseExpansionType.objects.create(phrase_type=phrase_type,
phrase_subtype=phrase_subtype)
expansion_type.save()
print(expansion_type)
for i, (positions, opinion) in enumerate(exps):
opinion = ExpansionOpinion.objects.get(key=OPINION_MAP[opinion])
expansion = PhraseExpansion.objects.create(expansion_type=expansion_type,
opinion=opinion,
priority=(i + 1))
expansion.save()
expansion_type.max_positions = max(expansion_type.max_positions, len(positions))
for j, pos in enumerate(positions):
position = ExpansionPosition(expansion=expansion, priority=(j + 1))
position.save()
for k, (text_rep, desc_pl, desc_en) in enumerate(pos):
phrase = ExpansionPhrase(position=position, text_rep=text_rep)
phrase.save()
d_pl = ExpansionPhraseDescription(phrase=phrase, lang='pl', description=desc_pl)
d_en = ExpansionPhraseDescription(phrase=phrase, lang='en', description=desc_en)
d_pl.save()
d_en.save()
expansion_type.save()
print(sum(map(len, expansions.values())))
class ExpansionsTeiHandler(handler.ContentHandler):
def __init__(self):
handler.ContentHandler.__init__(self)
self._subtree = None
self._current = None
self._constructing = False
self._content = ""
self._expansions = dict()
def startElement(self, name, attrs):
if name == 'entry':
self._constructing = True
self._content = ""
if (self._constructing):
node = XMLNode(name, attrs, self._current)
if self._current is not None:
self._current.addChild(node)
else:
self._subtree = node
self._current = node
def endElement(self, name):
if self._current is not None:
self._current.setContent(self._content.strip())
self._current = self._current._parent
if name == 'entry':
if self._current is not None:
raise TEIStructureError()
typ = self._subtree._children[0]._attrs['type']
self.get_expansions(self._subtree, typ)
self._content = ''
def characters(self, content):
self._content += content
def get_expansions(self, tree, phrase_type):
dummy_position = Position(None, None, None, None, None)
if phrase_type == 'advp':
subtype = tree._children[0]._children[0]._children[0]._attrs['value']
elif phrase_type == 'xp':
subtype = tree._children[0]._children[0]._children[0]._children[0]._children[0]._attrs['value']
elif phrase_type == 'comprepnp':
subtype = tree._children[0]._children[0]._children[0]._content
elif phrase_type in ('distrp', 'possp'):
subtype = None
#print('{}({})'.format(phrase_type, subtype))
assert((phrase_type, subtype) not in self._expansions)
expansions = []
for exp in tree._children[1]._children[0]._children:
expansion_positions = []
opinion = exp._children[0]._children[0]._attrs['value']
if exp._children[1]._attrs['name'] == 'phrases':
positions = [exp._children[1]]
elif exp._children[1]._attrs['name'] == 'positions':
positions = [pos._children[0] for pos in exp._children[1]._children[0]._children]
for position in positions:
expansion_position = []
for phrase in position._children[0]._children:
typ = phrase._attrs['type']
if typ == 'adverb':
adverb = phrase._children[0]._children[0]._attrs['value']
expansion_position.append((adverb, 'przysłówek <i>{}</i>'.format(adverb), '<i>{}</i> adverb'.format(adverb)))
elif typ == 'advp':
# xp realised by advp(cat)
advpcat = phrase._children[-1]._children[0]._attrs['value']
expansion_position.append(('advp({})'.format(advpcat), '???', '???'))
elif typ == 'comprepnp':
prep = phrase._children[1]._children[0]._content
expansion_position.append(('comprepnp({})'.format(prep), 'fraza rzeczownikowo-przyimkowa z przyimkiem złożonym <i>{}</i>'.format(prep), 'nominal-prepositional phrase with <i>{}</i> complex preposition'.format(prep)))
else:
phr = phrase_from_tree(phrase)
if False:#str(phr) == 'lex(adjp(agr),agr,agr,pos,OR(cudzy;czyj;czyjkolwiek;czyjś;mój;nasz;niczyj;pański;swój;twój;wasz),natr)':
desc_pl, desc_en = None, None
else:
desc_pl, desc_en = phrase_description2(phr, dummy_position, None, 'pl'), phrase_description2(phr, dummy_position, None, 'en')
expansion_position.append((str(phr), desc_pl, desc_en))
expansion_positions.append(expansion_position)
assert(expansion_positions)
expansions.append((expansion_positions, opinion))
self._expansions[(phrase_type, subtype)] = expansions