#! /usr/bin/python # -*- coding: utf-8 -*- from xml.sax import handler from importer.PreprocessEntry import Entry from meanings.models import LexicalUnit class XMLNode: def __init__(self, name, attrs, parent): self._name = name self._attrs = attrs self._children = [] self._parent = parent self._content = "" def addChild(self, child): self._children.append(child) def setContent(self, content): self._content = content def __str__(self): att = zip(self._attrs.keys(), self._attrs.values()) return self._name + '[' + str(att) + '](' + ';'.join([str(temp) for temp in self._children]) + ')' class WalentyPreprocessTeiHandler(handler.ContentHandler): def __init__(self): handler.ContentHandler.__init__(self) self._subtree = None self._current = None self._constructing = False self._content = "" self.entry_meanings = {} self.meanings = {} self.frames = {} def startElement(self, name, attrs): if name == 'date': pass if name == 'entry': self._constructing = True self._content = "" if (self._constructing): node = XMLNode(name, attrs, self._current) if self._current is not None: self._current.addChild(node) else: self._subtree = node self._current = node def endElement(self, name): if self._current is not None: self._current.setContent(self._content.strip()) self._current = self._current._parent if name == 'entry': if self._current is not None: raise TEIStructureError() entry = Entry(self._subtree) if entry._semantics is not None: self.extend(entry._base, entry._pos, entry._meanings, entry._semantics._frames) self._content = '' else: if name == 'title': self._content = '' elif name == 'publisher': self._content = '' elif name == 'licence': self.content = '' elif name == 'p': self._content += '\n% ' def characters(self, content): self._content += content def endDocument(self): print("Storing new lexical units") for entry_data, meaning in self.meanings.values(): lu = LexicalUnit.objects.filter() name, pos = entry_data lus = LexicalUnit.objects.filter(base=meaning._name, sense=meaning._variant, pos=pos) if lus: lu = lus[0] changed = False assert(lu.luid == meaning._luid) assert(lu.synset.id == meaning._sid) if lu.gloss != meaning._gloss: print(' updating gloss for :', lu, ' --- ', repr(lu.gloss), '->', repr(meaning._gloss)) lu.gloss = meaning._gloss if changed: lu.save() else: print(' new lu: {}-{}-{}-{}'.format(meaning._id, meaning._name, meaning._variant, pos)) meaning.save(pos) print("Stored") def extend(self, base, pos, meanings, frames): self.entry_meanings[(base, pos)] = [id for id in meanings._meanings] for id in meanings._meanings: self.meanings[id] = ((base, pos), meanings._meanings[id]) for frame in frames: if frame._base is not None: self.frames[frame._id] = frame