Skip to content
Snippets Groups Projects
Select Git revision
  • 16988605fe3645fc8caef5facc9ae7afbc0acb3f
  • main default protected
  • ud_training_script
  • fix_seed
  • merged-with-ner
  • multiword_fix_transformer
  • transformer_encoder
  • combo3
  • save_deprel_matrix_to_npz
  • master protected
  • combo-lambo
  • lambo-sent-attributes
  • adding_lambo
  • develop
  • update_allenlp2
  • develop_tmp
  • tokens_truncation
  • LR_test
  • eud_iwpt
  • iob
  • eud_iwpt_shared_task_bert_finetuning
  • 3.3.1
  • list
  • 3.2.1
  • 3.0.3
  • 3.0.1
  • 3.0.0
  • v1.0.6
  • v1.0.5
  • v1.0.4
  • v1.0.3
  • v1.0.2
  • v1.0.1
  • v1.0.0
34 results

main.py

Blame
  • WalentyPreprocessXML.py 3.80 KiB
    #! /usr/bin/python
    # -*- coding: utf-8 -*-
    
    from xml.sax import handler
    from importer.PreprocessEntry import Entry
    from meanings.models import LexicalUnit
    
    
    class XMLNode:
        
        def __init__(self, name, attrs, parent):
            self._name = name
            self._attrs = attrs
            self._children = []
            self._parent = parent
            self._content = ""
    
        def addChild(self, child):
            self._children.append(child)
    
        def setContent(self, content):
            self._content = content
    
        def __str__(self):
            att = zip(self._attrs.keys(), self._attrs.values())
            return self._name + '[' + str(att) + '](' + ';'.join([str(temp) for temp in self._children]) + ')'
    
    
    class WalentyPreprocessTeiHandler(handler.ContentHandler):
    
        def __init__(self):
            handler.ContentHandler.__init__(self)
            self._subtree = None
            self._current = None
            self._constructing = False
            self._content = ""
            self.entry_meanings = {}
            self.meanings = {}
            self.frames = {}
            
        def startElement(self, name, attrs):
            if name == 'date':
                pass
            if name == 'entry':
                self._constructing = True
                self._content = ""
            if (self._constructing):
                node = XMLNode(name, attrs, self._current)
                if self._current is not None:
                    self._current.addChild(node)
                else:
                    self._subtree = node
                self._current = node
    
        def endElement(self, name):
            if self._current is not None:
                self._current.setContent(self._content.strip())
                self._current = self._current._parent
                if name == 'entry':
                    if self._current is not None:
                        raise TEIStructureError()
                    entry = Entry(self._subtree)
                    if entry._semantics is not None:
                        self.extend(entry._base, entry._pos, entry._meanings, entry._semantics._frames)
                self._content = ''
            else:
                if name == 'title':
                    self._content = ''
                elif name == 'publisher':
                    self._content = ''
                elif name == 'licence':
                    self.content = ''
                elif name == 'p':
                    self._content += '\n% '
    
        def characters(self, content):
            self._content += content
            
        def endDocument(self):
            print("Storing new lexical units")
            for entry_data, meaning in self.meanings.values():
                lu = LexicalUnit.objects.filter()
                name, pos = entry_data
                lus = LexicalUnit.objects.filter(base=meaning._name, sense=meaning._variant, pos=pos)
                if lus:
                    lu = lus[0]
                    changed = False
                    if lu.luid == meaning._luid:
                        print('    lu.luid != meaning._luid :', lu.luid, ' != ', meaning._luid, ', name: ', meaning._name, ', variant: ', meaning._variant)
                        assert(lu.luid == meaning._luid)
                        assert(lu.synset.id == meaning._sid)
                        if lu.gloss != meaning._gloss:
                            print('    updating gloss for :', lu, ' --- ', repr(lu.gloss), '->', repr(meaning._gloss))
                            lu.gloss = meaning._gloss
                        if changed:
                            lu.save()
                else:
                    print('    new lu: {}-{}-{}-{}'.format(meaning._id, meaning._name, meaning._variant, pos))
                    meaning.save(pos)
            print("Stored")
    
        def extend(self, base, pos, meanings, frames):
            self.entry_meanings[(base, pos)] = [id for id in meanings._meanings]
            for id in meanings._meanings:
                self.meanings[id] = ((base, pos), meanings._meanings[id])
            for frame in frames:
                if frame._base is not None:
                    self.frames[frame._id] = frame