Skip to content
Snippets Groups Projects
Select Git revision
  • 49ed26bea2e00a61c0bd9d64d08466f265e612f6
  • master default protected
  • vertical_relations
  • lu_without_semantic_frames
  • hierarchy
  • additional-unification-filters
  • v0.1.1
  • v0.1.0
  • v0.0.9
  • v0.0.8
  • v0.0.7
  • v0.0.6
  • v0.0.5
  • v0.0.4
  • v0.0.3
  • v0.0.2
  • v0.0.1
17 results

WalentyXML.py

Blame
  • user avatar
    dcz authored
    187515c2
    History
    WalentyXML.py 4.40 KiB
    #! /usr/bin/python
    # -*- coding: utf-8 -*-
    
    import traceback
    
    from xml.sax import handler
    
    import connections.models
    from importer.Entry import Entry
    
    examples_out_file = 'examples_ambig.txt'
    misconnected_examples_out_file = 'examples_to_reattach.txt'
    
    
    class XMLNode:
        
        def __init__(self, name, attrs, parent):
            self._name = name
            self._attrs = attrs
            self._children = []
            self._parent = parent
            self._content = ""
    
        def addChild(self, child):
            self._children.append(child)
    
        def setContent(self, content):
            self._content = content
    
        def __str__(self):
            att = list(zip(self._attrs.keys(), self._attrs.values()))
            return self._name + '[' + str(att) + '](' + ';'.join([str(temp) for temp in self._children]) + ')'
            
        
    class WalentyTeiHandler(handler.ContentHandler):
    
        def __init__(self, entry_meanings, meanings, frames):
            handler.ContentHandler.__init__(self)
            self._subtree = None
            self._current = None
            self._constructing = False
            self._content = ""
            self._entry_meanings = entry_meanings
            self._meanings = meanings
            self._frames = frames
            self._stored_positions = {}
            self._examples_in = None # @TODO: read disambiguated file
            self._examples_out = open(examples_out_file, "w")
            self._misconnected_out = open(misconnected_examples_out_file, "w")
            self._errors = []
            self._counter = 0
            
        def startElement(self, name, attrs):
            if name == 'date':
                #self.printMeta(attrs['when'])
                pass
            if name == 'entry':
                self._constructing = True
                self._content = ""
            if (self._constructing):
                node = XMLNode(name, attrs, self._current)
                if self._current is not None:
                    self._current.addChild(node)
                else:
                    self._subtree = node
                self._current = node
    
    
        def endElement(self, name):
            if self._current is not None:
                self._current.setContent(self._content.strip())
                self._current = self._current._parent
                if name == 'entry':
                    self._counter += 1
                    if self._current is not None:
                        raise TEIStructureError()
                    base = self._subtree._children[0]._children[0]._content
                    try:
                        entry = Entry(self._subtree, self._entry_meanings, self._meanings, self._frames, self._examples_in, self._examples_out, self._misconnected_out)
                        if not connections.models.Entry.objects.filter(id=int(entry._id)).exists():
                            print("Entry not exists in database: {}, status: {}".format(entry._base, entry._status))
                            if entry._status == '(F) sprawdzone' or entry._status == '(S) w obróbce' or \
                                    entry._status == '(S) sprawdzone' or entry._status == 'sprawdzone' or entry._status == '(S) gotowe':
                                entry.store(self._meanings, self._stored_positions)
                            else:
                                print("Odrzucono niegotowe: {}, status: {}".format(entry._base, entry._status))
                        else:
                            print("Entry exists in database: {}, status: {}".format(entry._base, entry._status))
                    except Exception as e:
                        #raise
                        traceback.print_exc()
                        self._errors.append('{}: {} ({})'.format(base, type(e).__name__, str(e)))
                        # errors reach or exceed 10% of entries, but wait until some entries are read – 1 out of 2 might not yet be a reason to panic ;)
                        if self._counter >= 100 and len(self._errors) * 10 >= self._counter:
                            self.endDocument()
                            raise RuntimeError('too many errors encountered, abandoning ship!')
                self._content = ''
            else:
                if name == 'title':
                    pass
                elif name == 'publisher':
                    pass
                elif name == 'licence':
                    pass
                elif name == 'p':
                    self._content += '\n% '
    
        def characters(self, content):
            self._content += content
            
        def endDocument(self):
            self._examples_out.close()
            self._misconnected_out.close()
            print('encountered errors:')
            for error in self._errors:
                print(error)