Skip to content
Snippets Groups Projects
Select Git revision
  • fbe7e76e463dca650f64445c4deaac4ad7e913b3
  • master default protected
  • deanonimzer
  • v2 protected
  • v1 protected
  • develop protected
6 results

annotations.py

Blame
  • import_tei.py 12.14 KiB
    #! /usr/bin/python
    # -*- coding: utf-8 -*-
    
    import logging
    import os
    from xml.sax import handler, make_parser
    
    from django.core.management.base import BaseCommand
    
    from common.models import ImportInProgress
    from connections.models import POS, Status
    from examples.models import ExampleOpinion, ExampleSource
    from importer.WalentyPreprocessXML import WalentyPreprocessTeiHandler
    from importer.WalentyXML import WalentyTeiHandler
    from semantics.models import FrameOpinion, ArgumentRole, SemanticRole, RoleAttribute, \
        SelectionalPreferenceRelation, RoleType
    from shellvalier.environment import get_environment
    from shellvalier.settings import BASE_DIR
    from syntax.management.commands.add_predefined_preferences import create_predefined_preferences
    from syntax.management.commands.import_relations import import_relations
    from syntax.models import SchemaOpinion, Aspect, InherentSie, Negativity, Predicativity, SyntacticFunction, \
        Control, PredicativeControl, Position
    from syntax.models_phrase import (
        Case, PhraseAspect, AdverbialCategory, PhraseNegativity, PhraseInherentSie,
        Number, Gender, Degree,
        LemmaOperator, LemmaCooccur,
        ModificationType,
    )
    
    
    class Command(BaseCommand):
        args = 'none'
        help = ''
    
        def handle(self, **options):
            import_tei()
    
    
    def import_tei():
        logging.basicConfig(filename='import.log', level=logging.DEBUG)
    
        xml_file_name = os.getenv('WALENTY_FILE_NAME', default='walenty_20210913_smaller.xml')
    
        print("Loading walenty dict from: {}".format(xml_file_name))
    
        xml_file = os.path.join(BASE_DIR, 'data', 'walenty', xml_file_name)
        # xml_file = os.path.join(BASE_DIR, 'data', 'walenty', 'walenty_20210913_smaller.xml')
        # xml_file = os.path.join(BASE_DIR, 'data', 'walenty', 'walenty_20210913_smallest.xml')
        # xml_file = os.path.join(BASE_DIR, 'data', 'walenty', 'walenty_20210913.xml')
    
        xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), xml_file)
    
        import_constants()
    
        parser = make_parser()
        parser.setFeature(handler.feature_external_ges, False)
    
        parser.setContentHandler(WalentyPreprocessTeiHandler())
        parser.parse(xml_path)
    
        entry_meanings = parser.getContentHandler().entry_meanings
        meanings = parser.getContentHandler().meanings
        frames = parser.getContentHandler().frames
    
        parser.setContentHandler(WalentyTeiHandler(entry_meanings, meanings, frames))
        parser.parse(xml_path)
        ImportInProgress.objects.all().delete()
    
    
    def import_constants():
        import_poses()
        import_statuses()
        import_schema_opinions()
        import_frame_opinions()
        import_aspects()
        import_inherent_sies()
        import_negativities()
        import_predicativities()
        import_syntactic_functions()
        import_control_tags()
        import_semantic_roles()
        # import_predefined_preferences()
        import_preference_relations()
        import_examples_sources()
        import_examples_opinions()
        import_phrase_attributes()
        import_lemma_operators()
        import_modification_types()
        import_semantic_role_types()
    
        create_predefined_preferences()
        import_relations()
    
    
    def import_poses():
        poses = [u'unk', u'adj', u'noun', u'adv', u'verb']
        for pos_tag in poses:
            pos = POS(tag=pos_tag)
            pos.save()
    
    
    def import_statuses():
        statuses = [(10, u'do obróbki'), (20, u'w obróbce'), (25, u'do usunięcia'), (30, u'gotowe'), (35, u'zalążkowe'),
                    (40, u'sprawdzone'), (50, u'(F) w obróbce'), (60, u'(F) gotowe'), (70, u'(F) sprawdzone'),
                    (80, u'(S) w obróbce'), (90, u'(S) gotowe'), (100, u'(S) sprawdzone')]
        for pri, name in statuses:
            status = Status(key=name, priority=pri)
            status.save()
    
    
    def import_schema_opinions():
        opinions = [(60, u'vul'), (50, u'col'), (40, u'dat'), (30, u'bad'), (20, u'unc'), (10, u'cer')]
        for pri, short in opinions:
            opinion = SchemaOpinion(key=short, priority=pri)
            opinion.save()
    
    
    def import_frame_opinions():
        opinions = [(70, u'met'), (60, u'vul'), (50, u'col'), (40, u'dat'), (30, u'bad'), (20, u'unc'), (10, u'cer'),
                    (80, u'dom'), (90, u'rar'), (100, u'unk')]
        for pri, short in opinions:
            opinion = FrameOpinion(key=short, priority=pri)
            opinion.save()
    
    
    def import_aspects():
        aspects = [(10, u'imperf'), (20, u'perf'), (32, u'_'), (42, u'')]
        for pri, name in aspects:
            aspect = Aspect(name=name, priority=pri)
            aspect.save()
    
    
    def import_inherent_sies():
        sies = [(10, u'false'), (20, u'true')]
        for pri, name in sies:
            sie = InherentSie(name=name, priority=pri)
            sie.save()
    
    
    def import_negativities():
        negativities = [(20, u'aff'), (10, u'neg'), (31, u'_'), (41, u'')]
        for pri, name in negativities:
            neg = Negativity(name=name, priority=pri)
            neg.save()
    
    
    def import_predicativities():
        predicativities = [(20, u'false'), (10, u'true')]
        for pri, name in predicativities:
            pred = Predicativity(name=name, priority=pri)
            pred.save()
    
    
    def import_syntactic_functions():
        functions = [(0, u'subj'), (20, u'head'), (10, u'obj')]
        for pri, name in functions:
            sf = SyntacticFunction(name=name, priority=pri)
            sf.save()
    
    
    def import_control_tags():
        controls = [(10, u'controller'), (20, u'controllee'), (30, u'controller2'), (40, u'controllee2')]
        for pri, name in controls:
            cont = Control(name=name, priority=pri)
            cont.save()
        controls = [(10, u'pred_controller'), (20, 'pred_controllee')]
        for pri, name in controls:
            cont = PredicativeControl(name=name, priority=pri)
            cont.save()
    
    
    def import_semantic_roles():
        roles = [
            (10, u'Initiator', u'91,106,217', None),
            (20, u'Stimulus', u'62,173,226', None),
            (30, u'Condition', u'127,199,195', None),
            (40, u'Factor', u'82,150,87', None),
            (50, u'Experiencer', u'149,195,86', None),
            (60, u'Theme', u'90,179,69', None),
            (70, u'Recipient', u'203,77,141', None),
            (80, u'Result', u'231,155,159', None),
            (90, u'Instrument', u'199,221,60', None),
            (100, u'Manner', u'191,48,44', None),
            (110, u'Purpose', u'171,85,186', None),
            (120, u'Attribute', u'220,53,47', None),
            (130, u'Location', u'187,129,45', None),
            (140, u'Path', u'224,121,44', None),
            (150, u'Time', u'242,236,54', None),
            (160, u'Duration', u'233,192,6', None),
            (170, u'Measure', u'238,72,154', None),
            (180, u'Lemma', u'256,256,256', None)
        ]
        # priorities set so that, when role and attribute priorities are added,
        # Role_Source < Role_Foreground < Role_Background < Role_Goal
        # and Role can be inserted anywhere into that hierarchy
        attributes = [(1, u'Source', None, u'left'), (3, u'Foreground', None, u'top'), (5, u'Background', None, u'bottom'),
                      (7, u'Goal', None, u'right')]
        for pri, role, color, gradient in roles:
            role = SemanticRole(role=role, color=color, priority=pri)
            role.save()
        for pri, role, color, gradient in attributes:
            role = RoleAttribute(attribute=role, gradient=gradient, priority=pri)
            role.save()
    
        for role in SemanticRole.objects.all():
            r = ArgumentRole(role=role, attribute=None)
            r.save()
            for attribute in RoleAttribute.objects.all():
                r = ArgumentRole(role=role, attribute=attribute)
                r.save()
    
    
    def import_semantic_role_types():
        role_types = [(10, u'role'), (20, u'alernative'), (30, u'modifier')]
        for pri, name in role_types:
            cont = RoleType(type=name)
            cont.save()
    
    
    # def import_predefined_preferences():
    #     predefs = [u'ALL', u'LUDZIE', u'ISTOTY', u'PODMIOTY', u'KOMUNIKAT',
    #     u'KONCEPCJA', u'WYTWÓR', u'JADŁO', u'CZAS', u'OBIEKTY', u'CECHA',
    #     u'CZYNNOŚĆ', u'KIEDY', u'CZEMU', u'ILOŚĆ', u'POŁOŻENIE', u'DOBRA', u'MIEJSCE', u'SYTUACJA', u'OTOCZENIE']
    #     for name in predefs:
    #         predef = PredefinedSelectionalPreference(key=name)
    #         predef.save()
    
    def import_preference_relations():
        relations = [(14, u'meronimia'), (15, u'holonimia'), (20, u'meronimia (typu część)'),
                     (21, u'meronimia (typu porcja)'), (22, u'meronimia (typu miejsce)'), (23, u'meronimia (typu element)'),
                     (24, u'meronimia (typu materiał)'), (25, u'holonimia (typu część)'), (26, u'holonimia (typu porcja)'),
                     (27, u'holonimia (typu miejsce)'), (28, u'holonimia (typu element)'),
                     (29, u'holonimia (typu materiał)'), (51, u'nosiciel stanu/cechy'), (52, u'stan/cecha'),
                     (61, u'synonimia międzyparadygmatyczna'), (64, u'meronimia (typu element taksonomiczny)'),
                     (65, u'holonimia (typu element taksonomiczny)'), (108, u'fuzzynimia synsetów'), (-1, u'RELAT')]
        for id, name in relations:
            relat = SelectionalPreferenceRelation(plwn_id=id, key=name)
            relat.save()
    
    
    def import_examples_sources():
        sources = [(0, u'NKJP0.5M'), (1, u'NKJP1.2M'), (2, u'NKJP30M'), (3, u'NKJP250M'), (4, u'NKJP300M'),
                   (5, u'NKJP500M'), (6, u'NKJP1800M'), (7, u'linguistic_literature'), (8, u'other_literature'),
                   (9, u'own')]
        for pri, name in sources:
            es = ExampleSource(key=name, priority=pri)
            es.save()
    
    
    def import_examples_opinions():
        opinions = [(0, 'zły'), (1, 'wątpliwy'), (2, 'dobry')]
        for pri, name in opinions:
            eo = ExampleOpinion(key=name, priority=pri)
            eo.save()
    
    
    def import_phrase_attributes():
        import_cases()
        import_phrase_aspects()
        import_phrase_negativities()
        import_phrase_inherent_sies()
        import_adverbial_categories()
        import_numbers()
        import_genders()
        import_degrees()
        # TODO this is quite terrible... create a dummy position for storing phrases inside a lex
        # the store() method for phrases requires a position
        dummy_position = Position()
        dummy_position.save()
        assert (dummy_position.id == 1)
    
    
    def import_cases():
        cases = [(0, u'str'), (1, u'nom'), (2, u'gen'), (3, u'dat'), (4, u'acc'), (5, u'inst'), (6, u'loc'), (10, u'pred'),
                 (11, u'part'), (12, u'postp'), (13, u'agr')]
        for pri, name in cases:
            case = Case(name=name, priority=pri)
            case.save()
    
    
    def import_phrase_aspects():
        aspects = [(10, u'imperf'), (20, u'perf'), (30, u'_')]
        for pri, name in aspects:
            aspect = PhraseAspect(name=name, priority=pri)
            aspect.save()
    
    
    def import_phrase_negativities():
        negativities = [(10, u'aff'), (20, u'neg'), (30, u'_')]
        for pri, name in negativities:
            negativity = PhraseNegativity(name=name, priority=pri)
            negativity.save()
    
    
    def import_phrase_inherent_sies():
        sies = [(10, u'się'), (20, u'')]
        for pri, name in sies:
            sie = PhraseInherentSie(name=name, priority=pri)
            sie.save()
    
    
    def import_adverbial_categories():
        advcats = [(1, u'locat'), (2, u'abl'), (3, u'adl'), (4, u'perl'), (5, u'temp'), (6, u'dur'), (7, 'mod'),
                   (8, 'caus'), (9, 'dest'), (10, 'instr'), (11, 'pron'), (12, 'misc')]
        for pri, name in advcats:
            advcat = AdverbialCategory(name=name, priority=pri)
            advcat.save()
    
    
    def import_numbers():
        numbers = [(1, u'sg'), (2, u'pl'), (10, u'agr'), (20, u'_')]
        for pri, name in numbers:
            number = Number(name=name, priority=pri)
            number.save()
    
    
    def import_genders():
        genders = [(1, u'm1'), (2, u'm2'), (3, u'm3'), (4, u'f'), (5, u'n'), (10, u'agr')]
        for pri, name in genders:
            gender = Gender(name=name, priority=pri)
            gender.save()
    
    
    def import_degrees():
        degrees = [(1, u'pos'), (2, u'com'), (3, u'sup'), (20, u'_')]
        for pri, name in degrees:
            degree = Degree(name=name, priority=pri)
            degree.save()
    
    
    def import_lemma_operators():
        operators = [(1, u'xor'), (2, u'or')]
        for pri, name in operators:
            operator = LemmaOperator(name=name, priority=pri)
            operator.save()
        cooccurs = [(1, u'concat'), (2, u'coord')]
        for pri, name in cooccurs:
            cooccur = LemmaCooccur(name=name, priority=pri)
            cooccur.save()
    
    
    def import_modification_types():
        modtypes = [(1, u'ratr'), (2, u'ratr1'), (3, u'atr'), (4, u'atr1'), (5, u'natr')]
        for pri, name in modtypes:
            modtype = ModificationType(name=name, priority=pri)
            modtype.save()