diff --git a/common/management/commands/create_tei.py b/common/management/commands/create_tei.py new file mode 100644 index 0000000000000000000000000000000000000000..8e2e5500dd7b4024a53008fba8a766e71af3ffc5 --- /dev/null +++ b/common/management/commands/create_tei.py @@ -0,0 +1,47 @@ +#-*- coding:utf-8 -*- + +import datetime +import os + +from django.core.management.base import BaseCommand +from optparse import make_option + +from unifier.models import UnifiedFrame +from django.contrib.auth.models import User +from users.models import Assignment + +from common.valunifier_tei import createteixml + +BASEPATH = '.' + +class Command(BaseCommand): + args = '' + help = 'Export ValUnifier in TEI format' + + def add_arguments(self, parser): + parser.add_argument('-i', '--individual', action='store_true', help='Gen individual files.') + + def handle(self, **options): + + now = datetime.datetime.now().strftime('%Y%m%d') + + if not options['individual']: + print("Full dictionary") + outfile = 'valunifier_' + now + '.xml' + outpath = os.path.join(BASEPATH, outfile) + frames = UnifiedFrame.objects.all() + createteixml(outpath, frames) + + else: + for user in User.objects.all(): + print("Part for " + user.username) + frames = [] + for assignment in Assignment.objects.filter(user=user): + if assignment.subject_ct.model_class() == UnifiedFrame: + frames.append(assignment.subject_ct.get_object_for_this_type(id=assignment.subject_id)) + outfile = user.username + '_' + now + '.xml' + outpath = os.path.join(BASEPATH, outfile) + if len(frames) > 0: + createteixml(outpath, frames) + + diff --git a/common/management/commands/valunifier_tei.py b/common/management/commands/valunifier_tei.py new file mode 100644 index 0000000000000000000000000000000000000000..2d3f8be0bdafca9b33578108a7abd3ca12d06faf --- /dev/null +++ b/common/management/commands/valunifier_tei.py @@ -0,0 +1,522 @@ +#-*- coding:utf-8 -*- + +import datetime + +from django.db.models import Count, Min, Max +from lxml import etree +from xml.sax.saxutils import escape + +from unifier.models import UnifiedFrame2SlowalFrameMapping, \ + UnifiedFrameArgumentSlowalFrameMapping +from connections.models import ArgumentConnection + +from collections import defaultdict + +XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace' + + +def createteixml(outpath, unified_frames): + root = write_root() + write_header(root) + write_content(root, unified_frames) + with open(outpath, 'wb') as output_file: + root.getroottree().write(output_file, + encoding='UTF-8', + pretty_print=True, + xml_declaration=True, + doctype=u'<!DOCTYPE TEI SYSTEM "tei_all.dtd">') + +def write_root(): + root = etree.Element('TEI') + root.attrib[etree.QName(XML_NAMESPACE, 'lang')] = u'pl' + root.attrib['xmlns'] = u'http://www.tei-c.org/ns/1.0' + return root + +def write_header(root, extensions_file=False): + tei_header = etree.SubElement(root, 'teiHeader') + file_desc = etree.SubElement(tei_header, 'fileDesc') + + title_stmt = etree.SubElement(file_desc, 'titleStmt') + title = etree.SubElement(title_stmt, 'title') + title.text = u'ValUnifier (?)' + + publication_stmt = etree.SubElement(file_desc, 'publicationStmt') + publisher = etree.SubElement(publication_stmt, 'publisher') + publisher.text = u'Institute of Computer Science, Polish Academy of Sciences (IPI PAN)' + + date = etree.SubElement(publication_stmt, 'date') + date.attrib['when'] = datetime.datetime.now().strftime('%Y-%m-%d') + + write_license_elem(publication_stmt) + + source_desc = etree.SubElement(file_desc, 'sourceDesc') + p = etree.SubElement(source_desc, 'p') + p.text = u'?' + +def write_license_elem(parent): + availability = etree.SubElement(parent, 'availability') + licence = etree.SubElement(availability, 'licence') + licence.attrib['target'] = u'http://creativecommons.org/licenses/by-sa/4.0/' + + p = etree.SubElement(licence, 'p') + p.text = u'(C) Copyright 2012–2018 by the Institute of Computer Science, Polish Academy of Sciences (IPI PAN)' + + p = etree.SubElement(licence, 'p') + p.text = u'This work is distributed under a CC BY-SA license: http://creativecommons.org/licenses/by-sa/4.0/' + + p = etree.SubElement(licence, 'p') + p.text = '?' + +def write_content(root, unified_frames): + text = etree.SubElement(root, 'text') + body = etree.SubElement(text, 'body') + schemata = etree.SubElement(body, 'div') + frames = etree.SubElement(body, 'div') + used_schemata = set() + write_unified_frames(frames, unified_frames, used_schemata) + write_used_schemata(schemata, used_schemata) + +#=================== DIV -- SEMANTIC FRAMES ===================# + +def write_unified_frames(parent, unified_frames, used_schemata): + frames_head = etree.SubElement(parent, 'head') + frames_head.text = 'Semantic Frames' + + for unified_frame in unified_frames: + write_unified_frame_entry(parent, unified_frame, used_schemata) + +def write_unified_frame_entry(parent, unified_frame, used_schemata): + entry_xml_id = u'unif_%d-ent' % unified_frame.id + entry = etree.SubElement(parent, 'entry') + entry.attrib[etree.QName(XML_NAMESPACE, 'id')] = entry_xml_id + + write_unified_frame_definition(entry, unified_frame) + + write_status_info(entry, unified_frame) + + write_unified_frame(entry, unified_frame) + + write_unified_frames_realizations(entry, unified_frame, used_schemata) + +def write_unified_frame_definition(entry, unified_frame): + definition = etree.SubElement(entry, 'def') + definition.text = unified_frame.title + +def write_status_info(parent, unified_frame): + general_fs = etree.SubElement(parent, 'fs') + general_fs.attrib['type'] = 'general_info' + status_f = etree.SubElement(general_fs, 'f') + status_f.attrib['name'] = 'status' + status_string = etree.SubElement(status_f, 'string') + status_string.text = unified_frame.status + +#=================== FS TYPE = "FRAME" ===================# + +def write_unified_frame(parent, unified_frame): + frame_xml_id = u'unif_%d-frm' % unified_frame.id + + frame_fs = etree.SubElement(parent, 'fs') + frame_fs.attrib[etree.QName(XML_NAMESPACE, 'id')] = frame_xml_id + frame_fs.attrib['type'] = 'frame' + + write_frame_arguments(frame_fs, unified_frame) + +def write_frame_arguments(parent, unified_frame): + arguments_f = etree.SubElement(parent, 'f') + arguments_f.attrib['name'] = 'arguments' + + vColl = etree.SubElement(arguments_f, 'vColl') + vColl.attrib['org'] = 'set' + + for arg in unified_frame.sorted_arguments(): + write_frame_argument(vColl, unified_frame, arg) + +def write_frame_argument(parent, frame, arg): + arg_base_id = u'unif_%d' % frame.id + arg_xml_id = arg_base_id + u'.%d-arg' % arg.id + + argument_fs = etree.SubElement(parent, 'fs') + argument_fs.attrib[etree.QName(XML_NAMESPACE, 'id')] = arg_xml_id + argument_fs.attrib['type'] = 'argument' + + write_roles(argument_fs, arg) + write_selectional_preferences(argument_fs, arg, arg_base_id) + +def write_roles(parent, arg): + if arg.role_type is not None: + role_type_f = etree.SubElement(parent, 'f') + role_type_f.attrib['name'] = 'role_type' + role_type_symbol = etree.SubElement(role_type_f, 'symbol') + role_type_symbol.attrib['value'] = arg.role_type.type + + role_f = etree.SubElement(parent, 'f') + role_f.attrib['name'] = 'role' + role_symbol = etree.SubElement(role_f, 'symbol') + if arg.role is None: + role_symbol.attrib['value'] = '/'.join([r.role.role for r in arg.proposed_roles.all()]) + else: + role_symbol.attrib['value'] = arg.role.role.role + + if arg.role.attribute is not None and arg.role.sub_attribute is not None: + attributes_f = etree.SubElement(parent, 'f') + attributes_f.attrib['name'] = 'arributes' + vColl = etree.SubElement(attributes_f, 'vColl') + vColl.attrib['org'] = 'set' + if arg.role.attribute is not None: + write_role_attribute(vColl, arg.role.attribute.attribute) + if arg.role.sub_attribute is not None: + write_role_attribute(vColl, arg.role.sub_attribute.sub_attribute) + +def write_role_attribute(parent, symbol): + attribute_symbol = etree.SubElement(parent, 'symbol') + attribute_symbol.attrib['value'] = symbol + +def write_selectional_preferences(parent, arg, arg_base_id): + if len(arg.predefined.all()) > 0 or\ + len(arg.synsets.all()) > 0 or\ + len(arg.relations.all()) > 0: + sel_prefs_f = etree.SubElement(parent, 'f') + sel_prefs_f.attrib['name'] = 'sel_prefs' + + sel_prefs_groups_fs = etree.SubElement(sel_prefs_f, 'fs') + sel_prefs_groups_fs.attrib['type'] = 'sel_prefs_groups' + + write_synsets_selprefs(sel_prefs_groups_fs, arg) + write_predefined_selprefs(sel_prefs_groups_fs, arg) + write_relation_selprefs(sel_prefs_groups_fs, arg, arg_base_id) + +def write_synsets_selprefs(parent, arg): + synsets = arg.synsets.all() + if len(synsets) > 0: + synsets_f = etree.SubElement(parent, 'f') + synsets_f.attrib['name'] = 'synsets' + + vColl = etree.SubElement(synsets_f, 'vColl') + vColl.attrib['org'] = 'set' + + for synset in synsets: + write_synset(vColl, synset) + +def write_synset(parent, synset): + id_numeric = etree.SubElement(parent, 'numeric') + id_numeric.attrib['value'] = str(synset.id) + +def write_predefined_selprefs(parent, arg): + predefs = arg.predefined.all() + if len(predefs) > 0: + predefs_f = etree.SubElement(parent, 'f') + predefs_f.attrib['name'] = 'predefs' + + vColl = etree.SubElement(predefs_f, 'vColl') + vColl.attrib['org'] = 'set' + + for predef in predefs: + write_predef(vColl, predef) + +def write_predef(parent, predef): + name_symbol = etree.SubElement(parent, 'symbol') + name_symbol.attrib['value'] = predef.name + +def write_relation_selprefs(parent, arg, arg_base_id): + relations = arg.relations.all() + if len(relations) > 0: + relations_f = etree.SubElement(parent, 'f') + relations_f.attrib['name'] = 'relations' + + vColl = etree.SubElement(relations_f, 'vColl') + vColl.attrib['org'] = 'set' + + for relation in relations: + write_relation(vColl, relation, arg_base_id) + +def write_relation(parent, relation, arg_base_id): + relation_fs = etree.SubElement(parent, 'fs') + relation_fs.attrib['type'] = 'relation' + + relation_f = etree.SubElement(relation_fs, 'f') + relation_f.attrib['name'] = 'type' + type_symbol = etree.SubElement(relation_f, 'symbol') + type_symbol.attrib['value'] = relation.relation.key + + to_f = etree.SubElement(relation_fs, 'f') + to_f.attrib['name'] = 'to' + to_xml_link = '#%s.%d-arg' % (arg_base_id, relation.to.id) + arg_link = etree.SubElement(to_f, 'fs') + arg_link.attrib['sameAs'] = to_xml_link + arg_link.attrib['type'] = 'argument' + +#=================== FS TYPE = "FRAME_REALIZATIONS" ===================# + +def write_unified_frames_realizations(entry, unified_frame, used_schemata): + realizations_fs = etree.SubElement(entry, 'fs') + realizations_fs.attrib['type'] = 'frame_realizations' + + realizations_f = etree.SubElement(realizations_fs, 'f') + realizations_f.attrib['name'] = 'realizations' + + vColl = etree.SubElement(realizations_f, 'vColl') + vColl.attrib['org'] = 'set' + + write_lexical_units_realizations(vColl, unified_frame, used_schemata) + +def write_lexical_units_realizations(parent, unified_frame, used_schemata): + frames_mappings = UnifiedFrame2SlowalFrameMapping.objects.filter(unified_frame=unified_frame,\ + removed=False) + + lexical_units_sortable = {} + lexical_unit_mapping = {} + for mapping in frames_mappings: + frame = mapping.slowal_frame + for lexical_unit in frame.lexical_units.all(): + lexical_unit_mapping[lexical_unit] = mapping + lexical_units_sortable[(lexical_unit.base, lexical_unit.sense)] = lexical_unit + + for signature in sorted(lexical_units_sortable.keys()): + lexical_unit = lexical_units_sortable[signature] + mapping = lexical_unit_mapping[lexical_unit] + write_lexical_unit_realization(parent, unified_frame, lexical_unit, mapping, used_schemata) + +def write_lexical_unit_realization(parent, unified_frame, lexical_unit, mapping, used_schemata): + lexical_unit_realizations_fs = etree.SubElement(parent, "fs") + lexical_unit_realizations_fs.attrib['type'] = "lexical_unit_realizations" + + lexical_unit_f = etree.SubElement(lexical_unit_realizations_fs, "f") + lexical_unit_f.attrib['name'] = 'lexical_unit' + write_lexical_unit(lexical_unit_f, lexical_unit, mapping) + + syntactic_realizations_f = etree.SubElement(lexical_unit_realizations_fs, 'f') + syntactic_realizations_f.attrib['name'] = 'syntactic_realizations' + + vColl = etree.SubElement(syntactic_realizations_f, 'vColl') + vColl.attrib['org'] = 'set' + write_alternations(vColl, unified_frame, lexical_unit, mapping, used_schemata) + +def write_lexical_unit(parent, lexical_unit, mapping): + meaning_fs = etree.SubElement(parent, 'fs') + meaning_fs.attrib['type'] = 'lexical_unit' + + name_f = etree.SubElement(meaning_fs, 'f') + name_f.attrib['name'] = 'name' + name_content = etree.SubElement(name_f, 'string') + name_content.text = lexical_unit.base + + variant_f = etree.SubElement(meaning_fs, 'f') + variant_f.attrib['name'] = 'variant' + variant_string = etree.SubElement(variant_f, 'string') + variant_string.text = lexical_unit.sense + + if lexical_unit.luid is not None: + plwnluid_f = etree.SubElement(meaning_fs, 'f') + plwnluid_f.attrib['name'] = 'plwnluid' + plwnluid_numeric = etree.SubElement(plwnluid_f, 'numeric') + plwnluid_numeric.attrib['value'] = str(lexical_unit.luid) + + if lexical_unit.synset is not None and lexical_unit.synset.id > 0: + plwnsid_f = etree.SubElement(meaning_fs, 'f') + plwnsid_f.attrib['name'] = 'plwnsid' + plwnsid_numeric = etree.SubElement(plwnsid_f, 'numeric') + plwnsid_numeric.attrib['value'] = str(lexical_unit.synset.id) + + slowal_frame = mapping.slowal_frame + if slowal_frame.opinion is not None: + opinion_f = etree.SubElement(meaning_fs, 'f') + opinion_f.attrib['name'] = 'opinion' + opinion_symbol = etree.SubElement(opinion_f, 'symbol') + opinion_symbol.attrib['value'] = slowal_frame.opinion.key + +def write_alternations(parent, unified_frame, lexical_unit, mapping, used_schemata): + alternations = prepare_alternations(mapping, used_schemata) + for key in sorted(alternations.keys()): + alternation_fs = etree.SubElement(parent, 'fs') + alternation_fs.attrib['type'] = 'aternation' + connections_f = etree.SubElement(alternation_fs, 'f') + connections_f.attrib['name'] = 'connections' + vColl = etree.SubElement(connections_f, 'vColl') + vColl.attrib['org'] = 'set' + for argument, schema_hooks in alternations[key].items(): + connection_fs = etree.SubElement(vColl, 'fs') + connection_fs.attrib['type'] = 'connection' + argument_f = etree.SubElement(connection_fs, 'f') + argument_f.attrib['name'] = 'argument' + argument_fs = etree.SubElement(argument_f, 'fs') + argument_fs.attrib['type'] = 'argument' + argument_fs.attrib['sameAs'] = u'#unif_%d.%d-arg' % (unified_frame.id, argument.id) + phrases_f = etree.SubElement(connection_fs, 'f') + write_phrases_coll(phrases_f, schema_hooks) + +def prepare_alternations(mapping, used_schemata): + argument_mappings = UnifiedFrameArgumentSlowalFrameMapping.objects.filter(unified_frame_mapping = mapping) + alternations = defaultdict(lambda: defaultdict(lambda: [])) + for argument_mapping in argument_mappings: + uargument = argument_mapping.unified_agrument + sargument = argument_mapping.slowal_agrument + + argument_realization = ArgumentConnection.objects.get(argument = sargument) + by_schema_realizations = argument_realization.schema_connections.all() + for schema_hook in by_schema_realizations: + subentry = schema_hook.subentry + schema = schema_hook.schema + used_schemata.add((subentry, schema)) + alternation = schema_hook.alternation + alternations[(subentry.id, schema.id, alternation)][uargument].append(schema_hook) + + return alternations + +def write_phrases_coll(parent, phrases_list): + vColl = etree.SubElement(parent, 'vColl') + vColl.attrib['org'] = 'set' + for phrase in phrases_list: + phrase_fs = etree.SubElement(vColl, 'fs') + phrase_fs.attrib['type'] = 'phrase' + phrase_fs.attrib['sameAs'] = u'#unif_%d.%d.%d.%d-phr' %(phrase.subentry.id, phrase.schema.id, phrase.position.id, phrase.phrase_type.id) + + +#=================== DIV -- SYNTACTIC SCHEMATA ===================# + +def write_used_schemata(parent, used_schemata): + schemata_head = etree.SubElement(parent, 'head') + schemata_head.text = 'Syntactic Schemata' + + for subentry, schema in used_schemata: + write_schema_entry(parent, subentry, schema) + +def write_schema_entry(parent, subentry, schema): + entry_xml_id = u'unif_%d.%d-schent' %(subentry.id, schema.id) + entry = etree.SubElement(parent, 'entry') + entry.attrib[etree.QName(XML_NAMESPACE, 'id')] = entry_xml_id + + write_schema_definition(entry, subentry, schema) + + write_schema(entry, subentry, schema) + +def write_schema_definition(parent, subentry, schema): + pass + +def write_schema(parent, subentry, schema): + schema_xml_id = u'unif_%d.%d-sch' %(subentry.id, schema.id) + + schema_fs = etree.SubElement(parent, 'fs') + schema_fs.attrib[etree.QName(XML_NAMESPACE, 'id')] = schema_xml_id + schema_fs.attrib['type'] = 'schema' + + # textual representation @TODO -- not present in the database + # text_rep_f_elem = etree.SubElement(schema_fs_elem, 'f') + # text_rep_f_elem.attrib['name'] = 'text_rep' + # text_rep_string = etree.SubElement(text_rep_f_elem, 'string') + # text_rep = schema.get_position_spaced_text_rep() + # if schema.characteristics.filter(type=u'ZWROTNOŚĆ', value__value=u'się').exists(): + # text_rep = ' ' + text_rep + # text_rep_string.text = lemma.entry_obj.name + text_rep.replace(':',': ') + + # schema opinion + schema_opinion = schema.opinion.key + if schema.opinion.key is None: + schema_opinion = 'unk' + opinion_f = etree.SubElement(schema_fs, 'f') + opinion_f.attrib['name'] = 'opinion' + opinion_symbol = etree.SubElement(opinion_f, 'symbol') + opinion_symbol.attrib['value'] = schema_opinion + + # inherent "się" + reflex = subentry.inherent_sie.name + selfmark_f = etree.SubElement(schema_fs, 'f') + selfmark_f.attrib['name'] = 'inherent_sie' + selfmark_binary = etree.SubElement(selfmark_f, 'binary') + selfmark_binary.attrib['value'] = reflex + + # aspect + aspect_f = etree.SubElement(schema_fs, 'f') + aspect_f.attrib['name'] = 'aspect' + if subentry.aspect is not None and subentry.aspect.name != '': + aspect_symbol = etree.SubElement(aspect_f, 'symbol') + aspect_symbol.attrib['value'] = subentry.aspect.name + + # negativity + negativity_f = etree.SubElement(schema_fs, 'f') + negativity_f.attrib['name'] = 'negativity' + if subentry.negativity is not None and subentry.negativity.name != '': + negativity_symbol = etree.SubElement(negativity_f, 'symbol') + negativity_symbol.attrib['value'] = subentry.negativity.name + + # predicativity + predicativity = subentry.predicativity.name + predicativity_f = etree.SubElement(schema_fs, 'f') + predicativity_f.attrib['name'] = 'predicativity' + predicativity_binary = etree.SubElement(predicativity_f, 'binary') + predicativity_binary.attrib['value'] = predicativity + + # positions + write_positions(schema_fs, subentry, schema) + +def write_positions(parent, subentry, schema): + positions = schema.positions.all() + positions_f = etree.SubElement(parent, 'f') + positions_f.attrib['name'] = 'positions' + vColl = etree.SubElement(positions_f, 'vColl') + vColl.attrib['org'] = 'set' + for position in positions: + write_position(vColl, subentry, schema, position) + +def write_position(parent, subentry, schema, position): + position_xml_id = u'unif_%d.%d.%d-psn' %(subentry.id, schema.id, position.id) + position_fs = etree.SubElement(parent, 'fs') + position_fs.attrib['type'] = 'position' + position_fs.attrib[etree.QName(XML_NAMESPACE, 'id')] = position_xml_id + + write_function(position_fs, position) + + write_control(position_fs, position) + + write_phrases(position_fs, subentry, schema, position) + +def write_function(parent, position): + function = position.function + if function is not None: + function_f = etree.SubElement(parent, 'f') + function_f.attrib['name'] = 'function' + function_symbol = etree.SubElement(function_f, 'symbol') + function_symbol.attrib['value'] = function.name + +def write_control(parent, position): + control = position.control + pred_control = position.pred_control + if control is not None or pred_control is not None: + control_f = etree.SubElement(parent, 'f') + control_f.attrib['name'] = 'control' + vColl = etree.SubElement(control_f, 'vColl') + vColl.attrib['org'] = 'set' + if control is not None: + control = control.name + control_symbol = etree.SubElement(vColl, 'symbol') + control_symbol.attrib['value'] = control + if pred_control is not None: + control = pred_control.name + pred_control_symbol = etree.SubElement(vColl, 'symbol') + pred_control_symbol.attrib['value'] = control + +def write_phrases(parent, subentry, schema, position): + phrases = position.phrase_types.all() + phrases_f = etree.SubElement(parent, 'f') + phrases_f.attrib['name'] = 'phrases' + vColl = etree.SubElement(phrases_f, 'vColl') + vColl.attrib['org'] = 'set' + for phrase in phrases: + write_phrase(vColl, subentry, schema, position, phrase) + +def write_phrase(parent, subentry, schema, position, phrase): + phrase_xml_id = u'unif_%d.%d.%d.%d-phr' %(subentry.id, schema.id, position.id, phrase.id) + phrase_fs = etree.SubElement(parent, 'fs') + phrase_fs.attrib[etree.QName(XML_NAMESPACE, 'id')] = phrase_xml_id + phrase_fs.attrib['type'] = phrase.main_type.name + # @TODO -- currently no expansions file + # if phrase.realizations.exists() and write_expansions_id: + # write_expansions_link(phrase_fs_elem, phrase) + write_phrase_textrep(phrase_fs, phrase) + +def write_phrase_textrep(parent, phrase): + text_rep = phrase.text_rep + textrep_f = etree.SubElement(parent, 'f') + textrep_f.attrib['name'] = 'textual_representation' + textrep_string = etree.SubElement(textrep_f, 'string') + textrep_string.text = text_rep +