From acb98ee6b413017e3fbdd10e0698875b9867bc84 Mon Sep 17 00:00:00 2001 From: Maja Jablonska <majajjablonska@gmail.com> Date: Mon, 27 Nov 2023 22:23:23 +1100 Subject: [PATCH] Add a corrected misc column --- combo/data/api.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/combo/data/api.py b/combo/data/api.py index b36b616..ab00da5 100644 --- a/combo/data/api.py +++ b/combo/data/api.py @@ -86,7 +86,7 @@ def serialize_field(field: Any) -> str: return "{}".format(field) def serialize_token_list(tokenlist: conllu.models.TokenList) -> str: - KEYS_ORDER = ['idx', 'text', 'lemma', 'upostag', 'xpostag', 'entity_type', 'feats', 'head', 'deprel', 'deps', 'misc'] + KEYS_ORDER = ['idx', 'text', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps'] lines = [] if tokenlist.metadata: @@ -99,6 +99,18 @@ def serialize_token_list(tokenlist: conllu.models.TokenList) -> str: for token_data in tokenlist: line = '\t'.join(serialize_field(token_data[k]) for k in KEYS_ORDER) + serialized_misc = serialize_field(token_data['misc']) + serialized_entity_type = serialize_field(token_data['entity_type']) + if serialized_misc == '_' and serialized_entity_type == '_': + serialized_last_column = '_' + elif serialized_misc == '_': + serialized_last_column = serialized_entity_type + elif serialized_entity_type == '_': + serialized_last_column = serialized_misc + else: + serialized_last_column = serialized_entity_type + ' | ' + serialized_misc + + line += '\t' + serialized_last_column lines.append(line) return '\n'.join(lines) + "\n\n" -- GitLab