Skip to content
Snippets Groups Projects
wiktor_ner.py 3.53 KiB
Newer Older
Michał Pogoda's avatar
Michał Pogoda committed
"""Module for parsing WiktorNER files."""

import json
from typing import List, Tuple
Michał Pogoda's avatar
Michał Pogoda committed
from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation
from src.input_parsers.interface import InputParser

class WiktorNERInputParser(InputParser):
Michał Pogoda's avatar
Michał Pogoda committed
    """Parser for WiktorNER files.

    Example WiktorNER file:
    {
        "filename": "greeting-5b1401",
        "text": "Hello Tom!",
        "tokens": [
            {
                "index": 1,
                "position": [0,5],
                "orth": "Hello",
                "lex": [
                    {
                        "lemma": "hello",
                        "mstag": "interj"
                    }
                ]
            },
            {
                "index": 2,
                "position": [6,9],
                "orth": "Tom",
                "lex": [
                    {
                        "lemma": "Tom",
                        "mstag": "noun"
                    }
                ]
            },
            {
                "index": 3,
                "position": [9,10],
                "orth": "!",
                "lex": [
                    {
                        "lemma": "!",
                        "mstag": "interp"
                    }
                ]
            }
        ],
        "entities": [
            {
                "text": "Tom",
                "type": "nam_prs_human",
                "tokens": [2],
                "positions": [6,9]
            }
        ]
    }
    """
    def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]:
        """Parse wiktorner file into text and annotations.

Michał Pogoda's avatar
Michał Pogoda committed
        Annotations are returned as a dictionary with channel name as a key and list of
        tuples.

        Args:
            co z  (str): Path to file containing CCL.

        Returns:
            Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations.
Michał Pogoda's avatar
Michał Pogoda committed

        """
        content_parsed = json.loads(content)
        if "text" in content_parsed:
Michał Pogoda's avatar
Michał Pogoda committed
            text = content_parsed["text"]
        else:
            text = ""
        annotations = []
        # Morphosyntactic annotations
        if "tokens" in content_parsed:
Michał Pogoda's avatar
Michał Pogoda committed
            for token in content_parsed["tokens"]:
                if "position" in token:
Michał Pogoda's avatar
Michał Pogoda committed
                    token_start, token_end = token["position"]
                    if "lexemes" in token:
Michał Pogoda's avatar
Michał Pogoda committed
                        for lexeme in token["lexemes"]:
Michał Pogoda's avatar
Michał Pogoda committed
                            if "disamb" in lexeme and lexeme["disamb"] is True:
                                if "mstag" in lexeme:
                                    lemma = lexeme.get("lemma", None)
Michał Pogoda's avatar
Michał Pogoda committed
                                    annotations.append(
                                        (
                                            token_start,
                                            token_end,
                                            MorphosyntacticAnnotation(
                                                lexeme["mstag"], lemma
                                            ),
        # NER annotations
        if "entities" in content_parsed:
Michał Pogoda's avatar
Michał Pogoda committed
            for entity in content_parsed["entities"]:
                if "positions" in entity:
Michał Pogoda's avatar
Michał Pogoda committed
                    entity_start, entity_end = entity["positions"]
                    if "type" in entity:
Michał Pogoda's avatar
Michał Pogoda committed
                        annotations.append(
                            (entity_start, entity_end, NerAnnotation(entity["type"]))
                        )

        return text, annotations