Skip to content
Snippets Groups Projects
wiktor_ner.py 3.24 KiB
Newer Older
Michał Pogoda's avatar
Michał Pogoda committed
"""Module for parsing WiktorNER files."""

from typing import List, Tuple
Bartlomiej's avatar
Bartlomiej committed
from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation
from src.input_parsers.interface import InputParser

class WiktorNERInputParser(InputParser):
Michał Pogoda's avatar
Michał Pogoda committed
    """Parser for WiktorNER files.

    Example WiktorNER file:
    {
        "filename": "greeting-5b1401",
        "text": "Hello Tom!",
        "tokens": [
            {
                "index": 1,
                "position": [0,5],
                "orth": "Hello",
                "lex": [
                    {
                        "lemma": "hello",
                        "mstag": "interj"
                    }
                ]
            },
            {
                "index": 2,
                "position": [6,9],
                "orth": "Tom",
                "lex": [
                    {
                        "lemma": "Tom",
                        "mstag": "noun"
                    }
                ]
            },
            {
                "index": 3,
                "position": [9,10],
                "orth": "!",
                "lex": [
                    {
                        "lemma": "!",
                        "mstag": "interp"
                    }
                ]
            }
        ],
        "entities": [
            {
                "text": "Tom",
                "type": "nam_prs_human",
                "tokens": [2],
                "positions": [6,9]
            }
        ]
    }
    """
    def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]:
        """Parse wiktorner file into text and annotations.

Michał Pogoda's avatar
Michał Pogoda committed
        Annotations are returned as a dictionary with channel name as a key and list of
        tuples.

        Args:
            co z  (str): Path to file containing CCL.

        Returns:
            Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations.
Michał Pogoda's avatar
Michał Pogoda committed

        if content.text:
            text = content.text
Michał Pogoda's avatar
Michał Pogoda committed
        else:
            text = ""
        annotations = []
        # Morphosyntactic annotations
        if content.tokens():
            for token in content.tokens():
Bartlomiej's avatar
Bartlomiej committed
                if token.lexemes:
                    for lexeme in token.lexemes:
                        if lexeme.disamb and lexeme.disamb is True:
                            if lexeme.pos:
                                if lexeme.lemma:
                                    lemma = lexeme.lemma
                                else:
                                    lemma = None
                                annotations.append(
                                    (
                                        token.start,
                                        token.stop,
                                        MorphosyntacticAnnotation(
                                            lexeme.pos, lemma
                                        ),
Michał Pogoda's avatar
Michał Pogoda committed
                                    )
Bartlomiej's avatar
Bartlomiej committed
                                )
Bartlomiej's avatar
Bartlomiej committed
        # NER annotations
        if 'ner' in content.get_span_types():
Bartlomiej's avatar
Bartlomiej committed
            for entity in content.spans('ner'):
Bartlomiej's avatar
Bartlomiej committed
                if entity.type:
                    annotations.append(
                        (entity.start, entity.stop, NerAnnotation(entity.type))
                    )
Michał Pogoda's avatar
Michał Pogoda committed

        return text, annotations