Skip to content
Snippets Groups Projects
wiktor_ner.py 3.08 KiB
Newer Older
Michał Pogoda's avatar
Michał Pogoda committed
"""Module for parsing WiktorNER files."""

import json
from typing import List, Tuple
Michał Pogoda's avatar
Michał Pogoda committed
from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation
from src.input_parsers.interface import InputParser

class WiktorNERInputParser(InputParser):
Michał Pogoda's avatar
Michał Pogoda committed
    """Parser for WiktorNER files.

    Example WiktorNER file:
    {
        "filename": "greeting-5b1401",
        "text": "Hello Tom!",
        "tokens": [
            {
                "index": 1,
                "position": [0,5],
                "orth": "Hello",
                "lex": [
                    {
                        "lemma": "hello",
                        "mstag": "interj"
                    }
                ]
            },
            {
                "index": 2,
                "position": [6,9],
                "orth": "Tom",
                "lex": [
                    {
                        "lemma": "Tom",
                        "mstag": "noun"
                    }
                ]
            },
            {
                "index": 3,
                "position": [9,10],
                "orth": "!",
                "lex": [
                    {
                        "lemma": "!",
                        "mstag": "interp"
                    }
                ]
            }
        ],
        "entities": [
            {
                "text": "Tom",
                "type": "nam_prs_human",
                "tokens": [2],
                "positions": [6,9]
            }
        ]
    }
    """
    def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]:
        """Parse wiktorner file into text and annotations.

Michał Pogoda's avatar
Michał Pogoda committed
        Annotations are returned as a dictionary with channel name as a key and list of
        tuples.

        Args:
            co z  (str): Path to file containing CCL.

        Returns:
            Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations.
Michał Pogoda's avatar
Michał Pogoda committed

        if content.text:
            text = content.text
Michał Pogoda's avatar
Michał Pogoda committed
        else:
            text = ""
        annotations = []
        # Morphosyntactic annotations
        if content.tokens:
            for token in content.tokens():
                if token.start and token.stop:
                    if token.lexemes:
                        for lexeme in token.lexemes:
                            if lexeme.disamb and lexeme.disamb is True:
                                if lexeme.pos:
                                    if lexeme.lemma:
                                        lemma = lexeme.lemma
                                    else:
                                        lemma = None
Michał Pogoda's avatar
Michał Pogoda committed
                                    annotations.append(
                                        (
                                            token.start,
                                            token.stop,
                                            MorphosyntacticAnnotation(
                                                lexeme.pos, lemma
Michał Pogoda's avatar
Michał Pogoda committed
                                        )
                                    )

        return text, annotations