"""Module for parsing CCL files."""

from typing import List, Tuple

from lxml import etree

from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation
from src.input_parsers.interface import InputParser


class CCLInputParser(InputParser):
    """Parser for CCL files.

    Example CCL file:
    <?xml version="1.0" encoding="UTF-8"?>
    <!DOCTYPE chunkList SYSTEM "ccl.dtd">
    <chunkList>
    <chunk type="p" id="ch1">
    <sentence id="s1">
    <tok>
        <orth>Tom</orth>
        <lex disamb="1"><base>Tom</base><ctag>subst:sg:nom:m1</ctag></lex>
        <ann chan="person_first_nam" head="1">1</ann>
    </tok>
    <tok>
        <orth>is</orth>
        <lex disamb="1"><base>be</base><ctag>fin:sg:ter:pres</ctag></lex>
        <ann chan="person_first_nam">0</ann>
    </tok>
    <tok>
        <orth>nice</orth>
        <lex disamb="1"><base>nice</base><ctag>adj:sg:nom:f:pos</ctag></lex>
        <ann chan="person_first_nam">0</ann>
    </tok>
    <tok>
        <orth>!</orth>
        <lex disamb="1"><base>!</base><ctag>interp</ctag></lex>
        <ann chan="person_first_nam">0</ann>
    </tok>
    </sentence>
    </chunk>
    </chunkList>
    """

    def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]:
        """Parse CCL string into text and annotations.

        Annotations are returned as a dictionary with channel name as a key and list of
        tuples.

        Args:
            content (str): Content of ccl file.

        Returns:
            Tuple[str, Dict[str, List[Tuple[int, int, Annotation]]]]: Text and
                annotations.

        """
        ccl_tree = etree.fromstring(content.strip().encode("utf-8"))

        results = []
        text = ""

        ner_annotations = []
        morphosyntactic_annotations = []

        # First token is assumed to not have space before it
        last_was_ns = True

        tokens = ccl_tree.xpath("//ns | //tok")
        for token in tokens:
            if token.tag == "tok":
                if not last_was_ns:
                    text += " "

                word = token.xpath("./orth")[0].text
                start = len(text)
                end = start + len(word)

                for lex in token.xpath("./lex"):
                    if lex.attrib["disamb"] == "1":
                        ctag = lex.xpath("./ctag")[0]
                        morphosyntactic_annotations.append(
                            (start, end, MorphosyntacticAnnotation(ctag.text))
                        )

                        break

                for ann in token.xpath("./ann"):
                    is_present = int(ann.text) == 1
                    if not is_present:
                        continue

                    channel = ann.attrib["chan"]
                    is_head = "head" in ann.attrib and ann.attrib["head"] == "1"

                    if is_head:
                        ner_annotations.append((start, end, NerAnnotation(channel)))
                    else:
                        old_start = ner_annotations[-1][0]

                        ner_annotations[-1] = (old_start, end, ner_annotations[-1][2])

                last_was_ns = False
                text += word
            elif token.tag == "ns":
                last_was_ns = True

        results = ner_annotations + morphosyntactic_annotations

        return text, results