from typing import Dict, List, Tuple
from lxml import etree
from collections import defaultdict
# from src.annotation_types_old import 
from src.input_parsers.interface import InputParser
from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation

class CCLInputParser(InputParser):
    def __init__(self) -> None:
        super().__init__()
        
    def parse(self, path_to_input: str) -> List[Tuple[int, int, Annotation]]:
        """Parse CCL string into text and annotations.

        Annotations are returned as a dictionary with channel name as a key and list of tuples.

        Args:
            path_to_input (str): Path to file containing CCL.

        Returns:
            Tuple[str, Dict[str, List[Tuple[int, int, Annotation]]]]: Text and annotations.
        """
        with open(path_to_input, 'r') as f:
            ccl = f.read()
            
        ccl_tree = etree.fromstring(ccl.strip().encode('utf-8'))
        
        results = []
        text = ""
        
        ner_annotations = []
        morphosyntactic_annotations = []
        
        # First token is assumed to not have space before it
        last_was_ns = True
        
        tokens = ccl_tree.xpath("//ns | //tok")
        for token in tokens:                
            if token.tag == 'tok':
                if not last_was_ns:
                    text += " "

                word = token.xpath('./orth')[0].text
                start = len(text)
                end = start + len(word)

                for lex in token.xpath('./lex'):
                    if lex.attrib['disamb'] == "1":
                        ctag = lex.xpath('./ctag')[0]
                        morphosyntactic_annotations.append((start, end, MorphosyntacticAnnotation(ctag.text)))
                        
                        break
                    
                for ann in token.xpath('./ann'):
                    is_present = int(ann.text) == 1
                    if not is_present:
                        continue
                    
                    channel = ann.attrib['chan']
                    is_head = "head" in ann.attrib and ann.attrib['head'] == "1"
                    
                    if is_head:
                        ner_annotations.append((start, end, NerAnnotation(channel)))
                    else:                            
                        old_start = ner_annotations[-1][0]
                            
                        ner_annotations[-1] = (old_start, end, ner_annotations[-1][2])
                            
                last_was_ns = False
                text += word
            elif token.tag == 'ns':
                last_was_ns = True
                
        results = ner_annotations + morphosyntactic_annotations
                
        return text, results