# coding: utf8

# Copyright (C) 2017 Michał Kaliński
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# FIXME Some assert statements should be converted to regular raises (asserts
# should not be used for anything other than checking for errors in the code
# itself).
from xml.etree import ElementTree
import re
import logging

from .nodes import make_synset_node, make_lexical_unit_node
from .. import exceptions as exc
from ..enums import PoS, Domain


__all__ = 'ubylmf_reader',

ENCODING = 'utf-8'

_logger = logging.getLogger(__name__)


def ubylmf_reader(ubylmf_file):
    """Read PLwordnet iteratively, element by element.

    :param ubylmf_file: the name of UMY-LMF file or an opened file itself.
    :type ubylmf_file: str or file

    :return: a generator over PLwordnet entities.
    :rtype: generator
    """
    contex = ElementTree.iterparse(ubylmf_file)  # catch only end events
    contex = iter(contex)
    # Get root elem in order to clear it after reading each elem
    try:
        _, root = next(contex)
    except StopIteration:
        raise exc.ReaderException('The xml file is empty')
    # Generate wordnet's elements
    for _, elem in contex:
        entities = []
        # Parse entities
        if elem.tag == "LexicalEntry":
            for xml_sense in elem.findall("Sense"):
                # Don't move it before if - we still want to generate tuples
                # even if one sense is broken.
                try:
                    entities.append(_make_lexicalunit(elem, xml_sense))
                except Exception:
                    _logger.exception(
                        '\n%s\nIN ELEMENT\n%s',
                        ElementTree.tostring(xml_sense, ENCODING),
                        ElementTree.tostring(elem, ENCODING)
                    )
        elif elem.tag == "Synset":
            try:
                entities.append(_make_synset(elem))
            except Exception:
                _logger.exception('\n%s', ElementTree.tostring(elem, ENCODING))
        # Return entities
        if entities:
            root.clear()
            for entity in entities:
                yield entity


def _make_lexicalunit(xml_lexicalentry, xml_sense):
    """Return a lexical unit built from an xml element.

    :param xml_lexicalentry: an xml element of LexicalUnit read from a file.
    :type xml_lexicalentry: xml.etree.ElementTree.Element
    :param xml_sense: <Sense> element that belongs to the LexicalUnit
    :type xml_sense: xml.etree.ElementTree.Element

    :return: a named tuple LexicalUnitNode
    :rtype: LexicalUnitNode
    """
    # Get id, synset and variant
    lu_id = _extract_id(xml_sense.get("id"))
    lu_synset = _extract_id(xml_sense.get("synset"))
    lu_variant = int(xml_sense.get("index"))
    # Get lemma
    xml_lemma = xml_lexicalentry.find("Lemma").find("FormRepresentation")
    lu_lemma = xml_lemma.get("writtenForm")
    assert lu_lemma, "Lemma is empty"
    # Get PoS
    lu_pos = xml_lexicalentry.get("partOfSpeech")
    assert lu_pos, "PoS is empty"
    # Get definition - can be empty! At most 2
    lu_definition, lu_usage_notes, lu_external_links = \
        _extract_definitions(xml_sense)
    # Get usage examples
    lu_examples = []
    lu_examples_sources = []
    for xe in xml_sense.findall("SenseExample"):
        example = xe.find("TextRepresentation").get("writtenText").strip()
        if example:
            exm_src_match = re.search(r'\[##([-\w]+):?\]$', example, re.U)
            if exm_src_match is not None:
                lu_examples.append(example[:exm_src_match.start(0)])
                lu_examples_sources.append(exm_src_match.group(1))
            else:
                _logger.warning("Malformed sense example: %s", example)
    # Get semantic labels
    lu_domain = _get_domain(xml_sense)
    # Get related
    lu_related = []
    for xsr in xml_sense.findall("SenseRelation"):
        try:
            lu_related.append(
                (xsr.get("relName"), _extract_id(xsr.get("target")))
            )
        except exc.MalformedIdentifierException:
            _logger.exception(
                '\n%s\nIN ELEMENT\n%s\nThis relation is skipped.',
                ElementTree.tostring(xsr, ENCODING),
                ElementTree.tostring(xml_sense, ENCODING)
            )
    # Get unit index
    lu_unit_index = int(_extract_id(
        xml_sense.find("MonolingualExternalRef").get("externalReference"))
    )
    return make_lexical_unit_node(
        id=lu_id,
        legacy_id=None,
        lemma=lu_lemma,
        pos=PoS(lu_pos),
        synset=lu_synset,
        variant=lu_variant,
        unit_index=lu_unit_index,
        definition=lu_definition,
        usage_notes=tuple(lu_usage_notes),
        external_links=tuple(lu_external_links),
        examples=tuple(lu_examples),
        examples_sources=tuple(lu_examples_sources),
        # The domain label is in format <pos>.<lang>_<name>; the last one is
        # the only one we care about.
        domain=Domain[lu_domain.rsplit('_', 1)[-1]],
        related=tuple(lu_related),
        # Other properties are not stored in UBY files.
    )


def _extract_definitions(xml_sense):
    """Extract a definition, notes and links of a LU from <Definition> tags.

    :param xml_sense: <Sense> element read from an xml file.
    :type xml_sense: xml.etree.ElementTree.Element

    :return: the definition, usage notes and external links of the LU.
    :rtype: (str or unicode, tuple, tuple)

    :raises AssertionError: if there is more than 2 <Definition> tags.
    """
    # Get definition - can be empty! At most 2
    xml_definitions = xml_sense.findall("Definition")
    lu_definition = None
    lu_usage_notes = []
    lu_external_links = []
    assert len(xml_definitions) <= 2, \
        "Too many definitions ({:})".format(len(xml_definitions))
    # There is at least one <Definition>
    if xml_definitions:
        children = list(xml_definitions[0])
        # Check whether the first child is the real definition
        if children[0].tag == "TextRepresentation":
            lu_definition = children[0].get("writtenText")
            # <Statement> - the rest of children
            children = list(xml_definitions[1]) \
                if len(xml_definitions) == 2 else []
        # Get additional info
        for child in children:
            if child.get("statementType", "") == "usageNote":
                lu_usage_notes.append(
                    child.find("TextRepresentation").get("writtenText")
                )
            if child.get("statementType", "") == "externalReference":
                lu_external_links.append(
                    child.find("TextRepresentation").get("writtenText")
                )
    return lu_definition, lu_usage_notes, lu_external_links


def _get_domain(xml_sense):
    """Extract a domain of a LU from <SemanticLabel> tags.

    :param xml_sense: <Sense> element read from an xml file.
    :type xml_sense: xml.etree.ElementTree.Element

    :return: the domain of the LU.
    :rtype: str or unicode

    :raises AssertionError: if there is more than 1 <SemanticLabel> tags,
        no tag at all or its type is different from domain.
    """
    xml_semantic_labels = xml_sense.findall("SemanticLabel")
    assert len(xml_semantic_labels) == 1, \
        "{:} SemanticLabel found, should be 1".format(len(xml_semantic_labels))
    assert xml_semantic_labels[0].get("type", "") == "domain", \
        ("SemanticLabel has type {:} instead of domain"
         "").format(xml_semantic_labels[0].get("type").encode(ENCODING))
    return xml_semantic_labels[0].get("label")


def _make_synset(xml_synset):
    """Return a synset built from an xml element.

    :param xml_synset: an xml element of Synset read from a file.
    :type xml_synset: xml.etree.Element

    :return: a named tuple SynsetNode
    :rtype: SynsetNode
    """
    s_id = _extract_id(xml_synset.get("id"))
    xml_def = xml_synset.find("Definition")
    s_def = xml_def.find("TextRepresentation").get("writtenText") \
        if xml_def is not None else None
    s_related = []
    for xsr in xml_synset.findall("SynsetRelation"):
        try:
            s_related.append(
                (xsr.get("relName"), _extract_id(xsr.get("target")))
            )
        except exc.MalformedIdentifierException:
            _logger.exception(
                '\n%s\nIN ELEMENT\n%s\nThis relation is skipped.',
                ElementTree.tostring(xsr, ENCODING),
                ElementTree.tostring(xml_synset, ENCODING)
            )
    return make_synset_node(
        id=s_id,
        definition=s_def,
        related=tuple(s_related),
        # There are no artificial synsets in UBY dumps
    )


def _extract_id(full_id):
    """Extract only numeric identifier from the end of a full id.

    :param full_id: a full identifier that has a prefix before the real id.
    :type full_id: str|unicode

    :return: a real, numeric id.
    :rtype: int

    :raises MalformedIdentifierException: if the original id doesn't end with
        digits.
    """
    try:
        return int(re.findall(r"\d+$", full_id)[0])
    except IndexError:
        raise exc.MalformedIdentifierException(full_id)


_this_reader_ = ubylmf_reader
