# coding: utf8

# Copyright (C) 2017 Michał Kaliński
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import absolute_import, division


import collections as coll
import itertools as itt
import logging
import operator as op

import six
import plwn_comments as plwnc
import plwn_comments.exceptions as plwnce
import plwn_comments.utils.usage_tags as plwncu

from ..bases import RelationInfoBase
from ..utils.sorting import text_key
from . import nodes as nd


__all__ = 'WNSchemaProcessor',


_LOG = logging.getLogger(__name__)

_BASIC_RELINST_ERROR_TMPL = \
    'Relation %s between units / synset %s -> %s dropped: '

_SynData = coll.namedtuple('_SynData', ('definition', 'isart'))
_LexData = coll.namedtuple(
    '_LexData',
    ('lemma', 'pos', 'variant', 'domain', 'comment', 'verb_aspect'),
)
_UnSData = coll.namedtuple('_UnSData', ('synset_id', 'unit_index'))
_RelInstData = coll.namedtuple('_RelInstData', ('child', 'relation'))
_RelTypeData = coll.namedtuple(
    '_RelTypeData',
    ('kind', 'name', 'short', 'parent'),
)
_EmoData = coll.namedtuple(
    '_EmoData',
    ('mark', 'names', 'valuations', 'example1', 'example2', 'status', 'super'),
)

_CmtDataT = coll.namedtuple(
    '_CmtData',
    ('examples', 'examples_sources', 'definition', 'usage', 'links'),
)


class WNSchemaProcessor(object):
    """Helper class.

    Externalizing some operations common to reading from any
    source that follows the "standard" plWordNet schema.

    In practice, objects of this class are intended for composition, being fed
    data from a schema-abiding source, perform some consistency cleanups, then
    providing well-formatted nodes that can be passed to a storage.

    The checks performed by this processor are as such:

    * Synsets that don't have units.
    * Units not assigned to a synset.
    * Units assigned to not-existing synsets.
    * Relations to or from non-existent units / synsets.
    * Relation types that don't have instances or are parents.
    * Relation instances that don't have types (illegal in the schema).
    """

    def __init__(self):
        # These dicts should be indexed by IDs of the respective data records
        self._syn_acc = {}
        self._lex_acc = {}
        self._lex_to_syn_acc = {}
        self._reltype_acc = {}
        # Relation instances are indexed like:
        # parent id => list of _RelInstData
        self._synrel_acc = coll.defaultdict(list)
        self._lexrel_acc = coll.defaultdict(list)
        # Emotion records are indexed like: lexical id => list of _EmoData
        self._emo_acc = coll.defaultdict(list)
        # This is aux sets for IDs that will be used for filtering
        self._reltypes_being_parents = set()
        self._relinstance_count = coll.Counter()
        self._syn_to_units_check = coll.defaultdict(list)

        # The following are filled during finalization:
        # Nodes need full relation names, this will provide translation from
        # IDs.
        self._relid2relname = None
        # Some aliases may repeat in the plWN database, but it's not allowed
        # here.
        self._bad_rel_aliases = None
        # All the units that were rejected for any reason - used by filtering
        # relation.
        self._bad_units = None
        self._bad_synsets = None

    def take_relation_type(self, id_, kind, name, short_name, parent_id):
        data = _RelTypeData(kind, name, short_name, parent_id)
        if _insert_if_uniq(self._reltype_acc, id_, data):
            if parent_id is not None:
                self._reltypes_being_parents.add(parent_id)

    def take_synset(self, id_, definition, is_artificial):
        _insert_if_uniq(
            self._syn_acc,
            id_,
            _SynData(definition, is_artificial),
        )

    def take_lexical_unit(self,
                          id_,
                          lemma,
                          pos,
                          variant,
                          domain,
                          comment,
                          verb_aspect):
        _insert_if_uniq(
            self._lex_acc,
            id_,
            _LexData(
                lemma,
                pos,
                variant,
                domain,
                comment,
                verb_aspect,
            ),
        )

    def take_unit_to_synset(self, unit_id, synset_id, unit_index):
        data = _UnSData(synset_id, unit_index)
        if _insert_if_uniq(self._lex_to_syn_acc, unit_id, data):
            self._syn_to_units_check[synset_id].append(unit_id)

    def take_synset_relation(self, parent_id, child_id, relation_id):
        self.__take_relation(
            self._synrel_acc,
            parent_id,
            child_id,
            relation_id,
        )

    def take_lexical_relation(self, parent_id, child_id, relation_id):
        self.__take_relation(
            self._lexrel_acc,
            parent_id,
            child_id,
            relation_id,
        )

    def take_emotion(self,
                     lexical_id,
                     markedness,
                     names,
                     valuations,
                     example1,
                     example2,
                     unit_status,
                     super_annotation):
        self._emo_acc[lexical_id].append(_EmoData(
            markedness,
            names,
            valuations,
            example1,
            example2,
            unit_status,
            super_annotation,
        ))

    def finalize(self):
        """After putting in data using the ``take_*`` methods.

        Perform all checks and yield all created nodes.
        """
        # Reset filtered sets, then fill them
        self._bad_units = set()
        self._filter_bad_units()
        self._bad_synsets = set()
        self._filter_bad_synsets()
        self._bad_rel_aliases = set()
        self._filter_bad_rel_aliases()
        self._filter_bad_rel_instances()

        for node in itt.chain(self._fin_reltypes(),
                              self._fin_units(),
                              self._fin_syns()):
            yield node

    def _fin_reltypes(self):
        self._relid2relname = {}

        for rel_id, rel_data in six.iteritems(self._reltype_acc):
            if rel_id in self._reltypes_being_parents:
                continue

            if self._relinstance_count[rel_id] <= 0:
                _LOG.warning(
                    'Relation %s = %r omitted: no instances',
                    rel_id,
                    rel_data,
                )
                continue

            # Inherit the kind data from the parent reltype, if the parent is
            # not None.
            if rel_data.parent is not None:
                try:
                    par_data = self._reltype_acc[rel_data.parent]
                except KeyError:
                    _LOG.error(
                        'Relation %s has non-existent parent %s',
                        rel_id,
                        rel_data.parent,
                    )
                    continue
                rel_parname = par_data.name
                rel_kind = par_data.kind
            else:
                rel_parname = None
                rel_kind = rel_data.kind

            self._relid2relname[rel_id] = RelationInfoBase.format_name(
                rel_parname,
                rel_data.name,
            )

            yield nd.RelationTypeNode(
                kind=rel_kind,
                name=rel_data.name,
                parent=rel_parname,
                aliases=(rel_data.short,)
                if rel_data.short is not None and
                rel_data.short not in self._bad_rel_aliases
                else (),
            )

    def _fin_units(self):
        for lu_id, lu_data in six.iteritems(self._lex_acc):
            if lu_id in self._bad_units:
                continue

            final_emo = self._coalesce_emo(lu_id)
            cmt_data = (
                _CmtData.make_empty()
                if lu_data.comment is None
                else _CmtData.extract_from_string(lu_data.comment)
            )
            final_related = self._make_related_for_unit(lu_id)
            try:
                uns = self._lex_to_syn_acc[lu_id]
            except KeyError:
                # This shouldn't happen, but possibly can, so just skip the
                # unit.
                continue

            yield nd.LexicalUnitNode(
                id=lu_id,
                lemma=lu_data.lemma,
                pos=lu_data.pos,
                variant=lu_data.variant,
                synset=uns.synset_id,
                unit_index=uns.unit_index,
                definition=cmt_data.definition,
                usage_notes=cmt_data.usage,
                external_links=cmt_data.links,
                examples=cmt_data.examples,
                examples_sources=cmt_data.examples_sources,
                domain=lu_data.domain,
                related=final_related,
                verb_aspect=lu_data.verb_aspect,
                is_emotional=final_emo.status,
                emotion_markedness=final_emo.mark,
                emotion_names=final_emo.names,
                emotion_valuations=final_emo.valuations,
                emotion_example_1=final_emo.example1,
                emotion_example_2=final_emo.example2,
            )

    def _fin_syns(self):
        for syn_id, syn_data in six.iteritems(self._syn_acc):
            if syn_id in self._bad_synsets:
                continue

            final_related = self._make_related_for_synset(syn_id)

            yield nd.SynsetNode(
                id=syn_id,
                definition=syn_data.definition,
                related=final_related,
                is_artificial=syn_data.isart,
            )

    def _filter_bad_units(self):
        for lex_id in self._lex_acc:
            if lex_id not in self._lex_to_syn_acc:
                _LOG.error('Unit %s belongs to no synset', lex_id)
                self._bad_units.add(lex_id)
                continue

            syn_of_lex = self._lex_to_syn_acc[lex_id].synset_id
            if syn_of_lex not in self._syn_acc:
                _LOG.error(
                    'Unit %s belongs to non-existent synset %s',
                    lex_id,
                    syn_of_lex,
                )
                self._bad_units.add(lex_id)

    def _filter_bad_synsets(self):
        for syn_id in self._syn_acc:
            # Do those synsets have units and those units are real?
            syn_units = self._syn_to_units_check.get(syn_id, ())
            any_unit_valid = False

            # This check doesn't necessarily remove the synset, but
            # notification will be given. At least one valid unit for synset
            # must remain.
            for unit_id in syn_units:
                if unit_id in self._lex_acc:
                    any_unit_valid = True
                else:
                    _LOG.error(
                        'Unit %s of synset %s is non-existent',
                        unit_id,
                        syn_id,
                    )

            if not any_unit_valid:
                _LOG.error('Synset %s has no (valid) units', syn_id)
                self._bad_synsets.add(syn_id)

    def _filter_bad_rel_aliases(self):
        # If an alias repeats multiple times, remember it to remove both
        # instances later (so don't decide which is the "right" one).
        all_aliases = set()
        for rel_data in six.itervalues(self._reltype_acc):
            alias = rel_data.short
            if alias in all_aliases:
                _LOG.error(
                    'Relation shortcut %r is not unique; dropping both',
                    alias,
                )
                self._bad_rel_aliases.add(alias)
            else:
                all_aliases.add(alias)

    def _filter_bad_rel_instances(self):
        # Assuming that all bad synsets and units have been filtered, drop all
        # instances of relations that refer to them.
        # It removes instances in-place from related dicts, and decreases
        # counts of instances for relation types.
        self.__recount_rels(self._synrel_acc, self._syn_acc, self._bad_synsets)
        self.__recount_rels(self._lexrel_acc, self._lex_acc, self._bad_units)

    def _make_related_for_unit(self, lex_id):
        return self.__make_related(self._lexrel_acc, lex_id)

    def _make_related_for_synset(self, syn_id):
        return self.__make_related(self._synrel_acc, syn_id)

    def _coalesce_emo(self, lex_id):
        # The algorithm is like this:
        # Start with super-annotation, iterate all annotations, fill what is
        # possible.
        # Do not overwrite status, markedness or examples, but sum names and
        # values.  If the super-annotation is marked as not-an-emotion, just
        # return empty data.
        # When returning the final emo value, don't remember its
        # super annotation - it doesn't matter at this point; set to None.
        # TODO Ensure that this algorithm makes sense, there seem to be more
        # sensible ways of handling things. Move on for now.
        final_status = None
        final_mark = None
        final_ex1 = None
        final_ex2 = None
        names_acc = []
        values_acc = []

        for emo_data in sorted(self._emo_acc.get(lex_id, ()),
                               key=op.attrgetter('super'),
                               reverse=True):
            if final_status is None:
                final_status = emo_data.status
            if final_mark is None:
                final_mark = emo_data.mark
            if final_ex1 is None:
                final_ex1 = emo_data.example1
            if final_ex2 is None:
                final_ex2 = emo_data.example2

            names_acc.extend(emo_data.names)
            values_acc.extend(emo_data.valuations)

        return _EmoData(
            mark=final_mark,
            names=_emo_uniq_sorted_tuple(names_acc),
            valuations=_emo_uniq_sorted_tuple(values_acc),
            example1=final_ex1,
            example2=final_ex2,
            status=final_status,
            super=None,
        )

    def __take_relation(self, relinst_acc, parent_id, child_id, relation_id):
        relinst_acc[parent_id].append(_RelInstData(child_id, relation_id))
        self._relinstance_count[relation_id] += 1

    def __recount_rels(self, relinst_acc, item_acc, bad_acc):
        for parent_id, children in six.iteritems(relinst_acc):
            # Do not filter parents; this will be done at yielding, outside
            fil_children = []
            for relinst in children:
                if relinst.relation not in self._reltype_acc:
                    _LOG.error(
                        _BASIC_RELINST_ERROR_TMPL +
                        'non-existent relation',
                        relinst.relation,
                        parent_id,
                        relinst.child,
                    )
                elif relinst.child not in item_acc or relinst.child in bad_acc:
                    _LOG.error(
                        _BASIC_RELINST_ERROR_TMPL +
                        'the child is non-existent or invalid',
                        relinst.relation,
                        parent_id,
                        relinst.child,
                    )
                    self._relinstance_count[relinst.relation] -= 1
                else:
                    fil_children.append(relinst)

            relinst_acc[parent_id] = fil_children

    def __make_related(self, relinst_acc, parent_id):
        return tuple(
            (self._relid2relname[relinst.relation], relinst.child)
            for relinst in relinst_acc.get(parent_id, ())
        )


class _CmtData(_CmtDataT):

    __slots__ = ()

    # :class:`plwn_comments.TagBank` structure that defines all kinds of
    # comment tags which are needed by PLWN API.
    _WN_TAGS = plwnc.TagBank()
    # Usage notes
    _WN_TAGS.define(u'K')
    # External links
    _WN_TAGS.define(u'L', u'{')
    # Definition
    _WN_TAGS.define(u'D')
    # The distinction for these tags is useful, since all examples go to one
    # place.
    _NON_EXAMPLE_TAG_NAMES = frozenset((u'K', u'L', u'D'))
    # And define those example tags
    _WN_TAGS.define_from(
        plwncu.iter_usage_tags(),
        plwncu.DEFAULT_USAGE_TAG_SURROUND,
    )

    @classmethod
    def extract_from_string(cls, cmt_str):
        try:
            cmt = plwnc.Comment.parse(cmt_str, cls._WN_TAGS)
        except plwnce.PLWNCommentsException:
            # For now just make an empty comment which will make all fields
            # unset.
            cmt = plwnc.Comment(cls._WN_TAGS)

        # Get all examples
        examples = []
        examples_src = []

        for tagname, tagcontents in cmt.items():
            if tagname not in cls._NON_EXAMPLE_TAG_NAMES:
                examples.extend(tagcontents)
                examples_src.extend(itt.repeat(tagname, len(tagcontents)))

        return cls(
            examples=tuple(examples),
            examples_sources=tuple(examples_src),
            definition=cmt.get_first(u'D', None),
            usage=tuple(cmt[u'K']),
            links=tuple(cmt[u'L']),
        )

    @classmethod
    def make_empty(cls):
        return cls(
            examples=(),
            examples_sources=(),
            definition=None,
            usage=(),
            links=(),
        )


def _insert_if_uniq(data_acc, id_val, data_obj):
    obj_in = data_acc.setdefault(id_val, data_obj)

    if obj_in is not data_obj:
        _LOG.error(
            'Cannot add record %r with ID %s: already associated with '
            'record %r',
            data_obj,
            id_val,
            obj_in,
        )
        return False
    return True


def _emo_enums_sortkey(item):
    return text_key(item.value)


def _emo_uniq_sorted_tuple(emo_acc):
    # Sort the names and valuations for predictable behaviour
    return tuple(sorted(frozenset(emo_acc), key=_emo_enums_sortkey))
