diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..f696a4d46332c4de5cab5879241d87a50e46a18c --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include README-pl-beta.txt diff --git a/PKG-INFO b/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..0598421f994dc94d19923aae90f5c83fc1bd390a --- /dev/null +++ b/PKG-INFO @@ -0,0 +1,10 @@ +Metadata-Version: 1.0 +Name: PLWN_API +Version: 0.9 +Summary: Python API to access plWordNet lexicon +Home-page: UNKNOWN +Author: Michał Kaliński +Author-email: michal.kalinski@pwr.edu.pl +License: UNKNOWN +Description: UNKNOWN +Platform: UNKNOWN diff --git a/PLWN_API.egg-info/PKG-INFO b/PLWN_API.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..c0d5f643e3b319e5a9d96b0858b954b60441156f --- /dev/null +++ b/PLWN_API.egg-info/PKG-INFO @@ -0,0 +1,10 @@ +Metadata-Version: 1.0 +Name: PLWN-API +Version: 0.9 +Summary: Python API to access plWordNet lexicon +Home-page: UNKNOWN +Author: Michał Kaliński +Author-email: michal.kalinski@pwr.edu.pl +License: UNKNOWN +Description: UNKNOWN +Platform: UNKNOWN diff --git a/PLWN_API.egg-info/SOURCES.txt b/PLWN_API.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d68e4cea354d10ac0f67a12afc6328ad8417a01 --- /dev/null +++ b/PLWN_API.egg-info/SOURCES.txt @@ -0,0 +1,28 @@ +MANIFEST.in +README-pl-beta.txt +setup.py +PLWN_API.egg-info/PKG-INFO +PLWN_API.egg-info/SOURCES.txt +PLWN_API.egg-info/dependency_links.txt +PLWN_API.egg-info/requires.txt +PLWN_API.egg-info/top_level.txt +plwn/__init__.py +plwn/_loading.py +plwn/bases.py +plwn/enums.py +plwn/exceptions.py +plwn/relation_aliases.tsv +plwn/relresolver.py +plwn/readers/__init__.py +plwn/readers/comments.py +plwn/readers/nodes.py +plwn/readers/ubylmf.py +plwn/readers/wndb.py +plwn/readers/wnxml.py +plwn/storages/__init__.py +plwn/storages/objects.py +plwn/storages/sqlite.py +plwn/utils/__init__.py +plwn/utils/graphmlout.py +plwn/utils/sorting.py +plwn/utils/tupwrap.py \ No newline at end of file diff --git a/PLWN_API.egg-info/dependency_links.txt b/PLWN_API.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/PLWN_API.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/PLWN_API.egg-info/requires.txt b/PLWN_API.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..0cc144e55cb1e22cea3e852a4b1d07ca5988c7ea --- /dev/null +++ b/PLWN_API.egg-info/requires.txt @@ -0,0 +1,2 @@ +six>=1.10 +enum34>=1.1.2 diff --git a/PLWN_API.egg-info/top_level.txt b/PLWN_API.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..c72d30de0531ec401e7d72fd2f63a069d248db85 --- /dev/null +++ b/PLWN_API.egg-info/top_level.txt @@ -0,0 +1 @@ +plwn diff --git a/README-pl-beta.txt b/README-pl-beta.txt new file mode 100644 index 0000000000000000000000000000000000000000..382f3b25384dfd2149bde6e305992d2551ae295a --- /dev/null +++ b/README-pl-beta.txt @@ -0,0 +1,72 @@ +****************** +**** PlWN API **** +****************** + +PlWN API umożliwia: + - wyszukiwanie synsetów i jednostek leksykalnych w Słowosieci; + - dostęp do własności synsetów i jednostek leksykalnych, oraz ich relacji; + - eksport całości bądź części Słowosieci do grafu. + +To README jest krótką, tymczasową instrukcją do wersji beta PlWN API. +Interfejs oraz funkcjonalność mogą ulec zmianie. + +============= +Inicjalizacja +============= + + >>> import plwn + >>> wn = plwn.load('plwn-3.0.db', 'sqlite3') + +================= +Zrzuty baz danych +================= + +Na ten moment, zalecany jest dostęp do bazy danych Słowosieci poprzez zrzuty do +plików SQLite, "plwn-X.db" (gdzie X to wersja Słowosieci). Zrzuty powinny +były zostać udostępnione razem z API. + +============== +Funkcjonalność +============== + +Opis funkcjonalności jest dostępny poprzez docstringi modułu plwn/bases.py + + $ pydoc plwn.bases + +Dodatkowo, w plwn/_pos.py znajduje się lista stałych wartości part-of-speech +używanych przez API. + +Zgodnie z konwencją przyjętą przez Python 3, większość metod zwracających +kolekcje obiektów zwraca je w postaci generatorów. + + >>> wn.lexical_units(lemma=u'pies') + TupWrapper(<generator object <genexpr> at 0x7f1048583410>) + +Jeśli celem zapytania jest iteracja po wynikach, nie potrzeba niczego więcej. + + >>> for lu in wn.lexical_units(lemma=u'pies'): + >>> print(lu.id) + 5563 + 52245 + ... + +Aby otrzymać listę (albo inną kolekcję), należy rzutować otrzymany obiekt. + + >>> list(wn.lexical_units(lemma=u'pies')) + [<LexicalUnit id=5563 lemma=u'pies' pos=u'noun' variant=1>, + <LexicalUnit id=52245 lemma=u'pies' pos=u'noun' variant=2>, + ... + ] + +Dla wygody w trybie interaktywnym Pythona, generatory są opakowane w obiekty +"TupWrapper", które umożliwiają rzutowanie generatora do typu krotki +poprzez "wywołanie" obiektu. + + >>> wn.lexical_units(lemma=u'pies')() + (<LexicalUnit id=5563 lemma=u'pies' pos=u'noun' variant=1>, + <LexicalUnit id=52245 lemma=u'pies' pos=u'noun' variant=2>, + ... + ) + +Jednak w przypadku pisania programów odwołujących się do API zalecane jest +jawne rzutowanie zwracanych generatorów. "Explicit is better than implicit." diff --git a/plwn/.bases.py.swp b/plwn/.bases.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..db9c729a546145f142a12fbd6693aaea5bc691d3 Binary files /dev/null and b/plwn/.bases.py.swp differ diff --git a/plwn/__init__.py b/plwn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..69262d7dbff725b6934dbcbbedb0ff02daef9cad --- /dev/null +++ b/plwn/__init__.py @@ -0,0 +1,6 @@ +from ._loading import * +from .enums import PoS + +# Setup logging for the package (not) +import logging as _logging +_logging.getLogger('plwn').addHandler(_logging.NullHandler()) diff --git a/plwn/_loading.py b/plwn/_loading.py new file mode 100644 index 0000000000000000000000000000000000000000..709680eaa8ed015e3c66c466c0ab4ce12701b2f8 --- /dev/null +++ b/plwn/_loading.py @@ -0,0 +1,146 @@ +"""Defines user-facing functions that allow simple construction of +:class:`PLWordnetBase` instances, with selected storages and readers. +""" + +from __future__ import absolute_import, division, print_function + + +from collections import namedtuple +from importlib import import_module +import textwrap as tw + +import six + + +__all__ = 'read', 'load', 'show_source_formats', 'show_storage_formats' + + +_Info = namedtuple('_Info', ('desc', 'modname')) + +_READERS = { + 'uby-lmf': _Info('Discontinued XML-based format', 'ubylmf'), + 'database': _Info( + 'MySQL database of plWordNet. Only works on python 2 and requires ' + 'certain additional libraries. This is meant for internal use only ' + 'and will not work for most users. The file should contain one line ' + 'with SQLAlchemy URL to the database.', + 'wndb', + ), + 'xml': _Info('The official PLWN XML format', 'wnxml'), +} +_STORAGES = { + 'sqlite3': _Info( + 'File database format, with a compact schema (compared to internal ' + 'PLWN database).', + 'sqlite', + ), + 'objects': _Info( + 'Stores data in plain python objects, dumping them in pickle format. ' + 'Quick to construct, but querying and memory efficiency is not ' + 'guaranteed.', + 'objects', + ), +} + +# Defaults for this version +_READERS[None] = _READERS['xml'] +_STORAGES[None] = _STORAGES['sqlite3'] + + +def _imp_reader(modname): + # Pre-import the root package - py3 needs this? + import plwn.readers # noqa + return import_module('.' + modname, 'plwn.readers')._this_reader_ + + +def _imp_storage(modname): + # Pre-import the root package - py3 needs this? + import plwn.storages # noqa + return import_module('.' + modname, 'plwn.storages')._this_storage_ + + +def read(source_file, + source_format=None, + storage_file=None, + storage_format=None): + """Read plWordNet data from a file and return the right + :class:`PLWordnetBase` subclass instance for the selected parameters. + + Where defaults are mentioned, those values may change with each minor + version of PLWN API. If you depend on some particular format for a long + running program, state it explicitly. + + :param str source_file: Path to the file from which the plWordNet data will + be read. The required contents of the file depend on selected + ``source_format``. + + :param str source_format: Name of the format of data that's contained in + ``source_file``. If ``None``, then the default for the current version will + be chosen. + + :param str storage_file: Path to the file where the internal representation + of the storage will be dumped. It will be possible to load this file using + :func:`load`. If ``None``, then the representation will not be dumped. + + :param str storage_format: Name of the format in which PLWN API will store + data in memory. Access methods provided should be the same, but their + efficiency may differ. If ``None``, then the default for the current + version will be chosen. + + :rtype: PLWordnetBase + """ + + stor_cls = _imp_storage(_STORAGES[storage_format].modname) + rdr = _imp_reader(_READERS[source_format].modname) + return stor_cls.from_reader(rdr(source_file), storage_file) + + +def load(storage_file, storage_format=None): + """Read plWordNet data from a cached file with internal PLWN API + representation. + + This function is much faster than :func:`read` if such file is available. + + :param str storage_file: Path to the file from which the cached data will + be read. + + :param str storage_format: Name of the format the data is stored in. It + must match the actual format and version of schema contained in the file. + + :rtype: PLWordnetBase + """ + + stor_cls = _imp_storage(_STORAGES[storage_format].modname) + return stor_cls.from_dump(storage_file) + + +def show_source_formats(): + """Print names and short descriptions of available source file formats to + ``stdout``. + + This function is primarily meant to be informative in interactive shell + mode. + """ + + _show(_READERS) + + +def show_storage_formats(): + """Print names and short descriptions of available storage formats to + ``stdout``. + + This function is primarily meant to be informative in interactive shell + mode. + """ + + _show(_STORAGES) + + +def _show(dict_): + for name, info in six.iteritems(dict_): + if name is None: + continue + + print(name) + print('-' * len(name)) + print(tw.fill(info.desc), end='\n\n') diff --git a/plwn/bases.py b/plwn/bases.py new file mode 100644 index 0000000000000000000000000000000000000000..dae5f90cd2bdb745f0d29a89492f8b7cfd66c7bc --- /dev/null +++ b/plwn/bases.py @@ -0,0 +1,994 @@ +"""Base, abstract classes for plWordNet objects, implementing common +functionality independent of structures holding wordnet data. +""" + +from __future__ import absolute_import, division + +import abc +import collections as coll +import functools +import locale + +import six + +from .utils import graphmlout as go +from .enums import make_values_tuple +from .relresolver import get_default_relation_resolver + + +__all__ = 'SynsetBase', 'LexicalUnitBase', 'PLWordNetBase', 'RelationEdge' + + +#: Named tuple type yielded by +#: :meth:`PLWordNetBase.synset_relation_edges` and +#: :meth:`PLWordNetBase.lexical_relation_edges`. +RelationEdge = coll.namedtuple( + 'RelationEdge', + ('source', 'relation', 'target'), +) + + +class PLWordNetBase(object): + """The primary entry point for retrieving data from plWordNet. + + Allows querying the plWordNet for synsets and lexical units. + """ + + __metaclass__ = abc.ABCMeta + + _STORAGE_NAME = '?' + + @classmethod + def from_reader(cls, reader, dump_to=None): + """Create a new instance from a source reader, optionally saving it in + an internal representation format in another file. + + :param reader: Generator that yields :class:`SynsetNone` and + :class:`LexicalUnitNode` from a source representation. + + :param str dump_to: Path to a file where the data read from the source + will be dumped in an internal representation. It will be possible + to later load it quicker by :meth:`.from_dump`. If ``None``, then + no cached file will be created. + + :returns: New instance of PLWN API entry point. + :rtype: PLWordNetBase + """ + + raise NotImplementedError() + + @classmethod + def from_dump(cls, dump): + """Create a new instance from a dump of cached internal representation. + + The dump file must have been created by the same ``PLWordNetBase`` + subclass, and preferably by the same version of PLWN API (backwards + compatibility of dump formats is not guaranteed). + + :param str dump: Path to a file with cached internal representation. + + :returns: New instance of PLWN API entry point. + :rtype: PLWordNetBase + """ + + return NotImplementedError() + + def __init__(self): + self._rel_resolver = get_default_relation_resolver() + + @abc.abstractmethod + def synsets(self, lemma=None, pos=None, variant=None): + """Iterate over synsets form plWordNet, filtered by lemma, part of + speech and variant. + + If a parameter is omitted, then any value is accepted (so ``synsets()`` + iterates over all synsets). + + The lemma, pos and variant are properties of lexical units, this method + yields synsets that contain those lexical units. + + :param str lemma: Only synsets containing a lexical unit with this + lemma will be yielded. + :param pos: Only synsets containing a lexical unit with this part + of speech will be yielded. + :type pos: Union[PoS, str] + :param int variant: Only synsets containing a lexical unit with this + variant will be yielded. + + :returns: Iterable of synsets fitting the parameters' criteria. + :rtype: Iterable[SynsetBase] + + :raises InvalidPoSException: If a query is made for a PoS that is not + one of the valid constants. + """ + + pass + + @abc.abstractmethod + def synset(self, lemma, pos, variant): + """Get the synset containing the unit with the lemma, part of speech + and variant. + + Unlike :meth:`.synsets`, all parameters of this method are mandatory. + It either returns a single synset, or raises and exception if no + such synset can be found. + + :param str lemma: The lemma of a lexical unit contained by the + requested synset. + :param pos: The part of speech of a lexical unit contained by the + requested synset. + :type pos: Union[PoS, str] + :param int variant: The variant of a lexical unit contained by the + requested synset. + + :returns: Synset satisfying the criteria specified by the parameters. + :rtype: SynsetBase + + :raises SynsetNotFound: If no synset with the given properties + could be found. + :raises InvalidPoSException: If a query is made for a PoS that is not + one of the valid constants. + """ + + pass + + @abc.abstractmethod + def synset_by_id(self, id_): + """Get the synset, knowing its internal, numerical ID. + + This method is not intended to be used by itself, but with tools which + identify PLWN synsets by their IDs. + + :param int id_: The internal plWordnet identifier of the synset. + + :returns: The synset having the ID. + :rtype: SynsetBase + + :raises InvalidSynsetIdentifierException: If there's no synset with + the ID in plWordnet. + """ + + pass + + @abc.abstractmethod + def lexical_units(self, lemma=None, pos=None, variant=None): + """Iterate over lexical units form plWordNet, filtered by lemma, part + of speech and variant. + + If a parameter is omitted, then any value is accepted (so + ``lexical_units()`` iterates over all units). + + :param str lemma: Only lexical units with this lemma will be yielded. + :param pos: Only lexical units with this part of speech will be + yielded. + :type pos: Union[PoS, str] + :param int variant: Only lexical units with this variant will be + yielded. + + :returns: Iterable of lexical units fitting the parameters' criteria. + :rtype: Iterable[LexicalUnitBase] + + :raises InvalidPoSException: If a query is made for a PoS that is not + one of the valid constants. + """ + + pass + + @abc.abstractmethod + def lexical_unit(self, lemma, pos, variant): + """Get the lexical unit with the lemma, part of speech and variant. + + Unlike :meth:`.lexical_units`, all parameters of this method are + mandatory. It either returns a single unit, or raises and exception + if no such unit can be found. + + :param str lemma: The lemma of the requested lexical unit. + :param pos: The part of speech of the requested lexical unit. + :type pos: Union[PoS, str] + :param int variant: The variant of the requested lexical unit. + + :returns: Lexical unit satisfying the criteria specified by the + parameters. + :rtype: LexicalUnitBase + + :raises LexicalUnitNotFound: If no unit with the given properties + could be found. + :raises InvalidPoSException: If a query is made for a PoS that is not + one of the valid constants. + """ + + pass + + @abc.abstractmethod + def lexical_unit_by_id(self, id_): + """Get the lexical unit, knowing its internal, numerical ID. + + See :meth:`.synset_by_id` for remarks. + + :param int id_: The internal plWordnet identifier of the lexical unit. + + :returns: The lexical unit having the ID. + :rtype: LexicalUnitBase + + :raises InvalidLexicalUnitIdentifierException: If there's no lexical + unit with the ID in plWordnet. + """ + + pass + + @abc.abstractmethod + def synset_relation_edges(self, include=None, exclude=None): + """Iterate over all synset relation instances in plWordnet, yielding + them as tuples. + + Named tuples in format ``(source, relation, target)`` + (:data:`RelationEdge`) are yielded by this method. + + One of the intended uses of this method is to create a graph "live", by + feeding the results directly to a graph-building library. + + **Note:** if both ``include`` and ``exclude`` are passed, the result + will be a logical intersection. In both collections, invalid relation + names are silently ignored. + + :param Iterable[str] include: Names of relations which should be + included in the output. Instances of all other relations will be + ignored. By default all relations are included. + + :param Iterable[str] exclude: Names of relations which should not be + included in the output. By default, no relations are excluded. + + :returns: Generator of tuples representing synset relation edges. + :rtype: Iterable[Tuple[SynsetBase,str,SynsetBase]] + """ + + pass + + @abc.abstractmethod + def lexical_relation_edges(self, include=None, exclude=None): + """Iterate over all lexical relation instances in plWordnet, yielding + them as tuples. + + This method behaves very closely to :meth:`.synset_relation_edges`, but + for lexical relations. + + :rtype: Iterable[Tuple[LexicalUnitBase,str,LexicalUnitBase]] + """ + + pass + + def close(self): + """Perform necessary cleanup operations and close this PLWordNet + instance. + + Often, temporary files are created when reading and parsing plWordNet, + and non-temporary files may be opened. Call this method to properly + close / remove those files. + + It's best to use :func:`contextlib.closing` to ensure that this method + gets eventually called. + + It's legal to call this method several times. It's not legal to call + any other methods after :meth:`.close` has been called. + """ + + pass + + def to_graphml(self, + out_file, + graph_type=go.GRAPH_TYPE_SYNSET, + include_attributes=False, + prefix_ids=False, + included_synset_attributes=None, + excluded_synset_attributes=None, + included_lexical_unit_attributes=None, + excluded_lexical_unit_attributes=None, + included_synset_relations=None, + excluded_synset_relations=None, + included_lexical_unit_relations=None, + excluded_lexical_unit_relations=None, + included_synset_nodes=None, + excluded_synset_nodes=None, + included_lexical_unit_nodes=None, + excluded_lexical_unit_nodes=None): + """Export the wordnet as graph in `GraphML + <http://graphml.graphdrawing.org/>`_ format. + + Normally, nodes of the graph are synsets, and edges are relations + between synsets. It's possible to make the graph made of lexical units + and relations, or both synsets and units. + + IDs of nodes are internal plWordNet IDs (the same as returned by ``id`` + property of synset / lexical_unit). They may be prefixed with + ``synset-`` or ``lexical_unit-`` depending on type of the node and + ``prefix_ids`` parameter value. + + Edges have no IDs. + + Nodes and edges can have certain attributes assigned to them in + GraphML. For edges, there are two attributes: + + * **type:** Either ``relation`` or ``unit_and_synset``, depending on + whether the edge represents a relation or a link between a synset and + a unit that belongs to it. The latter are only present in mixed graph + type. + * **name:** If **type** is ``relation``, then it's the name of the + relation. If **type** is ``unit_and_synset``, then it's either + ``has_unit``, for an edge directed from a synset node to a + lexical_unit node, or ``in_synset`` for an edge in the + opposite direction. + + Nodes only have attributes if ``include_attributes`` parameter is + ``True``. The attributes have names and values corresponding to + properties of :class:`SynsetBase` or :class:`LexicalUnitBase` objects. + Composite values (like tuples) are stored as JSON strings (since + GraphML only allows simple types for attributes). Attributes can be + excluded or included using the method's parameters. + + Possible names of synset attributes: + * definition + * relations + + Possible names of lexical unit attributes: + * lemma + * pos + * variant + * definition + * sense_examples + * sense_examples_sources + * external_links + * usage_notes + * domain + * relations + + **NOTE:** If both corresponding ``include_*`` and ``exclude_*`` + parameters are passed, an item will be included only if it appears in + the ``include_*`` set and does not appear in ``exclude_*`` set. + + :param Union[str,BytesIO] out_file: Stream or name of the file to which + the GraphML XML will be written. **NOTE:** Because of a peculiarity in + the XML module used (ElementTree), if a stream is passed here, it + should be opened in binary mode. + + :param str graph_type: Type of the graph. There are three possible + values: + * ``synset``: Nodes are synsets and edges are synset relations. + * ``lexical_unit``: Nodes are lexical units and edges are lexical unit + relations. + * ``mixed``: There are both synset and lexical unit nodes, + distinguished by prefixes in their IDs. Synsets are connected with + synset relations and lexical units are connected with lexical + relations. Synsets and units are connected with ``unit_and_synset`` + type of edge (see description above). + + :param bool include_attributes: If ``True``, then node attributes will + be included in the output XML file. Note, that if + ``included_*_attributes`` or ``excluded_*_attributes`` is passed, then + this parameter is ignored and the designated attributes are included. + + :param bool prefix_ids: If ``True``, then IDs of nodes will be prefixed + with ``synset-`` or ``lexical_unit-``. Note, that if ``graph_type`` is + ``mixed``, nodes are always prefixed and this parameter is ignored. + + :param FrozenSet[str] included_synset_attributes: Set of names of + synset attributes which should be included in GraphML nodes. All other + attributes are excluded. + + :param FrozenSet[str] excluded_synset_attributes: Set of names of + synset attributes which should not be included in GraphML nodes. All + other attributes are included. + + :param FrozenSet[str] included_lexical_unit_attributes: Like + ``included_synset_attributes``, but for lexical unit nodes. + + :param FrozenSet[str] excluded_lexical_unit_attributes: Like + ``excluded_synset_attributes``, but for lexical unit nodes. + + :param FrozenSet[str] included_synset_relations: Set of names of synset + relations which should be included as edges in the graph. All other + relation edges are excluded. + + :param FrozenSet[str] excluded_synset_relations: Set of names of synset + relations which should not be included as edges in the graph. All other + relation edges are included. + + :param FrozenSet[str] included_lexical_unit_relations: Like + ``included_synset_relations``, but for lexical unit relations. + + :param FrozenSet[str] excluded_lexical_unit_relations: Like + ``excluded_synset_relations``, but for lexical unit relations. + + :param FrozenSet[int] included_synset_nodes: Set of IDs of synsets that + should be included as nodes in the graph. All other synsets are + excluded. Any edge that has one of its endpoints not included will also + not be included. Also, if the graph type is mixed, lexical units + belonging to a synset which is not included will also be excluded. + + :param FrozenSet[int] excluded_synset_nodes: Set of IDs of synsets + which should not be included as nodes in the graph. All other synsets + are included. Also see remarks for ``included_synset_nodes``. + + :param FrozenSet[int] included_lexical_unit_nodes: Like + ``included_synset_nodes``, but for lexical units. + + :param FrozenSet[int] excluded_lexical_unit_nodes: Like + ``excluded_synset_nodes``, but for lexical units. + + :raises ValueError: If ``graph_type`` is not one of the allowed values. + """ + + gwn = go.GraphMLWordNet() + gb = go.GraphMLBuilder(self, gwn) + + if graph_type == go.GRAPH_TYPE_SYNSET: + gb.synset_graph( + prefix_ids=prefix_ids, + include_attributes=include_attributes, + included_attributes=included_synset_attributes, + excluded_attributes=excluded_synset_attributes, + included_nodes=included_synset_nodes, + excluded_nodes=excluded_synset_nodes, + included_relations=included_synset_relations, + excluded_relations=excluded_synset_relations, + ) + elif graph_type == go.GRAPH_TYPE_UNIT: + gb.lexical_unit_graph( + prefix_ids=prefix_ids, + include_attributes=include_attributes, + included_attributes=included_lexical_unit_attributes, + excluded_attributes=excluded_lexical_unit_attributes, + included_nodes=included_lexical_unit_nodes, + excluded_nodes=excluded_lexical_unit_nodes, + included_relations=included_lexical_unit_relations, + excluded_relations=excluded_lexical_unit_relations, + ) + elif graph_type == go.GRAPH_TYPE_MIXED: + gb.mixed_graph( + include_attributes=include_attributes, + included_synset_attributes=included_synset_attributes, + excluded_synset_attributes=excluded_synset_attributes, + included_lexical_unit_attributes= + included_lexical_unit_attributes, + excluded_lexical_unit_attributes= + excluded_lexical_unit_attributes, + included_synset_relations=included_synset_relations, + excluded_synset_relations=excluded_synset_relations, + included_lexical_unit_relations= + included_lexical_unit_relations, + excluded_lexical_unit_relations= + excluded_lexical_unit_relations, + included_synset_nodes=included_synset_nodes, + excluded_synset_nodes=excluded_synset_nodes, + included_lexical_unit_nodes=included_lexical_unit_nodes, + excluded_lexical_unit_nodes=excluded_lexical_unit_nodes, + ) + else: + raise ValueError('graph_type={!r}'.format(graph_type)) + + gwn.write(out_file) + + def __repr__(self): + return '<PLWordNet ({}) at {:x}>'.format( + self._STORAGE_NAME, + id(self), + ) + + +@functools.total_ordering +@six.python_2_unicode_compatible +class SynsetBase(object): + """Encapsulates data associated with a plWordNet synset. + + Synset contains lexical units that have the same meaning (ie. synonyms). + Most of plWordNet relations are between meanings, hence the need to group + lexical units into synsets. + + For purposes of ordering, a :class:`SynsetBase` is uniquely identified by + its head: the first of the lexical units it contains. + """ + + __metaclass__ = abc.ABCMeta + + @abc.abstractproperty + def id(self): + """``int`` + + The internal identifier of the synset in plWordnet. It is unique among + all synsets. + """ + + pass + + @abc.abstractproperty + def lexical_units(self): + """``Tuple[LexicalUnitBase]`` + + Lexical units contained in the synsets. Ordering of units within the + tuple is arbitrary, but constant. The first unit is the synset's head, + used to represent it. + + At least one lexical unit is always present in every synset. + """ + + pass + + @abc.abstractproperty + def definition(self): + """``str`` + + Textual description of the synset's meaning. + + Will be an empty string if the definition is not present in plWordNet. + """ + + pass + + @abc.abstractproperty + def relations(self): + """``Tuple[str]`` + + Tuple of all outward relations that lead from this synset. + """ + + pass + + @abc.abstractmethod + def related(self, relation_name): + """Iterate over synsets to whom this synset has a certain relation. + + :param str relation_name: The name of the relation to follow. + + :returns: Iterable of related synsets. + :rtype: Iterable[SynsetBase] + + :raises InvalidRelationNameException: If ``relation_name`` is not a + valid name of a synset relation in plWordNet. + """ + + pass + + def to_dict(self, include_related=True, include_units_data=True): + """ + Create a JSON-compatible dictionary with all the public properties of + the synset. + + Enums are converted to their values and all collections are converted + to tuples. + + :param bool include_related: If ``True``, the dictionary will contain a + "related" member, whose value is a dictionary in format:: + + { + "<synset relation name>": ( + (<relation target id>, <relation target string form>), + ... + ), + ... + } + + :param bool include_units_data: If ``True``, then the "units" member of + the dictionary will be a tuple of results of + :meth:`LexicalUnitBase.to_dict`. Otherwise, it will contain only + tuples of ``(<unit id>, <unit string form>)``. + + :returns: Dictionary contain data of the synset. + :rtype: Mapping[str, Any] + """ + + syn_dict = { + u'id': self.id, + u'definition': self.definition, + u'units': tuple( + (lu.to_dict(include_related) for lu in self.lexical_units) + if include_units_data + else ((lu.id, six.text_type(lu)) for lu in self.lexical_units) + ), + u'str': six.text_type(self), + } + + if include_related: + syn_dict[u'related'] = { + relname: tuple( + (target.id, target.short_str()) + for target in self.related(relname) + ) + for relname in self.relations + } + + return syn_dict + + def short_str(self): + """ + Shorter version of ``str`` cast that displays only the first unit. + """ + + sstr = [u'{', six.text_type(self.lexical_units[0])] + if len(self.lexical_units) > 1: + sstr.append( + u', [+ {} unit(s)]'.format(len(self.lexical_units) - 1), + ) + sstr.append(u'}') + return ''.join(sstr) + + def __repr__(self): + head = self.lexical_units[0] + rstr = '<Synset id={!r} lemma={!r} pos={!r} variant={!r}'.format( + self.id, + head.lemma, + head.pos, + head.variant, + ) + + if len(self.lexical_units) > 1: + rstr += ' [+ {} unit(s)]'.format(len(self.lexical_units) - 1) + + return rstr + '>' + + def __str__(self): + return ( + u'{' + + u', '.join(six.text_type(lu) for lu in self.lexical_units) + + u'}' + ) + + def __hash__(self): + # Even if comparing is done by the synset's head, it's probably better + # to hash by all lexical units, to boost the hash's uniqueness + return hash(self.lexical_units) + + def __eq__(self, other): + if not isinstance(other, SynsetBase): + return NotImplemented + return self.lexical_units[0] == other.lexical_units[0] + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if not isinstance(other, SynsetBase): + return NotImplemented + return self.lexical_units[0] < other.lexical_units[0] + + +@six.python_2_unicode_compatible +class LexicalUnitBase(object): + """Encapsulates data associated with a plWordNet lexical unit. + + Lexical units represent terms in the language. Each lexical unit is + uniquely identified by its lemma (base written form), part of speech + (verb, noun, adjective or adverb) and variant (a number: sometimes the same + form can have multiple meanings). + """ + + __metaclass__ = abc.ABCMeta + + @abc.abstractproperty + def id(self): + """``int`` + + The internal identifier of the lexical unit in plWordnet. It is unique + among all units. + """ + + pass + + @abc.abstractproperty + def lemma(self): + """``str`` + + Lemma of the unit, basic form of the word(s) the unit represents. + """ + + pass + + @abc.abstractproperty + def pos(self): + """``PoS`` + + Part of speech of the unit. This will be one of enumeration constants + from :class:`PoS`. To get the textual value, use ``pos.value``. + """ + + pass + + @abc.abstractproperty + def variant(self): + """``int`` + + If the same lemma has different meanings as the same part of speech, + this number will be used to tell them apart. The first meaning has the + number 1. + """ + + pass + + @abc.abstractproperty + def definition(self): + """``str`` + + Textual description of the lexical unit's meaning. + + Will be an empty string if the definition is not present in plWordNet. + """ + + pass + + @abc.abstractproperty + def sense_examples(self): + """``Tuple[str]`` + + Fragments of text that show how the lexical unit is used in the + language. + + May be an empty collection, if no examples are present. + """ + + pass + + @abc.abstractproperty + def sense_examples_sources(self): + """``Tuple[str]`` + + Symbolic representations of sources from which the sense examples were + taken. + + This tuples has the same length as ``sense_examples``, and is aligned + by index (for example, the source of ``sense_examples[3]`` is at + ``sense_examples_sources[3]``). + + To get pairs of of examples with their sources, use + ``zip(sense_examples, sense_examples_sources)`` + """ + + # TODO List of source symbols, link to? + pass + + @abc.abstractproperty + def external_links(self): + """``Tuple[str]`` + + URLs to webpages describing the meaning of the lexical unit. + + May be an empty collection, if no examples are present. + """ + + pass + + @abc.abstractproperty + def usage_notes(self): + """``Tuple[str]`` + + Symbols denoting certain properties of how the lexical unit is used. + + For example, "daw." means that the word is considered dated. + + May be an empty collection. + """ + + pass + + @abc.abstractproperty + def domain(self): + """``Domain`` + + Wordnet domain the lexical unit belongs to. + """ + + pass + + @abc.abstractproperty + def verb_aspect(self): + """``Optional[VerbAspect]`` + + Aspect of a verb. This will be one of the constants from + :class:`VerbAspect`, or ``None``, if the lexical unit is not a verb. + """ + + pass + + @abc.abstractproperty + def emotion_markedness(self): + """``Optional[EmotionMarkedness]`` + + Markedness of emotional connotations of the lexical unit. May be + ``None``, if the unit has no emotional markedness. + + If this property is ``None``, then all other ``emotion_*`` properties + will be ``None`` or empty. + """ + + pass + + @abc.abstractproperty + def emotion_names(self): + """``Tuple[str, ...]`` + + Names of emotions associated with this lexical unit. + """ + + pass + + @abc.abstractproperty + def emotion_valuations(self): + """``Tuple[str, ...]`` + + Valuations of emotions associated with this lexical unit. + """ + + pass + + @abc.abstractproperty + def emotion_example(self): + """``Optional[str]`` + + An example of an emotionally loaded sentence using the lexical unit. + """ + + pass + + @abc.abstractproperty + def emotion_example_secondary(self): + """``Optional[str]`` + + This property is not ``None`` only if ``emotion_markedness`` is + ``amb``. In such case, :attr:`.emotion_example` will be an + example of a positive sentence, and this one will be a negative + sentence. + """ + + pass + + @abc.abstractproperty + def synset(self): + """``SynsetBase`` + + The synset the unit belongs to. + """ + + pass + + @abc.abstractmethod + def related(self, relation_name): + """Iterate over lexical units to whom this unit has a + certain relation. + + :param str relation_name: The name of the relation to follow. + + :returns: Iterable of related units. + :rtype: Iterable[LexicalUnitBase] + + :raises InvalidRelationNameException: If ``relation_name`` is not a + valid name of a lexical relation in plWordNet. + """ + + pass + + @abc.abstractproperty + def relations(self): + """``Tuple[str]`` + + Tuple of all outward relations that lead from this lexical unit. + """ + + pass + + def to_dict(self, include_related=True): + """ + Create a JSON-compatible dictionary with all the public properties of + the lexical unit. + + Enums are converted to their values and all collections are converted + to tuples. + + :param bool include_related: If ``True``, the dictionary will contain a + "related" member, whose value is a dictionary in format:: + + { + "<lexical relation name>": ( + (<relation target id>, <relation target string form>), + ... + ), + ... + } + + :returns: Dictionary contain data of the lexical unit. + :rtype: Mapping[str, Any] + """ + + lu_dict = { + u'id': self.id, + u'lemma': self.lemma, + u'pos': self.pos.value, + u'variant': self.variant, + u'definition': self.definition, + u'sense_examples': tuple(self.sense_examples), + u'sense_examples_sources': tuple(self.sense_examples_sources), + u'external_links': tuple(self.external_links), + u'usage_notes': tuple(self.usage_notes), + u'domain': self.domain.value, + u'synset': self.synset.id, + u'emotion_markedness': None + if self.emotion_markedness is None + else self.emotion_markedness.value, + u'emotion_names': make_values_tuple(self.emotion_names), + u'emotion_valuations': make_values_tuple(self.emotion_valuations), + u'emotion_example': self.emotion_example, + u'emotion_example_secondary': self.emotion_example_secondary, + u'str': six.text_type(self), + } + + if include_related: + lu_dict[u'related'] = { + relname: tuple( + (target.id, six.text_type(target)) + for target in self.related(relname) + ) + for relname in self.relations + } + + return lu_dict + + def __repr__(self): + return '<LexicalUnit id={!r} lemma={!r} pos={!r} variant={!r}>'.format( + self.id, + self.lemma, + self.pos, + self.variant, + ) + + def __str__(self): + return u'{lem}.{var}({domnum}:{domname})'.format( + lem=self.lemma.replace(u' ', u'_'), + var=self.variant, + domnum=self.domain.db_number, + domname=self.domain.name, + ) + + def __hash__(self): + return hash((self.lemma, self.pos, self.variant)) + + def __eq__(self, other): + if not isinstance(other, LexicalUnitBase): + return NotImplemented + + return (locale.strcoll(self.lemma, other.lemma) == 0 and + self.pos == other.pos and + self.variant == other.variant) + + def __ne__(self, other): + return not self == other + + # Total ordering done by hand, to minimize strcoll calls + + def __lt__(self, other): + cmp_ = self.__lt_lempos(other) + return cmp_ if cmp_ is not None else self.variant < other.variant + + def __le__(self, other): + cmp_ = self.__lt_lempos(other) + return cmp_ if cmp_ is not None else self.variant <= other.variant + + def __gt__(self, other): + return not self <= other + + def __ge__(self, other): + return not self < other + + def __lt_lempos(self, other): + # Common code for __lt__ and __le__ methods. + # Compares first two elements. + if not isinstance(other, LexicalUnitBase): + return NotImplemented + + colled = locale.strcoll(self.lemma, other.lemma) + + if colled < 0: + return True + if colled > 0: + return False + + if self.pos is other.pos: + # Defer comparison + return None + + return self.pos.value < other.pos.value diff --git a/plwn/enums.py b/plwn/enums.py new file mode 100644 index 0000000000000000000000000000000000000000..67ad34a9697c7b76eb8078c8c4d672325a85e01e --- /dev/null +++ b/plwn/enums.py @@ -0,0 +1,309 @@ +# coding: utf8 +""" +Enumerated values used in plWordNet +""" + +from __future__ import absolute_import, division + + +import re +from enum import Enum + +import six + + +__all__ = ( + 'PoS', + 'VerbAspect', + 'EmotionMarkedness', + 'EmotionName', + 'EmotionValuation', + 'Domain', + 'make_values_tuple', +) + + +# Helper function for making dictionaries translating enum instances into +# numbers used to denote them in plWN database. +def _fill_numtrans(enumclass, num2enum, enum2num): + for num, enuminst in enumerate(enumclass, 1): + num2enum[num] = enuminst + enum2num[enuminst] = num + + +def _get_from_numtrans(numtrans, num, optional): + try: + return numtrans[num] + except KeyError: + if optional: + return None + raise + + +# Explicit ordering is needed only in python 2. +_POS_ORDER = 'verb noun adverb adjective' +_POS_NUM2ENUM = {} +_POS_ENUM2NUM = {} + + +class PoS(Enum): + """ + Defines **Part of Speech** values used by plWN. + """ + + if six.PY2: + __order__ = _POS_ORDER + + verb = u'verb' + noun = u'noun' + adverb = u'adverb' + adjective = u'adjective' + + v = verb + n = noun + adv = adverb + adj = adjective + + @staticmethod + def by_db_number(number, optional=False): + return _get_from_numtrans(_POS_NUM2ENUM, number, optional) + + @property + def db_number(self): + return _POS_ENUM2NUM[self] + + +_fill_numtrans(PoS, _POS_NUM2ENUM, _POS_ENUM2NUM) + + +_VA_ORDER = 'perfective imperfective predicative two_aspect' +_VA_NUM2ENUM = {} +_VA_ENUM2NUM = {} + + +class VerbAspect(Enum): + """ + Defines aspect values used by verbs in plWN. + """ + + if six.PY2: + __order__ = _VA_ORDER + + perfective = u'perf' + imperfective = u'imperf' + predicative = u'pred' + two_aspect = u'imperf.perf' + + perf = perfective + imperf = imperfective + pred = predicative + two = two_aspect + + # Additionally, some Polish abbreviations + dk = perfective + ndk = imperfective + + @staticmethod + def by_db_number(number, optional=False): + return _get_from_numtrans(_VA_NUM2ENUM, number, optional) + + @property + def db_number(self): + return _VA_ENUM2NUM[self] + + +_fill_numtrans(VerbAspect, _VA_NUM2ENUM, _VA_ENUM2NUM) + + +class EmotionMarkedness(Enum): + """ + Defines markedness of emotions associated with some lexical units. + """ + + strong_positive = u'+ m' + strong_negative = u'- m' + weak_positive = u'+ s' + weak_negative = u'- s' + ambiguous = u'amb' + + plus_m = strong_positive + minus_m = strong_negative + plus_s = weak_positive + minus_s = weak_negative + amb = ambiguous + + @classmethod + def normalized(cls, strvalue): + """ + Return an instance of this enum with string value normalized with + regards to whitespace. + """ + + strvalue = strvalue.strip() + + # Try the one value value that won't require matching + if strvalue == cls.ambiguous.value: + return cls.ambiguous + + match = re.match(r'([+-])\s*([sm])', strvalue, re.U) + + if not match: + # This can't be a valid string, so let the built-in exception + # raise. + return cls(strvalue) + + return cls(match.group(1) + u' ' + match.group(2)) + + +class EmotionName(Enum): + """ + Possible names of emotions associated with some lexical units. + """ + + joy = u'radość' + trust = u'zaufanie' + anticipation = u'cieszenie się na coś oczekiwanego' + surprise = u'zaskoczenie czymś nieprzewidywanym' + sadness = u'smutek' + anger = u'złość' + fear = u'strach' + disgust = u'wstręt' + + radosc = joy + zaufanie = trust + cieszenie_sie_na = anticipation + zaskoczenie = surprise + smutek = sadness + zlosc = anger + strach = fear + wstret = disgust + + +class EmotionValuation(Enum): + """ + Possible valuations of emotions associated with some lexical units. + """ + + usefulness = u'użyteczność' + good = u'dobro' + truth = u'prawda' + knowledge = u'wiedza' + beauty = u'piękno' + happiness = u'szczęście' + uselessness = u'nieużyteczność' + harm = u'krzywda' + ignorance = u'niewiedza' + error = u'błąd' + ugliness = u'brzydota' + unhappiness = u'nieszczęście' + + uzytecznosc = usefulness + dobro = good + prawda = truth + wiedza = knowledge + piekno = beauty + szczescie = happiness + nieuzytecznosc = uselessness + krzywda = harm + niewiedza = ignorance + blad = error + brzydota = ugliness + nieszczescie = unhappiness + + +_DOM_ORDER = 'bhp czy wytw cech czc umy por zdarz czuj jedz grp msc cel rz ' \ + 'os zj rsl pos prc il zw ksz st sbst czas zwz hig zmn cumy cpor wal ' \ + 'cjedz dtk cwytw cczuj ruch pst cpos sp cst pog jak rel odcz grad sys ' \ + 'adj adv cdystr caku cper cdel' +_DOM_NUM2ENUM = {} +_DOM_ENUM2NUM = {} + + +class Domain(Enum): + """ + Wordnet domains of lexical units. + """ + + if six.PY2: + __order__ = _DOM_ORDER + + bhp = u'najwyższe w hierarchii' + czy = u'czynności (nazwy)' + wytw = u'wytwory ludzkie (nazwy)' + cech = u'cechy ludzi i zwierząt' + czc = u'części ciała' + umy = u'związane z myśleniem' + por = u'związane z porozumiewaniem się' + zdarz = u'zdarzenia' + czuj = u'uczucia, odczucia i emocje' + jedz = u'jedzenie' + grp = u'grupy ludzi i rzeczy' + msc = u'miejsca i umiejscowienie' + cel = u'cel działania' + rz = u'obiekty naturalne' + os = u'ludzie' + zj = u'zjawiska naturalne' + rsl = u'nazwy roślin' + pos = u'posiadanie i jego zmiana' + prc = u'procesy naturalne' + il = u'ilość, liczebność, jednoski miary' + zw = u'zwierzęta' + ksz = u'kształty' + st = u'sytuacje statyczne (stany)' + sbst = u'substancje' + czas = u'czas i stosunki czasowe' + zwz = u'związek miedzy ludźmi, rzeczami lub ideami' + + hig = u'pielęgnacja ciała' + zmn = u'zmiana wielkości, temeraturym natężenia, itp.' + cumy = u'czasowniki myślenia (szeroko rozumianego)' + cpor = u'czasowniki mówienia, śpiewania itp.' + wal = u'czasowniki rywalizacji fizycznej' + cjedz = u'czasowniki jedzenia' + dtk = u'czasowniki oznacz. kontakt fizyczny ' \ + u'(dotykanie, uderzenie, rycie itp.)' + cwytw = u'czasowniki oznacz. wytwarzanie czegoś' + cczuj = u'czasowniki wyrażające uczucia' + ruch = u'czasowniki ruchu' + pst = u'czasowniki postrzegania (percepcji)' + cpos = u'czasowniki posiadania i zmiany posiadania' + sp = u'czasowniki oznacz. wydarzenie i działania społeczne i polityczne' + cst = u'czasowniki stanowe' + pog = u'czasowniki oznacz. zjawiska pogodowe' + + jak = u'przymiotniki jakościowe' + rel = u'przymiotniki relacyjne (rzeczownikowe)' + odcz = u'przymiotniki odczasownikowe' + grad = u'przymiotniki odprzymiotnikowe (natężenie cechy)' + + sys = u'systematyka, klasyfikacja' + + adj = u'PWN: all adjective clusters' + adv = u'PWN: all adverbs' + + mat = u'przymiotniki materiałowe' + + cdystr = u'czasownki dystrybutywne' + caku = u'czasowniki akumulatywne' + cper = u'czasowniki perduratywne' + cdel = u'czasowniki delimitatywne' + + @staticmethod + def by_db_number(number, optional=False): + return _get_from_numtrans(_DOM_NUM2ENUM, number, optional) + + @property + def db_number(self): + return _DOM_ENUM2NUM[self] + + +_fill_numtrans(Domain, _DOM_NUM2ENUM, _DOM_ENUM2NUM) + + +def make_values_tuple(enum_seq): + """ + Auxiliary function that converts a sequence of enums to a tuple of enum + values. + """ + + return tuple(en.value for en in enum_seq) diff --git a/plwn/exceptions.py b/plwn/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..ebc2c9ad9a87cfbdd18dacc531e2dfdc87d8035b --- /dev/null +++ b/plwn/exceptions.py @@ -0,0 +1,121 @@ +"""Custom exceptions raised by PLWN API.""" + +from __future__ import absolute_import, division + + +__all__ = ( + 'PLWNAPIException', + 'NotFound', + 'LexicalUnitNotFound', + 'SynsetNotFound', + 'ReaderException', + 'MalformedIdentifierException', + 'LoadException', + 'DumpVersionException', + 'InvalidSynsetIdentifierException', + 'InvalidLexicalUnitIdentifierException', + 'InvalidRelationNameException', + 'InvalidPoSException', +) + + +class PLWNAPIException(Exception): + """Base for all exceptions in the module.""" + + pass + + +class NotFound(PLWNAPIException): + """Base for exceptions raised when an object is not found.""" + + def __init__(self, lemma, pos, variant, *args): + super(NotFound, self).__init__(*args) + + self.args = ('lemma={!r} pos={!r} variant={!r}'.format( + lemma, + pos, + variant, + ),) + self.args + + +class LexicalUnitNotFound(NotFound): + """Raised when a lexical unit is not found during lookup.""" + + pass + + +class SynsetNotFound(NotFound): + """Raised when a synset is not found during lookup.""" + + pass + + +class ReaderException(PLWNAPIException): + """Raised when there's an error in the format expected by a reader.""" + + pass + + +class MalformedIdentifierException(ReaderException): + """Raised during UBY-LMF parsing, when a malformed identifier is + encountered. + """ + + def __init__(self, id_): + super(MalformedIdentifierException, self).__init__( + "Malformed identifier, expected digits at the end of the original" + " id instead got {!r}" + .format(id_) + ) + + +class LoadException(PLWNAPIException): + """Raised when a storage can't be loaded from file.""" + + pass + + +class DumpVersionException(LoadException): + """Raised when a dumped storage has wrong version (suggesting incompatible + format). + """ + + def __init__(self, version_is, version_required): + super(DumpVersionException, self).__init__(version_is, + version_required) + self.version_is = version_is + self.version_required = version_required + + def __str__(self): + return ( + 'Invalid schema version of dumped storage: {!r} (should be {!r})' + .format(self.version_is, self.version_required) + ) + + +class InvalidSynsetIdentifierException(PLWNAPIException): + """Raised when a query for a nonexistent synset ID is made.""" + + pass + + +class InvalidLexicalUnitIdentifierException(PLWNAPIException): + """Raised when a query for a nonexistent lexical unit ID is made.""" + + pass + + +class InvalidRelationNameException(PLWNAPIException): + """Raised when attempting to select synsets or units related by a relation + that does not exist. + """ + + pass + + +class InvalidPoSException(PLWNAPIException): + """Raised when a query for PoS is made, which is not one of the valid + constants. + """ + + pass diff --git a/plwn/readers/__init__.py b/plwn/readers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/plwn/readers/comments.py b/plwn/readers/comments.py new file mode 100644 index 0000000000000000000000000000000000000000..9aaa329920fec9b7acd5aa9347afffb27538b7ad --- /dev/null +++ b/plwn/readers/comments.py @@ -0,0 +1,84 @@ +"""Parsing strings in wordnet comment format, for readers that need to deal +with them. + +Importing this module introduces dependency on wncomments. +""" + +from __future__ import absolute_import, division + + +from collections import namedtuple +import itertools as itt + +import plwn_comments as plwnc +import plwn_comments.exceptions as plwnce +import plwn_comments.utils.usage_tags as plwncu + + +__all__ = ( + 'WN_TAGS', + 'NON_EXAMPLE_TAG_NAMES', + 'CommentData', + 'parse_comment_string', +) + + +#: :class:`plwn_comments.TagBank` structure that defines all kinds of comment +#: tags which are needed by PLWN API. +WN_TAGS = plwnc.TagBank() +# Usage notes +WN_TAGS.define(u'K') +# External links +WN_TAGS.define(u'L', u'{') +# Definition +WN_TAGS.define(u'D') + +#: The distinction for these tags is useful, since all examples go to one +#: place. +NON_EXAMPLE_TAG_NAMES = frozenset((u'K', u'L', u'D')) + +# And define those example tags +WN_TAGS.define_from( + plwncu.iter_usage_tags(), + plwncu.DEFAULT_USAGE_TAG_SURROUND, +) + +#: Data tuple returned from :func:`parse_comment_string`. +CommentData = namedtuple( + 'CommentData', + ('examples', 'examples_sources', 'definition', 'usage', 'links'), +) + + +def parse_comment_string(cmt_str): + """Parse a comment string and extract all data required by PLWN API packed + in a named tuple. + + :param str cmt_str: String in PLWN comment format. + + :returns: Extracted and ordered items needed by PLWN API. + :rtype: CommentData + """ + + try: + cmt = plwnc.Comment.parse(cmt_str, WN_TAGS) + except plwnce.PLWNCommentsException: + # For now just make an empty comment which will make all fields unset + cmt = plwnc.Comment(WN_TAGS) + + # Get all examples + examples = [] + examples_src = [] + + for tagname, tagcontents in cmt.items(): + if tagname not in NON_EXAMPLE_TAG_NAMES: + examples.extend(tagcontents) + examples_src.extend(itt.repeat(tagname, len(tagcontents))) + + return CommentData( + tuple(examples), + tuple(examples_src), + cmt.get_first(u'D'), + tuple(cmt[u'K']), + tuple(cmt[u'L']), + ) diff --git a/plwn/readers/nodes.py b/plwn/readers/nodes.py new file mode 100644 index 0000000000000000000000000000000000000000..31790ec46a572a5134aa715a67f9ab0ffb8ec96a --- /dev/null +++ b/plwn/readers/nodes.py @@ -0,0 +1,16 @@ +"""Those tuples are returned by readers and absorbed by storages.""" + +from collections import namedtuple + + +__all__ = 'SynsetNode', 'LexicalUnitNode' + + +SynsetNode = namedtuple("SynsetNode", ["id", "definition", "related"]) +LexicalUnitNode = namedtuple( + "LexicalUnitNode", + ["id", "lemma", "pos", "variant", "synset", "unit_index", "definition", + "usage_notes", "external_links", "examples", "examples_sources", + "domain", "related", "verb_aspect", "emotion_markedness", "emotion_names", + "emotion_valuations", "emotion_example_1", "emotion_example_2"] +) diff --git a/plwn/readers/ubylmf.py b/plwn/readers/ubylmf.py new file mode 100644 index 0000000000000000000000000000000000000000..642155285ee8df64f6faecd040adb3b7d1eed466 --- /dev/null +++ b/plwn/readers/ubylmf.py @@ -0,0 +1,258 @@ +# FIXME Some assert statements should be converted to regular raises (asserts +# should not be used for anything other than checking for errors in the code +# itself). +from xml.etree import ElementTree +import re +import logging + +from .nodes import SynsetNode, LexicalUnitNode +from .. import exceptions as exc +from ..enums import PoS, Domain + + +__all__ = 'ubylmf_reader', + +ENCODING = 'utf-8' + +_logger = logging.getLogger(__name__) + + +def ubylmf_reader(ubylmf_file): + """Read PLwordnet iteratively, element by element. + + :param ubylmf_file: the name of UMY-LMF file or an opened file itself. + :type ubylmf_file: str or file + + :return: a generator over PLwordnet entities. + :rtype: generator + """ + contex = ElementTree.iterparse(ubylmf_file) # catch only end events + contex = iter(contex) + # Get root elem in order to clear it after reading each elem + try: + _, root = next(contex) + except StopIteration: + raise exc.ReaderException('The xml file is empty') + # Generate wordnet's elements + for _, elem in contex: + entities = [] + # Parse entities + if elem.tag == "LexicalEntry": + for xml_sense in elem.findall("Sense"): + # Don't move it before if - we still want to generate tuples + # even if one sense is broken. + try: + entities.append(_make_lexicalunit(elem, xml_sense)) + except Exception: + _logger.exception( + '\n%s\nIN ELEMENT\n%s', + ElementTree.tostring(xml_sense, ENCODING), + ElementTree.tostring(elem, ENCODING) + ) + elif elem.tag == "Synset": + try: + entities.append(_make_synset(elem)) + except Exception: + _logger.exception('\n%s', ElementTree.tostring(elem, ENCODING)) + # Return entities + if entities: + root.clear() + for entity in entities: + yield entity + + +def _make_lexicalunit(xml_lexicalentry, xml_sense): + """Return a lexical unit built from an xml element. + + :param xml_lexicalentry: an xml element of LexicalUnit read from a file. + :type xml_lexicalentry: xml.etree.ElementTree.Element + :param xml_sense: <Sense> element that belongs to the LexicalUnit + :type xml_sense: xml.etree.ElementTree.Element + + :return: a named tuple LexicalUnitNode + :rtype: LexicalUnitNode + """ + # Get id, synset and variant + lu_id = _extract_id(xml_sense.get("id")) + lu_synset = _extract_id(xml_sense.get("synset")) + lu_variant = int(xml_sense.get("index")) + # Get lemma + xml_lemma = xml_lexicalentry.find("Lemma").find("FormRepresentation") + lu_lemma = xml_lemma.get("writtenForm") + assert lu_lemma, "Lemma is empty" + # Get PoS + lu_pos = xml_lexicalentry.get("partOfSpeech") + assert lu_pos, "PoS is empty" + # Get definition - can be empty! At most 2 + lu_definition, lu_usage_notes, lu_external_links = \ + _extract_definitions(xml_sense) + # Get usage examples + lu_examples = [] + lu_examples_sources = [] + for xe in xml_sense.findall("SenseExample"): + example = xe.find("TextRepresentation").get("writtenText").strip() + if example: + exm_src_match = re.search(r'\[##([-\w]+):?\]$', example, re.U) + if exm_src_match is not None: + lu_examples.append(example[:exm_src_match.start(0)]) + lu_examples_sources.append(exm_src_match.group(1)) + else: + _logger.warning("Malformed sense example: %s", example) + # Get semantic labels + lu_domain = _get_domain(xml_sense) + # Get related + lu_related = [] + for xsr in xml_sense.findall("SenseRelation"): + try: + lu_related.append( + (xsr.get("relName"), _extract_id(xsr.get("target"))) + ) + except exc.MalformedIdentifierException: + _logger.exception( + '\n%s\nIN ELEMENT\n%s\nThis relation is skipped.', + ElementTree.tostring(xsr, ENCODING), + ElementTree.tostring(xml_sense, ENCODING) + ) + # Get unit index + lu_unit_index = int(_extract_id( + xml_sense.find("MonolingualExternalRef").get("externalReference")) + ) + return LexicalUnitNode( + id=lu_id, + lemma=lu_lemma, + pos=PoS(lu_pos), + synset=lu_synset, + variant=lu_variant, + unit_index=lu_unit_index, + definition=lu_definition, + usage_notes=tuple(lu_usage_notes), + external_links=tuple(lu_external_links), + examples=tuple(lu_examples), + examples_sources=tuple(lu_examples_sources), + # The domain label is in format <pos>.<lang>_<name>; the last one is + # the only one we care about. + domain=Domain[lu_domain.rsplit('_', 1)[-1]], + related=tuple(lu_related), + # The below properties are never stored in uby files (at present at + # least). + verb_aspect=None, + emotion_markedness=None, + emotion_names=(), + emotion_valuations=(), + emotion_example_1=None, + emotion_example_2=None, + ) + + +def _extract_definitions(xml_sense): + """Extract a definition, notes and links of a LU from <Definition> tags. + + :param xml_sense: <Sense> element read from an xml file. + :type xml_sense: xml.etree.ElementTree.Element + + :return: the definition, usage notes and external links of the LU. + :rtype: (str or unicode, tuple, tuple) + + :raises AssertionError: if there is more than 2 <Definition> tags. + """ + # Get definition - can be empty! At most 2 + xml_definitions = xml_sense.findall("Definition") + lu_definition = "" + lu_usage_notes = [] + lu_external_links = [] + assert len(xml_definitions) <= 2, \ + "Too many definitions ({:})".format(len(xml_definitions)) + # There is at least one <Definition> + if xml_definitions: + children = list(xml_definitions[0]) + # Check whether the first child is the real definition + if children[0].tag == "TextRepresentation": + lu_definition = children[0].get("writtenText") + # <Statement> - the rest of children + children = list(xml_definitions[1]) \ + if len(xml_definitions) == 2 else [] + # Get additional info + for child in children: + if child.get("statementType", "") == "usageNote": + lu_usage_notes.append( + child.find("TextRepresentation").get("writtenText") + ) + if child.get("statementType", "") == "externalReference": + lu_external_links.append( + child.find("TextRepresentation").get("writtenText") + ) + return lu_definition, lu_usage_notes, lu_external_links + + +def _get_domain(xml_sense): + """Extract a domain of a LU from <SemanticLabel> tags. + + :param xml_sense: <Sense> element read from an xml file. + :type xml_sense: xml.etree.ElementTree.Element + + :return: the domain of the LU. + :rtype: str or unicode + + :raises AssertionError: if there is more than 1 <SemanticLabel> tags, + no tag at all or its type is different from domain. + """ + xml_semantic_labels = xml_sense.findall("SemanticLabel") + assert len(xml_semantic_labels) == 1, \ + "{:} SemanticLabel found, should be 1".format(len(xml_semantic_labels)) + assert xml_semantic_labels[0].get("type", "") == "domain", \ + ("SemanticLabel has type {:} instead of domain" + "").format(xml_semantic_labels[0].get("type").encode(ENCODING)) + return xml_semantic_labels[0].get("label") + + +def _make_synset(xml_synset): + """Return a synset built from an xml element. + + :param xml_synset: an xml element of Synset read from a file. + :type xml_synset: xml.etree.Element + + :return: a named tuple SynsetNode + :rtype: SynsetNode + """ + s_id = _extract_id(xml_synset.get("id")) + xml_def = xml_synset.find("Definition") + s_def = xml_def.find("TextRepresentation").get("writtenText") \ + if xml_def is not None else "" + s_related = [] + for xsr in xml_synset.findall("SynsetRelation"): + try: + s_related.append( + (xsr.get("relName"), _extract_id(xsr.get("target"))) + ) + except exc.MalformedIdentifierException: + _logger.exception( + '\n%s\nIN ELEMENT\n%s\nThis relation is skipped.', + ElementTree.tostring(xsr, ENCODING), + ElementTree.tostring(xml_synset, ENCODING) + ) + return SynsetNode( + id=s_id, + definition=s_def, + related=tuple(s_related) + ) + + +def _extract_id(full_id): + """Extract only numerical identifier from the end of a full id. + + :param full_id: a full identifier that has a prefix before the real id. + :type full_id: str|unicode + + :return: a real, numerical id. + :rtype: int + + :raises MalformedIdentifierException: if the original id doesn't end with + digits. + """ + try: + return int(re.findall(r"\d+$", full_id)[0]) + except IndexError: + raise exc.MalformedIdentifierException(full_id) + + +_this_reader_ = ubylmf_reader diff --git a/plwn/readers/wndb.py b/plwn/readers/wndb.py new file mode 100644 index 0000000000000000000000000000000000000000..1677f3ff629de48e09201e39d2df4be517a02834 --- /dev/null +++ b/plwn/readers/wndb.py @@ -0,0 +1,284 @@ +# coding: utf8 +from __future__ import absolute_import, division + +import collections as coll +import contextlib as ctxl +import logging + +import sqlalchemy as sa + +from .nodes import SynsetNode, LexicalUnitNode +from .comments import parse_comment_string +from ..enums import ( + PoS, + VerbAspect, + EmotionMarkedness, + EmotionName, + EmotionValuation, + Domain, +) +from ..utils.sorting import text_key + + +__all__ = 'wndb_reader', + + +_log = logging.getLogger(__name__) + +_EmotionData = coll.namedtuple( + '_EmotionData', + ('mark', 'names', 'valuations', 'example_1', 'example_2'), +) + + +def wndb_reader(wordnet_db_url): + """Generate UBY-LMF format compatible records directly from plWordNet + database. + + sqlalchemy is required for this method to work. + + :param str wordnet_db_url: URL in sqlalchemy format, pointing to a + plWordNet database. + + :return: a generator over PLwordnet entities. + :rtype: generator + """ + + db_eng = sa.create_engine(wordnet_db_url) + db_meta = sa.MetaData(db_eng) + visited_synsets = set() + nonexistent_synsets = set() + + # Define required tables + dbt_synset = sa.Table(u'synset', db_meta, autoload=True) + dbt_synrel = sa.Table(u'synsetrelation', db_meta, autoload=True) + dbt_reltype = sa.Table(u'relationtype', db_meta, autoload=True) + dbt_lexunit = sa.Table(u'lexicalunit', db_meta, autoload=True) + dbt_lexrel = sa.Table(u'lexicalrelation', db_meta, autoload=True) + dbt_uns = sa.Table(u'unitandsynset', db_meta, autoload=True) + dbt_emo = sa.Table(u'emotion', db_meta, autoload=True) + + q = sa.select(( + dbt_lexunit.c.ID, + dbt_lexunit.c.lemma, + dbt_lexunit.c.pos, + dbt_lexunit.c.variant, + dbt_uns.c.SYN_ID, + dbt_uns.c.unitindex, + dbt_lexunit.c.domain, + dbt_lexunit.c.comment, + dbt_lexunit.c.verb_aspect, + )).select_from( + dbt_lexunit.join( + dbt_uns, + dbt_uns.c.LEX_ID == dbt_lexunit.c.ID, + ) + ).where(dbt_lexunit.c.pos.between(1, 4)) + + with ctxl.closing(db_eng.execute(q)) as result: + for lexid, lemma, pos, variant, synid, uidx, domain, comment,\ + verb_aspect in result: + + if synid in nonexistent_synsets: + continue + + # Select all relations children of the unit + q = sa.select( + (dbt_lexrel.c.CHILD_ID, dbt_reltype.c.name) + ).select_from( + dbt_lexrel.join( + dbt_reltype, + dbt_reltype.c.ID == dbt_lexrel.c.REL_ID, + ) + ).where(dbt_lexrel.c.PARENT_ID == lexid) + + with ctxl.closing(db_eng.execute(q)) as lex_rel_result: + # Ensure relations targets exist + lex_related = [] + for lex_child_id, lex_rel_name in lex_rel_result: + q = sa.select(( + sa.exists().select_from( + # This join to ensure the unit belongs to + # some synset. + dbt_lexunit.join( + dbt_uns, + dbt_uns.c.LEX_ID == dbt_lexunit.c.ID, + ) + ).where(sa.and_( + dbt_lexunit.c.ID == lex_child_id, + dbt_lexunit.c.pos.between(1, 4), + )), + )) + + if db_eng.execute(q).scalar(): + lex_related.append((lex_rel_name, lex_child_id)) + + # Now, select the unit's synset, but only once + if synid not in visited_synsets: + visited_synsets.add(synid) + + q = sa.select( + (dbt_synset.c.ID, dbt_synset.c.definition) + ).where(dbt_synset.c.ID == synid) + + synrow = db_eng.execute(q).first() + + if synrow is None: + nonexistent_synsets.add(synid) + continue + + # Select all relation children of the synset + q = sa.select( + (dbt_synrel.c.CHILD_ID, dbt_reltype.c.name) + ).select_from( + dbt_synrel.join( + dbt_reltype, + dbt_reltype.c.ID == dbt_synrel.c.REL_ID, + ) + ).where(dbt_synrel.c.PARENT_ID == synid) + + with ctxl.closing(db_eng.execute(q)) as syn_rel_result: + syn_related = [] + for syn_child_id, syn_rel_name in syn_rel_result: + # Ensure the child exists + q = sa.select(( + sa.exists().select_from( + dbt_synset.join( + dbt_uns, + dbt_uns.c.SYN_ID == dbt_synset.c.ID, + ).join( + dbt_lexunit, + dbt_lexunit.c.ID == dbt_uns.c.LEX_ID, + ) + ).where(sa.and_( + dbt_synset.c.ID == syn_child_id, + dbt_lexunit.c.pos.between(1, 4), + )), + )) + + if db_eng.execute(q).scalar(): + syn_related.append((syn_rel_name, syn_child_id)) + + yield SynsetNode( + synid, + synrow[1] if synrow[1] is not None else u'', + tuple(syn_related), + ) + + # Try getting emotion annotations for the unit + emo_data = _extract_emotion_data(db_eng, dbt_emo, lexid) + + # Now, parse the comment string to get some last pieces of data + cmt_data = parse_comment_string(comment + if comment is not None + else u'') + + yield LexicalUnitNode( + id=lexid, + lemma=lemma, + pos=PoS.by_db_number(pos), + variant=variant, + synset=synid, + unit_index=uidx, + definition=cmt_data.definition, + usage_notes=cmt_data.usage, + external_links=cmt_data.links, + examples=cmt_data.examples, + examples_sources=cmt_data.examples_sources, + # XXX Since domains are defined as strings, the int is cast + # to unicode. It's possible, in the future to add a + # translation dict to textual representations. + domain=Domain.by_db_number(domain), + related=tuple(lex_related), + verb_aspect=VerbAspect.by_db_number(verb_aspect, True), + emotion_markedness=EmotionMarkedness.normalized(emo_data.mark) + if emo_data.mark is not None else None, + emotion_names=_make_enum_tuple( + EmotionName, + sorted(emo_data.names, key=text_key), + ), + emotion_valuations=_make_enum_tuple( + EmotionValuation, + sorted(emo_data.valuations, key=text_key), + ), + emotion_example_1=emo_data.example_1, + emotion_example_2=emo_data.example_2, + ) + + +def _extract_emotion_data(db_eng, db_t_emo, unit_id): + q_emo = sa.select(( + db_t_emo.c.markedness, # XXX Typo in schema + db_t_emo.c.emotions, + db_t_emo.c.valuations, + db_t_emo.c.example1, + db_t_emo.c.example2, + db_t_emo.c.unitStatus, + )).where(db_t_emo.c.lexicalunit_id == unit_id).order_by( + # "super_anotation" is a boolean 0 or 1, so descending sort will put + # the super annotation first. + db_t_emo.c.super_anotation.desc() # XXX Typo in schema + ) + + mark = None + names = set() + valuations = set() + example_1 = None + example_2 = None + + with ctxl.closing(db_eng.execute(q_emo)) as result: + for row in result: + if not row[db_t_emo.c.unitStatus]: + return _EmotionData( + mark=None, + names=(), + valuations=(), + example_1=None, + example_2=None, + ) + + if mark is None: + mark = row[db_t_emo.c.markedness] + if example_1 is None: + example_1 = row[db_t_emo.c.example1] + if example_2 is None: + example_2 = row[db_t_emo.c.example2] + + row_names = row[db_t_emo.c.emotions] + if row_names is not None: + names.update( + word.strip() + for word in row_names.split(u';') + ) + + row_valuations = row[db_t_emo.c.valuations] + if row_valuations is not None: + valuations.update( + word.strip() + for word in row_valuations.split(u';') + ) + + return _EmotionData( + mark=mark, + names=names, + valuations=valuations, + example_1=example_1, + example_2=example_2, + ) + + +def _make_enum_tuple(enumtype, source): + result = [] + + for item in source: + try: + val = enumtype(item) + except ValueError: + _log.warning('Omitting bad value %r of enum %r', item, enumtype) + else: + result.append(val) + + return tuple(result) + + +_this_reader_ = wndb_reader diff --git a/plwn/readers/wnxml.py b/plwn/readers/wnxml.py new file mode 100644 index 0000000000000000000000000000000000000000..acfbc494906922ca282600197e3ab770bf669160 --- /dev/null +++ b/plwn/readers/wnxml.py @@ -0,0 +1,210 @@ +# coding: utf8 +from __future__ import absolute_import, division + + +from collections import defaultdict +import itertools as itt +import logging +import xml.etree.ElementTree as et + +import six + +from .comments import parse_comment_string +from .nodes import SynsetNode, LexicalUnitNode +from ..enums import PoS, Domain + + +__all__ = 'wnxml_reader', + + +_log = logging.getLogger(__name__) + +_POSES = { + u'rzeczownik': PoS.n, + u'czasownik': PoS.v, + u'przymiotnik': PoS.adj, + u'przysłówek': PoS.adv, +} + + +# Since etree may return either unicode or byte strings, all strings returned +# by its interfaces are wrapped with six.text_type + + +def wnxml_reader(wnxml_file): + """Generate plWordNet records from the official XML file. + + :param str wnxml_file: Path to the plWordNet XML file to read from. + + :return: a generator over PLwordnet entities. + :rtype: generator + """ + + # The regrettably huge global storage for yielding + synsets = {} + lexunits = {} + synid_n_lexids = [] + reltypes_syn = {} + reltypes_lex = {} + # These need defaults to add instances to parent syn / lex + synrels = defaultdict(list) + lexrels = defaultdict(list) + + # Now, parse everything + for _, elem in et.iterparse(wnxml_file): + if elem.tag == u'lexical-unit': + _make_lexunit(elem, lexunits) + elif elem.tag == u'synset': + _make_synset(elem, synsets, synid_n_lexids) + elif elem.tag == u'relationtypes': + _make_reltype(elem, reltypes_syn, reltypes_lex) + elif elem.tag == u'synsetrelations': + _make_rel(elem, synrels) + elif elem.tag == u'lexicalrelations': + _make_rel(elem, lexrels) + + # Finalize units to synsets mapping + _make_units2synsets(lexunits, synid_n_lexids) + + # Now complete synsets and lexunits with relations and yield + for node in itt.chain( + _make_gen(synsets, synrels, reltypes_syn), + _filter_nosynset(_make_gen(lexunits, lexrels, reltypes_lex)), + ): + yield node + + +_this_reader_ = wnxml_reader + + +def _make_lexunit(lu_node, lu_dict): + # Only words will pl poses will be remembered + xmlpos = six.text_type(lu_node.get(u'pos')) + + if xmlpos not in _POSES: + return + + lu_id = int(lu_node.get(u'id')) + cmt_data = parse_comment_string(six.text_type(lu_node.get(u'desc'))) + # Create a temporal object which will be filled later + lu_dict[lu_id] = LexicalUnitNode( + id=lu_id, + lemma=six.text_type(lu_node.get(u'name')), + pos=_POSES[xmlpos], + variant=int(lu_node.get(u'variant')), + synset=None, + unit_index=None, + definition=cmt_data.definition, + usage_notes=cmt_data.usage, + external_links=cmt_data.links, + examples=cmt_data.examples, + examples_sources=cmt_data.examples_sources, + domain=Domain[lu_node.get(u'domain')], + related=None, + # The below properties are not stored in wnxml (at least in present) + verb_aspect=None, + emotion_markedness=None, + emotion_names=(), + emotion_valuations=(), + emotion_example_1=None, + emotion_example_2=None, + ) + + +def _make_synset(syn_node, syn_dict, snu_list): + # Only take non-abstract synsets + if six.text_type(syn_node.get(u'abstract')) != u'false': + return + + synid = int(syn_node.get(u'id')) + # Assign lexical units to synsets they belong to. + snu_list.append((synid, [int(uid_node.text) + for uid_node in syn_node.iter(u'unit-id')])) + # As with lexunits, related field is not yet filled + syn_dict[synid] = SynsetNode( + synid, + six.text_type(syn_node.get(u'definition')), + None, + ) + + +def _make_units2synsets(lu_dict, snu_list): + for synid, lexids in snu_list: + for uidx, uid in enumerate(lexids): + try: + lu = lu_dict[uid] + except KeyError: + _log.warning( + 'Unit %d from synset %d does not exist', + uid, + synid, + ) + else: + lu_dict[uid] = lu._replace(synset=synid, unit_index=uidx) + + +# Relation types are spelled in descriptive names +_RELTYPE_SYN = u'relacja pomiędzy synsetami' +_RELTYPE_LEX = u'relacja leksykalna' + + +def _make_reltype(reltype_node, synreltype_dict, lureltype_dict): + relid = int(reltype_node.get(u'id')) + typestr = reltype_node.get(u'type') + + if typestr == _RELTYPE_SYN: + the_dict = synreltype_dict + elif typestr == _RELTYPE_LEX: + the_dict = lureltype_dict + else: + # There is one more relation type, synonymy, but it's artificial + return + + # Remember the name so that will be inserted into the reltype storages + the_dict[relid] = six.text_type(reltype_node.get(u'name')) + + +# Relations are put into dicts indexed by parent IDs, to be later put into +# nodes. One function can handle both types. +def _make_rel(node, reldict): + # Get reltype - drop if unknown + reldict[int(node.get(u'parent'))].append(( + int(node.get(u'child')), + # Reltypes should be returned by names, not IDs + int(node.get(u'relation')), + )) + + +# As with relation, yielding is general for syn / lexes. +# Related IDs need to be added, and those not known purged. +def _make_gen(node_dict, rels_dict, reltype_dict): + for node in six.itervalues(node_dict): + related = [] + for child_id, rel_id in rels_dict.get(node.id, ()): + try: + relname = reltype_dict[rel_id] + except KeyError: + _log.warning( + 'Unknown relation %d (of %s), from %d to %d', + rel_id, + node.__class__.__name__, + node.id, + child_id, + ) + continue + + # Only remember from the related dict the items whose IDs are in + # the node dict. + if child_id in node_dict: + related.append((child_id, relname)) + related.append((relname, child_id)) + yield node._replace(related=related) + + +# Addendum to _make_gen for lexical units to filter synsetless ones +def _filter_nosynset(lu_node_gen): + for lu_node in lu_node_gen: + if lu_node.synset is None: + _log.warning('Unit %d belongs to no synset', lu_node.id) + else: + yield lu_node diff --git a/plwn/relation_aliases.tsv b/plwn/relation_aliases.tsv new file mode 100644 index 0000000000000000000000000000000000000000..b7f87a60bacacb416c9577923ca6bd8558ef3baa --- /dev/null +++ b/plwn/relation_aliases.tsv @@ -0,0 +1,5 @@ +hiperonimia hiper +hiponimia hipo +deminutywność dem +holonimia holo +meronimia mero diff --git a/plwn/relresolver.py b/plwn/relresolver.py new file mode 100644 index 0000000000000000000000000000000000000000..92eec11aef02bc9747f1d8c73f938ef779c61123 --- /dev/null +++ b/plwn/relresolver.py @@ -0,0 +1,130 @@ +from __future__ import absolute_import, division + + +from contextlib import closing +import logging + +import pkg_resources as pkgr +import six + + +__all__ = 'RelationResolver', 'get_default_relation_resolver' + + +_DEFAULT_RESOLVER_LOC = 'plwn', 'relation_aliases.tsv' +_default_resolver_obj = None + +_log = logging.getLogger(__name__) + + +class RelationResolver(object): + """ + Stores dictionary of relation name aliases to full names. + """ + + @classmethod + def from_tsv(cls, tsv_stream): + """ + Creates an instance from a TSV file. + + The first item of each line should be the full name, and every other + should be an alias (similar to ``from_reverse_dict``). + + :param tsv_stream: The stream from which TSV lines are read. + :type tsv_stream: TextIO + + :rtype: RelationResolver + """ + + adict = {} + + for line in tsv_stream: + items = line.strip().split(u'\t') + fullname = items[0].strip() + for alias in items[1:]: + adict[alias.strip()] = fullname + + return cls(adict) + + @classmethod + def from_reverse_dict(cls, rdict): + """ + Creates an instance from a dictionary mapping full names to lists of + aliases that should resolve to them. + + :type rdict: Mapping[str, List[str]] + + :rtype: RelationResolver + """ + + adict = {} + + for full, aliases in six.iteritems(rdict): + for alias in aliases: + adict[alias] = full + + return cls(adict) + + def __init__(self, aliases): + """ + :param aliases: Dictionary (or pairs sequence) mapping relation aliases + to full names. + :type aliases: Mapping[str, str] + """ + + self._aliases = dict(aliases) + + def add_alias(self, alias, fullname): + """ + Add a new alias to the dictionary: + + :param str alias: The alias. + + :param str fullname: The name the alias will resolve to. + """ + + self._aliases[alias] = fullname + + def resolve_name(self, relname): + """ + Resolve a possible alias to a full name. If ``relname`` is not a known + alias, it's returned unchanged. + + :param str relname: The relation name that may be an alias that needs + to be resolved. + + :return: ``relname`` or, if it's an alias, the full name it resolves + to. + :rtype: str + """ + + return self._aliases.get(relname, relname) + + +def get_default_relation_resolver(): + """ + Create an instance of ``RelationResolver`` that loads a file with all + default relation name aliases. + + The default aliases TSV file is located in ``plwn`` package root, as + ``relation_aliases.tsv``. + + :return: The default ``RelationResolver`` instance, initialized once on the + first call. + :rtype: RelationResolver + """ + + global _default_resolver_obj + + if _default_resolver_obj is None: + try: + with closing(pkgr.resource_stream(*_DEFAULT_RESOLVER_LOC)) \ + as tsv_in: + _default_resolver_obj = RelationResolver.from_tsv( + line.decode('utf8') for line in tsv_in + ) + except IOError: + _log.exception('Failed to load default aliases file') + _default_resolver_obj = RelationResolver({}) + + return _default_resolver_obj diff --git a/plwn/storages/__init__.py b/plwn/storages/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/plwn/storages/objects.py b/plwn/storages/objects.py new file mode 100644 index 0000000000000000000000000000000000000000..37fa7f3cf7c6debf1ed4232458e21a0b2edb801d --- /dev/null +++ b/plwn/storages/objects.py @@ -0,0 +1,518 @@ +"""Implementation which stores data in plain python objects. Should be fairly +fast to construct, but querying and memory efficiencies may not be too great. +""" + +from __future__ import absolute_import, absolute_import + + +import collections as coll +import logging +import operator as op + +import six +from six.moves import cPickle + +from ..readers import nodes as nd +from ..enums import PoS +from ..relresolver import get_default_relation_resolver +from ..utils.tupwrap import tup_wrapped, TupWrapper +from ..utils.sorting import text_key +from .. import bases, exceptions as exc + + +__all__ = 'PLWordNet', 'Synset', 'LexicalUnit' + + +_log = logging.getLogger(__name__) + + +class PLWordNet(bases.PLWordNetBase): + + _STORAGE_NAME = 'objects' + _SCHEMA_VERSION = 2 + + @classmethod + def from_reader(cls, reader, dump_to=None): + obj = cls() + obj.__read_data(reader) + + if dump_to is not None: + with open(dump_to, 'wb') as dump_ofs: + cPickle.dump(obj, dump_ofs, cPickle.HIGHEST_PROTOCOL) + + return obj + + @classmethod + def from_dump(cls, dump): + with open(dump, 'rb') as dump_ifs: + obj = cPickle.load(dump_ifs) + + if not isinstance(obj, cls): + raise exc.LoadException( + 'Unpickled object is not an instance of ' + repr(cls) + ) + + if not hasattr(obj, '_version') or obj._version != cls._SCHEMA_VERSION: + raise exc.DumpVersionException( + getattr(obj, '_version', None), + cls._SCHEMA_VERSION, + ) + + return obj + + @staticmethod + def __fill_id_reldict(src_node, id_rel_dict, id_set): + rels = coll.defaultdict(list) + for relname, reltarget in src_node.related: + if reltarget not in id_set: + _log.warning( + 'Target %d of relation %s from %d does not exist', + reltarget, + relname, + src_node.id, + ) + else: + rels[relname].append(reltarget) + + id_rel_dict[src_node.id] = coll.OrderedDict( + (relname, tuple(rels[relname])) + for relname in sorted(rels, key=text_key) + ) + + @staticmethod + def __gen_item_reldict(id_rel_dict, item_rel_dict, item_dict): + for src_id, rel_dict in six.iteritems(id_rel_dict): + irel_dict = coll.OrderedDict() + for relname, trg_ids in six.iteritems(rel_dict): + trg_items = [] + for trg_id in rel_dict[relname]: + try: + trg_item = item_dict[trg_id] + except KeyError: + _log.warning( + 'Target %d of relation %s from %d does not exist', + trg_id, + relname, + src_id, + ) + else: + trg_items.append(trg_item) + + if trg_items: + irel_dict[relname] = tuple(trg_items) + + if irel_dict: + item_rel_dict[src_id] = irel_dict + + def __init__(self): + """**NOTE:** This constructor should not be invoked directly. Use one + of the standard methods: ``from_dump`` or ``from_reader``. + """ + + super(PLWordNet, self).__init__() + + # Remember the version for unpickling check + self._version = self._SCHEMA_VERSION + + # Master indexes + self._synsets = coll.OrderedDict() + self._units = coll.OrderedDict() + + # Secondary indexes for lookup of units by lemma, pos and var + self._i_lem_pos_var = {} + self._i_lem_pos = coll.defaultdict(list) + self._i_lem_var = coll.defaultdict(list) + self._i_lem = coll.defaultdict(list) + self._i_pos = coll.defaultdict(list) + # No index for lookup by var! That's the slow way. + + # Relations: indexed by id and then relation names; the second one + # should be ordered. + self._synrels = {} + self._lexrels = {} + + def lexical_unit_by_id(self, id_): + try: + return self._units[id_] + except KeyError: + raise exc.InvalidLexicalUnitIdentifierException(id_) + + @tup_wrapped + def lexical_units(self, lemma=None, pos=None, variant=None): + if lemma is not None and pos is not None and variant is not None: + # Yield only one unit since it must be it if it exists + try: + yield self._i_lem_pos_var[lemma, PoS(pos), variant] + except KeyError: + pass + finally: + return + + if lemma is not None and pos is not None: + retlist = self._i_lem_pos.get((lemma, PoS(pos)), ()) + elif lemma is not None and variant is not None: + retlist = self._i_lem_var.get((lemma, variant), ()) + elif lemma is not None: + retlist = self._i_lem.get(lemma, ()) + elif pos is not None: + retlist = self._i_pos.get(PoS(pos), ()) + else: + # Hoo boy, it's bad + retlist = self._select_lexunits(lemma, PoS(pos), variant) + + for lu in retlist: + yield lu + + def lexical_unit(self, lemma, pos, variant): + try: + return self._i_lem_pos_var[lemma, PoS(pos), variant] + except KeyError: + raise exc.LexicalUnitNotFound(lemma, pos, variant) + + def synset_by_id(self, id_): + try: + return self._synsets[id_] + except KeyError: + raise exc.InvalidSynsetIdentifierException(id_) + + @tup_wrapped + def synsets(self, lemma=None, pos=None, variant=None): + for lu in self.lexical_units(lemma, pos, variant): + yield lu.synset + + def synset(self, lemma, pos, variant): + try: + return self._i_lem_pos_var[lemma, PoS(pos), variant].synset + except KeyError: + raise exc.SynsetNotFound(lemma, pos, variant) + + def synset_relation_edges(self, include=None, exclude=None): + return TupWrapper(self._iter_reledges(self._synrels, include, exclude)) + + def lexical_relation_edges(self, include=None, exclude=None): + return TupWrapper(self._iter_reledges(self._lexrels, include, exclude)) + + def _select_lexunits(self, lemma, pos, variant): + # The "slow way" (indexless) of selecting lexical units + for lu in six.itervalues(self._units): + if ((lemma is None or lemma == lu._lemma) and + (pos is None or pos is lu._pos) and + (variant is None or variant == lu._var)): + yield lu + + def _iter_reledges(self, reledges, include, exclude): + # Ensure those are sets + include = frozenset( + self._rel_resolver.resolve_name(rel) for rel in include + ) if include is not None else None + exclude = frozenset( + self._rel_resolver.resolve_name(rel) for rel in exclude + ) if exclude is not None else None + + for src, reldict in six.iteritems(reledges): + for relname, targets in six.iteritems(reldict): + if ((include is None or relname in include) and + (exclude is None or relname not in exclude)): + for trg in targets: + yield bases.RelationEdge( + source=src, + relation=relname, + target=trg, + ) + + def __read_data(self, reader): + # Nodes need to be separated and sorted before being pushed to indexes. + syn_nodes = {} + ordered_synids = [] + lex_nodes = {} + # Ordered AND filtered + ordered_lex_nodes = [] + # The association will remember unit indices + s2u = coll.defaultdict(list) + # Temporary id relation dicts + id_lex_rels = {} + id_syn_rels = {} + + for node in reader: + if isinstance(node, nd.SynsetNode): + syn_nodes[node.id] = node + else: + lex_nodes[node.id] = node + + # First iterate over lex nodes to establish the unit-synset + # relationships and sort out synsets and lexunits that don't exist. + for lex_node in six.itervalues(lex_nodes): + if lex_node.synset not in syn_nodes: + _log.warning( + 'Synset %d from unit %d does not exist', + lex_node.id, + lex_node.synset, + ) + else: + s2u[lex_node.synset].append((lex_node.unit_index, lex_node.id)) + ordered_synids.append(lex_node.synset) + ordered_lex_nodes.append(lex_node) + + # Sort by lemma! + ordered_lex_nodes.sort(key=lambda node: text_key(node.lemma)) + + # Insert lexical unit objects into ordered dict + for lex_node in ordered_lex_nodes: + self._units[lex_node.id] = LexicalUnit( + self, + lex_node.id, + lex_node.lemma, + lex_node.pos, + lex_node.variant, + lex_node.synset, + lex_node.definition, + tuple(lex_node.usage_notes), + tuple(lex_node.external_links), + tuple(lex_node.examples), + tuple(lex_node.examples_sources), + lex_node.domain, + lex_node.verb_aspect, + lex_node.emotion_markedness, + tuple(lex_node.emotion_names), + tuple(lex_node.emotion_valuations), + lex_node.emotion_example_1, + lex_node.emotion_example_2, + ) + + self.__fill_id_reldict(lex_node, id_lex_rels, lex_nodes) + + # Now, insert synsets in the right order + for synid in ordered_synids: + if synid in self._synsets: + continue + + syn_node = syn_nodes[synid] + # Sort units by index first + synunits = s2u[synid] + synunits.sort(key=op.itemgetter(0)) + + self._synsets[synid] = Synset( + self, + synid, + (it[1] for it in synunits), + syn_node.definition, + ) + + # Relations are done similarly to lex ones + self.__fill_id_reldict(syn_node, id_syn_rels, syn_nodes) + + # But what if there are synsets that have no units? + for synid in syn_nodes: + if synid not in self._synsets: + _log.warning('Synset %d has no units', synid) + + # We can convert id rel dicts now + self.__gen_item_reldict(id_lex_rels, self._lexrels, self._units) + self.__gen_item_reldict(id_syn_rels, self._synrels, self._synsets) + + # We can build indexes now + for lu in six.itervalues(self._units): + self._i_lem_pos_var[lu._lemma, lu._pos, lu._var] = lu + self._i_lem_pos[lu._lemma, lu._pos].append(lu) + self._i_lem_var[lu._lemma, lu._var].append(lu) + self._i_lem[lu._lemma].append(lu) + self._i_pos[lu._pos].append(lu) + + +class LexicalUnit(bases.LexicalUnitBase): + + __slots__ = ( + '_relr', + '_wn', + '_id', + '_lemma', + '_pos', + '_var', + '_synid', + '_def', + '_usn', + '_extl', + '_exms', + '_exms_srcs', + '_dom', + '_va', + '_emo_mark', + '_emo_names', + '_emo_valuations' + '_emo_ex1', + '_emo_ex2', + ) + + def __init__(self, + wn, + lexid, + lemma, + pos, + variant, + synid, + def_, + usn, + extl, + exms, + exms_srcs, + dom, + va, + emo_mark, + emo_names, + emo_valuations, + emo_ex1, + emo_ex2): + """**NOTE:** This constructor should not be called directly. Use + :class:`PLWordNet` methods to obtain lexical units. + """ + + self._relr = get_default_relation_resolver() + + self._wn = wn + self._id = lexid + self._lemma = lemma + self._pos = pos + self._var = variant + self._synid = synid + self._def = def_ + self._usn = usn + self._extl = extl + self._exms = exms + self._exms_srcs = exms_srcs + self._dom = dom + self._va = va + self._emo_mark = emo_mark + self._emo_names = emo_names + self._emo_valuations = emo_valuations + self._emo_ex1 = emo_ex1 + self._emo_ex2 = emo_ex2 + + @property + def id(self): + return self._id + + @property + def lemma(self): + return self._lemma + + @property + def pos(self): + return self._pos + + @property + def variant(self): + return self._var + + @property + def synset(self): + return self._wn._synsets[self._synid] + + @property + def definition(self): + return self._def + + @property + def sense_examples(self): + return self._exms + + @property + def sense_examples_sources(self): + return self._exms_srcs + + @property + def external_links(self): + return self._extl + + @property + def usage_notes(self): + return self._usn + + @property + def domain(self): + return self._dom + + @property + def verb_aspect(self): + return self._va + + @property + def emotion_markedness(self): + return self._emo_mark + + @property + def emotion_names(self): + return self._emo_names + + @property + def emotion_valuations(self): + return self._emo_valuations + + @property + def emotion_example(self): + return self._emo_ex1 + + @property + def emotion_example_secondary(self): + return self._emo_ex2 + + @property + def relations(self): + # Not caching, since this is an informational method that will probably + # not be called very often. + # The rel dicts should be an ordered dict with relation names as keys. + return tuple(self._wn._lexrels[self._id]) + + def related(self, relation_name): + relname = self._rel_resolver.resolve_name(relation_name) + reldict = self._wn._lexrels[self._id] + try: + return TupWrapper(iter(reldict[relname])) + except KeyError: + raise exc.InvalidRelationNameException(relation_name) + + +class Synset(bases.SynsetBase): + + __slots__ = '_relr', '_wn', '_id', '_units', '_def' + + def __init__(self, wn, synid, unit_ids, def_): + """**NOTE:** This constructor should not be called directly. Use + :class:`PLWordNet` methods to obtain synsets. + """ + + self._relr = get_default_relation_resolver() + + self._wn = wn + self._id = synid + self._units = tuple(wn._units[uid] for uid in unit_ids) + self._def = def_ + + @property + def id(self): + return self._id + + @property + def lexical_units(self): + return self._units + + @property + def definition(self): + return self._def + + @property + def relations(self): + # Not caching, since this is an informational method that will probably + # not be called very often. + # The rel dicts should be an ordered dict with relation names as keys. + return tuple(self._wn._synrels[self._id]) + + def related(self, relation_name): + relname = self._rel_resolver.resolve_name(relation_name) + reldict = self._wn._synrels[self._id] + try: + return TupWrapper(iter(reldict[relname])) + except KeyError: + raise exc.InvalidRelationNameException(relation_name) + + +_this_storage_ = PLWordNet diff --git a/plwn/storages/sqlite.py b/plwn/storages/sqlite.py new file mode 100644 index 0000000000000000000000000000000000000000..17d0691673df46916e24da29090d8235eb52307a --- /dev/null +++ b/plwn/storages/sqlite.py @@ -0,0 +1,1179 @@ +"""Implementation that stores data from plWordNet in a sqlite databse file, +with an impromptu schema. +""" + +from __future__ import absolute_import, division +try: + from future_builtins import zip +except ImportError: + pass + +import sqlite3 +from collections import defaultdict +from contextlib import closing +import errno +import itertools as itt +import locale +import logging +import os +import shutil +import tempfile +import weakref + +import six + +from ..readers import nodes as nd +from ..enums import ( + PoS, + VerbAspect, + EmotionMarkedness, + EmotionName, + EmotionValuation, + Domain, +) +from ..relresolver import get_default_relation_resolver +from ..utils.tupwrap import tup_wrapped, TupWrapper +from .. import bases, exceptions as exc + + +__all__ = 'PLWordNet', 'Synset', 'LexicalUnit' + + +_log = logging.getLogger(__name__) + +# SQL script used to initialize the database. +# "locale" collation must be defined on the connection before this is executed. +_DB_SCHEMA_SCRIPT = u""" +PRAGMA foreign_keys = ON; + +-- Metadata table. Used for version number, currently. +CREATE TABLE IF NOT EXISTS plwn_meta ( + name TEXT UNIQUE NOT NULL, + value BLOB +); + +-- Tables for constant values +CREATE TABLE IF NOT EXISTS pos ( + id INTEGER PRIMARY KEY, + value TEXT UNIQUE NOT NULL +); + +CREATE TABLE IF NOT EXISTS verbaspect ( + id INTEGER PRIMARY KEY, + value TEXT UNIQUE NOT NULL +); + +CREATE TABLE IF NOT EXISTS emotionmark ( + id INTEGER PRIMARY KEY, + value TEXT UNIQUE NOT NULL +); + +CREATE TABLE IF NOT EXISTS emotionname ( + id INTEGER PRIMARY KEY, + value TEXT UNIQUE NOT NULL COLLATE locale +); + +CREATE TABLE IF NOT EXISTS emotionvaluation ( + id INTEGER PRIMARY KEY, + value TEXT UNIQUE NOT NULL COLLATE locale +); + +CREATE TABLE IF NOT EXISTS domain ( + id INTEGER PRIMARY KEY, + value TEXT UNIQUE NOT NULL COLLATE locale +); + +-- Synset only gets one simple table +CREATE TABLE IF NOT EXISTS synset ( + id INTEGER PRIMARY KEY, + definition TEXT NOT NULL COLLATE locale +); + +-- Lexical units have several tables, since they have several list-like +-- properties. They also need indexes for lookup. +CREATE TABLE IF NOT EXISTS lexicalunit ( + id INTEGER PRIMARY KEY, + lemma TEXT NOT NULL COLLATE locale, + pos INTEGER NOT NULL + REFERENCES pos (id), + variant INTEGER NOT NULL, + synset INTEGER NOT NULL + REFERENCES synset (id), + unitindex INTEGER NOT NULL, + definition TEXT NOT NULL COLLATE locale, + domain INTEGER NOT NULL + REFERENCES domain (id), + verbaspect INTEGER + REFERENCES verbaspect (id), + emotionmark INTEGER + REFERENCES emotionmark (id), + emotionexample1 TEXT COLLATE locale, + emotionexample2 TEXT COLLATE locale, + + UNIQUE (lemma, pos, variant), + -- Also, each unit needs its of place in synset + UNIQUE (synset, unitindex) +); + +-- lem-pos-var and synset-unitindex indexes (and partial ones) are +-- automatically made because of UNIQUE constraint, but additional indexes +-- need to be created. +CREATE INDEX IF NOT EXISTS lex_i_lem_var ON lexicalunit (lemma, variant); +CREATE INDEX IF NOT EXISTS lex_i_pos ON lexicalunit (pos); +-- No index for variant itself - it's not an useful use case + +-- Tables dependant on lexicalunit +CREATE TABLE IF NOT EXISTS senseexample ( + unitid INTEGER NOT NULL + REFERENCES lexicalunit (id), + example TEXT NOT NULL COLLATE locale, + source TEXT NOT NULL COLLATE locale +); +CREATE INDEX IF NOT EXISTS sen_i ON senseexample (unitid); + +CREATE TABLE IF NOT EXISTS externallink ( + unitid INTEGER NOT NULL + REFERENCES lexicalunit (id), + link TEXT NOT NULL COLLATE locale +); +CREATE INDEX IF NOT EXISTS link_i ON externallink (unitid); + +CREATE TABLE IF NOT EXISTS usagenote ( + unitid INTEGER NOT NULL + REFERENCES lexicalunit (id), + note TEXT NOT NULL COLLATE locale +); +CREATE INDEX IF NOT EXISTS note_i ON usagenote (unitid); + +CREATE TABLE IF NOT EXISTS unitemotionname ( + unitid INTEGER NOT NULL + REFERENCES lexicalunit (id), + nameid INTEGER NOT NULL + REFERENCES emotionname (id), + + PRIMARY KEY (unitid, nameid) +); + +CREATE TABLE IF NOT EXISTS unitemotionvaluation ( + unitid INTEGER NOT NULL + REFERENCES lexicalunit (id), + valuationid INTEGER NOT NULL + REFERENCES emotionvaluation (id), + + PRIMARY KEY (unitid, valuationid) +); + +-- Relation tables +CREATE TABLE IF NOT EXISTS synsetrelationtype ( + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL COLLATE locale +); +CREATE TABLE IF NOT EXISTS lexicalrelationtype ( + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL COLLATE locale +); + +CREATE TABLE IF NOT EXISTS synsetrelation ( + parentid INTEGER NOT NULL + REFERENCES synset (id), + relid INTEGER NOT NULL + REFERENCES synsetrelationtype (id), + childid INTEGER NOT NULL + REFERENCES synset (id), + + PRIMARY KEY (parentid, relid, childid) +); + +CREATE TABLE IF NOT EXISTS lexicalrelation ( + parentid INTEGER NOT NULL + REFERENCES lexicalunit (id), + relid INTEGER NOT NULL + REFERENCES lexicalrelationtype (id), + childid INTEGER NOT NULL + REFERENCES lexicalunit (id), + + PRIMARY KEY (parentid, relid, childid) +); +""" + + +class PLWordNet(bases.PLWordNetBase): + + _STORAGE_NAME = 'sqlite3' + _SCHEMA_VERSION = 4 + + @classmethod + def from_reader(cls, reader, dump_to=None): + plwn = cls(dump_to) + + try: + plwn.__init_db() + plwn.__read_data(reader) + except BaseException: + plwn.close() + raise + + return plwn + + @classmethod + def from_dump(cls, dump): + plwn = cls(dump) + + try: + plwn.__check_db() + except BaseException: + plwn.close() + raise + + return plwn + + @staticmethod + def _make_include_exclude(include, exclude): + """Creates ``WHERE`` clause and the parameter tuple for simple ``IN`` + and ``NOT IN`` case. + """ + + if include is not None: + whereclause = u"WHERE name IN ({})".format( + u','.join(itt.repeat(u'?', len(include))) + ) + includetuple = tuple(include) + else: + whereclause = u'' + includetuple = () + + if exclude is not None: + if not whereclause: + whereclause = u"WHERE name NOT IN ({})" + else: + whereclause += u" AND name NOT IN ({})" + + whereclause = whereclause.format( + u','.join(itt.repeat(u'?', len(exclude))) + ) + excludetuple = tuple(exclude) + else: + excludetuple = () + + return whereclause, includetuple + excludetuple + + def __init__(self, db_file=None): + """**NOTE:** This constructor should not be invoked directly. Use one + of the standard methods: ``from_dump`` or ``from_reader``. + """ + + super(PLWordNet, self).__init__() + + if db_file is None: + self._tmp_dir = tempfile.mkdtemp(prefix='plwn_api-') + # Close the file immediately, we just need the + db_file = os.path.join(self._tmp_dir, 'db') + else: + self._tmp_dir = None + + try: + self._db = sqlite3.connect(db_file) + except BaseException: + self.__drop_tmpdir() + raise + + try: + # Init the locale collation, needs to be done for every connection. + # The str cast is for py2/3 compatibility + self._db.create_collation('locale', locale.strcoll) + except BaseException: + self._db.close() + self.__drop_tmpdir() + raise + + def close(self): + self._db.close() + self.__drop_tmpdir() + + def lexical_units(self, lemma=None, pos=None, variant=None): + return TupWrapper( + LexicalUnit(self._db, *row) + for row in self._select_lexical_units(lemma, pos, variant, True) + ) + + def lexical_unit(self, lemma, pos, variant): + return LexicalUnit( + self._db, + *self._get_one_lexical_unit( + lemma, + pos, + variant, + exc.LexicalUnitNotFound, + ) + ) + + def lexical_unit_by_id(self, id_): + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT lemma, pos.value, variant, synset + FROM lexicalunit JOIN pos ON lexicalunit.pos = pos.id + WHERE lexicalunit.id = ? + """, + (id_,) + ) + row = cur.fetchone() + if row is None: + raise exc.InvalidLexicalUnitIdentifierException(id_) + return LexicalUnit(self._db, id_, *row) + + @tup_wrapped + def lexical_relation_edges(self, include=None, exclude=None): + parsed_include = frozenset( + self._rel_resolver.resolve_name(rel) for rel in include + ) if include is not None else None + + parsed_exclude = frozenset( + self._rel_resolver.resolve_name(rel) for rel in exclude + ) if exclude is not None else None + + whereclause, paramtuple = self._make_include_exclude( + parsed_include, + parsed_exclude, + ) + + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT parentid, childid, name + FROM lexicalrelation + JOIN lexicalrelationtype ON relid = id + """ + whereclause, + paramtuple, + ) + + lu_q = u""" + SELECT lemma, pos.value, variant, synset + FROM lexicalunit + JOIN pos ON pos.id = lexicalunit.pos + WHERE lexicalunit.id = ? + """ + + for parent_id, child_id, rel_name in cur: + with closing(self._db.cursor()) as cur2: + cur2.execute(lu_q, (parent_id,)) + par_lu = LexicalUnit( + self._db, + parent_id, + *cur2.fetchone() + ) + cur2.execute(lu_q, (child_id,)) + chl_lu = LexicalUnit( + self._db, + child_id, + *cur2.fetchone() + ) + yield bases.RelationEdge(par_lu, rel_name, chl_lu) + + def synsets(self, lemma=None, pos=None, variant=None): + synids = frozenset( + row[-1] + for row in self._select_lexical_units(lemma, pos, variant, True) + ) + return TupWrapper(Synset(self._db, synid) for synid in synids) + + def synset(self, lemma, pos, variant): + return Synset( + self._db, + self._get_one_lexical_unit( + lemma, + pos, + variant, + exc.SynsetNotFound, + )[-1], + ) + + def synset_by_id(self, id_): + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT EXISTS (SELECT 1 FROM synset WHERE id = ?)", + (id_,), + ) + if not cur.fetchone()[0]: + raise exc.InvalidSynsetIdentifierException(id_) + return Synset(self._db, id_) + + @tup_wrapped + def synset_relation_edges(self, include=None, exclude=None): + parsed_include = frozenset( + self._rel_resolver.resolve_name(rel) for rel in include + ) if include is not None else None + + parsed_exclude = frozenset( + self._rel_resolver.resolve_name(rel) for rel in exclude + ) if exclude is not None else None + + whereclause, paramtuple = self._make_include_exclude( + parsed_include, + parsed_exclude, + ) + + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT parentid, childid, name + FROM synsetrelation JOIN synsetrelationtype ON relid = id + """ + whereclause, + paramtuple, + ) + for parent_id, child_id, rel_name in cur: + yield bases.RelationEdge( + Synset(self._db, parent_id), + rel_name, + Synset(self._db, child_id), + ) + + def _select_lexical_units(self, lemma, pos, variant, defval): + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT lexicalunit.id, lemma, pos.value, variant, synset + FROM lexicalunit + JOIN pos ON lexicalunit.pos = pos.id + WHERE COALESCE(lemma = :lem, :defval) + AND COALESCE(pos.value = :pos, :defval) + AND COALESCE(variant = :var, :defval) + """, + { + u'lem': lemma, + u'pos': PoS(pos).value if pos else None, + u'var': variant, + u'defval': defval, + }, + ) + for row in cur: + yield row + + def _get_one_lexical_unit(self, lemma, pos, variant, exc_class): + # False by default will force-return nothing if any is None + lu_rows = iter(self._select_lexical_units(lemma, pos, variant, False)) + try: + lu_row = next(lu_rows) + except StopIteration: + raise exc_class(lemma, pos, variant) + assert next(lu_rows, None) is None + return lu_row + + def __init_db(self): + self._db.executescript(_DB_SCHEMA_SCRIPT).close() + + with self._db: + self._db.executemany( + u"INSERT OR IGNORE INTO pos (value) VALUES (?)", + ((p.value,) for p in PoS), + ).close() + self._db.executemany( + u"INSERT OR IGNORE INTO verbaspect (value) VALUES (?)", + ((va.value,) for va in VerbAspect), + ).close() + self._db.executemany( + u"INSERT OR IGNORE INTO emotionmark (value) VALUES (?)", + ((em.value,) for em in EmotionMarkedness), + ).close() + self._db.executemany( + u"INSERT OR IGNORE INTO emotionname (value) VALUES (?)", + ((en.value,) for en in EmotionName), + ).close() + self._db.executemany( + u"INSERT OR IGNORE INTO emotionvaluation (value) VALUES (?)", + ((ev.value,) for ev in EmotionValuation), + ).close() + self._db.executemany( + u"INSERT OR IGNORE INTO domain (value) VALUES (?)", + ((dm.value,) for dm in Domain), + ).close() + + # Insert version if the database is new + self._db.execute( + u""" + INSERT OR IGNORE INTO plwn_meta (name, value) + VALUES ('version', ?) + """, + (self._SCHEMA_VERSION,), + ).close() + + def __check_db(self): + with closing(self._db.cursor()) as cur: + try: + cur.execute( + u"SELECT value FROM plwn_meta WHERE name = 'version'", + ) + except sqlite3.OperationalError: + raise exc.LoadException( + 'Connected database seems not to be a PLWN database', + ) + + row = cur.fetchone() + + verval = row[0] if row is not None else None + if verval != self._SCHEMA_VERSION: + raise exc.DumpVersionException(verval, self._SCHEMA_VERSION) + + def __read_data(self, reader): + _DBBuilder(self._db)(reader) + + def __drop_tmpdir(self): + if self._tmp_dir is not None: + try: + shutil.rmtree(self._tmp_dir) + except OSError as e: + if e.errno != errno.ENOENT: + raise + + +class LexicalUnit(bases.LexicalUnitBase): + + __slots__ = ( + '_relr', + '_db', + '_id', + '_lemma', + '_pos', + '_var', + '_synid', + '_syn', + '_def', + '_usn', + '_extl', + '_exms', + '_exms_srcs', + '_dom', + '_va', + '_emo_mark', + '_emo_names', + '_emo_valuations' + '_emo_ex1', + '_emo_ex2', + ) + + # Since ``None`` is a valid value for verb_aspect, this is a sentinel value + _NO_VAL = object() + + def __init__(self, conn, id_, lemma, pos, variant, synid): + """**NOTE:** This constructor should not be called directly. Use + :class:`PLWordNet` methods to obtain lexical units. + """ + + self._relr = get_default_relation_resolver() + + self._db = conn + self._id = id_ + self._lemma = lemma + self._pos = PoS(pos) + self._var = variant + self._synid = synid + # Rest is unitialized + self._syn = self._NO_VAL + self._def = self._NO_VAL + self._usn = self._NO_VAL + self._extl = self._NO_VAL + self._exms = self._NO_VAL + self._exms_srcs = self._NO_VAL + self._dom = self._NO_VAL + self._va = self._NO_VAL + self._emo_mark = self._NO_VAL + self._emo_names = self._NO_VAL + self._emo_valuations = self._NO_VAL + self._emo_ex1 = self._NO_VAL + self._emo_ex2 = self._NO_VAL + + @property + def id(self): + return self._id + + @property + def lemma(self): + return self._lemma + + @property + def pos(self): + return self._pos + + @property + def variant(self): + return self._var + + @property + def synset(self): + if self._syn is self._NO_VAL or self._syn() is None: + syn = Synset(self._db, self._synid) + # Use weakref to avoid circular refrence to synset + self._syn = weakref.ref(syn) + return syn + return self._syn() + + @property + def definition(self): + if self._def is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT definition FROM lexicalunit WHERE id = ?", + (self._id,), + ) + row = cur.fetchone() + assert row is not None + self._def = row[0] if row[0] is not None else '' + return self._def + + @property + def sense_examples(self): + if self._exms is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT example FROM senseexample WHERE unitid = ?", + (self._id,), + ) + self._exms = tuple(row[0] for row in cur) + return self._exms + + @property + def sense_examples_sources(self): + if self._exms_srcs is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT source FROM senseexample WHERE unitid = ?", + (self._id,), + ) + self._exms_srcs = tuple(row[0] for row in cur) + return self._exms_srcs + + @property + def external_links(self): + if self._extl is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT link FROM externallink WHERE unitid = ?", + (self._id,), + ) + self._extl = tuple(row[0] for row in cur) + return self._extl + + @property + def usage_notes(self): + if self._usn is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT note FROM usagenote WHERE unitid = ?", + (self._id,), + ) + self._usn = tuple(row[0] for row in cur) + return self._usn + + @property + def domain(self): + if self._dom is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT domain.value + FROM lexicalunit JOIN domain + ON lexicalunit.domain = domain.id + WHERE lexicalunit.id = ? + """, + (self._id,), + ) + row = cur.fetchone() + assert row is not None + self._dom = Domain(row[0]) + return self._dom + + @property + def verb_aspect(self): + if self._va is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT verbaspect.value + FROM lexicalunit JOIN verbaspect + ON lexicalunit.verbaspect = verbaspect.id + WHERE lexicalunit.id = ? + """, + (self._id,), + ) + row = cur.fetchone() + self._va = None if row is None else VerbAspect(row[0]) + return self._va + + @property + def emotion_markedness(self): + if self._emo_mark is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT emotionmark.value + FROM lexicalunit JOIN emotionmark + ON lexicalunit.emotionmark = emotionmark.id + WHERE lexicalunit.id = ? + """, + (self._id,), + ) + row = cur.fetchone() + self._emo_mark = None if row is None else EmotionMarkedness(row[0]) + return self._emo_mark + + @property + def emotion_names(self): + if self._emo_names is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT emotionname.value + FROM emotionname JOIN unitemotionname + ON emotionname.id = unitemotionname.nameid + WHERE unitemotionname.unitid = ? + ORDER BY emotionname.value + """, + (self._id,), + ) + self._emo_names = tuple(EmotionName(row[0]) for row in cur) + return self._emo_names + + @property + def emotion_valuations(self): + if self._emo_valuations is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT emotionvaluation.value + FROM emotionvaluation JOIN unitemotionvaluation + ON emotionvaluation.id = + unitemotionvaluation.valuationid + WHERE unitemotionvaluation.unitid = ? + ORDER BY emotionvaluation.value + """, + (self._id,), + ) + self._emo_valuations = tuple( + EmotionValuation(row[0]) + for row in cur + ) + return self._emo_valuations + + @property + def emotion_example(self): + if self._emo_ex1 is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT emotionexample1 FROM lexicalunit WHERE id = ?", + (self._id,), + ) + self._emo_ex1 = cur.fetchone()[0] + return self._emo_ex1 + + @property + def emotion_example_secondary(self): + if self._emo_ex2 is self._NO_VAL: + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT emotionexample2 FROM lexicalunit WHERE id = ?", + (self._id,), + ) + self._emo_ex2 = cur.fetchone()[0] + return self._emo_ex2 + + @property + def relations(self): + # Not caching, since this is an informational method that will probably + # not be called very often + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT DISTINCT name + FROM lexicalrelation JOIN lexicalrelationtype ON id = relid + WHERE parentid = ? + ORDER BY name + """, + (self._id,), + ) + return tuple(row[0] for row in cur) + + def related(self, relation_name): + relname = self._relr.resolve_name(relation_name) + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT id FROM lexicalrelationtype WHERE name = ?", + (relname,), + ) + row = cur.fetchone() + if row is None: + raise exc.InvalidRelationNameException(relation_name) + return TupWrapper(self.__related_gen(row[0])) + + def __related_gen(self, relid): + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT lexicalunit.id, lemma, pos.value, variant, synset + FROM lexicalrelation + JOIN lexicalunit ON lexicalunit.id = childid + JOIN pos ON lexicalunit.pos = pos.id + WHERE parentid = ? AND relid = ? + """, + (self._id, relid), + ) + for row in cur: + yield LexicalUnit(self._db, *row) + + +class Synset(bases.SynsetBase): + + __slots__ = '_relr', '_db', '_id', '_units', '_def' + + def __init__(self, conn, syn_id): + """**NOTE:** This constructor should not be called directly. Use + :class:`PLWordNet` methods to obtain synsets. + """ + + self._relr = get_default_relation_resolver() + + self._db = conn + self._id = syn_id + + self._units = None + self._def = None + + @property + def id(self): + return self._id + + @property + def lexical_units(self): + if self._units is None: + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT lexicalunit.id, lemma, pos.value, variant + FROM lexicalunit JOIN pos ON lexicalunit.pos = pos.id + WHERE synset = ? + ORDER BY unitindex + """, + (self._id,), + ) + self._units = tuple( + LexicalUnit( + self._db, + row[0], + row[1], + row[2], + row[3], + self._id, + ) + for row in cur + ) + assert self._units + return self._units + + @property + def definition(self): + if self._def is None: + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT definition FROM synset WHERE id = ?", + (self._id,), + ) + row = cur.fetchone() + assert row is not None + self._def = row[0] if row[0] is not None else '' + return self._def + + @property + def relations(self): + # Not caching, since this is an informational method that will probably + # not be called very often + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT DISTINCT name + FROM synsetrelation JOIN synsetrelationtype ON id = relid + WHERE parentid = ? + ORDER BY name + """, + (self._id,), + ) + return tuple(row[0] for row in cur) + + def related(self, relation_name): + relname = self._relr.resolve_name(relation_name) + with closing(self._db.cursor()) as cur: + cur.execute( + u"SELECT id FROM synsetrelationtype WHERE name = ?", + (relname,), + ) + row = cur.fetchone() + if row is None: + raise exc.InvalidRelationNameException(relation_name) + return TupWrapper(self.__related_gen(row[0])) + + def __related_gen(self, relid): + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT childid + FROM synsetrelation + WHERE parentid = ? AND relid = ? + """, + (self._id, relid), + ) + for row in cur: + yield Synset(self._db, row[0]) + + +class _DBBuilder(object): + + def __init__(self, db): + self._db = db + # Relations need to be added later to weed out nonexistent ones targets + # and avoid foreign key failures (which are a bit obtuse in sqlite3. + self._synrels = {} + self._lexrels = {} + # Synset to lexical units relations also need to be deferred. + self._synid2lexids = defaultdict(list) + # Cache IDs of constant values + with closing(db.execute(u"SELECT value, id FROM pos")) as cur: + self._posids = dict(cur) + with closing(db.execute(u"SELECT value, id FROM verbaspect")) as cur: + self._vaids = dict(cur) + with closing(db.execute(u"SELECT value, id FROM emotionmark")) as cur: + self._emids = dict(cur) + with closing(db.execute(u"SELECT value, id FROM emotionname")) as cur: + self._enids = dict(cur) + with closing(db.execute(u"SELECT value, id FROM emotionvaluation")) \ + as cur: + self._evids = dict(cur) + with closing(db.execute(u"SELECT value, id FROM domain")) as cur: + self._dmids = dict(cur) + + def __call__(self, reader): + with self._db: + for node in reader: + if isinstance(node, nd.SynsetNode): + self._insert_synset(node) + else: + self._insert_unit(node) + + with self._db: + self._finalize_units() + + with self._db: + self._prune_empty_synsets() + + with self._db: + self._finalize_rels(u'synsetrelation', self._synrels) + self._finalize_rels(u'lexicalrelation', self._lexrels) + + def _insert_synset(self, syn_node): + self._db.execute( + u"INSERT INTO synset (id, definition) VALUES (?, ?)", + (syn_node.id, syn_node.definition), + ).close() + # Related go into temp storage + self._synrels[syn_node.id] = [ + ( + self._ensure_enum_row_id( + u'synsetrelationtype', + u'id', + u'name', + relname, + ), + targetid, + ) + for relname, targetid in syn_node.related + ] + + def _insert_unit(self, lu_node): + # Unfortunately, we can't insert into DB until we have all synsets. So + # save nodes in temp dict. + self._synid2lexids[lu_node.synset].append(lu_node) + # But deal with relations + self._lexrels[lu_node.id] = [ + ( + self._ensure_enum_row_id( + u'lexicalrelationtype', + u'id', + u'name', + relname, + ), + targetid, + ) + for relname, targetid in lu_node.related + ] + + def _finalize_units(self): + # All synsets are in, can add units now. + with closing(self._db.cursor()) as cur: + for synid, lu_nodes in six.iteritems(self._synid2lexids): + for lu_node in lu_nodes: + try: + cur.execute( + u""" + INSERT INTO lexicalunit ( + id, lemma, pos, variant, + synset, unitindex, + definition, domain, verbaspect, + emotionmark, emotionexample1, emotionexample2 + ) + VALUES ( + :id, :lemma, :pos, :var, + :syn, :uidx, + :def, :dom, :va, + :emo_m, :emo_ex1, :emo_ex2 + ) + """, + { + u'id': lu_node.id, + u'lemma': lu_node.lemma, + u'pos': self._posids[lu_node.pos.value], + u'var': lu_node.variant, + u'syn': lu_node.synset, + u'uidx': lu_node.unit_index, + u'def': lu_node.definition, + u'dom': self._dmids[lu_node.domain.value], + u'va': None + if lu_node.verb_aspect is None + else self._vaids[lu_node.verb_aspect.value], + u'emo_m': None + if lu_node.emotion_markedness is None + else self._emids[ + lu_node.emotion_markedness.value + ], + u'emo_ex1': lu_node.emotion_example_1, + u'emo_ex2': lu_node.emotion_example_2, + }, + ) + except sqlite3.IntegrityError as e: + _log.warning( + 'Pair (synset=%d, unitindex=%d) of unit %d ' + 'violates: %r', + lu_node.synset, + lu_node.unit_index, + lu_node.id, + e.args, + ) + # Drop relations for this unit, if any + self._lexrels.pop(lu_node.id, None) + return + + cur.executemany( + u""" + INSERT INTO senseexample (unitid, example, source) + VALUES (?, ?, ?) + """, + ( + (lu_node.id, exm, exm_src) + for exm, exm_src in zip(lu_node.examples, + lu_node.examples_sources) + ), + ) + + cur.executemany( + u""" + INSERT INTO usagenote (unitid, note) + VALUES (?, ?) + """, + ((lu_node.id, note) for note in lu_node.usage_notes), + ) + + cur.executemany( + u""" + INSERT INTO externallink (unitid, link) + VALUES (?, ?) + """, + ((lu_node.id, link) + for link in lu_node.external_links), + ) + + cur.executemany( + u""" + INSERT INTO unitemotionname (unitid, nameid) + VALUES (?, ?) + """, + ( + (lu_node.id, self._enids[emo_name.value]) + for emo_name in lu_node.emotion_names + ), + ) + + cur.executemany( + u""" + INSERT INTO unitemotionvaluation (unitid, valuationid) + VALUES (?, ?) + """, + ( + (lu_node.id, self._evids[emo_val.value]) + for emo_val in lu_node.emotion_valuations + ), + ) + + def _ensure_enum_row_id(self, table, id_field, value_field, value): + select_query = u"SELECT {id} FROM {table} WHERE {value} = ?".format( + id=id_field, + table=table, + value=value_field, + ) + with closing(self._db.cursor()) as cur: + cur.execute(select_query, (value,)) + id_row = cur.fetchone() + + if id_row is not None: + return id_row[0] + + insert_query = u"INSERT INTO {table} ({value}) VALUES (?)".format( + table=table, + value=value_field, + ) + with closing(self._db.cursor()) as cur: + cur.execute(insert_query, (value,)) + return cur.lastrowid + + def _finalize_rels(self, tablename, rels_dict): + ins_query = ( + u"INSERT INTO {} (parentid, relid, childid) VALUES (?, ?, ?)" + .format(tablename) + ) + + with closing(self._db.cursor()) as cur: + for par_id, chls in six.iteritems(rels_dict): + for rel_id, chl_id in chls: + try: + cur.execute(ins_query, (par_id, rel_id, chl_id)) + except sqlite3.IntegrityError: + _log.warning( + 'Relation typed %s between %d --> %d causes ' + 'IntegrityError, dropped', + tablename, + par_id, + chl_id, + ) + + def _prune_empty_synsets(self): + with closing(self._db.cursor()) as cur: + cur.execute( + u""" + SELECT synset.id + FROM synset + LEFT JOIN lexicalunit ON synset.id = lexicalunit.synset + WHERE lexicalunit.synset IS NULL + """, + ) + empties = tuple(row[0] for row in cur) + + if not empties: + # All clear! + return + + for synid in empties: + _log.warning('Synset %d is empty', synid) + + self._db.execute( + u"DELETE FROM synset WHERE id IN ({})".format( + u','.join(u'?' * len(empties)) + ), + empties, + ).close() + +_this_storage_ = PLWordNet diff --git a/plwn/utils/__init__.py b/plwn/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/plwn/utils/graphmlout.py b/plwn/utils/graphmlout.py new file mode 100644 index 0000000000000000000000000000000000000000..53ecac14bbc9b448e3a46fbfc3d4717bb34e96df --- /dev/null +++ b/plwn/utils/graphmlout.py @@ -0,0 +1,801 @@ +from __future__ import absolute_import, division +try: + str = unicode +except NameError: + pass + +import collections as coll +import functools as funct +import json +import xml.etree.cElementTree as et + +from six import iteritems + +from ..enums import make_values_tuple + +__all__ = ( + 'GraphMLWordNet', + 'GraphMLBuilder', + 'GRAPH_TYPE_SYNSET', + 'GRAPH_TYPE_UNIT', + 'GRAPH_TYPE_MIXED', + 'UNS_HAS_LU', + 'UNS_IN_SYN', +) + +# Constants for graphml exporting (library user should just use the string +# values). +# They also double as prefixes for IDs. +GRAPH_TYPE_SYNSET = 'synset' +GRAPH_TYPE_UNIT = 'lexical_unit' +GRAPH_TYPE_MIXED = 'mixed' +UNS_HAS_LU = u'has_unit' +UNS_IN_SYN = u'in_synset' + + +class GraphMLWordNet(object): + """Stores plWordNet data as a GraphML tree. + + This is an auxiliary class which usually shouldn't be constructed directly. + Use an appropriate method from :class:`plwn.bases.PLWordNet`. + """ + + #: Defines a possible type of a GraphML graph attribute. ``typename`` is + #: the name of the type (value of ``attr.type`` attribute in GraphML), and + #: ``convert`` is a function that takes a single argument and converts it + #: to string which will be the content of a ``data`` tag. + _DataType = coll.namedtuple('_DataType', ('typename', 'convert')) + + DATA_TYPE_INT = _DataType(u'long', lambda val: str(int(val))) + DATA_TYPE_STR = _DataType(u'string', str) + DATA_TYPE_BOOL = _DataType( + u'boolean', + (lambda val: u'true' if val else u'false'), + ) + DATA_TYPE_JSON = _DataType(u'string', json.dumps) + DATA_TYPE_ENUMVAL = _DataType(u'string', lambda val: str(val.value)) + # Data type for enum that can also be None. + DATA_TYPE_OPTENUMVAL = _DataType( + u'string', + lambda val: '' if val is None else str(val.value), + ) + DATA_TYPE_ENUMSEQ = _DataType( + u'string', + lambda val: json.dumps(make_values_tuple(val)), + ) + + def __init__(self): + self._root = et.Element( + u'graphml', + # The commented out xmlns declaration is correct, but inserting + # it causes the namespace machinery of ElementTree complicate + # everything. Only uncomment if more namespaces need to be + # embedded in the output. + # {'xmlns': 'http://graphml.graphdrawing.org/xmlns'}, + ) + self._graph = et.SubElement( + self._root, + u'graph', + {u'id': u'plWordNet', u'edgedefault': u'directed'}, + ) + self._tree = et.ElementTree(self._root) + self._attr_types = {} + + def add_attribute_type(self, id_, name, type_, for_=u'node'): + """Adds an attribute which can be then assigned to node or edge + instances. + + :param str id_: Unique (in the whole XML) identifier of the attribute + type. + :param str name: Name of the attribute. + :param _DataType type_: One of the ``DATA_TYPE_*`` constants, defining + the type of the attribute. + :param str for_: Should be either "node" or "edge", depending if it's a + node attribute or an edge attribute. + + :raises ValueError: If ``type_`` or ``for_`` were passed an illegal + value. + """ + + if not isinstance(type_, self._DataType): + raise ValueError('type_={!r}'.format(type_)) + if for_ != u'node' and for_ != u'edge': + raise ValueError('for={!r}'.format(for_)) + + self._attr_types[id_] = type_ + + et.SubElement( + self._root, + u'key', + { + u'id': id_, + u'for': for_, + u'attr.name': name, + u'attr.type': type_.typename, + }, + ) + + def add_node(self, id_, attributes={}): + """Add a node to the GraphML graph. + + This would be either a synset node or a lexical unit node, this method + doesn't distinguish between them. The caller should include some way to + tell them apart. + + :param str id_: Unique (in the whole XML) identifier of the node. + :param Mapping[str,Union[int,bool,float,str]] attributes: Maps + attribute IDs to their values. The IDs should have been previously + defined by :meth:`.add_attribute_type`. + + :raises KeyError: If any of the names in ``attributes`` was not + previously defined. + """ + + node = et.SubElement( + self._graph, + u'node', + {u'id': id_}, + ) + self._add_attributes_to(node, attributes) + + def add_edge(self, id_, source, target, attributes={}): + """Add an edge to the GraphML graph. + + An edge would normally designate a relation, but this method doesn't + assume that. The caller should set an appropriate attribute for that. + + Source and target nodes do not need to have been previously defined, + but should be eventually or the graph will be invalid. + + :param str id_: Unique (in the whole XML) identifier of the node. + :param str source: Identifier of the source node. + :param str target: Identifier of the target node. + :param Mapping[str,Union[int,bool,float,str]] attributes: Maps + attribute IDs to their values. The IDs should have been previously + defined by :meth:`.add_attribute_type`. + + :raises KeyError: If any of the names in ``attributes`` was not + previously defined. + """ + + edge = et.SubElement( + self._graph, + u'edge', + {u'id': id_, u'source': source, u'target': target}, + ) + self._add_attributes_to(edge, attributes) + + def write(self, file_): + """Saves the GraphML representation to a file. + + :param Union[str,TextIO] file_: Stream or name of the file to which the + graph should be written. + """ + + self._tree.write(file_, 'utf-8') + + def _add_attributes_to(self, element, attributes): + for attr_id, attr_val in iteritems(attributes): + attr_type = self._attr_types[attr_id] + attr = et.SubElement( + element, + u'data', + {u'key': attr_id}, + ) + attr.text = attr_type.convert(attr_val) + + +class GraphMLBuilder(object): + """Class that bridges :class:`plwn.bases.PLWordNetBase` and + :class:`GraphMLWordNet`, extracting data from the former and putting it + into the latter in the appropriate format. + + This is an auxiliary class which usually shouldn't be constructed directly. + Use an appropriate method from :class:`plwn.bases.PLWordNet`. + """ + + _EDGE_LEX_TEMPLATE = u'lu--{}--{}--{}' + _EDGE_SYN_TEMPLATE = u'syn--{}--{}--{}' + _EDGE_UNS_TEMPLATE = u'uns--{}--{}--{}' + + def __init__(self, plwn, gmlwn): + """ + :param plwn: The plWordNet instance from which the data will be + extracted. + :type plwn: plwn.bases.PLWordNetBase + :param gmlwn: The GraphML storage which will receive data from + ``plwn``. + :type gmlwn: GraphMLWordNet + """ + + self._plwn = plwn + self._graphout = gmlwn + + # Add attributes for relation edges. Edges are present for all graphs, + # so they will be needed anyway. + self._graphout.add_attribute_type( + 'edge-type', + 'type', + GraphMLWordNet.DATA_TYPE_STR, + 'edge', + ) + self._graphout.add_attribute_type( + 'edge-name', + 'name', + GraphMLWordNet.DATA_TYPE_STR, + 'edge', + ) + + def synset_graph(self, + prefix_ids, + include_attributes, + included_attributes, + excluded_attributes, + included_nodes, + excluded_nodes, + included_relations, + excluded_relations): + """See :meth:`plwn.bases.PLWordNetBase.to_graphml` for description.""" + + added_attributes = ( + self._add_synset_attrs(included_attributes, excluded_attributes) + if (include_attributes or + included_attributes is not None or + excluded_attributes is not None) + else () + ) + visited_nodes = set() + + for edge in self._plwn.synset_relation_edges( + included_relations, + excluded_relations, + ): + prefixed_source = self._prefix_synset_id( + edge.source.id, + prefix_ids, + ) + prefixed_target = self._prefix_synset_id( + edge.target.id, + prefix_ids, + ) + + # Add an edge if both its endpoints are not excluded. Nodes are + # added along edges, but it's not a problem if a valid node is not + # included, because it will eventually be included by another edge, + # if it's not completely secluded (and if it is, we don't want it). + if self._check_include_exclude_2( + edge.source.id, + edge.target.id, + included_nodes, + excluded_nodes, + ): + if edge.source.id not in visited_nodes: + visited_nodes.add(edge.source.id) + self._graphout.add_node( + prefixed_source, + self._make_attr_dict( + edge.source, + added_attributes, + ), + ) + if edge.target.id not in visited_nodes: + visited_nodes.add(edge.target.id) + self._graphout.add_node( + prefixed_target, + self._make_attr_dict( + edge.target, + added_attributes, + ), + ) + + # Now, add the edge itself + self._graphout.add_edge( + self._EDGE_SYN_TEMPLATE.format( + prefixed_source, + prefixed_target, + edge.relation, + ), + prefixed_source, + prefixed_target, + {u'edge-type': u'relation', u'edge-name': edge.relation}, + ) + + def lexical_unit_graph(self, + prefix_ids, + include_attributes, + included_attributes, + excluded_attributes, + included_nodes, + excluded_nodes, + included_relations, + excluded_relations): + + added_attributes = ( + self._add_lexunit_attrs(included_attributes, excluded_attributes) + if (include_attributes or + included_attributes is not None or + excluded_attributes is not None) + else () + ) + visited_nodes = set() + + for edge in self._plwn.lexical_relation_edges(included_relations, + excluded_relations): + prefixed_source = self._prefix_lexunit_id( + edge.source.id, + prefix_ids, + ) + prefixed_target = self._prefix_lexunit_id( + edge.target.id, + prefix_ids, + ) + + if self._check_include_exclude_2( + edge.source.id, + edge.target.id, + included_nodes, + excluded_nodes, + ): + if edge.source.id not in visited_nodes: + visited_nodes.add(edge.source.id) + self._graphout.add_node( + prefixed_source, + self._make_attr_dict( + edge.source, + added_attributes, + ), + ) + if edge.target.id not in visited_nodes: + visited_nodes.add(edge.target.id) + self._graphout.add_node( + prefixed_target, + self._make_attr_dict( + edge.target, + added_attributes, + ), + ) + + self._graphout.add_edge( + self._EDGE_LEX_TEMPLATE.format( + prefixed_source, + prefixed_target, + edge.relation, + ), + prefixed_source, + prefixed_target, + {u'edge-type': u'relation', u'edge-name': edge.relation}, + ) + + def mixed_graph(self, + include_attributes, + included_synset_attributes, + excluded_synset_attributes, + included_lexical_unit_attributes, + excluded_lexical_unit_attributes, + included_synset_relations, + excluded_synset_relations, + included_lexical_unit_relations, + excluded_lexical_unit_relations, + included_synset_nodes, + excluded_synset_nodes, + included_lexical_unit_nodes, + excluded_lexical_unit_nodes): + + synset_attributes = ( + self._add_synset_attrs( + included_synset_attributes, + excluded_synset_attributes, + ) + if (include_attributes or + included_synset_attributes is not None or + excluded_synset_attributes is not None) + else () + ) + + lexunit_attributes = ( + self._add_lexunit_attrs( + included_lexical_unit_attributes, + excluded_lexical_unit_attributes, + ) + + if (include_attributes or + included_lexical_unit_attributes is not None or + excluded_lexical_unit_attributes is not None) + else () + ) + + added_synsets = set() + empty_synsets = set() + + # Add synset edges, then add their lexunit nodes (which were not + # excluded). Do not include lexical units from synsets that were + # excluded. + for syn_edge in self._plwn.synset_relation_edges( + included_synset_relations, + excluded_synset_relations, + ): + + if self._check_include_exclude_2( + syn_edge.source.id, + syn_edge.target.id, + included_synset_nodes, + excluded_synset_nodes, + ): + self._add_mixed_synset_edge( + syn_edge, + synset_attributes, + lexunit_attributes, + added_synsets, + empty_synsets, + included_lexical_unit_nodes, + excluded_lexical_unit_nodes, + ) + + for lex_edge in self._plwn.lexical_relation_edges( + included_lexical_unit_relations, + excluded_lexical_unit_relations, + ): + + if self._check_include_exclude_2( + lex_edge.source.id, + lex_edge.target.id, + included_lexical_unit_nodes, + excluded_lexical_unit_nodes, + ): + self._add_mixed_lexunit_edge( + lex_edge, + synset_attributes, + lexunit_attributes, + added_synsets, + empty_synsets, + included_synset_nodes, + excluded_synset_nodes, + ) + + def _add_mixed_synset_edge(self, + syn_edge, + syn_attrs, + lex_attrs, + added_syns, + empty_syns, + included_lexs, + excluded_lexs): + + source_units = None + target_units = None + + # If the synsets have not yet been yet added, get lexical units + # that belong to them. If an empty synset is encountered, + # remember it. + if (syn_edge.source.id not in added_syns and + syn_edge.source.id not in empty_syns): + + source_units = self._make_units_of_synset( + syn_edge.source, + included_lexs, + excluded_lexs, + ) + + if not source_units: + empty_syns.add(syn_edge.source.id) + + if (syn_edge.target.id not in added_syns and + syn_edge.target.id not in empty_syns): + + target_units = self._make_units_of_synset( + syn_edge.target, + included_lexs, + excluded_lexs, + ) + + if not target_units: + empty_syns.add(syn_edge.target.id) + + prefixed_syn_source = self._prefix_synset_id( + syn_edge.source.id, + True, + ) + prefixed_syn_target = self._prefix_synset_id( + syn_edge.target.id, + True, + ) + + # Only add the edge if both endpoints are not empty (don't + # check from *_units, because the endpoint wasn't necessarily + # added in this step. + if (syn_edge.source.id not in empty_syns and + syn_edge.target.id not in empty_syns): + + # If the source or target was not yet added, it will have + # the units set with true value. If it's false, then it was + # already added earlier (if it was really empty, it would + # have been added to empty_synsets. + if source_units: + self._graphout.add_node( + prefixed_syn_source, + self._make_attr_dict( + syn_edge.source, + syn_attrs, + ), + ) + self._add_units_of_synset( + prefixed_syn_source, + source_units, + lex_attrs, + ) + added_syns.add(syn_edge.source.id) + + if target_units: + self._graphout.add_node( + prefixed_syn_target, + self._make_attr_dict( + syn_edge.target, + syn_attrs, + ), + ) + self._add_units_of_synset( + prefixed_syn_target, + target_units, + lex_attrs, + ) + added_syns.add(syn_edge.target.id) + + self._graphout.add_edge( + self._EDGE_SYN_TEMPLATE.format( + prefixed_syn_source, + prefixed_syn_target, + syn_edge.relation, + ), + prefixed_syn_source, + prefixed_syn_target, + {u'edge-type': u'relation', u'edge-name': syn_edge.relation}, + ) + + def _add_mixed_lexunit_edge(self, + lex_edge, + syn_attrs, + lex_attrs, + added_syns, + empty_syns, + included_syns, + excluded_syns): + + source_synset = lex_edge.source.synset + target_synset = lex_edge.target.synset + + # Check if one of the lexunits' synset is empty or otherwise + # excluded. + if (self._check_include_exclude_2(source_synset.id, + target_synset.id, + included_syns, + excluded_syns) and + source_synset.id not in empty_syns and + target_synset.id not in empty_syns): + + # At this point, both lexunits and their synsets are eligible for + # being added. And if the synset is added, then all of its + # not-excluded lexical units get added too. This takes care of + # adding the currently processed lexical unit nodes to the graph. + if source_synset.id not in added_syns: + source_units = self._make_units_of_synset( + source_synset, + included_syns, + excluded_syns, + ) + prefixed_syn_source = self._prefix_synset_id( + source_synset.id, + True, + ) + self._graphout.add_node( + prefixed_syn_source, + self._make_attr_dict(source_synset, syn_attrs), + ) + self._add_units_of_synset( + prefixed_syn_source, + source_units, + lex_attrs, + ) + added_syns.add(source_synset.id) + + if target_synset.id not in added_syns: + target_units = self._make_units_of_synset( + target_synset, + included_syns, + excluded_syns, + ) + prefixed_syn_target = self._prefix_synset_id( + target_synset.id, + True, + ) + self._graphout.add_node( + prefixed_syn_target, + self._make_attr_dict( + target_synset, + syn_attrs, + ), + ) + self._add_units_of_synset( + prefixed_syn_target, + target_units, + lex_attrs, + ) + added_syns.add(target_synset.id) + + prefixed_lex_source = self._prefix_lexunit_id( + lex_edge.source.id, + True, + ) + prefixed_lex_target = self._prefix_lexunit_id( + lex_edge.target.id, + True, + ) + self._graphout.add_edge( + self._EDGE_LEX_TEMPLATE.format( + prefixed_lex_source, + prefixed_lex_target, + lex_edge.relation, + ), + prefixed_lex_source, + prefixed_lex_target, + {u'edge-type': u'relation', u'edge-name': lex_edge.relation}, + ) + + def _add_units_of_synset(self, + prefixed_synset_id, + units_of_synset, + attributes): + + for lu in units_of_synset: + prefixed_lex = self._prefix_lexunit_id(lu.id, True) + self._graphout.add_node( + prefixed_lex, + self._make_attr_dict(lu, attributes), + ) + self._graphout.add_edge( + self._EDGE_UNS_TEMPLATE.format( + prefixed_synset_id, + prefixed_lex, + UNS_HAS_LU, + ), + prefixed_synset_id, + prefixed_lex, + {u'edge-type': u'unit_and_synset', u'edge-name': UNS_HAS_LU}, + ) + self._graphout.add_edge( + self._EDGE_UNS_TEMPLATE.format( + prefixed_lex, + prefixed_synset_id, + UNS_IN_SYN, + ), + prefixed_lex, + prefixed_synset_id, + {u'edge-type': u'unit_and_synset', u'edge-name': UNS_IN_SYN}, + ) + + def _add_synset_attrs(self, included_attrs, excluded_attrs): + includer = _AttrIncluder( + self._graphout, + u'syn_data', + funct.partial( + self._check_include_exclude, + include_set=included_attrs, + exclude_set=excluded_attrs, + ), + ) + + includer(u'relations', GraphMLWordNet.DATA_TYPE_JSON) + includer(u'definition', GraphMLWordNet.DATA_TYPE_STR) + + return includer.included_attrs + + def _add_lexunit_attrs(self, included_attrs, excluded_attrs): + includer = _AttrIncluder( + self._graphout, + u'lu_data', + funct.partial( + self._check_include_exclude, + include_set=included_attrs, + exclude_set=excluded_attrs, + ), + ) + + includer(u'lemma', GraphMLWordNet.DATA_TYPE_STR) + includer(u'pos', GraphMLWordNet.DATA_TYPE_ENUMVAL) + includer(u'variant', GraphMLWordNet.DATA_TYPE_INT) + includer(u'definition', GraphMLWordNet.DATA_TYPE_STR) + includer(u'sense_examples', GraphMLWordNet.DATA_TYPE_JSON) + includer(u'sense_examples_sources', GraphMLWordNet.DATA_TYPE_JSON) + includer(u'external_links', GraphMLWordNet.DATA_TYPE_JSON) + includer(u'usage_notes', GraphMLWordNet.DATA_TYPE_JSON) + includer(u'domain', GraphMLWordNet.DATA_TYPE_ENUMVAL) + includer(u'relations', GraphMLWordNet.DATA_TYPE_JSON) + includer(u'verb_aspect', GraphMLWordNet.DATA_TYPE_OPTENUMVAL) + includer(u'emotion_markedness', GraphMLWordNet.DATA_TYPE_OPTENUMVAL) + includer(u'emotion_names', GraphMLWordNet.DATA_TYPE_ENUMSEQ) + includer(u'emotion_valuations', GraphMLWordNet.DATA_TYPE_ENUMSEQ) + includer(u'emotion_example', GraphMLWordNet.DATA_TYPE_STR) + includer(u'emotion_example_secondary', GraphMLWordNet.DATA_TYPE_STR) + + return includer.included_attrs + + @classmethod + def _make_units_of_synset(cls, synset, included_nodes, excluded_nodes): + return frozenset(lu + for lu in synset.lexical_units + if cls._check_include_exclude(lu.id, + included_nodes, + excluded_nodes)) + + @classmethod + def _prefix_synset_id(cls, id_, do_prefix): + return (u'{}-{}'.format(GRAPH_TYPE_SYNSET, id_) + if do_prefix + else str(id_)) + + @classmethod + def _prefix_lexunit_id(cls, id_, do_prefix): + return (u'{}-{}'.format(GRAPH_TYPE_UNIT, id_) + if do_prefix + else str(id_)) + + @staticmethod + def _check_include_exclude(item, include_set, exclude_set): + """``True`` if item is in include and not in exclude. If the set is + ``None``, the check for the set is ``True``. + """ + + return ((include_set is None or item in include_set) and + (exclude_set is None or item not in exclude_set)) + + @staticmethod + def _check_include_exclude_2(item1, item2, include_set, exclude_set): + """Check for two items in include/exclude (ex. for edges).""" + + return ((include_set is None or + (item1 in include_set and item2 in include_set)) and + (exclude_set is None or + (item1 not in exclude_set and item2 not in exclude_set))) + + @staticmethod + def _make_attr_dict(item, added_attrs): + # It's assumed that by the time this private method gets called, + # something hase made sure that added_attrs contains only legal values. + # added_attrs should be a set of pairs, first one the GraphML attribute + # key, and the second the name of the attribute of the lexical unit / + # synset object. + return {attrkey: getattr(item, attrname) + for attrkey, attrname in added_attrs} + + +class _AttrIncluder(object): + """ + Aux class for the repetitive "check if attribute should be included" -> + "store it in all required places" cycle. + """ + + def __init__(self, graphout, type_prefix, checkfunc): + """ + :param GraphMLWordNet graphout: The output graph instance. + + :param str type_prefix: Unique names of attributes will be prefixed + with this. + + :param checkfunc: Callable that should take a name of an attribute and + return ``True`` if it should be included and ``False`` otherwise. + :type checkfunc: Callable[[str], bool] + """ + + self._graphout = graphout + self._prefix = type_prefix + self._check = checkfunc + self._added = set() + + @property + def included_attrs(self): + return self._added + + def __call__(self, attr_name, attr_type): + if self._check(attr_name): + idpair = u'{}-{}'.format(self._prefix, attr_name), attr_name + self._added.add(idpair) + self._graphout.add_attribute_type(*idpair, type_=attr_type) diff --git a/plwn/utils/sorting.py b/plwn/utils/sorting.py new file mode 100644 index 0000000000000000000000000000000000000000..6f4a6878ce75ae1b547854e4da75bce8b4116449 --- /dev/null +++ b/plwn/utils/sorting.py @@ -0,0 +1,22 @@ +""" +Sorting keys that provide locale-dependant alphabetical sorting. +""" + +from __future__ import absolute_import, division + + +import locale + +import six + + +__all__ = 'text_key', + + +if six.PY2: + # Since Python 2 does not support strxfrm for unicode, encode it in UTF-8 + # before transforming. + def text_key(text): + return locale.strxfrm(text.encode('UTF-8')) +else: + text_key = locale.strxfrm diff --git a/plwn/utils/tupwrap.py b/plwn/utils/tupwrap.py new file mode 100644 index 0000000000000000000000000000000000000000..49c2ca512967e785f59c92101b5c32f5980ce8cb --- /dev/null +++ b/plwn/utils/tupwrap.py @@ -0,0 +1,51 @@ +"""Wrapper for all functions that return generators, calling the wrapped +generator will wrap the contents in a tuple (as a faster, chaining way or +``tuple(generator)``). +""" + +from __future__ import absolute_import, unicode_literals, division + + +from functools import wraps + + +__all__ = 'TupWrapper', 'tup_wrapped' + + +class TupWrapper(object): + """Wrapper class for generator objects. + + Adds a ``__call__`` method which will convert the wrapped generator to + a tuple. + """ + + __slots__ = '_gen', + + def __init__(self, generator): + self._gen = generator + + def __iter__(self): + return self._gen + + def __call__(self): + return tuple(self._gen) + + def __repr__(self): + return '{}({!r})'.format(self.__class__.__name__, self._gen) + + +def tup_wrapped(fn): + """Decorator for functions that return generators. + + The return value of the wrapped function will be wrapped by + :class:`TupWrapper`. + + This decorator is the only way to wrap around the output of generator + functions. + """ + + @wraps(fn) + def decorated(*args, **kwargs): + return TupWrapper(fn(*args, **kwargs)) + + return decorated diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..861a9f554263efb088d8636c4f17a30696e495ad --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +[egg_info] +tag_build = +tag_date = 0 +tag_svn_revision = 0 + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..eaee1df3c4f66e925b9a9b40a6ed71b464debb3d --- /dev/null +++ b/setup.py @@ -0,0 +1,28 @@ +# coding: utf8 +from setuptools import setup, find_packages +import sys + + +def install_requires(): + req = ['six>=1.10'] + # Only require enum backport in python2 (python3 has better stdlib) + if sys.version_info.major < 3: + req.append('enum34>=1.1.2') + return req + + +if __name__ == '__main__': + setup( + name='PLWN_API', + version='0.9', + description='Python API to access plWordNet lexicon', + + author='Michał Kaliński', + author_email='michal.kalinski@pwr.edu.pl', + + packages=find_packages(exclude=['tests', 'tests.*']), + package_data={'plwn': ['relation_aliases.tsv']}, + test_suite='tests.setuptools_loader.setuptools_load_tests', + + install_requires=install_requires(), + )