Skip to content
Snippets Groups Projects
Commit f492369b authored by leszeks's avatar leszeks
Browse files

Initial commit

parent bf7d8539
No related branches found
No related tags found
No related merge requests found
Showing
with 2553 additions and 0 deletions
include README-pl-beta.txt
PKG-INFO 0 → 100644
Metadata-Version: 1.0
Name: PLWN_API
Version: 0.9
Summary: Python API to access plWordNet lexicon
Home-page: UNKNOWN
Author: Michał Kaliński
Author-email: michal.kalinski@pwr.edu.pl
License: UNKNOWN
Description: UNKNOWN
Platform: UNKNOWN
Metadata-Version: 1.0
Name: PLWN-API
Version: 0.9
Summary: Python API to access plWordNet lexicon
Home-page: UNKNOWN
Author: Michał Kaliński
Author-email: michal.kalinski@pwr.edu.pl
License: UNKNOWN
Description: UNKNOWN
Platform: UNKNOWN
MANIFEST.in
README-pl-beta.txt
setup.py
PLWN_API.egg-info/PKG-INFO
PLWN_API.egg-info/SOURCES.txt
PLWN_API.egg-info/dependency_links.txt
PLWN_API.egg-info/requires.txt
PLWN_API.egg-info/top_level.txt
plwn/__init__.py
plwn/_loading.py
plwn/bases.py
plwn/enums.py
plwn/exceptions.py
plwn/relation_aliases.tsv
plwn/relresolver.py
plwn/readers/__init__.py
plwn/readers/comments.py
plwn/readers/nodes.py
plwn/readers/ubylmf.py
plwn/readers/wndb.py
plwn/readers/wnxml.py
plwn/storages/__init__.py
plwn/storages/objects.py
plwn/storages/sqlite.py
plwn/utils/__init__.py
plwn/utils/graphmlout.py
plwn/utils/sorting.py
plwn/utils/tupwrap.py
\ No newline at end of file
six>=1.10
enum34>=1.1.2
plwn
******************
**** PlWN API ****
******************
PlWN API umożliwia:
- wyszukiwanie synsetów i jednostek leksykalnych w Słowosieci;
- dostęp do własności synsetów i jednostek leksykalnych, oraz ich relacji;
- eksport całości bądź części Słowosieci do grafu.
To README jest krótką, tymczasową instrukcją do wersji beta PlWN API.
Interfejs oraz funkcjonalność mogą ulec zmianie.
=============
Inicjalizacja
=============
>>> import plwn
>>> wn = plwn.load('plwn-3.0.db', 'sqlite3')
=================
Zrzuty baz danych
=================
Na ten moment, zalecany jest dostęp do bazy danych Słowosieci poprzez zrzuty do
plików SQLite, "plwn-X.db" (gdzie X to wersja Słowosieci). Zrzuty powinny
były zostać udostępnione razem z API.
==============
Funkcjonalność
==============
Opis funkcjonalności jest dostępny poprzez docstringi modułu plwn/bases.py
$ pydoc plwn.bases
Dodatkowo, w plwn/_pos.py znajduje się lista stałych wartości part-of-speech
używanych przez API.
Zgodnie z konwencją przyjętą przez Python 3, większość metod zwracających
kolekcje obiektów zwraca je w postaci generatorów.
>>> wn.lexical_units(lemma=u'pies')
TupWrapper(<generator object <genexpr> at 0x7f1048583410>)
Jeśli celem zapytania jest iteracja po wynikach, nie potrzeba niczego więcej.
>>> for lu in wn.lexical_units(lemma=u'pies'):
>>> print(lu.id)
5563
52245
...
Aby otrzymać listę (albo inną kolekcję), należy rzutować otrzymany obiekt.
>>> list(wn.lexical_units(lemma=u'pies'))
[<LexicalUnit id=5563 lemma=u'pies' pos=u'noun' variant=1>,
<LexicalUnit id=52245 lemma=u'pies' pos=u'noun' variant=2>,
...
]
Dla wygody w trybie interaktywnym Pythona, generatory są opakowane w obiekty
"TupWrapper", które umożliwiają rzutowanie generatora do typu krotki
poprzez "wywołanie" obiektu.
>>> wn.lexical_units(lemma=u'pies')()
(<LexicalUnit id=5563 lemma=u'pies' pos=u'noun' variant=1>,
<LexicalUnit id=52245 lemma=u'pies' pos=u'noun' variant=2>,
...
)
Jednak w przypadku pisania programów odwołujących się do API zalecane jest
jawne rzutowanie zwracanych generatorów. "Explicit is better than implicit."
File added
from ._loading import *
from .enums import PoS
# Setup logging for the package (not)
import logging as _logging
_logging.getLogger('plwn').addHandler(_logging.NullHandler())
"""Defines user-facing functions that allow simple construction of
:class:`PLWordnetBase` instances, with selected storages and readers.
"""
from __future__ import absolute_import, division, print_function
from collections import namedtuple
from importlib import import_module
import textwrap as tw
import six
__all__ = 'read', 'load', 'show_source_formats', 'show_storage_formats'
_Info = namedtuple('_Info', ('desc', 'modname'))
_READERS = {
'uby-lmf': _Info('Discontinued XML-based format', 'ubylmf'),
'database': _Info(
'MySQL database of plWordNet. Only works on python 2 and requires '
'certain additional libraries. This is meant for internal use only '
'and will not work for most users. The file should contain one line '
'with SQLAlchemy URL to the database.',
'wndb',
),
'xml': _Info('The official PLWN XML format', 'wnxml'),
}
_STORAGES = {
'sqlite3': _Info(
'File database format, with a compact schema (compared to internal '
'PLWN database).',
'sqlite',
),
'objects': _Info(
'Stores data in plain python objects, dumping them in pickle format. '
'Quick to construct, but querying and memory efficiency is not '
'guaranteed.',
'objects',
),
}
# Defaults for this version
_READERS[None] = _READERS['xml']
_STORAGES[None] = _STORAGES['sqlite3']
def _imp_reader(modname):
# Pre-import the root package - py3 needs this?
import plwn.readers # noqa
return import_module('.' + modname, 'plwn.readers')._this_reader_
def _imp_storage(modname):
# Pre-import the root package - py3 needs this?
import plwn.storages # noqa
return import_module('.' + modname, 'plwn.storages')._this_storage_
def read(source_file,
source_format=None,
storage_file=None,
storage_format=None):
"""Read plWordNet data from a file and return the right
:class:`PLWordnetBase` subclass instance for the selected parameters.
Where defaults are mentioned, those values may change with each minor
version of PLWN API. If you depend on some particular format for a long
running program, state it explicitly.
:param str source_file: Path to the file from which the plWordNet data will
be read. The required contents of the file depend on selected
``source_format``.
:param str source_format: Name of the format of data that's contained in
``source_file``. If ``None``, then the default for the current version will
be chosen.
:param str storage_file: Path to the file where the internal representation
of the storage will be dumped. It will be possible to load this file using
:func:`load`. If ``None``, then the representation will not be dumped.
:param str storage_format: Name of the format in which PLWN API will store
data in memory. Access methods provided should be the same, but their
efficiency may differ. If ``None``, then the default for the current
version will be chosen.
:rtype: PLWordnetBase
"""
stor_cls = _imp_storage(_STORAGES[storage_format].modname)
rdr = _imp_reader(_READERS[source_format].modname)
return stor_cls.from_reader(rdr(source_file), storage_file)
def load(storage_file, storage_format=None):
"""Read plWordNet data from a cached file with internal PLWN API
representation.
This function is much faster than :func:`read` if such file is available.
:param str storage_file: Path to the file from which the cached data will
be read.
:param str storage_format: Name of the format the data is stored in. It
must match the actual format and version of schema contained in the file.
:rtype: PLWordnetBase
"""
stor_cls = _imp_storage(_STORAGES[storage_format].modname)
return stor_cls.from_dump(storage_file)
def show_source_formats():
"""Print names and short descriptions of available source file formats to
``stdout``.
This function is primarily meant to be informative in interactive shell
mode.
"""
_show(_READERS)
def show_storage_formats():
"""Print names and short descriptions of available storage formats to
``stdout``.
This function is primarily meant to be informative in interactive shell
mode.
"""
_show(_STORAGES)
def _show(dict_):
for name, info in six.iteritems(dict_):
if name is None:
continue
print(name)
print('-' * len(name))
print(tw.fill(info.desc), end='\n\n')
This diff is collapsed.
# coding: utf8
"""
Enumerated values used in plWordNet
"""
from __future__ import absolute_import, division
import re
from enum import Enum
import six
__all__ = (
'PoS',
'VerbAspect',
'EmotionMarkedness',
'EmotionName',
'EmotionValuation',
'Domain',
'make_values_tuple',
)
# Helper function for making dictionaries translating enum instances into
# numbers used to denote them in plWN database.
def _fill_numtrans(enumclass, num2enum, enum2num):
for num, enuminst in enumerate(enumclass, 1):
num2enum[num] = enuminst
enum2num[enuminst] = num
def _get_from_numtrans(numtrans, num, optional):
try:
return numtrans[num]
except KeyError:
if optional:
return None
raise
# Explicit ordering is needed only in python 2.
_POS_ORDER = 'verb noun adverb adjective'
_POS_NUM2ENUM = {}
_POS_ENUM2NUM = {}
class PoS(Enum):
"""
Defines **Part of Speech** values used by plWN.
"""
if six.PY2:
__order__ = _POS_ORDER
verb = u'verb'
noun = u'noun'
adverb = u'adverb'
adjective = u'adjective'
v = verb
n = noun
adv = adverb
adj = adjective
@staticmethod
def by_db_number(number, optional=False):
return _get_from_numtrans(_POS_NUM2ENUM, number, optional)
@property
def db_number(self):
return _POS_ENUM2NUM[self]
_fill_numtrans(PoS, _POS_NUM2ENUM, _POS_ENUM2NUM)
_VA_ORDER = 'perfective imperfective predicative two_aspect'
_VA_NUM2ENUM = {}
_VA_ENUM2NUM = {}
class VerbAspect(Enum):
"""
Defines aspect values used by verbs in plWN.
"""
if six.PY2:
__order__ = _VA_ORDER
perfective = u'perf'
imperfective = u'imperf'
predicative = u'pred'
two_aspect = u'imperf.perf'
perf = perfective
imperf = imperfective
pred = predicative
two = two_aspect
# Additionally, some Polish abbreviations
dk = perfective
ndk = imperfective
@staticmethod
def by_db_number(number, optional=False):
return _get_from_numtrans(_VA_NUM2ENUM, number, optional)
@property
def db_number(self):
return _VA_ENUM2NUM[self]
_fill_numtrans(VerbAspect, _VA_NUM2ENUM, _VA_ENUM2NUM)
class EmotionMarkedness(Enum):
"""
Defines markedness of emotions associated with some lexical units.
"""
strong_positive = u'+ m'
strong_negative = u'- m'
weak_positive = u'+ s'
weak_negative = u'- s'
ambiguous = u'amb'
plus_m = strong_positive
minus_m = strong_negative
plus_s = weak_positive
minus_s = weak_negative
amb = ambiguous
@classmethod
def normalized(cls, strvalue):
"""
Return an instance of this enum with string value normalized with
regards to whitespace.
"""
strvalue = strvalue.strip()
# Try the one value value that won't require matching
if strvalue == cls.ambiguous.value:
return cls.ambiguous
match = re.match(r'([+-])\s*([sm])', strvalue, re.U)
if not match:
# This can't be a valid string, so let the built-in exception
# raise.
return cls(strvalue)
return cls(match.group(1) + u' ' + match.group(2))
class EmotionName(Enum):
"""
Possible names of emotions associated with some lexical units.
"""
joy = u'radość'
trust = u'zaufanie'
anticipation = u'cieszenie się na coś oczekiwanego'
surprise = u'zaskoczenie czymś nieprzewidywanym'
sadness = u'smutek'
anger = u'złość'
fear = u'strach'
disgust = u'wstręt'
radosc = joy
zaufanie = trust
cieszenie_sie_na = anticipation
zaskoczenie = surprise
smutek = sadness
zlosc = anger
strach = fear
wstret = disgust
class EmotionValuation(Enum):
"""
Possible valuations of emotions associated with some lexical units.
"""
usefulness = u'użyteczność'
good = u'dobro'
truth = u'prawda'
knowledge = u'wiedza'
beauty = u'piękno'
happiness = u'szczęście'
uselessness = u'nieużyteczność'
harm = u'krzywda'
ignorance = u'niewiedza'
error = u'błąd'
ugliness = u'brzydota'
unhappiness = u'nieszczęście'
uzytecznosc = usefulness
dobro = good
prawda = truth
wiedza = knowledge
piekno = beauty
szczescie = happiness
nieuzytecznosc = uselessness
krzywda = harm
niewiedza = ignorance
blad = error
brzydota = ugliness
nieszczescie = unhappiness
_DOM_ORDER = 'bhp czy wytw cech czc umy por zdarz czuj jedz grp msc cel rz ' \
'os zj rsl pos prc il zw ksz st sbst czas zwz hig zmn cumy cpor wal ' \
'cjedz dtk cwytw cczuj ruch pst cpos sp cst pog jak rel odcz grad sys ' \
'adj adv cdystr caku cper cdel'
_DOM_NUM2ENUM = {}
_DOM_ENUM2NUM = {}
class Domain(Enum):
"""
Wordnet domains of lexical units.
"""
if six.PY2:
__order__ = _DOM_ORDER
bhp = u'najwyższe w hierarchii'
czy = u'czynności (nazwy)'
wytw = u'wytwory ludzkie (nazwy)'
cech = u'cechy ludzi i zwierząt'
czc = u'części ciała'
umy = u'związane z myśleniem'
por = u'związane z porozumiewaniem się'
zdarz = u'zdarzenia'
czuj = u'uczucia, odczucia i emocje'
jedz = u'jedzenie'
grp = u'grupy ludzi i rzeczy'
msc = u'miejsca i umiejscowienie'
cel = u'cel działania'
rz = u'obiekty naturalne'
os = u'ludzie'
zj = u'zjawiska naturalne'
rsl = u'nazwy roślin'
pos = u'posiadanie i jego zmiana'
prc = u'procesy naturalne'
il = u'ilość, liczebność, jednoski miary'
zw = u'zwierzęta'
ksz = u'kształty'
st = u'sytuacje statyczne (stany)'
sbst = u'substancje'
czas = u'czas i stosunki czasowe'
zwz = u'związek miedzy ludźmi, rzeczami lub ideami'
hig = u'pielęgnacja ciała'
zmn = u'zmiana wielkości, temeraturym natężenia, itp.'
cumy = u'czasowniki myślenia (szeroko rozumianego)'
cpor = u'czasowniki mówienia, śpiewania itp.'
wal = u'czasowniki rywalizacji fizycznej'
cjedz = u'czasowniki jedzenia'
dtk = u'czasowniki oznacz. kontakt fizyczny ' \
u'(dotykanie, uderzenie, rycie itp.)'
cwytw = u'czasowniki oznacz. wytwarzanie czegoś'
cczuj = u'czasowniki wyrażające uczucia'
ruch = u'czasowniki ruchu'
pst = u'czasowniki postrzegania (percepcji)'
cpos = u'czasowniki posiadania i zmiany posiadania'
sp = u'czasowniki oznacz. wydarzenie i działania społeczne i polityczne'
cst = u'czasowniki stanowe'
pog = u'czasowniki oznacz. zjawiska pogodowe'
jak = u'przymiotniki jakościowe'
rel = u'przymiotniki relacyjne (rzeczownikowe)'
odcz = u'przymiotniki odczasownikowe'
grad = u'przymiotniki odprzymiotnikowe (natężenie cechy)'
sys = u'systematyka, klasyfikacja'
adj = u'PWN: all adjective clusters'
adv = u'PWN: all adverbs'
mat = u'przymiotniki materiałowe'
cdystr = u'czasownki dystrybutywne'
caku = u'czasowniki akumulatywne'
cper = u'czasowniki perduratywne'
cdel = u'czasowniki delimitatywne'
@staticmethod
def by_db_number(number, optional=False):
return _get_from_numtrans(_DOM_NUM2ENUM, number, optional)
@property
def db_number(self):
return _DOM_ENUM2NUM[self]
_fill_numtrans(Domain, _DOM_NUM2ENUM, _DOM_ENUM2NUM)
def make_values_tuple(enum_seq):
"""
Auxiliary function that converts a sequence of enums to a tuple of enum
values.
"""
return tuple(en.value for en in enum_seq)
"""Custom exceptions raised by PLWN API."""
from __future__ import absolute_import, division
__all__ = (
'PLWNAPIException',
'NotFound',
'LexicalUnitNotFound',
'SynsetNotFound',
'ReaderException',
'MalformedIdentifierException',
'LoadException',
'DumpVersionException',
'InvalidSynsetIdentifierException',
'InvalidLexicalUnitIdentifierException',
'InvalidRelationNameException',
'InvalidPoSException',
)
class PLWNAPIException(Exception):
"""Base for all exceptions in the module."""
pass
class NotFound(PLWNAPIException):
"""Base for exceptions raised when an object is not found."""
def __init__(self, lemma, pos, variant, *args):
super(NotFound, self).__init__(*args)
self.args = ('lemma={!r} pos={!r} variant={!r}'.format(
lemma,
pos,
variant,
),) + self.args
class LexicalUnitNotFound(NotFound):
"""Raised when a lexical unit is not found during lookup."""
pass
class SynsetNotFound(NotFound):
"""Raised when a synset is not found during lookup."""
pass
class ReaderException(PLWNAPIException):
"""Raised when there's an error in the format expected by a reader."""
pass
class MalformedIdentifierException(ReaderException):
"""Raised during UBY-LMF parsing, when a malformed identifier is
encountered.
"""
def __init__(self, id_):
super(MalformedIdentifierException, self).__init__(
"Malformed identifier, expected digits at the end of the original"
" id instead got {!r}"
.format(id_)
)
class LoadException(PLWNAPIException):
"""Raised when a storage can't be loaded from file."""
pass
class DumpVersionException(LoadException):
"""Raised when a dumped storage has wrong version (suggesting incompatible
format).
"""
def __init__(self, version_is, version_required):
super(DumpVersionException, self).__init__(version_is,
version_required)
self.version_is = version_is
self.version_required = version_required
def __str__(self):
return (
'Invalid schema version of dumped storage: {!r} (should be {!r})'
.format(self.version_is, self.version_required)
)
class InvalidSynsetIdentifierException(PLWNAPIException):
"""Raised when a query for a nonexistent synset ID is made."""
pass
class InvalidLexicalUnitIdentifierException(PLWNAPIException):
"""Raised when a query for a nonexistent lexical unit ID is made."""
pass
class InvalidRelationNameException(PLWNAPIException):
"""Raised when attempting to select synsets or units related by a relation
that does not exist.
"""
pass
class InvalidPoSException(PLWNAPIException):
"""Raised when a query for PoS is made, which is not one of the valid
constants.
"""
pass
"""Parsing strings in wordnet comment format, for readers that need to deal
with them.
Importing this module introduces dependency on wncomments.
"""
from __future__ import absolute_import, division
from collections import namedtuple
import itertools as itt
import plwn_comments as plwnc
import plwn_comments.exceptions as plwnce
import plwn_comments.utils.usage_tags as plwncu
__all__ = (
'WN_TAGS',
'NON_EXAMPLE_TAG_NAMES',
'CommentData',
'parse_comment_string',
)
#: :class:`plwn_comments.TagBank` structure that defines all kinds of comment
#: tags which are needed by PLWN API.
WN_TAGS = plwnc.TagBank()
# Usage notes
WN_TAGS.define(u'K')
# External links
WN_TAGS.define(u'L', u'{')
# Definition
WN_TAGS.define(u'D')
#: The distinction for these tags is useful, since all examples go to one
#: place.
NON_EXAMPLE_TAG_NAMES = frozenset((u'K', u'L', u'D'))
# And define those example tags
WN_TAGS.define_from(
plwncu.iter_usage_tags(),
plwncu.DEFAULT_USAGE_TAG_SURROUND,
)
#: Data tuple returned from :func:`parse_comment_string`.
CommentData = namedtuple(
'CommentData',
('examples', 'examples_sources', 'definition', 'usage', 'links'),
)
def parse_comment_string(cmt_str):
"""Parse a comment string and extract all data required by PLWN API packed
in a named tuple.
:param str cmt_str: String in PLWN comment format.
:returns: Extracted and ordered items needed by PLWN API.
:rtype: CommentData
"""
try:
cmt = plwnc.Comment.parse(cmt_str, WN_TAGS)
except plwnce.PLWNCommentsException:
# For now just make an empty comment which will make all fields unset
cmt = plwnc.Comment(WN_TAGS)
# Get all examples
examples = []
examples_src = []
for tagname, tagcontents in cmt.items():
if tagname not in NON_EXAMPLE_TAG_NAMES:
examples.extend(tagcontents)
examples_src.extend(itt.repeat(tagname, len(tagcontents)))
return CommentData(
tuple(examples),
tuple(examples_src),
cmt.get_first(u'D'),
tuple(cmt[u'K']),
tuple(cmt[u'L']),
)
"""Those tuples are returned by readers and absorbed by storages."""
from collections import namedtuple
__all__ = 'SynsetNode', 'LexicalUnitNode'
SynsetNode = namedtuple("SynsetNode", ["id", "definition", "related"])
LexicalUnitNode = namedtuple(
"LexicalUnitNode",
["id", "lemma", "pos", "variant", "synset", "unit_index", "definition",
"usage_notes", "external_links", "examples", "examples_sources",
"domain", "related", "verb_aspect", "emotion_markedness", "emotion_names",
"emotion_valuations", "emotion_example_1", "emotion_example_2"]
)
# FIXME Some assert statements should be converted to regular raises (asserts
# should not be used for anything other than checking for errors in the code
# itself).
from xml.etree import ElementTree
import re
import logging
from .nodes import SynsetNode, LexicalUnitNode
from .. import exceptions as exc
from ..enums import PoS, Domain
__all__ = 'ubylmf_reader',
ENCODING = 'utf-8'
_logger = logging.getLogger(__name__)
def ubylmf_reader(ubylmf_file):
"""Read PLwordnet iteratively, element by element.
:param ubylmf_file: the name of UMY-LMF file or an opened file itself.
:type ubylmf_file: str or file
:return: a generator over PLwordnet entities.
:rtype: generator
"""
contex = ElementTree.iterparse(ubylmf_file) # catch only end events
contex = iter(contex)
# Get root elem in order to clear it after reading each elem
try:
_, root = next(contex)
except StopIteration:
raise exc.ReaderException('The xml file is empty')
# Generate wordnet's elements
for _, elem in contex:
entities = []
# Parse entities
if elem.tag == "LexicalEntry":
for xml_sense in elem.findall("Sense"):
# Don't move it before if - we still want to generate tuples
# even if one sense is broken.
try:
entities.append(_make_lexicalunit(elem, xml_sense))
except Exception:
_logger.exception(
'\n%s\nIN ELEMENT\n%s',
ElementTree.tostring(xml_sense, ENCODING),
ElementTree.tostring(elem, ENCODING)
)
elif elem.tag == "Synset":
try:
entities.append(_make_synset(elem))
except Exception:
_logger.exception('\n%s', ElementTree.tostring(elem, ENCODING))
# Return entities
if entities:
root.clear()
for entity in entities:
yield entity
def _make_lexicalunit(xml_lexicalentry, xml_sense):
"""Return a lexical unit built from an xml element.
:param xml_lexicalentry: an xml element of LexicalUnit read from a file.
:type xml_lexicalentry: xml.etree.ElementTree.Element
:param xml_sense: <Sense> element that belongs to the LexicalUnit
:type xml_sense: xml.etree.ElementTree.Element
:return: a named tuple LexicalUnitNode
:rtype: LexicalUnitNode
"""
# Get id, synset and variant
lu_id = _extract_id(xml_sense.get("id"))
lu_synset = _extract_id(xml_sense.get("synset"))
lu_variant = int(xml_sense.get("index"))
# Get lemma
xml_lemma = xml_lexicalentry.find("Lemma").find("FormRepresentation")
lu_lemma = xml_lemma.get("writtenForm")
assert lu_lemma, "Lemma is empty"
# Get PoS
lu_pos = xml_lexicalentry.get("partOfSpeech")
assert lu_pos, "PoS is empty"
# Get definition - can be empty! At most 2
lu_definition, lu_usage_notes, lu_external_links = \
_extract_definitions(xml_sense)
# Get usage examples
lu_examples = []
lu_examples_sources = []
for xe in xml_sense.findall("SenseExample"):
example = xe.find("TextRepresentation").get("writtenText").strip()
if example:
exm_src_match = re.search(r'\[##([-\w]+):?\]$', example, re.U)
if exm_src_match is not None:
lu_examples.append(example[:exm_src_match.start(0)])
lu_examples_sources.append(exm_src_match.group(1))
else:
_logger.warning("Malformed sense example: %s", example)
# Get semantic labels
lu_domain = _get_domain(xml_sense)
# Get related
lu_related = []
for xsr in xml_sense.findall("SenseRelation"):
try:
lu_related.append(
(xsr.get("relName"), _extract_id(xsr.get("target")))
)
except exc.MalformedIdentifierException:
_logger.exception(
'\n%s\nIN ELEMENT\n%s\nThis relation is skipped.',
ElementTree.tostring(xsr, ENCODING),
ElementTree.tostring(xml_sense, ENCODING)
)
# Get unit index
lu_unit_index = int(_extract_id(
xml_sense.find("MonolingualExternalRef").get("externalReference"))
)
return LexicalUnitNode(
id=lu_id,
lemma=lu_lemma,
pos=PoS(lu_pos),
synset=lu_synset,
variant=lu_variant,
unit_index=lu_unit_index,
definition=lu_definition,
usage_notes=tuple(lu_usage_notes),
external_links=tuple(lu_external_links),
examples=tuple(lu_examples),
examples_sources=tuple(lu_examples_sources),
# The domain label is in format <pos>.<lang>_<name>; the last one is
# the only one we care about.
domain=Domain[lu_domain.rsplit('_', 1)[-1]],
related=tuple(lu_related),
# The below properties are never stored in uby files (at present at
# least).
verb_aspect=None,
emotion_markedness=None,
emotion_names=(),
emotion_valuations=(),
emotion_example_1=None,
emotion_example_2=None,
)
def _extract_definitions(xml_sense):
"""Extract a definition, notes and links of a LU from <Definition> tags.
:param xml_sense: <Sense> element read from an xml file.
:type xml_sense: xml.etree.ElementTree.Element
:return: the definition, usage notes and external links of the LU.
:rtype: (str or unicode, tuple, tuple)
:raises AssertionError: if there is more than 2 <Definition> tags.
"""
# Get definition - can be empty! At most 2
xml_definitions = xml_sense.findall("Definition")
lu_definition = ""
lu_usage_notes = []
lu_external_links = []
assert len(xml_definitions) <= 2, \
"Too many definitions ({:})".format(len(xml_definitions))
# There is at least one <Definition>
if xml_definitions:
children = list(xml_definitions[0])
# Check whether the first child is the real definition
if children[0].tag == "TextRepresentation":
lu_definition = children[0].get("writtenText")
# <Statement> - the rest of children
children = list(xml_definitions[1]) \
if len(xml_definitions) == 2 else []
# Get additional info
for child in children:
if child.get("statementType", "") == "usageNote":
lu_usage_notes.append(
child.find("TextRepresentation").get("writtenText")
)
if child.get("statementType", "") == "externalReference":
lu_external_links.append(
child.find("TextRepresentation").get("writtenText")
)
return lu_definition, lu_usage_notes, lu_external_links
def _get_domain(xml_sense):
"""Extract a domain of a LU from <SemanticLabel> tags.
:param xml_sense: <Sense> element read from an xml file.
:type xml_sense: xml.etree.ElementTree.Element
:return: the domain of the LU.
:rtype: str or unicode
:raises AssertionError: if there is more than 1 <SemanticLabel> tags,
no tag at all or its type is different from domain.
"""
xml_semantic_labels = xml_sense.findall("SemanticLabel")
assert len(xml_semantic_labels) == 1, \
"{:} SemanticLabel found, should be 1".format(len(xml_semantic_labels))
assert xml_semantic_labels[0].get("type", "") == "domain", \
("SemanticLabel has type {:} instead of domain"
"").format(xml_semantic_labels[0].get("type").encode(ENCODING))
return xml_semantic_labels[0].get("label")
def _make_synset(xml_synset):
"""Return a synset built from an xml element.
:param xml_synset: an xml element of Synset read from a file.
:type xml_synset: xml.etree.Element
:return: a named tuple SynsetNode
:rtype: SynsetNode
"""
s_id = _extract_id(xml_synset.get("id"))
xml_def = xml_synset.find("Definition")
s_def = xml_def.find("TextRepresentation").get("writtenText") \
if xml_def is not None else ""
s_related = []
for xsr in xml_synset.findall("SynsetRelation"):
try:
s_related.append(
(xsr.get("relName"), _extract_id(xsr.get("target")))
)
except exc.MalformedIdentifierException:
_logger.exception(
'\n%s\nIN ELEMENT\n%s\nThis relation is skipped.',
ElementTree.tostring(xsr, ENCODING),
ElementTree.tostring(xml_synset, ENCODING)
)
return SynsetNode(
id=s_id,
definition=s_def,
related=tuple(s_related)
)
def _extract_id(full_id):
"""Extract only numerical identifier from the end of a full id.
:param full_id: a full identifier that has a prefix before the real id.
:type full_id: str|unicode
:return: a real, numerical id.
:rtype: int
:raises MalformedIdentifierException: if the original id doesn't end with
digits.
"""
try:
return int(re.findall(r"\d+$", full_id)[0])
except IndexError:
raise exc.MalformedIdentifierException(full_id)
_this_reader_ = ubylmf_reader
# coding: utf8
from __future__ import absolute_import, division
import collections as coll
import contextlib as ctxl
import logging
import sqlalchemy as sa
from .nodes import SynsetNode, LexicalUnitNode
from .comments import parse_comment_string
from ..enums import (
PoS,
VerbAspect,
EmotionMarkedness,
EmotionName,
EmotionValuation,
Domain,
)
from ..utils.sorting import text_key
__all__ = 'wndb_reader',
_log = logging.getLogger(__name__)
_EmotionData = coll.namedtuple(
'_EmotionData',
('mark', 'names', 'valuations', 'example_1', 'example_2'),
)
def wndb_reader(wordnet_db_url):
"""Generate UBY-LMF format compatible records directly from plWordNet
database.
sqlalchemy is required for this method to work.
:param str wordnet_db_url: URL in sqlalchemy format, pointing to a
plWordNet database.
:return: a generator over PLwordnet entities.
:rtype: generator
"""
db_eng = sa.create_engine(wordnet_db_url)
db_meta = sa.MetaData(db_eng)
visited_synsets = set()
nonexistent_synsets = set()
# Define required tables
dbt_synset = sa.Table(u'synset', db_meta, autoload=True)
dbt_synrel = sa.Table(u'synsetrelation', db_meta, autoload=True)
dbt_reltype = sa.Table(u'relationtype', db_meta, autoload=True)
dbt_lexunit = sa.Table(u'lexicalunit', db_meta, autoload=True)
dbt_lexrel = sa.Table(u'lexicalrelation', db_meta, autoload=True)
dbt_uns = sa.Table(u'unitandsynset', db_meta, autoload=True)
dbt_emo = sa.Table(u'emotion', db_meta, autoload=True)
q = sa.select((
dbt_lexunit.c.ID,
dbt_lexunit.c.lemma,
dbt_lexunit.c.pos,
dbt_lexunit.c.variant,
dbt_uns.c.SYN_ID,
dbt_uns.c.unitindex,
dbt_lexunit.c.domain,
dbt_lexunit.c.comment,
dbt_lexunit.c.verb_aspect,
)).select_from(
dbt_lexunit.join(
dbt_uns,
dbt_uns.c.LEX_ID == dbt_lexunit.c.ID,
)
).where(dbt_lexunit.c.pos.between(1, 4))
with ctxl.closing(db_eng.execute(q)) as result:
for lexid, lemma, pos, variant, synid, uidx, domain, comment,\
verb_aspect in result:
if synid in nonexistent_synsets:
continue
# Select all relations children of the unit
q = sa.select(
(dbt_lexrel.c.CHILD_ID, dbt_reltype.c.name)
).select_from(
dbt_lexrel.join(
dbt_reltype,
dbt_reltype.c.ID == dbt_lexrel.c.REL_ID,
)
).where(dbt_lexrel.c.PARENT_ID == lexid)
with ctxl.closing(db_eng.execute(q)) as lex_rel_result:
# Ensure relations targets exist
lex_related = []
for lex_child_id, lex_rel_name in lex_rel_result:
q = sa.select((
sa.exists().select_from(
# This join to ensure the unit belongs to
# some synset.
dbt_lexunit.join(
dbt_uns,
dbt_uns.c.LEX_ID == dbt_lexunit.c.ID,
)
).where(sa.and_(
dbt_lexunit.c.ID == lex_child_id,
dbt_lexunit.c.pos.between(1, 4),
)),
))
if db_eng.execute(q).scalar():
lex_related.append((lex_rel_name, lex_child_id))
# Now, select the unit's synset, but only once
if synid not in visited_synsets:
visited_synsets.add(synid)
q = sa.select(
(dbt_synset.c.ID, dbt_synset.c.definition)
).where(dbt_synset.c.ID == synid)
synrow = db_eng.execute(q).first()
if synrow is None:
nonexistent_synsets.add(synid)
continue
# Select all relation children of the synset
q = sa.select(
(dbt_synrel.c.CHILD_ID, dbt_reltype.c.name)
).select_from(
dbt_synrel.join(
dbt_reltype,
dbt_reltype.c.ID == dbt_synrel.c.REL_ID,
)
).where(dbt_synrel.c.PARENT_ID == synid)
with ctxl.closing(db_eng.execute(q)) as syn_rel_result:
syn_related = []
for syn_child_id, syn_rel_name in syn_rel_result:
# Ensure the child exists
q = sa.select((
sa.exists().select_from(
dbt_synset.join(
dbt_uns,
dbt_uns.c.SYN_ID == dbt_synset.c.ID,
).join(
dbt_lexunit,
dbt_lexunit.c.ID == dbt_uns.c.LEX_ID,
)
).where(sa.and_(
dbt_synset.c.ID == syn_child_id,
dbt_lexunit.c.pos.between(1, 4),
)),
))
if db_eng.execute(q).scalar():
syn_related.append((syn_rel_name, syn_child_id))
yield SynsetNode(
synid,
synrow[1] if synrow[1] is not None else u'',
tuple(syn_related),
)
# Try getting emotion annotations for the unit
emo_data = _extract_emotion_data(db_eng, dbt_emo, lexid)
# Now, parse the comment string to get some last pieces of data
cmt_data = parse_comment_string(comment
if comment is not None
else u'')
yield LexicalUnitNode(
id=lexid,
lemma=lemma,
pos=PoS.by_db_number(pos),
variant=variant,
synset=synid,
unit_index=uidx,
definition=cmt_data.definition,
usage_notes=cmt_data.usage,
external_links=cmt_data.links,
examples=cmt_data.examples,
examples_sources=cmt_data.examples_sources,
# XXX Since domains are defined as strings, the int is cast
# to unicode. It's possible, in the future to add a
# translation dict to textual representations.
domain=Domain.by_db_number(domain),
related=tuple(lex_related),
verb_aspect=VerbAspect.by_db_number(verb_aspect, True),
emotion_markedness=EmotionMarkedness.normalized(emo_data.mark)
if emo_data.mark is not None else None,
emotion_names=_make_enum_tuple(
EmotionName,
sorted(emo_data.names, key=text_key),
),
emotion_valuations=_make_enum_tuple(
EmotionValuation,
sorted(emo_data.valuations, key=text_key),
),
emotion_example_1=emo_data.example_1,
emotion_example_2=emo_data.example_2,
)
def _extract_emotion_data(db_eng, db_t_emo, unit_id):
q_emo = sa.select((
db_t_emo.c.markedness, # XXX Typo in schema
db_t_emo.c.emotions,
db_t_emo.c.valuations,
db_t_emo.c.example1,
db_t_emo.c.example2,
db_t_emo.c.unitStatus,
)).where(db_t_emo.c.lexicalunit_id == unit_id).order_by(
# "super_anotation" is a boolean 0 or 1, so descending sort will put
# the super annotation first.
db_t_emo.c.super_anotation.desc() # XXX Typo in schema
)
mark = None
names = set()
valuations = set()
example_1 = None
example_2 = None
with ctxl.closing(db_eng.execute(q_emo)) as result:
for row in result:
if not row[db_t_emo.c.unitStatus]:
return _EmotionData(
mark=None,
names=(),
valuations=(),
example_1=None,
example_2=None,
)
if mark is None:
mark = row[db_t_emo.c.markedness]
if example_1 is None:
example_1 = row[db_t_emo.c.example1]
if example_2 is None:
example_2 = row[db_t_emo.c.example2]
row_names = row[db_t_emo.c.emotions]
if row_names is not None:
names.update(
word.strip()
for word in row_names.split(u';')
)
row_valuations = row[db_t_emo.c.valuations]
if row_valuations is not None:
valuations.update(
word.strip()
for word in row_valuations.split(u';')
)
return _EmotionData(
mark=mark,
names=names,
valuations=valuations,
example_1=example_1,
example_2=example_2,
)
def _make_enum_tuple(enumtype, source):
result = []
for item in source:
try:
val = enumtype(item)
except ValueError:
_log.warning('Omitting bad value %r of enum %r', item, enumtype)
else:
result.append(val)
return tuple(result)
_this_reader_ = wndb_reader
# coding: utf8
from __future__ import absolute_import, division
from collections import defaultdict
import itertools as itt
import logging
import xml.etree.ElementTree as et
import six
from .comments import parse_comment_string
from .nodes import SynsetNode, LexicalUnitNode
from ..enums import PoS, Domain
__all__ = 'wnxml_reader',
_log = logging.getLogger(__name__)
_POSES = {
u'rzeczownik': PoS.n,
u'czasownik': PoS.v,
u'przymiotnik': PoS.adj,
u'przysłówek': PoS.adv,
}
# Since etree may return either unicode or byte strings, all strings returned
# by its interfaces are wrapped with six.text_type
def wnxml_reader(wnxml_file):
"""Generate plWordNet records from the official XML file.
:param str wnxml_file: Path to the plWordNet XML file to read from.
:return: a generator over PLwordnet entities.
:rtype: generator
"""
# The regrettably huge global storage for yielding
synsets = {}
lexunits = {}
synid_n_lexids = []
reltypes_syn = {}
reltypes_lex = {}
# These need defaults to add instances to parent syn / lex
synrels = defaultdict(list)
lexrels = defaultdict(list)
# Now, parse everything
for _, elem in et.iterparse(wnxml_file):
if elem.tag == u'lexical-unit':
_make_lexunit(elem, lexunits)
elif elem.tag == u'synset':
_make_synset(elem, synsets, synid_n_lexids)
elif elem.tag == u'relationtypes':
_make_reltype(elem, reltypes_syn, reltypes_lex)
elif elem.tag == u'synsetrelations':
_make_rel(elem, synrels)
elif elem.tag == u'lexicalrelations':
_make_rel(elem, lexrels)
# Finalize units to synsets mapping
_make_units2synsets(lexunits, synid_n_lexids)
# Now complete synsets and lexunits with relations and yield
for node in itt.chain(
_make_gen(synsets, synrels, reltypes_syn),
_filter_nosynset(_make_gen(lexunits, lexrels, reltypes_lex)),
):
yield node
_this_reader_ = wnxml_reader
def _make_lexunit(lu_node, lu_dict):
# Only words will pl poses will be remembered
xmlpos = six.text_type(lu_node.get(u'pos'))
if xmlpos not in _POSES:
return
lu_id = int(lu_node.get(u'id'))
cmt_data = parse_comment_string(six.text_type(lu_node.get(u'desc')))
# Create a temporal object which will be filled later
lu_dict[lu_id] = LexicalUnitNode(
id=lu_id,
lemma=six.text_type(lu_node.get(u'name')),
pos=_POSES[xmlpos],
variant=int(lu_node.get(u'variant')),
synset=None,
unit_index=None,
definition=cmt_data.definition,
usage_notes=cmt_data.usage,
external_links=cmt_data.links,
examples=cmt_data.examples,
examples_sources=cmt_data.examples_sources,
domain=Domain[lu_node.get(u'domain')],
related=None,
# The below properties are not stored in wnxml (at least in present)
verb_aspect=None,
emotion_markedness=None,
emotion_names=(),
emotion_valuations=(),
emotion_example_1=None,
emotion_example_2=None,
)
def _make_synset(syn_node, syn_dict, snu_list):
# Only take non-abstract synsets
if six.text_type(syn_node.get(u'abstract')) != u'false':
return
synid = int(syn_node.get(u'id'))
# Assign lexical units to synsets they belong to.
snu_list.append((synid, [int(uid_node.text)
for uid_node in syn_node.iter(u'unit-id')]))
# As with lexunits, related field is not yet filled
syn_dict[synid] = SynsetNode(
synid,
six.text_type(syn_node.get(u'definition')),
None,
)
def _make_units2synsets(lu_dict, snu_list):
for synid, lexids in snu_list:
for uidx, uid in enumerate(lexids):
try:
lu = lu_dict[uid]
except KeyError:
_log.warning(
'Unit %d from synset %d does not exist',
uid,
synid,
)
else:
lu_dict[uid] = lu._replace(synset=synid, unit_index=uidx)
# Relation types are spelled in descriptive names
_RELTYPE_SYN = u'relacja pomiędzy synsetami'
_RELTYPE_LEX = u'relacja leksykalna'
def _make_reltype(reltype_node, synreltype_dict, lureltype_dict):
relid = int(reltype_node.get(u'id'))
typestr = reltype_node.get(u'type')
if typestr == _RELTYPE_SYN:
the_dict = synreltype_dict
elif typestr == _RELTYPE_LEX:
the_dict = lureltype_dict
else:
# There is one more relation type, synonymy, but it's artificial
return
# Remember the name so that will be inserted into the reltype storages
the_dict[relid] = six.text_type(reltype_node.get(u'name'))
# Relations are put into dicts indexed by parent IDs, to be later put into
# nodes. One function can handle both types.
def _make_rel(node, reldict):
# Get reltype - drop if unknown
reldict[int(node.get(u'parent'))].append((
int(node.get(u'child')),
# Reltypes should be returned by names, not IDs
int(node.get(u'relation')),
))
# As with relation, yielding is general for syn / lexes.
# Related IDs need to be added, and those not known purged.
def _make_gen(node_dict, rels_dict, reltype_dict):
for node in six.itervalues(node_dict):
related = []
for child_id, rel_id in rels_dict.get(node.id, ()):
try:
relname = reltype_dict[rel_id]
except KeyError:
_log.warning(
'Unknown relation %d (of %s), from %d to %d',
rel_id,
node.__class__.__name__,
node.id,
child_id,
)
continue
# Only remember from the related dict the items whose IDs are in
# the node dict.
if child_id in node_dict:
related.append((child_id, relname))
related.append((relname, child_id))
yield node._replace(related=related)
# Addendum to _make_gen for lexical units to filter synsetless ones
def _filter_nosynset(lu_node_gen):
for lu_node in lu_node_gen:
if lu_node.synset is None:
_log.warning('Unit %d belongs to no synset', lu_node.id)
else:
yield lu_node
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment