a new package for CCL document manipulation - it's a convenient wrapper for corpus2

56487d30 · Arkadiusz Janz · cce77df0 · 56487d30 · 56487d30 · 56487d30
Commit 56487d30 authored Apr 21, 2020 by Arkadiusz Janz
--- a/cclutils/__init__.py
+++ b/cclutils/__init__.py
+from ._base import *
--- a/cclutils/_base.py
+++ b/cclutils/_base.py
+""" A set of methods for operating on ccl files, especially for reading and
+writing documents.
+"""
+import os
+import corpus2
+ENCODING = "utf-8"
+__all__ = [
+    'read',
+    'write',
+    'read_from_directory',
+    'get_tagset',
+]
+def _read(tagset, ccl_path, rel_path=None):
+    """ A standard way to read CCL using corpus2. """
+    reader = corpus2.CclRelReader(
+        get_tagset(tagset),
+        ccl_path,
+        rel_path if rel_path else ccl_path
+    )
+    document = reader.read()
+    del reader
+    return document
+def read(ccl_path, rel_path=None, tagset='nkjp'):
+    """ Read the given ccl file and return corpus2.Document.
+    Notes:
+        Additionally, a rel-ccl file can be provided. Information about relat-
+        ions is attached to the document object.
+    Args:
+        tagset: the name of the tagset that is used in the document or a tagset
+            object itself.
+        ccl_path: a path to CCL file
+        rel_path: a path to REL file.
+    Returns:
+        corpus2.Document: The read document.
+    """
+    if rel_path:
+        return _read(tagset, ccl_path, rel_path)
+    return _read(tagset, ccl_path)
+def read_from_directory(path, ccl_ext='.xml', rel_ext='.rel.xml',
+                        read_rel_files=False, tagset='nkjp',
+                        rel_required=True):
+    """ Read CCL files from the given directory. Returns a generator of
+    corpus2.Document objects .
+    Notes:
+        Additionally, REL files can be attached by searching for them in the
+        same directory. To use this function set the `read_rel_files` parame
+        ter to True. The information about relations is added to   document
+        object if the corresponding REL file was found.
+        If `rel_required` parameter is set to True, but the function could not
+        find a corresponding REL file, then we skip both CCL and REL files. If
+        `rel_required` is set to False, then the function just ignores missing
+        REL files, but reads CCLs. We do not search the directory recursively.
+        The function returns a generator of documents, because reading long
+        documents may take some time (and memory space!).
+    Args:
+        path: a path to a directory we want to scan.
+        ccl_ext: we can change file extension of our CCL files if they have
+            different extensions than default (the default is .xml).
+        rel_ext: we can change file extension of our REL files if they have
+            different extensions (the default is set to .rel.xml).
+        red_rel_files: read REL files - True, ignore REL file - False.
+        rel_required:   if reading REL files is necessary then set this pa-
+            rameter to True. This will force the function to read CCL files
+            only if a corresponding REL file exists (and skips CCLs if RELs
+            do not exist.).
+        tagset: str|corpus2.Tagset.
+    Returns:
+        a generator of corpus2.Document objects
+    """
+    if not os.path.isdir(path):
+        raise TypeError("{:} must be a directory".format(path))
+    files = set([os.path.join(path, file_)
+                 for file_ in os.listdir(path)])
+    ccl_files = (file_ for file_ in files
+                 if file_.endswith(ccl_ext) and
+                 not file_.endswith(rel_ext))
+    for ccl_file in ccl_files:
+        rel_file = None
+        if read_rel_files:
+            rel_file = ccl_file.replace(ccl_ext, rel_ext)
+            if rel_required:
+                filepath = os.path.join(path, rel_file)
+                if not os.path.exists(filepath):
+                    continue
+        yield _read(tagset, ccl_file, rel_file)
+def _write(document, tagset, ccl_path, rel_path=None):
+    """ A standard way to save CCL files. """
+    writer = corpus2.TokenWriter.create_path_writer(
+        'ccl', ccl_path, get_tagset(tagset))
+    for chunk in document.paragraphs():
+        writer.write_chunk(chunk)
+    writer.finish()
+    if rel_path:
+        writer = corpus2.RelationWriter(rel_path)
+        writer.write(document.relations())
+    del writer
+def write(document, ccl_path, rel_path=None, tagset='nkjp'):
+    """ Write the document object to the output ccl file.
+    Notes:
+        We save the relations from the document if the output path for REL file
+        is provided.
+    Args:
+        document: corpus2.Document object - the document to be saved.
+        tagset: the name of the tagset that is used in the document or a tagset
+            object itself.
+        ccl_path: a path for output CCL file to save the document.
+        rel_path: a path to REL file to save document relations.
+    """
+    _write(document, tagset, ccl_path, rel_path)
+def get_tagset(tagset):
+    """ Returns a tagset object.
+    Notes:
+        Its a wrapper function that creates a tagset object if the input value
+        is string. Otherwise it returns what it got.
+    Args:
+        tagset: a name of a tagset or a tagset object itself.
+    Returns:
+        a corpus2 tagset object.
+    """
+    if isinstance(tagset, str):
+        tagset = corpus2.get_named_tagset(tagset)
+    return tagset
--- a/cclutils/annotations.py
+++ b/cclutils/annotations.py
+""" Helper functions for manipulating token attributes and annotations. """
+from builtins import dict
+from corpus2 import AnnotatedSentence_wrap_sentence as annotate_sentence
+class _RaiseClass(object):
+    def __repr__(self):
+        return "<RAISE>"
+RAISE = _RaiseClass()
+def get_attributes(token, to_unicode=False):
+    """
+    Get attributes of a token.
+    If token has no metadata, safely returns empty dict.
+    Args:
+        token (Corpus2.token)
+        to_unicode (:obj:`bool`, optional): Cast keys and values to unicode.
+            (Default value = False)
+    Returns:
+        dict
+    """
+    if not token.has_metadata():
+        return {}
+    metadata = token.get_metadata()
+    attributes = dict(metadata.attributes())
+    if to_unicode:
+        attributes = {_try_decode(key): _try_decode(value)
+                      for (key, value) in list(attributes.items())}
+    return attributes
+def get_attribute(token, key, default=RAISE, to_unicode=False):
+    """
+    Get named attribute of a token.
+    If token has no metadata, attribute is treated as not existing.
+    Args:
+        token (Corpus2.token)
+        key (object): Attribute name, automatically casted to string.
+        default (:obj:`object`, optional): If given, and key not found,
+            returns this value instead. Raises KeyError otherwise.
+        to_unicode (:obj:`bool`, optional): Cast value to unicode.
+            (Default value = False)
+    Returns:
+        str
+    """
+    attributes = get_attributes(token, to_unicode)
+    if to_unicode:
+        key = _try_decode(key)
+    if default is not RAISE:
+        return attributes.get(key, default)
+    return attributes[key]
+def set_attribute(token, key, value):
+    """
+    Set attribute of a token.
+    If token has no metadata, it is created automatically.
+    Args:
+        token (Corpus2.token)
+        key (object): Attribute name, automatically casted to string.
+        value (object): Attribute name, automatically casted to string.
+    """
+    if not token.has_metadata():
+        token.create_metadata()
+    metadata = token.get_metadata()
+    metadata.set_attribute(_try_encode(key), _try_encode(value))
+def set_attributes(token, items):
+    """
+    Set attribute of a token.
+    If token has no metadata, it is created automatically.
+    Args:
+        token (Corpus2.token)
+        items (Mapping[object, object]): Dictionary with keys and values,
+            automatically casted to string.
+    """
+    for (key, value) in list(items.items()):
+        set_attribute(token, key, value)
+def get_annotations(sentence, token, tok_in_sent_index=None):
+    """
+    Get annotations of a token from sentence annotation channel.
+    Args:
+        sentence (Corpus2.sentence)
+        token (Corpus2.token)
+        tok_in_sent_index (int): Position of a token in a sentence.
+                                 If present, then additional operations
+                                 related with finding token's index won't be
+                                 performed. If such index is known then it's
+                                 recommended to provide it to speed up
+                                 execution.
+    Returns:
+        Dict[str, int]
+    """
+    try:
+        sentence.all_channels()
+    except AttributeError:
+        sentence = annotate_sentence(sentence)
+    if tok_in_sent_index:
+        index = tok_in_sent_index
+    else:
+        index = _find_token(sentence, token)
+    # Using dict causes invalid reference, need to retrieve channel anyways
+    channels = list(sentence.all_channels())
+    return {
+        name: sentence.get_channel(name).get_segment_at(index)
+        for name in channels
+    }
+def _find_token(sentence, token):
+    for (index, token_in_sentence) in enumerate(sentence.tokens()):
+        if token_in_sentence.is_same(token):
+            return index
+    raise ValueError("Token does not belong to sentence.")
+def get_annotation(sentence, token, key, tok_in_sent_index=None,
+                   default=RAISE):
+    """
+    Get named annotation of a token from sentence annotation channel.
+    Args:
+        sentence (Corpus2.sentence)
+        token (Corpus2.token)
+        key (str)
+        tok_in_sent_index (int): Position of token in sentence.
+                                 If present, then additional operations
+                                 related with finding token's index won't be
+                                 performed. If such index is known then it's
+                                 recommended to provide it to speed up
+                                 execution.
+        default (:obj:`object`, optional): If given, and key not found,
+            returns this value instead. Raises KeyError otherwise.
+    Returns:
+        int
+    """
+    annotations = get_annotations(
+        sentence, token, tok_in_sent_index=tok_in_sent_index)
+    if default is not RAISE:
+        return annotations.get(key, default)
+    return annotations[key]
+def set_annotation_for_token(sentence, token, key, value=None, set_head=False):
+    """
+    Set annotation for a token.
+    Args:
+        sentence (Corpus2.Sentence)
+        token (Corpus2.Token)
+        key (str): a name for annotation channel
+        value (int, bool): annotation number (convertible to integer)
+    """
+    ann_sentence = annotate_sentence(sentence)
+    if key not in ann_sentence.all_channels():
+        ann_sentence.create_channel(key)
+    channel = ann_sentence.get_channel(key)
+    token_index = _find_token(sentence, token)
+    if value:
+        try:
+            segment = int(value)
+        except TypeError:
+            raise Exception("Wrong value type - should be an integer.")
+    else:
+        segment = channel.get_new_segment_index()
+    channel.set_segment_at(token_index, segment)
+    if set_head:
+        channel.set_head_at(token_index, True)
+def is_head_of(sentence, token, key):
+    ann_sentence = annotate_sentence(sentence)
+    if key not in ann_sentence.all_channels():
+        raise Exception("Channel not found!")
+    channel = ann_sentence.get_channel(key)
+    token_index = _find_token(sentence, token)
+    return channel.is_head_at(token_index)
+def _try_decode(value):
+    try:
+        value = str(value)
+    except UnicodeEncodeError:
+        pass
+    try:
+        value = value.decode("utf-8")
+    except (UnicodeDecodeError, AttributeError):
+        pass
+    return value
+def _try_encode(value):
+    try:
+        value = str(value)
+    except UnicodeEncodeError:
+        pass
+    try:
+        value = value.encode("utf-8")
+    except (UnicodeEncodeError, AttributeError):
+        pass
+    return value
--- a/cclutils/copies.py
+++ b/cclutils/copies.py
+""" A set of methods for operating on ccl files, especially for reading and
+writing documents.
+"""
+import corpus2
+ENCODING = "utf-8"
+__all__ = [
+    'copy_chunk',
+    'copy_sentence',
+    'copy_relation'
+    # todo: add 'copy_token' function
+]
+def _new_relation_point(relation_id, channel_name, annotation_number):
+    return corpus2.DirectionPoint(
+        relation_id, channel_name, annotation_number
+    )
+def _change_point_id(point, point_id):
+    return _new_relation_point(
+        point_id, point.channel_name(), point.annotation_number()
+    )
+def copy_relation(relation, new_from_id=None, new_to_id=None):
+    """ Returns a copy of given relation object and changes its identifiers if
+    necessary. When no optional parameters are given we just copy the relation
+    without changing its properties.
+    Notes:
+        If `new_from_id` is given then the sentence identifier of `rel_from`
+        object will be replaced. The same holds for `new_to_id` and `rel_to`
+        object. If only one of them is provided then the second one will be
+        a direct copy of the original sentence identifier.
+    Args:
+        relation: the relation object to make a copy.
+        new_from_id: the new identifier for the from-point of the relation.
+        new_to_id: the new identifier for the to-point of the relation.
+    Returns:
+        a new relation copy (corpus2.RelationPtr)
+    """
+    relation_copy = relation.clone_shared()
+    if new_from_id:
+        point = _change_point_id(new_from_id, relation_copy.rel_from())
+        relation_copy.set_from(point)
+    if new_to_id:
+        point = _change_point_id(new_to_id, relation_copy.rel_to())
+        relation_copy.set_to(point)
+    return relation_copy
+def copy_sentence(sentence, new_id=None):
+    """ Returns a copy of the given sentence and changes its identifier if
+    necessary. If the `new_id` is provided then the original identifier of
+    source sentence is replaced.
+    Args:
+        sentence: a sentence object to copy.
+        new_id: a new identifier for our sentence copy.
+    Returns:
+        a copy of the given sentence (corpus2.SentencePtr)
+    """
+    sentence_copy = sentence.clone_shared()
+    if new_id:
+        sentence_copy.set_id(new_id)
+    return sentence_copy
+def copy_chunk(chunk, copy_sentences=True, new_id=None):
+    """ Returns a copy of the given chunk and changes its identifier if
+    necessary.
+    Notes:
+        If `copy_sentences` is set to False then the sentences from source
+        chunk WON'T be copied, only the attributes. If `new_id` is provided
+        then the identifier of source chunk will be replaced.
+    Args:
+        chunk: a chunk to copy.
+        copy_sentences: the copy won't copy source sentences if set to
+            False.
+        new_id: the new identifier for a copy of the chunk.
+    Returns:
+        a new copy of source chunk (corpus2.Chunk).
+    """
+    if not copy_sentences:
+        new_chunk = corpus2.Chunk().clone_shared()
+        _copy_chunk_attributes(chunk, new_chunk)
+    else:
+        new_chunk = chunk.clone_shared()
+    if new_id:
+        new_chunk.set_attribute('id', new_id)
+    return new_chunk
+def _copy_chunk_attributes(source_chunk, target_chunk):
+    """ Copy all attributes from the source chunk to the target chunk.
+    Args:
+        source_chunk: a source chunk.
+        target_chunk: a target chunk.
+    """
+    for key, value in list(source_chunk.attributes().items()):
+        target_chunk.set_attribute(key, value)
+# todo: move somewhere else!
+def sentence2str(sentence, use_bases=False, tagset='nkjp'):
+    """ Return corpus2.Sentence as a string.
+    Args:
+        sentence: a sentence object (corpus2.Sentence).
+        use_bases: if set to True, the we take base forms
+            instead of taking the orths.
+    Returns:
+        a string representation of the input sentence object.
+    """
+    if isinstance(tagset, str):
+        tagset = corpus2.get_named_tagset(tagset)
+    text = []
+    for token in sentence.tokens():
+        text.append(" " if token.after_space() else "")
+        if not use_bases:
+            token_string = token.orth_utf8()
+        else:
+            token_string = token.get_preferred_lexeme(tagset).lemma_utf8()
+        text.append(token_string)
+    return "".join(text).strip()
--- a/cclutils/tokens.py
+++ b/cclutils/tokens.py
+"""
+Set of helper methods for creating corpus2 Token, Lexeme and Tag objects.
+"""
+import corpus2
+from cclutils import get_tagset
+SIMPLE_TAGSET = get_tagset("simple")
+def get_lexeme_strings(document, tagset, delimiter=":", include_fine=False,
+                       lemma_only=False):
+    """
+    Get lexeme strings from the corpus.
+    Args:
+        document: (corpus2.Document)
+    Returns:
+        List[str]: List of string represenations of tokens consistent
+            with include_fine and lemma_only constructor options.
+    """
+    lexemes = (token.get_preferred_lexeme(tagset)
+               for paragraph in document.paragraphs()
+               for sentence in paragraph.sentences()
+               for token in sentence.tokens())
+    lexemes = (get_coarse_lexeme_pair(lexeme, tagset)
+               for lexeme in lexemes)
+    if not include_fine:
+        lexemes = [pos_lex for pos_lex in lexemes
+                   if pos_lex[0] in ["verb", "noun", "adj", "adv"]]
+    lexemes = [join_lexeme(pos_lex, delimiter, lemma_only)
+               for pos_lex in lexemes]
+    return lexemes
+def create_token_split(string, delimiter=":", tagset=SIMPLE_TAGSET):
+    """
+    Create Token object with single Lexeme from single string
+    with part of speech and lemma separated by delimiter.
+    Args:
+        string (str): String of form "{pos}{delimiter}{lemma}".
+        delimiter (:obj:`str`, optional): Delimiter used for splitting
+            part of speech and lemma. CAN appear further down in lemma
+            but NOT in part of speech. Defaults to ":".
+        tagset (:obj:`corpus2.Tagset`, optional): Tagset for decoding
+            the string. Defaults to corpus2.get_named_tagset("simple").
+    Returns:
+        corpus2.Token: Token object with single Lexeme with given pos and lemma.
+    """
+    (pos, lemma) = split_lexeme(string, delimiter)
+    return create_token(pos, lemma, tagset)
+def create_token(pos, lemma, tagset=SIMPLE_TAGSET):
+    """
+    Create Lexeme object from single string with part of speech and lemma
+    separated by delimiter.
+    Args:
+        pos (str): String specifying tagset's part of speech.
+        lemma (str)
+        tagset (:obj:`corpus2.Tagset`, optional): Tagset for decoding
+            the string. Defaults to corpus2.get_named_tagset("simple").
+    Returns:
+        corpus2.Token: Lexeme object with given pos and lemma.
+    """
+    lexeme = create_lexeme(pos, lemma, tagset)
+    if not lexeme:
+        return None
+    token = corpus2.Token()
+    token.add_lexeme(lexeme)
+    return token
+def create_lexeme_split(string, delimiter=":", tagset=SIMPLE_TAGSET):
+    """
+    Create Lexeme object from single string with part of speech and lemma
+    separated by delimiter.
+    Args:
+        string (str): String of form "{pos}{delimiter}{lemma}".
+        delimiter (:obj:`str`, optional): Delimiter used for splitting
+            part of speech and lemma. CAN appear further down in lemma
+            but NOT in part of speech. Defaults to ":".
+        tagset (:obj:`corpus2.Tagset`, optional): Tagset for decoding
+            the string. Defaults to corpus2.get_named_tagset("simple").
+    Returns:
+        corpus2.Lexeme: Lexeme object with given pos and lemma.
+    """
+    if not string:
+        return None
+    (pos, lemma) = split_lexeme(string, delimiter)
+    return create_lexeme(pos, lemma, tagset)
+def create_lexeme(pos, lemma, tagset=SIMPLE_TAGSET):
+    """
+    Create Lexeme object from part of speech and lemma.
+    Args:
+        pos (str): String specifying tagset's part of speech.
+        lemma (str)
+        tagset (:obj:`corpus2.Tagset`, optional): Tagset for decoding
+            the string. Defaults to corpus2.get_named_tagset("simple").
+    Returns:
+        corpus2.Lexeme: Lexeme object with given pos and lemma.
+    """
+    lexeme = corpus2.Lexeme()
+    lexeme.set_lemma_utf8(lemma)
+    lexeme.set_tag(create_tag(pos, tagset))
+    return lexeme
+def create_tag(pos, tagset=SIMPLE_TAGSET):
+    """
+    Create Token object from string.
+    Args:
+        pos (str): String specifying tagset's part of speech.
+        tagset (:obj:`corpus2.Tagset`, optional): Tagset for decoding
+            the string. Defaults to corpus2.get_named_tagset("simple").
+    Returns:
+        corpus2.Tag: Tag object parsed from given string and tagset.
+    """
+    if isinstance(tagset, str):
+        tagset = get_tagset(tagset)
+    return tagset.parse_simple_tag(pos)
+def _is_swig_instance(object_, type_):
+    """
+    Check typing of an object, SWIG style.
+    Seriously, SWIG is so stupid that importing the same class from different
+    modules creates an entirely different subclass that cannot be compared
+    using isinstance. Yes, I am absolutely serious, it kept throwing
+    'maca.Lexeme != corpus2.Lexeme' at me. Where is maca coming from?
+    Hell if I know.
+    And if anyone even tries to say something like 'Well, just use duck typing,
+    it is more pythonic!', I will start punching puppies. First of all,
+    the whole concept of just throwing (or 'raising', if that is more
+    'pythonic' for you) exceptions all the time is brilliant. I raise (sorry
+    for the pan) my headset (for lack of a hat) to whoever came up with idea
+    of StopIterationException.
+    Second of all, you know how SWIG reacts to exceptions? You get a segfault,
+    you get a segfault, everyone gets a segfault! So trying to ducktype that
+    is about as successful as playing actual Duck Hunt on an LCD.
+    So I did the only thing that kinda worked and compared the type names.
+    Because apparently maca.Lexeme.__name__ and corpus2.Lexeme.__name__ still
+    yield just 'Lexeme', so at least I do not have to do partial string
+    comparison. Yay me, I guess.
+    Args:
+        object_ (object)
+        type_ (type)
+    Returns:
+        bool: True if name of type of the object is same as name of the type.
+    """
+    return type(object_).__name__ == type_.__name__
+def _token_or_lexeme(token_or_lexeme, flow=False):
+    if _is_swig_instance(token_or_lexeme, corpus2.Token):
+        return token_or_lexeme.lexemes()[0]
+    elif _is_swig_instance(token_or_lexeme, corpus2.Lexeme) or flow:
+        return token_or_lexeme
+    else:
+        raise TypeError("{} is neither Token nor Lexeme"
+                        .format(token_or_lexeme))
+def _lexeme_or_tag(lexeme_or_tag, flow=False):
+    if _is_swig_instance(lexeme_or_tag, corpus2.Lexeme):
+        return lexeme_or_tag.tag()
+    elif _is_swig_instance(lexeme_or_tag, corpus2.Tag) or flow:
+        return lexeme_or_tag
+    else:
+        raise TypeError("{} is neither Lexeme nor Tag"
+                        .format(lexeme_or_tag))
+def _token_lexeme_or_tag(token_lexeme_or_tag):
+    token_or_lexeme = _token_or_lexeme(token_lexeme_or_tag, True)
+    return _lexeme_or_tag(token_or_lexeme)
+def get_lexeme_string(token_or_lexeme, tagset, delimiter=":",
+                      main_only=False):
+    return delimiter.join(get_lexeme_pair(
+        token_or_lexeme, tagset, main_only))
+def get_coarse_lexeme_string(token_or_lexeme, tagset, delimiter=":"):
+    return delimiter.join(get_coarse_lexeme_pair(
+        token_or_lexeme, tagset))
+def get_lexeme_pair(token_or_lexeme, tagset, main_only=False):
+    return (get_pos(token_or_lexeme, tagset, main_only),
+            get_lexeme_lemma(token_or_lexeme))
+def get_coarse_lexeme_pair(token_or_lexeme, tagset):
+    return (get_coarse_pos(token_or_lexeme, tagset),
+            get_lexeme_lemma(token_or_lexeme))
+def get_lexeme_lemma(token_or_lexeme):
+    lexeme = _token_or_lexeme(token_or_lexeme)
+    lemma = lexeme.lemma_utf8()
+    return lemma
+def get_pos(token_lexeme_or_tag, tagset, main_only=False):
+    """
+    Get part of speech from the lexeme.
+    Args:
+        token_lexeme_or_tag (Union[corpus2.Token, corpus2.Lexeme, corpus2.Tag])
+        tagset (corpus2.Tagset)
+        main_only (:obj:`bool`, optional) If True, return only the main part
+            before first comma. Defaults to False.
+    Returns:
+        str
+    """
+    tag = _token_lexeme_or_tag(token_lexeme_or_tag)
+    if isinstance(tagset, str):
+        tagset = get_tagset(tagset)
+    pos = tagset.tag_to_symbol_string(tag)
+    if main_only:
+        pos = pos.split(",")[0]
+    return pos
+def get_coarse_pos(token_lexeme_or_tag, tagset):
+    """
+    Get coarse part of speech from the lexeme: either verb, noun, adj, adv.
+    Args:
+        token_lexeme_or_tag (Union[corpus2.Token, corpus2.Lexeme, corpus2.Tag])
+        tagset (corpus2.Tagset)
+    Returns:
+        str: Coarse part of speech string. If original part of speach
+            is convertible to coarse, the returned string will be coarse
+            also. Otherwise returns first part of original part of speech.
+    """
+    fine_pos = get_pos(token_lexeme_or_tag, tagset, True)
+    coarse_pos = convert_to_coarse_pos(fine_pos)
+    return coarse_pos
+def convert_to_coarse_pos(fine_pos):
+    """!
+    Returns a corresponding coarse-grained POS for a given fine-grained POS.
+    @param fine_pos: fine-grained POS
+    @return coarse-grained POS
+    """
+    # verbs
+    verb_pl_pos = ['fin', 'bedzie', 'praet', 'impt',
+                   'inf', 'pcon', 'pant', 'imps',
+                   'winien', 'pred', 'pact', 'ppas', 'pred']
+    # nouns
+    noun_pl_pos = ['subst', 'depr', 'ger', 'brev']
+    # adjectives
+    adj_pl_pos = ['adj', 'adja', 'adjp', 'adjc']
+    # adverbs
+    adv_pl_pos = ['adv']
+    if fine_pos in noun_pl_pos:
+        return "noun"
+    elif fine_pos in adj_pl_pos:
+        return "adj"
+    elif fine_pos in verb_pl_pos:
+        return "verb"
+    elif fine_pos in adv_pl_pos:
+        return "adv"
+    return fine_pos
+def POS_num_to_str(pos_num, short=False):
+    """ Convert POS number to string.
+    Args:
+        pos_num (int): selected POS as number
+    Note:
+        If POS number is out of range, None will be returned.
+    Returns:
+        str: string representation of POS
+    """
+    poses = {
+        1: ('v', 'verb'),
+        2: ('n', 'noun'),
+        3: ('r', 'adv'),
+        4: ('a', 'adj'),
+        5: ('v-PWN', 'verb-PWN'),
+        6: ('n-PWN', 'noun-PWN'),
+        7: ('r-PWN', 'adv-PWN'),
+        8: ('a-PWN', 'adj-PWN')
+    }
+    try:
+        pos_short, pos_long = poses[pos_num]
+    except KeyError:
+        pos_short, pos_long = (None, None)
+    return pos_short if short else pos_long
+def POS_str_to_num(pos_str, short=False):
+    """ Convert POS string to num.
+    Args:
+        pos_str (str): selected POS as string
+    Note:
+        If unknown POS string was used, None will be returned.
+    Returns:
+        int: number of selected POS
+    """
+    try:
+        return next(
+            n for n in range(1, 9) if POS_num_to_str(n, short) == pos_str
+        )
+    except StopIteration:
+        return None
+def join_lexeme(pos_lex, delimiter=":", lemma_only=False):
+    """
+    Return string representing the part of speech and lemma pair.
+    Args:
+        pos_lex (Tuple[str, str]): Part of speech and lemma pair.
+        delimiter (:obj:`str`, optional): Delimiter for joining
+            part of speech and lemma. Only used if lemma_only == False.
+            Defaults to ":".
+        lemma_only (:obj:`bool`, optional): Skip pos.
+    Returns:
+        str: If lemma_only, returns lemma. Otherwise returns
+            "{pos}{delimiter}{lemma}".
+    """
+    if lemma_only:
+        return pos_lex[1]
+    else:
+        return "{}{}{}".format(pos_lex[0], delimiter, pos_lex[1])
+def join_lexemes(pos_lexs, delimiter=":", lemma_only=False):
+    """
+    Return strings representing the part of speech and lemma pair.
+    Args:
+        pos_lex (List[Tuple[str, str]]): Part of speech and lemma pairs.
+        delimiter (:obj:`str`, optional): Delimiter for joining
+            part of speech and lemma. Only used if lemma_only == False.
+            Defaults to ":".
+        lemma_only (:obj:`bool`, optional): Skip pos.
+    Returns:
+        str: If lemma_only, returns lemma. Otherwise returns
+            "{pos}{delimiter}{lemma}".
+    """
+    return [join_lexeme(pos_lex, delimiter, lemma_only)
+            for pos_lex in pos_lexs]
+def split_lexeme(string, delimiter):
+    """
+    Split string into part of speech and lemma tuple
+    Args:
+        string (str): String of form "{pos}{delimiter}{lemma}".
+        delimiter (:obj:`str`, optional): Delimiter used for splitting
+            part of speech and lemma. CAN appear further down in lemma
+            but NOT in part of speech. Defaults to ":".
+    Returns:
+        Tuple[str, str]: Tuple with part of speech and lemma
+    """
+    (pos, lemma) = string.split(delimiter, 1)
+    return (pos, lemma)
+def split_lexemes(strings, delimiter):
+    """
+    Return article id and its part of speech and lemma tuples
+    from a line with given delimiters.
+    Args:
+        strings (Iterable[str]): Strings of form "{pos}{delimiter}{lemma}".
+        delimiter (:obj:`str`, optional): Delimiter used for splitting
+            part of speech and lemma. CAN appear further down in lemma
+            but NOT in part of speech. Defaults to ":".
+    Returns:
+        int: Article id.
+        Set[Tuple[str, str]]: Set of part of speech, lemma tuples
+            appearing in the article.
+    Raises:
+        DelimiterConflictError: If token_delimiter equals lemma_delimiter
+    """
+    return [split_lexeme(string, delimiter)
+            for string in strings]