diff --git a/cclutils/__init__.py b/cclutils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d2be0a79e8d807a13130c4efed7e6d9ddebdf657 --- /dev/null +++ b/cclutils/__init__.py @@ -0,0 +1 @@ +from ._base import * diff --git a/cclutils/_base.py b/cclutils/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..971c01b6cb608e4b3ab2eb3c6c6121b0fd4aee5e --- /dev/null +++ b/cclutils/_base.py @@ -0,0 +1,161 @@ +""" A set of methods for operating on ccl files, especially for reading and +writing documents. +""" + +import os +import corpus2 + +ENCODING = "utf-8" + + +__all__ = [ + 'read', + 'write', + 'read_from_directory', + 'get_tagset', +] + + +def _read(tagset, ccl_path, rel_path=None): + """ A standard way to read CCL using corpus2. """ + reader = corpus2.CclRelReader( + get_tagset(tagset), + ccl_path, + rel_path if rel_path else ccl_path + ) + document = reader.read() + del reader + return document + + +def read(ccl_path, rel_path=None, tagset='nkjp'): + """ Read the given ccl file and return corpus2.Document. + + Notes: + Additionally, a rel-ccl file can be provided. Information about relat- + ions is attached to the document object. + + Args: + tagset: the name of the tagset that is used in the document or a tagset + object itself. + ccl_path: a path to CCL file + rel_path: a path to REL file. + + Returns: + corpus2.Document: The read document. + + """ + if rel_path: + return _read(tagset, ccl_path, rel_path) + return _read(tagset, ccl_path) + + +def read_from_directory(path, ccl_ext='.xml', rel_ext='.rel.xml', + read_rel_files=False, tagset='nkjp', + rel_required=True): + """ Read CCL files from the given directory. Returns a generator of + corpus2.Document objects . + + Notes: + Additionally, REL files can be attached by searching for them in the + same directory. To use this function set the `read_rel_files` parame + ter to True. The information about relations is added to document + object if the corresponding REL file was found. + + If `rel_required` parameter is set to True, but the function could not + find a corresponding REL file, then we skip both CCL and REL files. If + `rel_required` is set to False, then the function just ignores missing + REL files, but reads CCLs. We do not search the directory recursively. + + The function returns a generator of documents, because reading long + documents may take some time (and memory space!). + + Args: + path: a path to a directory we want to scan. + ccl_ext: we can change file extension of our CCL files if they have + different extensions than default (the default is .xml). + rel_ext: we can change file extension of our REL files if they have + different extensions (the default is set to .rel.xml). + red_rel_files: read REL files - True, ignore REL file - False. + rel_required: if reading REL files is necessary then set this pa- + rameter to True. This will force the function to read CCL files + only if a corresponding REL file exists (and skips CCLs if RELs + do not exist.). + tagset: str|corpus2.Tagset. + + Returns: + a generator of corpus2.Document objects + + """ + if not os.path.isdir(path): + raise TypeError("{:} must be a directory".format(path)) + + files = set([os.path.join(path, file_) + for file_ in os.listdir(path)]) + + ccl_files = (file_ for file_ in files + if file_.endswith(ccl_ext) and + not file_.endswith(rel_ext)) + + for ccl_file in ccl_files: + rel_file = None + if read_rel_files: + rel_file = ccl_file.replace(ccl_ext, rel_ext) + if rel_required: + filepath = os.path.join(path, rel_file) + if not os.path.exists(filepath): + continue + yield _read(tagset, ccl_file, rel_file) + + +def _write(document, tagset, ccl_path, rel_path=None): + """ A standard way to save CCL files. """ + writer = corpus2.TokenWriter.create_path_writer( + 'ccl', ccl_path, get_tagset(tagset)) + + for chunk in document.paragraphs(): + writer.write_chunk(chunk) + writer.finish() + + if rel_path: + writer = corpus2.RelationWriter(rel_path) + writer.write(document.relations()) + + del writer + + +def write(document, ccl_path, rel_path=None, tagset='nkjp'): + """ Write the document object to the output ccl file. + + Notes: + We save the relations from the document if the output path for REL file + is provided. + + Args: + document: corpus2.Document object - the document to be saved. + tagset: the name of the tagset that is used in the document or a tagset + object itself. + ccl_path: a path for output CCL file to save the document. + rel_path: a path to REL file to save document relations. + + """ + _write(document, tagset, ccl_path, rel_path) + + +def get_tagset(tagset): + """ Returns a tagset object. + + Notes: + Its a wrapper function that creates a tagset object if the input value + is string. Otherwise it returns what it got. + + Args: + tagset: a name of a tagset or a tagset object itself. + + Returns: + a corpus2 tagset object. + + """ + if isinstance(tagset, str): + tagset = corpus2.get_named_tagset(tagset) + return tagset diff --git a/cclutils/annotations.py b/cclutils/annotations.py new file mode 100644 index 0000000000000000000000000000000000000000..331035bf412e9560dd2240b4744cf50dcd77fe90 --- /dev/null +++ b/cclutils/annotations.py @@ -0,0 +1,226 @@ +""" Helper functions for manipulating token attributes and annotations. """ +from builtins import dict + +from corpus2 import AnnotatedSentence_wrap_sentence as annotate_sentence + + +class _RaiseClass(object): + def __repr__(self): + return "<RAISE>" + + +RAISE = _RaiseClass() + + +def get_attributes(token, to_unicode=False): + """ + Get attributes of a token. + + If token has no metadata, safely returns empty dict. + + Args: + token (Corpus2.token) + to_unicode (:obj:`bool`, optional): Cast keys and values to unicode. + (Default value = False) + + Returns: + dict + """ + if not token.has_metadata(): + return {} + metadata = token.get_metadata() + attributes = dict(metadata.attributes()) + if to_unicode: + attributes = {_try_decode(key): _try_decode(value) + for (key, value) in list(attributes.items())} + return attributes + + +def get_attribute(token, key, default=RAISE, to_unicode=False): + """ + Get named attribute of a token. + + If token has no metadata, attribute is treated as not existing. + + Args: + token (Corpus2.token) + key (object): Attribute name, automatically casted to string. + default (:obj:`object`, optional): If given, and key not found, + returns this value instead. Raises KeyError otherwise. + to_unicode (:obj:`bool`, optional): Cast value to unicode. + (Default value = False) + + Returns: + str + """ + attributes = get_attributes(token, to_unicode) + if to_unicode: + key = _try_decode(key) + if default is not RAISE: + return attributes.get(key, default) + return attributes[key] + + +def set_attribute(token, key, value): + """ + Set attribute of a token. + + If token has no metadata, it is created automatically. + + Args: + token (Corpus2.token) + key (object): Attribute name, automatically casted to string. + value (object): Attribute name, automatically casted to string. + """ + if not token.has_metadata(): + token.create_metadata() + metadata = token.get_metadata() + metadata.set_attribute(_try_encode(key), _try_encode(value)) + + +def set_attributes(token, items): + """ + Set attribute of a token. + + If token has no metadata, it is created automatically. + + Args: + token (Corpus2.token) + items (Mapping[object, object]): Dictionary with keys and values, + automatically casted to string. + """ + for (key, value) in list(items.items()): + set_attribute(token, key, value) + + +def get_annotations(sentence, token, tok_in_sent_index=None): + """ + Get annotations of a token from sentence annotation channel. + + Args: + sentence (Corpus2.sentence) + token (Corpus2.token) + tok_in_sent_index (int): Position of a token in a sentence. + If present, then additional operations + related with finding token's index won't be + performed. If such index is known then it's + recommended to provide it to speed up + execution. + Returns: + Dict[str, int] + """ + try: + sentence.all_channels() + except AttributeError: + sentence = annotate_sentence(sentence) + + if tok_in_sent_index: + index = tok_in_sent_index + else: + index = _find_token(sentence, token) + # Using dict causes invalid reference, need to retrieve channel anyways + channels = list(sentence.all_channels()) + return { + name: sentence.get_channel(name).get_segment_at(index) + for name in channels + } + + +def _find_token(sentence, token): + for (index, token_in_sentence) in enumerate(sentence.tokens()): + if token_in_sentence.is_same(token): + return index + raise ValueError("Token does not belong to sentence.") + + +def get_annotation(sentence, token, key, tok_in_sent_index=None, + default=RAISE): + """ + Get named annotation of a token from sentence annotation channel. + + Args: + sentence (Corpus2.sentence) + token (Corpus2.token) + key (str) + tok_in_sent_index (int): Position of token in sentence. + If present, then additional operations + related with finding token's index won't be + performed. If such index is known then it's + recommended to provide it to speed up + execution. + default (:obj:`object`, optional): If given, and key not found, + returns this value instead. Raises KeyError otherwise. + Returns: + int + """ + annotations = get_annotations( + sentence, token, tok_in_sent_index=tok_in_sent_index) + if default is not RAISE: + return annotations.get(key, default) + return annotations[key] + + +def set_annotation_for_token(sentence, token, key, value=None, set_head=False): + """ + Set annotation for a token. + + Args: + sentence (Corpus2.Sentence) + token (Corpus2.Token) + key (str): a name for annotation channel + value (int, bool): annotation number (convertible to integer) + + """ + ann_sentence = annotate_sentence(sentence) + if key not in ann_sentence.all_channels(): + ann_sentence.create_channel(key) + channel = ann_sentence.get_channel(key) + + token_index = _find_token(sentence, token) + if value: + try: + segment = int(value) + except TypeError: + raise Exception("Wrong value type - should be an integer.") + else: + segment = channel.get_new_segment_index() + channel.set_segment_at(token_index, segment) + if set_head: + channel.set_head_at(token_index, True) + + +def is_head_of(sentence, token, key): + ann_sentence = annotate_sentence(sentence) + if key not in ann_sentence.all_channels(): + raise Exception("Channel not found!") + channel = ann_sentence.get_channel(key) + token_index = _find_token(sentence, token) + return channel.is_head_at(token_index) + + +def _try_decode(value): + try: + value = str(value) + except UnicodeEncodeError: + pass + + try: + value = value.decode("utf-8") + except (UnicodeDecodeError, AttributeError): + pass + + return value + + +def _try_encode(value): + try: + value = str(value) + except UnicodeEncodeError: + pass + + try: + value = value.encode("utf-8") + except (UnicodeEncodeError, AttributeError): + pass + + return value diff --git a/cclutils/copies.py b/cclutils/copies.py new file mode 100644 index 0000000000000000000000000000000000000000..8598f37ba8f2fc3f2391f331b8fd1fe936725ae0 --- /dev/null +++ b/cclutils/copies.py @@ -0,0 +1,149 @@ +""" A set of methods for operating on ccl files, especially for reading and +writing documents. +""" +import corpus2 + +ENCODING = "utf-8" + + +__all__ = [ + 'copy_chunk', + 'copy_sentence', + 'copy_relation' + # todo: add 'copy_token' function +] + + +def _new_relation_point(relation_id, channel_name, annotation_number): + return corpus2.DirectionPoint( + relation_id, channel_name, annotation_number + ) + + +def _change_point_id(point, point_id): + return _new_relation_point( + point_id, point.channel_name(), point.annotation_number() + ) + + +def copy_relation(relation, new_from_id=None, new_to_id=None): + """ Returns a copy of given relation object and changes its identifiers if + necessary. When no optional parameters are given we just copy the relation + without changing its properties. + + Notes: + If `new_from_id` is given then the sentence identifier of `rel_from` + object will be replaced. The same holds for `new_to_id` and `rel_to` + object. If only one of them is provided then the second one will be + a direct copy of the original sentence identifier. + + Args: + relation: the relation object to make a copy. + new_from_id: the new identifier for the from-point of the relation. + new_to_id: the new identifier for the to-point of the relation. + + Returns: + a new relation copy (corpus2.RelationPtr) + + """ + relation_copy = relation.clone_shared() + + if new_from_id: + point = _change_point_id(new_from_id, relation_copy.rel_from()) + relation_copy.set_from(point) + + if new_to_id: + point = _change_point_id(new_to_id, relation_copy.rel_to()) + relation_copy.set_to(point) + + return relation_copy + + +def copy_sentence(sentence, new_id=None): + """ Returns a copy of the given sentence and changes its identifier if + necessary. If the `new_id` is provided then the original identifier of + source sentence is replaced. + + Args: + sentence: a sentence object to copy. + new_id: a new identifier for our sentence copy. + + Returns: + a copy of the given sentence (corpus2.SentencePtr) + + """ + sentence_copy = sentence.clone_shared() + if new_id: + sentence_copy.set_id(new_id) + return sentence_copy + + +def copy_chunk(chunk, copy_sentences=True, new_id=None): + """ Returns a copy of the given chunk and changes its identifier if + necessary. + + Notes: + If `copy_sentences` is set to False then the sentences from source + chunk WON'T be copied, only the attributes. If `new_id` is provided + then the identifier of source chunk will be replaced. + + Args: + chunk: a chunk to copy. + copy_sentences: the copy won't copy source sentences if set to + False. + new_id: the new identifier for a copy of the chunk. + + Returns: + a new copy of source chunk (corpus2.Chunk). + + """ + if not copy_sentences: + new_chunk = corpus2.Chunk().clone_shared() + _copy_chunk_attributes(chunk, new_chunk) + else: + new_chunk = chunk.clone_shared() + + if new_id: + new_chunk.set_attribute('id', new_id) + + return new_chunk + + +def _copy_chunk_attributes(source_chunk, target_chunk): + """ Copy all attributes from the source chunk to the target chunk. + + Args: + source_chunk: a source chunk. + target_chunk: a target chunk. + + """ + for key, value in list(source_chunk.attributes().items()): + target_chunk.set_attribute(key, value) + + +# todo: move somewhere else! +def sentence2str(sentence, use_bases=False, tagset='nkjp'): + """ Return corpus2.Sentence as a string. + + Args: + sentence: a sentence object (corpus2.Sentence). + use_bases: if set to True, the we take base forms + instead of taking the orths. + + Returns: + a string representation of the input sentence object. + + """ + if isinstance(tagset, str): + tagset = corpus2.get_named_tagset(tagset) + + text = [] + for token in sentence.tokens(): + text.append(" " if token.after_space() else "") + if not use_bases: + token_string = token.orth_utf8() + else: + token_string = token.get_preferred_lexeme(tagset).lemma_utf8() + text.append(token_string) + + return "".join(text).strip() diff --git a/cclutils/tokens.py b/cclutils/tokens.py new file mode 100644 index 0000000000000000000000000000000000000000..af75c1fcc7a0ad6f37e4cbf780a38b2ef5d9412f --- /dev/null +++ b/cclutils/tokens.py @@ -0,0 +1,423 @@ +""" +Set of helper methods for creating corpus2 Token, Lexeme and Tag objects. +""" +import corpus2 +from cclutils import get_tagset + +SIMPLE_TAGSET = get_tagset("simple") + + +def get_lexeme_strings(document, tagset, delimiter=":", include_fine=False, + lemma_only=False): + """ + Get lexeme strings from the corpus. + + Args: + document: (corpus2.Document) + + Returns: + List[str]: List of string represenations of tokens consistent + with include_fine and lemma_only constructor options. + """ + lexemes = (token.get_preferred_lexeme(tagset) + for paragraph in document.paragraphs() + for sentence in paragraph.sentences() + for token in sentence.tokens()) + + lexemes = (get_coarse_lexeme_pair(lexeme, tagset) + for lexeme in lexemes) + + if not include_fine: + lexemes = [pos_lex for pos_lex in lexemes + if pos_lex[0] in ["verb", "noun", "adj", "adv"]] + + lexemes = [join_lexeme(pos_lex, delimiter, lemma_only) + for pos_lex in lexemes] + return lexemes + + +def create_token_split(string, delimiter=":", tagset=SIMPLE_TAGSET): + """ + Create Token object with single Lexeme from single string + with part of speech and lemma separated by delimiter. + + Args: + string (str): String of form "{pos}{delimiter}{lemma}". + delimiter (:obj:`str`, optional): Delimiter used for splitting + part of speech and lemma. CAN appear further down in lemma + but NOT in part of speech. Defaults to ":". + tagset (:obj:`corpus2.Tagset`, optional): Tagset for decoding + the string. Defaults to corpus2.get_named_tagset("simple"). + + Returns: + corpus2.Token: Token object with single Lexeme with given pos and lemma. + """ + (pos, lemma) = split_lexeme(string, delimiter) + return create_token(pos, lemma, tagset) + + +def create_token(pos, lemma, tagset=SIMPLE_TAGSET): + """ + Create Lexeme object from single string with part of speech and lemma + separated by delimiter. + + Args: + pos (str): String specifying tagset's part of speech. + lemma (str) + tagset (:obj:`corpus2.Tagset`, optional): Tagset for decoding + the string. Defaults to corpus2.get_named_tagset("simple"). + + Returns: + corpus2.Token: Lexeme object with given pos and lemma. + """ + lexeme = create_lexeme(pos, lemma, tagset) + if not lexeme: + return None + token = corpus2.Token() + token.add_lexeme(lexeme) + return token + + +def create_lexeme_split(string, delimiter=":", tagset=SIMPLE_TAGSET): + """ + Create Lexeme object from single string with part of speech and lemma + separated by delimiter. + + Args: + string (str): String of form "{pos}{delimiter}{lemma}". + delimiter (:obj:`str`, optional): Delimiter used for splitting + part of speech and lemma. CAN appear further down in lemma + but NOT in part of speech. Defaults to ":". + tagset (:obj:`corpus2.Tagset`, optional): Tagset for decoding + the string. Defaults to corpus2.get_named_tagset("simple"). + + Returns: + corpus2.Lexeme: Lexeme object with given pos and lemma. + """ + if not string: + return None + (pos, lemma) = split_lexeme(string, delimiter) + return create_lexeme(pos, lemma, tagset) + + +def create_lexeme(pos, lemma, tagset=SIMPLE_TAGSET): + """ + Create Lexeme object from part of speech and lemma. + + Args: + pos (str): String specifying tagset's part of speech. + lemma (str) + tagset (:obj:`corpus2.Tagset`, optional): Tagset for decoding + the string. Defaults to corpus2.get_named_tagset("simple"). + + Returns: + corpus2.Lexeme: Lexeme object with given pos and lemma. + """ + lexeme = corpus2.Lexeme() + lexeme.set_lemma_utf8(lemma) + lexeme.set_tag(create_tag(pos, tagset)) + return lexeme + + +def create_tag(pos, tagset=SIMPLE_TAGSET): + """ + Create Token object from string. + + Args: + pos (str): String specifying tagset's part of speech. + tagset (:obj:`corpus2.Tagset`, optional): Tagset for decoding + the string. Defaults to corpus2.get_named_tagset("simple"). + + Returns: + corpus2.Tag: Tag object parsed from given string and tagset. + """ + if isinstance(tagset, str): + tagset = get_tagset(tagset) + return tagset.parse_simple_tag(pos) + + +def _is_swig_instance(object_, type_): + """ + Check typing of an object, SWIG style. + + Seriously, SWIG is so stupid that importing the same class from different + modules creates an entirely different subclass that cannot be compared + using isinstance. Yes, I am absolutely serious, it kept throwing + 'maca.Lexeme != corpus2.Lexeme' at me. Where is maca coming from? + Hell if I know. + + And if anyone even tries to say something like 'Well, just use duck typing, + it is more pythonic!', I will start punching puppies. First of all, + the whole concept of just throwing (or 'raising', if that is more + 'pythonic' for you) exceptions all the time is brilliant. I raise (sorry + for the pan) my headset (for lack of a hat) to whoever came up with idea + of StopIterationException. + + Second of all, you know how SWIG reacts to exceptions? You get a segfault, + you get a segfault, everyone gets a segfault! So trying to ducktype that + is about as successful as playing actual Duck Hunt on an LCD. + + So I did the only thing that kinda worked and compared the type names. + Because apparently maca.Lexeme.__name__ and corpus2.Lexeme.__name__ still + yield just 'Lexeme', so at least I do not have to do partial string + comparison. Yay me, I guess. + + Args: + object_ (object) + type_ (type) + + Returns: + bool: True if name of type of the object is same as name of the type. + """ + return type(object_).__name__ == type_.__name__ + + +def _token_or_lexeme(token_or_lexeme, flow=False): + if _is_swig_instance(token_or_lexeme, corpus2.Token): + return token_or_lexeme.lexemes()[0] + elif _is_swig_instance(token_or_lexeme, corpus2.Lexeme) or flow: + return token_or_lexeme + else: + raise TypeError("{} is neither Token nor Lexeme" + .format(token_or_lexeme)) + + +def _lexeme_or_tag(lexeme_or_tag, flow=False): + if _is_swig_instance(lexeme_or_tag, corpus2.Lexeme): + return lexeme_or_tag.tag() + elif _is_swig_instance(lexeme_or_tag, corpus2.Tag) or flow: + return lexeme_or_tag + else: + raise TypeError("{} is neither Lexeme nor Tag" + .format(lexeme_or_tag)) + + +def _token_lexeme_or_tag(token_lexeme_or_tag): + token_or_lexeme = _token_or_lexeme(token_lexeme_or_tag, True) + return _lexeme_or_tag(token_or_lexeme) + + +def get_lexeme_string(token_or_lexeme, tagset, delimiter=":", + main_only=False): + return delimiter.join(get_lexeme_pair( + token_or_lexeme, tagset, main_only)) + + +def get_coarse_lexeme_string(token_or_lexeme, tagset, delimiter=":"): + return delimiter.join(get_coarse_lexeme_pair( + token_or_lexeme, tagset)) + + +def get_lexeme_pair(token_or_lexeme, tagset, main_only=False): + return (get_pos(token_or_lexeme, tagset, main_only), + get_lexeme_lemma(token_or_lexeme)) + + +def get_coarse_lexeme_pair(token_or_lexeme, tagset): + return (get_coarse_pos(token_or_lexeme, tagset), + get_lexeme_lemma(token_or_lexeme)) + + +def get_lexeme_lemma(token_or_lexeme): + lexeme = _token_or_lexeme(token_or_lexeme) + lemma = lexeme.lemma_utf8() + return lemma + + +def get_pos(token_lexeme_or_tag, tagset, main_only=False): + """ + Get part of speech from the lexeme. + + Args: + token_lexeme_or_tag (Union[corpus2.Token, corpus2.Lexeme, corpus2.Tag]) + tagset (corpus2.Tagset) + main_only (:obj:`bool`, optional) If True, return only the main part + before first comma. Defaults to False. + + Returns: + str + """ + tag = _token_lexeme_or_tag(token_lexeme_or_tag) + if isinstance(tagset, str): + tagset = get_tagset(tagset) + pos = tagset.tag_to_symbol_string(tag) + if main_only: + pos = pos.split(",")[0] + return pos + + +def get_coarse_pos(token_lexeme_or_tag, tagset): + """ + Get coarse part of speech from the lexeme: either verb, noun, adj, adv. + + Args: + token_lexeme_or_tag (Union[corpus2.Token, corpus2.Lexeme, corpus2.Tag]) + tagset (corpus2.Tagset) + + Returns: + str: Coarse part of speech string. If original part of speach + is convertible to coarse, the returned string will be coarse + also. Otherwise returns first part of original part of speech. + """ + fine_pos = get_pos(token_lexeme_or_tag, tagset, True) + coarse_pos = convert_to_coarse_pos(fine_pos) + return coarse_pos + + +def convert_to_coarse_pos(fine_pos): + """! + Returns a corresponding coarse-grained POS for a given fine-grained POS. + @param fine_pos: fine-grained POS + @return coarse-grained POS + """ + # verbs + verb_pl_pos = ['fin', 'bedzie', 'praet', 'impt', + 'inf', 'pcon', 'pant', 'imps', + 'winien', 'pred', 'pact', 'ppas', 'pred'] + # nouns + noun_pl_pos = ['subst', 'depr', 'ger', 'brev'] + # adjectives + adj_pl_pos = ['adj', 'adja', 'adjp', 'adjc'] + # adverbs + adv_pl_pos = ['adv'] + + if fine_pos in noun_pl_pos: + return "noun" + elif fine_pos in adj_pl_pos: + return "adj" + elif fine_pos in verb_pl_pos: + return "verb" + elif fine_pos in adv_pl_pos: + return "adv" + return fine_pos + + +def POS_num_to_str(pos_num, short=False): + """ Convert POS number to string. + + Args: + pos_num (int): selected POS as number + + Note: + If POS number is out of range, None will be returned. + + Returns: + str: string representation of POS + + """ + poses = { + 1: ('v', 'verb'), + 2: ('n', 'noun'), + 3: ('r', 'adv'), + 4: ('a', 'adj'), + 5: ('v-PWN', 'verb-PWN'), + 6: ('n-PWN', 'noun-PWN'), + 7: ('r-PWN', 'adv-PWN'), + 8: ('a-PWN', 'adj-PWN') + } + try: + pos_short, pos_long = poses[pos_num] + except KeyError: + pos_short, pos_long = (None, None) + return pos_short if short else pos_long + + +def POS_str_to_num(pos_str, short=False): + """ Convert POS string to num. + + Args: + pos_str (str): selected POS as string + + Note: + If unknown POS string was used, None will be returned. + + Returns: + int: number of selected POS + + """ + try: + return next( + n for n in range(1, 9) if POS_num_to_str(n, short) == pos_str + ) + except StopIteration: + return None + + +def join_lexeme(pos_lex, delimiter=":", lemma_only=False): + """ + Return string representing the part of speech and lemma pair. + + Args: + pos_lex (Tuple[str, str]): Part of speech and lemma pair. + delimiter (:obj:`str`, optional): Delimiter for joining + part of speech and lemma. Only used if lemma_only == False. + Defaults to ":". + lemma_only (:obj:`bool`, optional): Skip pos. + + Returns: + str: If lemma_only, returns lemma. Otherwise returns + "{pos}{delimiter}{lemma}". + """ + if lemma_only: + return pos_lex[1] + else: + return "{}{}{}".format(pos_lex[0], delimiter, pos_lex[1]) + + +def join_lexemes(pos_lexs, delimiter=":", lemma_only=False): + """ + Return strings representing the part of speech and lemma pair. + + Args: + pos_lex (List[Tuple[str, str]]): Part of speech and lemma pairs. + delimiter (:obj:`str`, optional): Delimiter for joining + part of speech and lemma. Only used if lemma_only == False. + Defaults to ":". + lemma_only (:obj:`bool`, optional): Skip pos. + + Returns: + str: If lemma_only, returns lemma. Otherwise returns + "{pos}{delimiter}{lemma}". + """ + return [join_lexeme(pos_lex, delimiter, lemma_only) + for pos_lex in pos_lexs] + + +def split_lexeme(string, delimiter): + """ + Split string into part of speech and lemma tuple + + Args: + string (str): String of form "{pos}{delimiter}{lemma}". + delimiter (:obj:`str`, optional): Delimiter used for splitting + part of speech and lemma. CAN appear further down in lemma + but NOT in part of speech. Defaults to ":". + + Returns: + Tuple[str, str]: Tuple with part of speech and lemma + """ + (pos, lemma) = string.split(delimiter, 1) + return (pos, lemma) + + +def split_lexemes(strings, delimiter): + """ + Return article id and its part of speech and lemma tuples + from a line with given delimiters. + + Args: + strings (Iterable[str]): Strings of form "{pos}{delimiter}{lemma}". + delimiter (:obj:`str`, optional): Delimiter used for splitting + part of speech and lemma. CAN appear further down in lemma + but NOT in part of speech. Defaults to ":". + + Returns: + int: Article id. + Set[Tuple[str, str]]: Set of part of speech, lemma tuples + appearing in the article. + + Raises: + DelimiterConflictError: If token_delimiter equals lemma_delimiter + """ + return [split_lexeme(string, delimiter) + for string in strings]