Commit 880ea56e authored by Arkadiusz Janz's avatar Arkadiusz Janz

Merge branch 'develop' into 'master'

Develop

See merge request !4
parents f9038678 c1f5d0e3
Pipeline #2597 failed with stages
in 29 seconds
image: clarinpl/python:3.6
before_script:
- pip install tox==2.9.1
cache:
paths:
- .tox
stages:
- check_style
- push_wheel
pep8:
stage: check_style
script:
- tox -v -e pep8
docstyle:
stage: check_style
script:
- tox -v -e docstyle
push_wheel:
before_script:
- pip install twine
only:
- master
stage: push_wheel
when: on_success
script:
- python setup.py sdist bdist_wheel
- python -m twine upload
--repository-url https://pypi.clarin-pl.eu/
-u $PIPY_USER -p $PIPY_PASS dist/cclutils*.whl
......@@ -12,7 +12,10 @@ __all__ = [
'read',
'write',
'read_from_directory',
'read_chunks_it',
'read_sentences_it',
'get_tagset',
'sentence2str'
]
......@@ -206,3 +209,30 @@ def read_sentences_it(filepath, tagset='nkjp'):
yield sentence
del reader
def sentence2str(sentence, use_bases=False, tagset='nkjp'):
""" Return corpus2.Sentence as a string.
Args:
sentence: a sentence object (corpus2.Sentence).
use_bases: if set to True, the we take base forms
instead of taking the orths.
Returns:
a string representation of the input sentence object.
"""
if isinstance(tagset, str):
tagset = corpus2.get_named_tagset(tagset)
text = []
for token in sentence.tokens():
text.append(" " if token.after_space() else "")
if not use_bases:
token_string = token.orth_utf8()
else:
token_string = token.get_preferred_lexeme(tagset).lemma_utf8()
text.append(token_string)
return "".join(text).strip()
......@@ -9,8 +9,7 @@ ENCODING = "utf-8"
__all__ = [
'copy_chunk',
'copy_sentence',
'copy_relation',
'sentence2str'
'copy_relation'
]
......@@ -119,31 +118,3 @@ def _copy_chunk_attributes(source_chunk, target_chunk):
"""
for key, value in list(source_chunk.attributes().items()):
target_chunk.set_attribute(key, value)
# todo: move somewhere else!
def sentence2str(sentence, use_bases=False, tagset='nkjp'):
""" Return corpus2.Sentence as a string.
Args:
sentence: a sentence object (corpus2.Sentence).
use_bases: if set to True, the we take base forms
instead of taking the orths.
Returns:
a string representation of the input sentence object.
"""
if isinstance(tagset, str):
tagset = corpus2.get_named_tagset(tagset)
text = []
for token in sentence.tokens():
text.append(" " if token.after_space() else "")
if not use_bases:
token_string = token.orth_utf8()
else:
token_string = token.get_preferred_lexeme(tagset).lemma_utf8()
text.append(token_string)
return "".join(text).strip()
[tox]
envlist = pep8,docstyle
skipsdist = True
[testenv:pep8]
deps =
flake8
basepython = python3
commands =
flake8 {posargs}
[testenv:docstyle]
deps =
pydocstyle
basepython = python3
commands =
pydocstyle --verbose {posargs}
[flake8]
# W504 skipped because it is overeager and unnecessary
ignore = W504
show-source = True
exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
import-order-style = pep8
max-line-length = 80
[pydocstyle]
# D104 Missing docstring in public package
# D203 1 blank line required before class docstring
# D213 Multi-line docstring summary should start at the second line
# D214 Section is over-indented
# D215 Section underline is over-indented
# D401 First line should be in imperative mood; try rephrasing
# D405 Section name should be properly capitalized
# D406 Section name should end with a newline
# D407 Missing dashed underline after section
# D408 Section underline should be in the line following the section’s name
# D409 Section underline should match the length of its name
# D410 Missing blank line after section
# D411 Missing blank line before section
ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
match-dir = ^(?!\.tox|venv).*
match = ^(?!setup).*\.py
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment