Commit b07f3292 authored by Bartłomiej Koptyra's avatar Bartłomiej Koptyra Committed by Mateusz Gniewkowski

Develop

parent ab493140
image: 'clarinpl/python:3.6'
cache:
paths:
- .tox
stages:
- check_style
- build
before_script:
- pip install tox==2.9.1
pep8:
stage: check_style
script:
- tox -v -e pep8
docstyle:
stage: check_style
script:
- tox -v -e docstyle
build_image:
stage: build
image: 'docker:18.09.7'
only:
- master
services:
- 'docker:18.09.7-dind'
before_script:
- ''
script:
- docker build -t clarinpl/tokenizer .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/tokenizer
image: 'clarinpl/python:3.6'
cache:
paths:
- .tox
stages:
- check_style
- build
before_script:
- pip install tox==2.9.1
pep8:
stage: check_style
script:
- tox -v -e pep8
docstyle:
stage: check_style
script:
- tox -v -e docstyle
build_image:
stage: build
image: 'docker:18.09.7'
only:
- master
services:
- 'docker:18.09.7-dind'
before_script:
- ''
script:
- docker build -t clarinpl/tokenizer .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/tokenizer
FROM clarinpl/python:3.6
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY ./requirements.txt .
RUN python3.6 -m pip install -r requirements.txt
FROM clarinpl/python:3.6
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY ./requirements.txt .
RUN python3.6 -m pip install -r requirements.txt
CMD ["python3.6", "main.py", "service"]
\ No newline at end of file
[service]
tool = tokenizer
root = /samba/requests/
rabbit_host = rabbitmq
rabbit_user = test
rabbit_password = test
queue_prefix =nlp_
[tool]
workers_number = 1
processed_lines = 1000
[logging]
port = 9998
local_log_level = INFO
[logging_levels]
__main__ = INFO
[service]
tool = tokenizer
root = /samba/requests/
rabbit_host = rabbitmq
rabbit_user = test
rabbit_password = test
queue_prefix = nlp_
[tool]
workers_number = 1
processed_lines = 1000
[logging]
port = 9998
local_log_level = INFO
[logging_levels]
__main__ = INFO
version: '3'
services:
tokenizer:
container_name: clarin_tokenizer
build: ./
working_dir: /home/worker
entrypoint:
- python3.6
- main.py
- service
environment:
- PYTHONUNBUFFERED=0
volumes:
- '/samba:/samba'
- './config.ini:/home/worker/config.ini'
- './src:/home/worker/src'
- './main.py:/home/worker/main.py'
version: '3'
services:
tokenizer:
container_name: clarin_tokenizer
build: ./
working_dir: /home/worker
entrypoint:
- python3.6
- main.py
- service
environment:
- PYTHONUNBUFFERED=0
volumes:
- '/samba:/samba'
- './config.ini:/home/worker/config.ini'
- './src:/home/worker/src'
- './main.py:/home/worker/main.py'
"""Implementation of tokenizer service."""
import argparse
import nlp_ws
from src.worker import Worker
def get_args():
"""Gets command line arguments."""
parser = argparse.ArgumentParser(description="tokenizer")
subparsers = parser.add_subparsers(dest="mode")
subparsers.required = True
subparsers.add_parser(
"service",
help="Run as a service")
return parser.parse_args()
def main():
"""Runs the program."""
args = get_args()
generators = {
"service": lambda: nlp_ws.NLPService.main(Worker),
}
gen_fn = generators.get(args.mode, lambda: None)
gen_fn()
if __name__ == "__main__":
main()
"""Implementation of tokenizer service."""
import argparse
import nlp_ws
from src.worker import Worker
def get_args():
"""Gets command line arguments."""
parser = argparse.ArgumentParser(description="tokenizer")
subparsers = parser.add_subparsers(dest="mode")
subparsers.required = True
subparsers.add_parser(
"service",
help="Run as a service")
return parser.parse_args()
def main():
"""Runs the program."""
args = get_args()
generators = {
"service": lambda: nlp_ws.NLPService.main(Worker),
}
gen_fn = generators.get(args.mode, lambda: None)
gen_fn()
if __name__ == "__main__":
main()
This diff is collapsed.
"""Implementation of nlp_worker."""
import logging
import nltk.data
import nlp_ws
import src.text_edit
_log = logging.getLogger(__name__)
class Worker(nlp_ws.NLPWorker):
"""Implements nlp_worker for tokenizer service."""
@classmethod
def static_init(cls, config):
"""One time static initialisation."""
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
cls.processed_lines = int(config['tool']['processed_lines'])
def process(self, input_file, task_options, output_file):
"""Separates input into sentences, applies transformations from options.
It is assumed input_file is encoded in UTF-8.
Options:
By default remove:
punctuation - 'remove'/'leave' - 'remove' removes punctuation (
from string.punctuation) from input (not including periods like
in 'ul. Sądeckiej')
By default leave:
listings - 'remove'/'leave' - 'remove' removes listings (words that
consist of a single letter or digit followed by a ')' )
emails - 'remove'/'leave'/'token' - 'remove' removes email addresses
from input, 'token' substitutes email addresses by a word 'mail'
links - 'remove'/'leave'/'token' - 'remove' removes links from input,
'token' substitutes links by a word 'link'
mentions - 'remove'/'leave'/'token' - 'remove' removes mentions from
input (e.g. @twitter_handle), 'token' substitutes mentions
by a word 'mention'
case - 'upper'/'lower'/'leave' - changes (or not) the case of the input
rm_add_char - 'all'/'special'/'leave' - 'all' changes non-ASCII
punctuation and removes all characters that are neither ASCII
characters nor polish characters, 'special' removes emoticons,
asian,russian characters, changes non-ASCII punctuation
mistyped_listings - 'remove'/'leave' - 'remove' only works if listings
option if on 'remove'. This option removes uppercase letters and
numbers at the beginning of lines and sentences if they are
recognized to be a listing by a simple logic.
"""
tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
text_editor = src.text_edit.TextEdit(task_options, tokenizer)
with open(input_file, 'r', encoding='utf-8') as input_file, \
open(output_file, 'wt', encoding='utf-8') as output_file:
i = 0
for line in input_file:
text_editor.add_line(line)
i += 1
if i > self.processed_lines:
list_of_sentences = text_editor.process(False)
output_file.write('\n'.join(list_of_sentences))
i = 0
list_of_sentences = text_editor.process(True)
if list_of_sentences and (len(list_of_sentences) > 1 or
list_of_sentences[0] != ''):
output_file.write('\n'.join(list_of_sentences))
"""Implementation of nlp_worker."""
import logging
import nltk.data
import nlp_ws
import src.text_edit
_log = logging.getLogger(__name__)
class Worker(nlp_ws.NLPWorker):
"""Implements nlp_worker for tokenizer service."""
@classmethod
def static_init(cls, config):
"""One time static initialisation."""
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
cls.processed_lines = int(config['tool']['processed_lines'])
def process(self, input_file, task_options, output_file):
"""Separates input into sentences, applies transformations from options.
It is assumed input_file is encoded in UTF-8.
Options:
punctuation - 'remove'/'leave' - 'remove' removes punctuation (
from string.punctuation) from input (not including periods like
in 'ul. Sądeckiej')
listings - 'remove'/'leave' - 'remove' removes listings (words that
consist of a single letter or digit followed by a ')' )
emails - 'remove'/'leave'/'token' - 'remove' removes email addresses
from input, 'token' substitutes email addresses by a word 'mail'
links - 'remove'/'leave'/'token' - 'remove' removes links from input,
'token' substitutes links by a word 'link'
mentions - 'remove'/'leave'/'token' - 'remove' removes mentions from
input (e.g. @twitter_handle), 'token' substitutes mentions
by a word 'mention'
case - 'upper'/'lower'/'leave' - changes (or not) the case of the input
rm_add_char - 'all'/'special'/'leave' - 'all' changes non-ASCII
punctuation and removes all characters that are neither ASCII
characters nor polish characters, 'special' removes emoticons,
asian,russian characters, changes non-ASCII punctuation
mistyped_listings - 'remove'/'leave' - 'remove' only works if listings
option if on 'remove'. This option removes uppercase letters and
numbers at the beginning of lines and sentences if they are
recognized to be a listing by a simple logic.
letter_emoticons - 'remove'/'leave' - 'remove' removes emoticons made
out of letters. May delete names it is not supposed to.
For example XD in 'Adobe XD'.
repeating_punctuation - if punctuation is on 'remove' it is forced
to be on 'remove'. Changes repeating punctuation marks like
a question mark spam at the end of sentence to a like char.
"""
tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
text_editor = src.text_edit.TextEdit(task_options, tokenizer)
with open(input_file, 'r', encoding='utf-8') as input_file, \
open(output_file, 'wt', encoding='utf-8') as output_file:
i = 0
for line in input_file:
text_editor.add_line(line)
i += 1
if i > self.processed_lines:
list_of_sentences = text_editor.process(False)
output_file.write('\n'.join(list_of_sentences))
i = 0
list_of_sentences = text_editor.process(True)
if list_of_sentences and (len(list_of_sentences) > 1 or
list_of_sentences[0] != ''):
output_file.write('\n'.join(list_of_sentences))
[tox]
envlist = pep8,docstyle
skipsdist = True
[testenv:pep8]
deps =
flake8
basepython = python3
commands =
flake8 {posargs}
[testenv:docstyle]
deps =
pydocstyle
basepython = python3
commands =
pydocstyle --verbose {posargs}
[flake8]
# W504 skipped because it is overeager and unnecessary
ignore = W504
show-source = True
exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
import-order-style = pep8
max-line-length = 80
[pydocstyle]
# D104 Missing docstring in public package
# D203 1 blank line required before class docstring
# D213 Multi-line docstring summary should start at the second line
# D214 Section is over-indented
# D215 Section underline is over-indented
# D401 First line should be in imperative mood; try rephrasing
# D405 Section name should be properly capitalized
# D406 Section name should end with a newline
# D407 Missing dashed underline after section
# D408 Section underline should be in the line following the section’s name
# D409 Section underline should match the length of its name
# D410 Missing blank line after section
# D411 Missing blank line before section
ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
match-dir = ^(?!\.tox|venv).*
[tox]
envlist = pep8,docstyle
skipsdist = True
[testenv:pep8]
deps =
flake8
basepython = python3
commands =
flake8 {posargs}
[testenv:docstyle]
deps =
pydocstyle
basepython = python3
commands =
pydocstyle --verbose {posargs}
[flake8]
# W504 skipped because it is overeager and unnecessary
ignore = W504
show-source = True
exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
import-order-style = pep8
max-line-length = 80
[pydocstyle]
# D104 Missing docstring in public package
# D203 1 blank line required before class docstring
# D213 Multi-line docstring summary should start at the second line
# D214 Section is over-indented
# D215 Section underline is over-indented
# D401 First line should be in imperative mood; try rephrasing
# D405 Section name should be properly capitalized
# D406 Section name should end with a newline
# D407 Missing dashed underline after section
# D408 Section underline should be in the line following the section’s name
# D409 Section underline should match the length of its name
# D410 Missing blank line after section
# D411 Missing blank line before section
ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
match-dir = ^(?!\.tox|venv).*
match = ^(?!setup).*\.py
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment