Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
nlpworkers
tokenizer
Commits
b07f3292
Commit
b07f3292
authored
Sep 14, 2020
by
Bartłomiej Koptyra
Committed by
Mateusz Gniewkowski
Sep 14, 2020
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Develop
parent
ab493140
Changes
9
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
772 additions
and
727 deletions
+772
-727
.gitlab-ci.yml
.gitlab-ci.yml
+32
-32
Dockerfile
Dockerfile
+9
-9
config.ini
config.ini
+20
-20
docker-compose.yml
docker-compose.yml
+17
-17
main.py
main.py
+34
-34
requirements.txt
requirements.txt
+1
-1
src/text_edit.py
src/text_edit.py
+542
-501
src/worker.py
src/worker.py
+74
-70
tox.ini
tox.ini
+43
-43
No files found.
.gitlab-ci.yml
View file @
b07f3292
image
:
'
clarinpl/python:3.6'
cache
:
paths
:
-
.tox
stages
:
-
check_style
-
build
before_script
:
-
pip install tox==2.9.1
pep8
:
stage
:
check_style
script
:
-
tox -v -e pep8
docstyle
:
stage
:
check_style
script
:
-
tox -v -e docstyle
build_image
:
stage
:
build
image
:
'
docker:18.09.7'
only
:
-
master
services
:
-
'
docker:18.09.7-dind'
before_script
:
-
'
'
script
:
-
docker build -t clarinpl/tokenizer .
-
echo $DOCKER_PASSWORD > pass.txt
-
cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
-
rm pass.txt
-
docker push clarinpl/tokenizer
image
:
'
clarinpl/python:3.6'
cache
:
paths
:
-
.tox
stages
:
-
check_style
-
build
before_script
:
-
pip install tox==2.9.1
pep8
:
stage
:
check_style
script
:
-
tox -v -e pep8
docstyle
:
stage
:
check_style
script
:
-
tox -v -e docstyle
build_image
:
stage
:
build
image
:
'
docker:18.09.7'
only
:
-
master
services
:
-
'
docker:18.09.7-dind'
before_script
:
-
'
'
script
:
-
docker build -t clarinpl/tokenizer .
-
echo $DOCKER_PASSWORD > pass.txt
-
cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
-
rm pass.txt
-
docker push clarinpl/tokenizer
Dockerfile
View file @
b07f3292
FROM
clarinpl/python:3.6
WORKDIR
/home/worker
COPY
./src ./src
COPY
./main.py .
COPY
./requirements.txt .
RUN
python3.6
-m
pip
install
-r
requirements.txt
FROM
clarinpl/python:3.6
WORKDIR
/home/worker
COPY
./src ./src
COPY
./main.py .
COPY
./requirements.txt .
RUN
python3.6
-m
pip
install
-r
requirements.txt
CMD
["python3.6", "main.py", "service"]
\ No newline at end of file
config.ini
View file @
b07f3292
[service]
tool
=
tokenizer
root
=
/samba/requests/
rabbit_host
=
rabbitmq
rabbit_user
=
test
rabbit_password
=
test
queue_prefix
=
nlp_
[tool]
workers_number
=
1
processed_lines
=
1000
[logging]
port
=
9998
local_log_level
=
INFO
[logging_levels]
__main__
=
INFO
[service]
tool
=
tokenizer
root
=
/samba/requests/
rabbit_host
=
rabbitmq
rabbit_user
=
test
rabbit_password
=
test
queue_prefix
=
nlp_
[tool]
workers_number
=
1
processed_lines
=
1000
[logging]
port
=
9998
local_log_level
=
INFO
[logging_levels]
__main__
=
INFO
docker-compose.yml
View file @
b07f3292
version
:
'
3'
services
:
tokenizer
:
container_name
:
clarin_tokenizer
build
:
./
working_dir
:
/home/worker
entrypoint
:
-
python3.6
-
main.py
-
service
environment
:
-
PYTHONUNBUFFERED=0
volumes
:
-
'
/samba:/samba'
-
'
./config.ini:/home/worker/config.ini'
-
'
./src:/home/worker/src'
-
'
./main.py:/home/worker/main.py'
version
:
'
3'
services
:
tokenizer
:
container_name
:
clarin_tokenizer
build
:
./
working_dir
:
/home/worker
entrypoint
:
-
python3.6
-
main.py
-
service
environment
:
-
PYTHONUNBUFFERED=0
volumes
:
-
'
/samba:/samba'
-
'
./config.ini:/home/worker/config.ini'
-
'
./src:/home/worker/src'
-
'
./main.py:/home/worker/main.py'
main.py
View file @
b07f3292
"""Implementation of tokenizer service."""
import
argparse
import
nlp_ws
from
src.worker
import
Worker
def
get_args
():
"""Gets command line arguments."""
parser
=
argparse
.
ArgumentParser
(
description
=
"tokenizer"
)
subparsers
=
parser
.
add_subparsers
(
dest
=
"mode"
)
subparsers
.
required
=
True
subparsers
.
add_parser
(
"service"
,
help
=
"Run as a service"
)
return
parser
.
parse_args
()
def
main
():
"""Runs the program."""
args
=
get_args
()
generators
=
{
"service"
:
lambda
:
nlp_ws
.
NLPService
.
main
(
Worker
),
}
gen_fn
=
generators
.
get
(
args
.
mode
,
lambda
:
None
)
gen_fn
()
if
__name__
==
"__main__"
:
main
()
"""Implementation of tokenizer service."""
import
argparse
import
nlp_ws
from
src.worker
import
Worker
def
get_args
():
"""Gets command line arguments."""
parser
=
argparse
.
ArgumentParser
(
description
=
"tokenizer"
)
subparsers
=
parser
.
add_subparsers
(
dest
=
"mode"
)
subparsers
.
required
=
True
subparsers
.
add_parser
(
"service"
,
help
=
"Run as a service"
)
return
parser
.
parse_args
()
def
main
():
"""Runs the program."""
args
=
get_args
()
generators
=
{
"service"
:
lambda
:
nlp_ws
.
NLPService
.
main
(
Worker
),
}
gen_fn
=
generators
.
get
(
args
.
mode
,
lambda
:
None
)
gen_fn
()
if
__name__
==
"__main__"
:
main
()
requirements.txt
View file @
b07f3292
nlp-ws
nlp-ws
nltk
\ No newline at end of file
src/text_edit.py
View file @
b07f3292
This diff is collapsed.
Click to expand it.
src/worker.py
View file @
b07f3292
"""Implementation of nlp_worker."""
import
logging
import
nltk.data
import
nlp_ws
import
src.text_edit
_log
=
logging
.
getLogger
(
__name__
)
class
Worker
(
nlp_ws
.
NLPWorker
):
"""Implements nlp_worker for tokenizer service."""
@
classmethod
def
static_init
(
cls
,
config
):
"""One time static initialisation."""
try
:
nltk
.
data
.
find
(
'tokenizers/punkt'
)
except
LookupError
:
nltk
.
download
(
'punkt'
)
cls
.
processed_lines
=
int
(
config
[
'tool'
][
'processed_lines'
])
def
process
(
self
,
input_file
,
task_options
,
output_file
):
"""Separates input into sentences, applies transformations from options.
It is assumed input_file is encoded in UTF-8.
Options:
By default remove:
punctuation - 'remove'/'leave' - 'remove' removes punctuation (
from string.punctuation) from input (not including periods like
in 'ul. Sądeckiej')
By default leave:
listings - 'remove'/'leave' - 'remove' removes listings (words that
consist of a single letter or digit followed by a ')' )
emails - 'remove'/'leave'/'token' - 'remove' removes email addresses
from input, 'token' substitutes email addresses by a word 'mail'
links - 'remove'/'leave'/'token' - 'remove' removes links from input,
'token' substitutes links by a word 'link'
mentions - 'remove'/'leave'/'token' - 'remove' removes mentions from
input (e.g. @twitter_handle), 'token' substitutes mentions
by a word 'mention'
case - 'upper'/'lower'/'leave' - changes (or not) the case of the input
rm_add_char - 'all'/'special'/'leave' - 'all' changes non-ASCII
punctuation and removes all characters that are neither ASCII
characters nor polish characters, 'special' removes emoticons,
asian,russian characters, changes non-ASCII punctuation
mistyped_listings - 'remove'/'leave' - 'remove' only works if listings
option if on 'remove'. This option removes uppercase letters and
numbers at the beginning of lines and sentences if they are
recognized to be a listing by a simple logic.
"""
tokenizer
=
nltk
.
data
.
load
(
'tokenizers/punkt/polish.pickle'
)
text_editor
=
src
.
text_edit
.
TextEdit
(
task_options
,
tokenizer
)
with
open
(
input_file
,
'r'
,
encoding
=
'utf-8'
)
as
input_file
,
\
open
(
output_file
,
'wt'
,
encoding
=
'utf-8'
)
as
output_file
:
i
=
0
for
line
in
input_file
:
text_editor
.
add_line
(
line
)
i
+=
1
if
i
>
self
.
processed_lines
:
list_of_sentences
=
text_editor
.
process
(
False
)
output_file
.
write
(
'
\n
'
.
join
(
list_of_sentences
))
i
=
0
list_of_sentences
=
text_editor
.
process
(
True
)
if
list_of_sentences
and
(
len
(
list_of_sentences
)
>
1
or
list_of_sentences
[
0
]
!=
''
):
output_file
.
write
(
'
\n
'
.
join
(
list_of_sentences
))
"""Implementation of nlp_worker."""
import
logging
import
nltk.data
import
nlp_ws
import
src.text_edit
_log
=
logging
.
getLogger
(
__name__
)
class
Worker
(
nlp_ws
.
NLPWorker
):
"""Implements nlp_worker for tokenizer service."""
@
classmethod
def
static_init
(
cls
,
config
):
"""One time static initialisation."""
try
:
nltk
.
data
.
find
(
'tokenizers/punkt'
)
except
LookupError
:
nltk
.
download
(
'punkt'
)
cls
.
processed_lines
=
int
(
config
[
'tool'
][
'processed_lines'
])
def
process
(
self
,
input_file
,
task_options
,
output_file
):
"""Separates input into sentences, applies transformations from options.
It is assumed input_file is encoded in UTF-8.
Options:
punctuation - 'remove'/'leave' - 'remove' removes punctuation (
from string.punctuation) from input (not including periods like
in 'ul. Sądeckiej')
listings - 'remove'/'leave' - 'remove' removes listings (words that
consist of a single letter or digit followed by a ')' )
emails - 'remove'/'leave'/'token' - 'remove' removes email addresses
from input, 'token' substitutes email addresses by a word 'mail'
links - 'remove'/'leave'/'token' - 'remove' removes links from input,
'token' substitutes links by a word 'link'
mentions - 'remove'/'leave'/'token' - 'remove' removes mentions from
input (e.g. @twitter_handle), 'token' substitutes mentions
by a word 'mention'
case - 'upper'/'lower'/'leave' - changes (or not) the case of the input
rm_add_char - 'all'/'special'/'leave' - 'all' changes non-ASCII
punctuation and removes all characters that are neither ASCII
characters nor polish characters, 'special' removes emoticons,
asian,russian characters, changes non-ASCII punctuation
mistyped_listings - 'remove'/'leave' - 'remove' only works if listings
option if on 'remove'. This option removes uppercase letters and
numbers at the beginning of lines and sentences if they are
recognized to be a listing by a simple logic.
letter_emoticons - 'remove'/'leave' - 'remove' removes emoticons made
out of letters. May delete names it is not supposed to.
For example XD in 'Adobe XD'.
repeating_punctuation - if punctuation is on 'remove' it is forced
to be on 'remove'. Changes repeating punctuation marks like
a question mark spam at the end of sentence to a like char.
"""
tokenizer
=
nltk
.
data
.
load
(
'tokenizers/punkt/polish.pickle'
)
text_editor
=
src
.
text_edit
.
TextEdit
(
task_options
,
tokenizer
)
with
open
(
input_file
,
'r'
,
encoding
=
'utf-8'
)
as
input_file
,
\
open
(
output_file
,
'wt'
,
encoding
=
'utf-8'
)
as
output_file
:
i
=
0
for
line
in
input_file
:
text_editor
.
add_line
(
line
)
i
+=
1
if
i
>
self
.
processed_lines
:
list_of_sentences
=
text_editor
.
process
(
False
)
output_file
.
write
(
'
\n
'
.
join
(
list_of_sentences
))
i
=
0
list_of_sentences
=
text_editor
.
process
(
True
)
if
list_of_sentences
and
(
len
(
list_of_sentences
)
>
1
or
list_of_sentences
[
0
]
!=
''
):
output_file
.
write
(
'
\n
'
.
join
(
list_of_sentences
))
tox.ini
View file @
b07f3292
[tox]
envlist
=
pep8,docstyle
skipsdist
=
True
[testenv:pep8]
deps
=
flake8
basepython
=
python3
commands
=
flake8
{posargs}
[testenv:docstyle]
deps
=
pydocstyle
basepython
=
python3
commands
=
pydocstyle
--verbose
{posargs}
[flake8]
# W504 skipped because it is overeager and unnecessary
ignore
=
W504
show-source
=
True
exclude
=
.git,.venv,.tox,dist,doc,*egg,build,venv
import-order-style
=
pep8
max-line-length
=
80
[pydocstyle]
# D104 Missing docstring in public package
# D203 1 blank line required before class docstring
# D213 Multi-line docstring summary should start at the second line
# D214 Section is over-indented
# D215 Section underline is over-indented
# D401 First line should be in imperative mood; try rephrasing
# D405 Section name should be properly capitalized
# D406 Section name should end with a newline
# D407 Missing dashed underline after section
# D408 Section underline should be in the line following the section’s name
# D409 Section underline should match the length of its name
# D410 Missing blank line after section
# D411 Missing blank line before section
ignore
=
D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
match-dir
=
^(?!
\.
tox|venv).*
[tox]
envlist
=
pep8,docstyle
skipsdist
=
True
[testenv:pep8]
deps
=
flake8
basepython
=
python3
commands
=
flake8 {posargs}
[testenv:docstyle]
deps
=
pydocstyle
basepython
=
python3
commands
=
pydocstyle --verbose {posargs}
[flake8]
# W504 skipped because it is overeager and unnecessary
ignore
=
W504
show-source
=
True
exclude
=
.git,.venv,.tox,dist,doc,*egg,build,venv
import-order-style
=
pep8
max-line-length
=
80
[pydocstyle]
# D104 Missing docstring in public package
# D203 1 blank line required before class docstring
# D213 Multi-line docstring summary should start at the second line
# D214 Section is over-indented
# D215 Section underline is over-indented
# D401 First line should be in imperative mood; try rephrasing
# D405 Section name should be properly capitalized
# D406 Section name should end with a newline
# D407 Missing dashed underline after section
# D408 Section underline should be in the line following the section’s name
# D409 Section underline should match the length of its name
# D410 Missing blank line after section
# D411 Missing blank line before section
ignore
=
D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
match-dir
=
^(?!
\.
tox|venv).*
match
=
^(?!setup).*
\.
py
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment