Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
combo
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Syntactic Tools
combo
Commits
26afe8e7
Commit
26afe8e7
authored
Apr 19, 2023
by
Maja Jablonska
Browse files
Options
Downloads
Patches
Plain Diff
Add SentenceSplitter
parent
497db05f
No related branches found
No related tags found
1 merge request
!46
Merge COMBO 3.0 into master
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
combo/data/tokenizers/__init__.py
+1
-0
1 addition, 0 deletions
combo/data/tokenizers/__init__.py
combo/data/tokenizers/sentence_splitter.py
+81
-0
81 additions, 0 deletions
combo/data/tokenizers/sentence_splitter.py
with
82 additions
and
0 deletions
combo/data/tokenizers/__init__.py
+
1
−
0
View file @
26afe8e7
...
@@ -2,4 +2,5 @@ from .tokenizer import Tokenizer, TokenizerToken
...
@@ -2,4 +2,5 @@ from .tokenizer import Tokenizer, TokenizerToken
from
.character_tokenizer
import
CharacterTokenizer
from
.character_tokenizer
import
CharacterTokenizer
from
.pretrained_transformer_tokenizer
import
PretrainedTransformerTokenizer
from
.pretrained_transformer_tokenizer
import
PretrainedTransformerTokenizer
from
.spacy_tokenizer
import
SpacyTokenizer
from
.spacy_tokenizer
import
SpacyTokenizer
from
.sentence_splitter
import
SentenceSplitter
,
SpacySentenceSplitter
from
.whitespace_tokenizer
import
WhitespaceTokenizer
from
.whitespace_tokenizer
import
WhitespaceTokenizer
This diff is collapsed.
Click to expand it.
combo/data/tokenizers/sentence_splitter.py
0 → 100644
+
81
−
0
View file @
26afe8e7
"""
Adapted from AllenNLP
https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c82/allennlp/data/tokenizers/sentence_splitter.py
"""
from
typing
import
List
,
Dict
,
Any
import
spacy
from
combo.utils.spacy
import
get_spacy_model
class
SentenceSplitter
:
"""
A `SentenceSplitter` splits strings into sentences.
"""
default_implementation
=
"
spacy
"
def
split_sentences
(
self
,
text
:
str
)
->
List
[
str
]:
"""
Splits a `text` :class:`str` paragraph into a list of :class:`str`, where each is a sentence.
"""
raise
NotImplementedError
def
batch_split_sentences
(
self
,
texts
:
List
[
str
])
->
List
[
List
[
str
]]:
"""
Default implementation is to just iterate over the texts and call `split_sentences`.
"""
return
[
self
.
split_sentences
(
text
)
for
text
in
texts
]
@SentenceSplitter.register
(
"
spacy
"
)
class
SpacySentenceSplitter
(
SentenceSplitter
):
"""
A `SentenceSplitter` that uses spaCy
'
s built-in sentence boundary detection.
Spacy
'
s default sentence splitter uses a dependency parse to detect sentence boundaries, so
it is slow, but accurate.
Another option is to use rule-based sentence boundary detection. It
'
s fast and has a small memory footprint,
since it uses punctuation to detect sentence boundaries. This can be activated with the `rule_based` flag.
By default, `SpacySentenceSplitter` calls the default spacy boundary detector.
Registered as a `SentenceSplitter` with name
"
spacy
"
.
"""
def
__init__
(
self
,
language
:
str
=
"
en_core_web_sm
"
,
rule_based
:
bool
=
False
)
->
None
:
self
.
_language
=
language
self
.
_rule_based
=
rule_based
# we need spacy's dependency parser if we're not using rule-based sentence boundary detection.
self
.
spacy
=
get_spacy_model
(
self
.
_language
,
parse
=
not
self
.
_rule_based
,
ner
=
False
)
self
.
_is_version_3
=
spacy
.
__version__
>=
"
3.0
"
if
rule_based
:
# we use `sentencizer`, a built-in spacy module for rule-based sentence boundary detection.
# depending on the spacy version, it could be called 'sentencizer' or 'sbd'
sbd_name
=
"
sbd
"
if
spacy
.
__version__
<
"
2.1
"
else
"
sentencizer
"
if
not
self
.
spacy
.
has_pipe
(
sbd_name
):
if
self
.
_is_version_3
:
self
.
spacy
.
add_pipe
(
sbd_name
)
else
:
sbd
=
self
.
spacy
.
create_pipe
(
sbd_name
)
self
.
spacy
.
add_pipe
(
sbd
)
def
split_sentences
(
self
,
text
:
str
)
->
List
[
str
]:
if
self
.
_is_version_3
:
return
[
sent
.
text
.
strip
()
for
sent
in
self
.
spacy
(
text
).
sents
]
else
:
return
[
sent
.
string
.
strip
()
for
sent
in
self
.
spacy
(
text
).
sents
]
def
batch_split_sentences
(
self
,
texts
:
List
[
str
])
->
List
[
List
[
str
]]:
"""
This method lets you take advantage of spacy
'
s batch processing.
"""
if
self
.
_is_version_3
:
return
[
[
sentence
.
text
.
strip
()
for
sentence
in
doc
.
sents
]
for
doc
in
self
.
spacy
.
pipe
(
texts
)
]
return
[
[
sentence
.
string
.
strip
()
for
sentence
in
doc
.
sents
]
for
doc
in
self
.
spacy
.
pipe
(
texts
)
]
def
_to_params
(
self
)
->
Dict
[
str
,
Any
]:
return
{
"
type
"
:
"
spacy
"
,
"
language
"
:
self
.
_language
,
"
rule_based
"
:
self
.
_rule_based
}
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment