Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
combo
Manage
Activity
Members
Labels
Plan
Issues
20
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
2
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Syntactic Tools
combo
Merge requests
!40
Add try/catch clause for sentences with large number of wordpieces.
Code
Review changes
Check out branch
Download
Patches
Plain diff
Closed
Add try/catch clause for sentences with large number of wordpieces.
fix/try_catch_clause_for_long_wordpiece_list
into
develop
Overview
0
Commits
1
Pipelines
1
Changes
1
Closed
Martyna Wiącek
requested to merge
fix/try_catch_clause_for_long_wordpiece_list
into
develop
3 years ago
Overview
0
Commits
1
Pipelines
1
Changes
1
Expand
Add try/catch clause for sentences with large number of wordpieces.
0
0
Merge request reports
Compare
develop
develop (base)
and
latest version
latest version
e060a26d
1 commit,
3 years ago
1 file
+
25
−
19
Expand all files
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
+
25
−
19
Options
import
logging
import
sys
from
typing
import
Optional
,
Dict
,
Any
,
List
,
Tuple
from
allennlp
import
data
from
allennlp.data
import
token_indexers
,
tokenizers
,
IndexedTokenList
,
vocabulary
from
overrides
import
overrides
from
typing
import
List
logger
=
logging
.
getLogger
(
__name__
)
@data.TokenIndexer.register
(
"
pretrained_transformer_mismatched_fixed
"
)
class
PretrainedTransformerMismatchedIndexer
(
token_indexers
.
PretrainedTransformerMismatchedIndexer
):
@@ -34,28 +35,33 @@ class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransforme
Method is overridden in order to raise an error while the number of tokens needed to embed a sentence exceeds the
maximal input of a model.
"""
self
.
_matched_indexer
.
_add_encoding_to_vocabulary_if_needed
(
vocabulary
)
try
:
self
.
_matched_indexer
.
_add_encoding_to_vocabulary_if_needed
(
vocabulary
)
wordpieces
,
offsets
=
self
.
_allennlp_tokenizer
.
intra_word_tokenize
(
[
t
.
ensure_text
()
for
t
in
tokens
])
wordpieces
,
offsets
=
self
.
_allennlp_tokenizer
.
intra_word_tokenize
(
[
t
.
ensure_text
()
for
t
in
tokens
])
if
len
(
wordpieces
)
>
self
.
_tokenizer
.
max_len_single_sentence
:
raise
ValueError
(
"
Following sentence consists of more wordpiece tokens that the model can process:
\n
"
+
\
"
"
.
join
([
str
(
x
)
for
x
in
tokens
[:
10
]])
+
"
...
\n
"
+
\
f
"
Maximal input:
{
self
.
_tokenizer
.
max_len_single_sentence
}
\n
"
+
\
f
"
Current input:
{
len
(
wordpieces
)
}
"
)
if
len
(
wordpieces
)
>
self
.
_tokenizer
.
max_len_single_sentence
:
raise
ValueError
(
"
Following sentence consists of more wordpiece tokens that the model can process:
\n
"
+
\
"
"
.
join
([
str
(
x
)
for
x
in
tokens
[:
10
]])
+
"
...
\n
"
+
\
f
"
Maximal input:
{
self
.
_tokenizer
.
max_len_single_sentence
}
\n
"
+
\
f
"
Current input:
{
len
(
wordpieces
)
}
"
)
offsets
=
[
x
if
x
is
not
None
else
(
-
1
,
-
1
)
for
x
in
offsets
]
offsets
=
[
x
if
x
is
not
None
else
(
-
1
,
-
1
)
for
x
in
offsets
]
output
:
IndexedTokenList
=
{
"
token_ids
"
:
[
t
.
text_id
for
t
in
wordpieces
],
"
mask
"
:
[
True
]
*
len
(
tokens
),
# for original tokens (i.e. word-level)
"
type_ids
"
:
[
t
.
type_id
for
t
in
wordpieces
],
"
offsets
"
:
offsets
,
"
wordpiece_mask
"
:
[
True
]
*
len
(
wordpieces
),
# for wordpieces (i.e. subword-level)
}
output
:
IndexedTokenList
=
{
"
token_ids
"
:
[
t
.
text_id
for
t
in
wordpieces
],
"
mask
"
:
[
True
]
*
len
(
tokens
),
# for original tokens (i.e. word-level)
"
type_ids
"
:
[
t
.
type_id
for
t
in
wordpieces
],
"
offsets
"
:
offsets
,
"
wordpiece_mask
"
:
[
True
]
*
len
(
wordpieces
),
# for wordpieces (i.e. subword-level)
}
return
self
.
_matched_indexer
.
_postprocess_output
(
output
)
return
self
.
_matched_indexer
.
_postprocess_output
(
output
)
except
ValueError
as
value_error
:
logger
.
error
(
value_error
)
sys
.
exit
(
1
)
class
PretrainedTransformerIndexer
(
token_indexers
.
PretrainedTransformerIndexer
):