Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
combo
Manage
Activity
Members
Labels
Plan
Issues
20
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
2
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Syntactic Tools
combo
Commits
e060a26d
Commit
e060a26d
authored
3 years ago
by
martynawiacek
Browse files
Options
Downloads
Patches
Plain Diff
Add try/catch clause for sentences with large number of wordpieces.
parent
0b63a2c3
2 merge requests
!41
Add try/catch clause for sentences with large number of wordpieces.
,
!40
Add try/catch clause for sentences with large number of wordpieces.
Pipeline
#4283
failed with stage
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
+25
-19
25 additions, 19 deletions
...ken_indexers/pretrained_transformer_mismatched_indexer.py
with
25 additions
and
19 deletions
combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
+
25
−
19
View file @
e060a26d
import
logging
import
sys
from
typing
import
Optional
,
Dict
,
Any
,
List
,
Tuple
from
allennlp
import
data
from
allennlp.data
import
token_indexers
,
tokenizers
,
IndexedTokenList
,
vocabulary
from
overrides
import
overrides
from
typing
import
List
logger
=
logging
.
getLogger
(
__name__
)
@data.TokenIndexer.register
(
"
pretrained_transformer_mismatched_fixed
"
)
class
PretrainedTransformerMismatchedIndexer
(
token_indexers
.
PretrainedTransformerMismatchedIndexer
):
...
...
@@ -34,28 +35,33 @@ class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransforme
Method is overridden in order to raise an error while the number of tokens needed to embed a sentence exceeds the
maximal input of a model.
"""
self
.
_matched_indexer
.
_add_encoding_to_vocabulary_if_needed
(
vocabulary
)
try
:
self
.
_matched_indexer
.
_add_encoding_to_vocabulary_if_needed
(
vocabulary
)
wordpieces
,
offsets
=
self
.
_allennlp_tokenizer
.
intra_word_tokenize
(
[
t
.
ensure_text
()
for
t
in
tokens
])
wordpieces
,
offsets
=
self
.
_allennlp_tokenizer
.
intra_word_tokenize
(
[
t
.
ensure_text
()
for
t
in
tokens
])
if
len
(
wordpieces
)
>
self
.
_tokenizer
.
max_len_single_sentence
:
raise
ValueError
(
"
Following sentence consists of more wordpiece tokens that the model can process:
\n
"
+
\
"
"
.
join
([
str
(
x
)
for
x
in
tokens
[:
10
]])
+
"
...
\n
"
+
\
f
"
Maximal input:
{
self
.
_tokenizer
.
max_len_single_sentence
}
\n
"
+
\
f
"
Current input:
{
len
(
wordpieces
)
}
"
)
if
len
(
wordpieces
)
>
self
.
_tokenizer
.
max_len_single_sentence
:
raise
ValueError
(
"
Following sentence consists of more wordpiece tokens that the model can process:
\n
"
+
\
"
"
.
join
([
str
(
x
)
for
x
in
tokens
[:
10
]])
+
"
...
\n
"
+
\
f
"
Maximal input:
{
self
.
_tokenizer
.
max_len_single_sentence
}
\n
"
+
\
f
"
Current input:
{
len
(
wordpieces
)
}
"
)
offsets
=
[
x
if
x
is
not
None
else
(
-
1
,
-
1
)
for
x
in
offsets
]
offsets
=
[
x
if
x
is
not
None
else
(
-
1
,
-
1
)
for
x
in
offsets
]
output
:
IndexedTokenList
=
{
"
token_ids
"
:
[
t
.
text_id
for
t
in
wordpieces
],
"
mask
"
:
[
True
]
*
len
(
tokens
),
# for original tokens (i.e. word-level)
"
type_ids
"
:
[
t
.
type_id
for
t
in
wordpieces
],
"
offsets
"
:
offsets
,
"
wordpiece_mask
"
:
[
True
]
*
len
(
wordpieces
),
# for wordpieces (i.e. subword-level)
}
output
:
IndexedTokenList
=
{
"
token_ids
"
:
[
t
.
text_id
for
t
in
wordpieces
],
"
mask
"
:
[
True
]
*
len
(
tokens
),
# for original tokens (i.e. word-level)
"
type_ids
"
:
[
t
.
type_id
for
t
in
wordpieces
],
"
offsets
"
:
offsets
,
"
wordpiece_mask
"
:
[
True
]
*
len
(
wordpieces
),
# for wordpieces (i.e. subword-level)
}
return
self
.
_matched_indexer
.
_postprocess_output
(
output
)
return
self
.
_matched_indexer
.
_postprocess_output
(
output
)
except
ValueError
as
value_error
:
logger
.
error
(
value_error
)
sys
.
exit
(
1
)
class
PretrainedTransformerIndexer
(
token_indexers
.
PretrainedTransformerIndexer
):
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment