Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
combo
Manage
Activity
Members
Labels
Plan
Issues
20
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
2
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Syntactic Tools
combo
Commits
a978579d
Commit
a978579d
authored
1 year ago
by
Martyna Wiącek
Browse files
Options
Downloads
Patches
Plain Diff
fixed proper division into multiwords
parent
5fae577a
Branches
Branches containing commit
Tags
Tags containing commit
1 merge request
!47
Fixed multiword prediction + bug that made the code write empty predictions
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
combo/data/tokenizers/lambo_tokenizer.py
+25
-11
25 additions, 11 deletions
combo/data/tokenizers/lambo_tokenizer.py
with
25 additions
and
11 deletions
combo/data/tokenizers/lambo_tokenizer.py
+
25
−
11
View file @
a978579d
...
...
@@ -84,6 +84,15 @@ class LamboTokenizer(Tokenizer):
_reset_idx
()
sentence_tokens
=
[]
for
token
in
sentence
.
tokens
:
if
len
(
token
.
subwords
)
>
0
and
split_subwords
:
# @TODO this is a very dirty fix for Lambo model's shortcomings
# I noticed that for longer words with multiwords it tends to remove the last letter in the last multiword
# so this is a quick workaround to fix it
# check if subwords in token.subwords are consistent with token.text
if
""
.
join
(
token
.
subwords
)
!=
token
.
text
:
fixed_subwords
=
fix_subwords
(
token
)
token
.
subwords
=
fixed_subwords
sentence_tokens
.
extend
(
_sentence_tokens
(
token
,
split_subwords
))
tokens
.
append
(
sentence_tokens
)
else
:
...
...
@@ -130,17 +139,7 @@ class LamboTokenizer(Tokenizer):
# check if subwords in token.subwords are consistent with token.text
if
""
.
join
(
token
.
subwords
)
!=
token
.
text
:
fixed_subwords
=
[]
text_it
=
0
for
i
,
subword
in
enumerate
(
token
.
subwords
):
if
token
.
text
[
text_it
:
text_it
+
len
(
subword
)]
==
subword
:
if
i
==
len
(
token
.
subwords
)
-
1
and
(
text_it
+
len
(
subword
)
<
len
(
token
.
text
)):
subword
=
token
.
text
[
text_it
:]
fixed_subwords
.
append
(
subword
)
text_it
+=
len
(
subword
)
else
:
fixed_subwords
.
append
(
token
.
text
[
text_it
:
text_it
+
len
(
subword
)])
text_it
+=
len
(
subword
)
fixed_subwords
=
fix_subwords
(
token
)
token
.
subwords
=
fixed_subwords
# sentence_tokens.extend(_sentence_tokens(token, split_subwords))
# else:
...
...
@@ -151,3 +150,18 @@ class LamboTokenizer(Tokenizer):
sentences
.
append
(
sentence_tokens
)
return
sentences
def
fix_subwords
(
token
:
Token
):
fixed_subwords
=
[]
text_it
=
0
for
i
,
subword
in
enumerate
(
token
.
subwords
):
if
token
.
text
[
text_it
:
text_it
+
len
(
subword
)]
==
subword
:
if
i
==
len
(
token
.
subwords
)
-
1
and
(
text_it
+
len
(
subword
)
<
len
(
token
.
text
)):
subword
=
token
.
text
[
text_it
:]
fixed_subwords
.
append
(
subword
)
text_it
+=
len
(
subword
)
else
:
fixed_subwords
.
append
(
token
.
text
[
text_it
:
text_it
+
len
(
subword
)])
text_it
+=
len
(
subword
)
return
fixed_subwords
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment