Skip to content
Snippets Groups Projects
Commit a978579d authored by Martyna Wiącek's avatar Martyna Wiącek
Browse files

fixed proper division into multiwords

parent 5fae577a
Branches
Tags
1 merge request!47Fixed multiword prediction + bug that made the code write empty predictions
......@@ -84,6 +84,15 @@ class LamboTokenizer(Tokenizer):
_reset_idx()
sentence_tokens = []
for token in sentence.tokens:
if len(token.subwords) > 0 and split_subwords:
# @TODO this is a very dirty fix for Lambo model's shortcomings
# I noticed that for longer words with multiwords it tends to remove the last letter in the last multiword
# so this is a quick workaround to fix it
# check if subwords in token.subwords are consistent with token.text
if "".join(token.subwords) != token.text:
fixed_subwords = fix_subwords(token)
token.subwords = fixed_subwords
sentence_tokens.extend(_sentence_tokens(token, split_subwords))
tokens.append(sentence_tokens)
else:
......@@ -130,17 +139,7 @@ class LamboTokenizer(Tokenizer):
# check if subwords in token.subwords are consistent with token.text
if "".join(token.subwords) != token.text:
fixed_subwords = []
text_it = 0
for i, subword in enumerate(token.subwords):
if token.text[text_it:text_it + len(subword)] == subword:
if i == len(token.subwords) - 1 and (text_it + len(subword) < len(token.text)):
subword = token.text[text_it:]
fixed_subwords.append(subword)
text_it += len(subword)
else:
fixed_subwords.append(token.text[text_it:text_it + len(subword)])
text_it += len(subword)
fixed_subwords = fix_subwords(token)
token.subwords = fixed_subwords
# sentence_tokens.extend(_sentence_tokens(token, split_subwords))
# else:
......@@ -151,3 +150,18 @@ class LamboTokenizer(Tokenizer):
sentences.append(sentence_tokens)
return sentences
def fix_subwords(token: Token):
fixed_subwords = []
text_it = 0
for i, subword in enumerate(token.subwords):
if token.text[text_it:text_it + len(subword)] == subword:
if i == len(token.subwords) - 1 and (text_it + len(subword) < len(token.text)):
subword = token.text[text_it:]
fixed_subwords.append(subword)
text_it += len(subword)
else:
fixed_subwords.append(token.text[text_it:text_it + len(subword)])
text_it += len(subword)
return fixed_subwords
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment