Skip to content
Snippets Groups Projects
Commit a978579d authored by Martyna Wiącek's avatar Martyna Wiącek
Browse files

fixed proper division into multiwords

parent 5fae577a
No related branches found
No related tags found
1 merge request!47Fixed multiword prediction + bug that made the code write empty predictions
...@@ -84,6 +84,15 @@ class LamboTokenizer(Tokenizer): ...@@ -84,6 +84,15 @@ class LamboTokenizer(Tokenizer):
_reset_idx() _reset_idx()
sentence_tokens = [] sentence_tokens = []
for token in sentence.tokens: for token in sentence.tokens:
if len(token.subwords) > 0 and split_subwords:
# @TODO this is a very dirty fix for Lambo model's shortcomings
# I noticed that for longer words with multiwords it tends to remove the last letter in the last multiword
# so this is a quick workaround to fix it
# check if subwords in token.subwords are consistent with token.text
if "".join(token.subwords) != token.text:
fixed_subwords = fix_subwords(token)
token.subwords = fixed_subwords
sentence_tokens.extend(_sentence_tokens(token, split_subwords)) sentence_tokens.extend(_sentence_tokens(token, split_subwords))
tokens.append(sentence_tokens) tokens.append(sentence_tokens)
else: else:
...@@ -130,6 +139,20 @@ class LamboTokenizer(Tokenizer): ...@@ -130,6 +139,20 @@ class LamboTokenizer(Tokenizer):
# check if subwords in token.subwords are consistent with token.text # check if subwords in token.subwords are consistent with token.text
if "".join(token.subwords) != token.text: if "".join(token.subwords) != token.text:
fixed_subwords = fix_subwords(token)
token.subwords = fixed_subwords
# sentence_tokens.extend(_sentence_tokens(token, split_subwords))
# else:
sentence_tokens.extend(_sentence_tokens(token, split_subwords))
if not turns:
sentences.append(sentence_tokens)
if turns:
sentences.append(sentence_tokens)
return sentences
def fix_subwords(token: Token):
fixed_subwords = [] fixed_subwords = []
text_it = 0 text_it = 0
for i, subword in enumerate(token.subwords): for i, subword in enumerate(token.subwords):
...@@ -141,13 +164,4 @@ class LamboTokenizer(Tokenizer): ...@@ -141,13 +164,4 @@ class LamboTokenizer(Tokenizer):
else: else:
fixed_subwords.append(token.text[text_it:text_it + len(subword)]) fixed_subwords.append(token.text[text_it:text_it + len(subword)])
text_it += len(subword) text_it += len(subword)
token.subwords = fixed_subwords return fixed_subwords
# sentence_tokens.extend(_sentence_tokens(token, split_subwords)) \ No newline at end of file
# else:
sentence_tokens.extend(_sentence_tokens(token, split_subwords))
if not turns:
sentences.append(sentence_tokens)
if turns:
sentences.append(sentence_tokens)
return sentences
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment