Skip to content
Snippets Groups Projects
Commit 39b3f106 authored by Jarema Radom's avatar Jarema Radom
Browse files

fix for bpe related decoding

parent 2b98e2b1
Branches en-ru-support
1 merge request!16S3 synchronization and CI
Pipeline #3398 passed with stages
in 2 minutes and 44 seconds
......@@ -35,6 +35,8 @@ def decode(tokens, labels_decoded, tokenizer, bpe=False):
for label, token in zip(labels_decoded, tokens):
if bpe:
token_str = tokenizer.decode(token)
if token_str.startswith(" "):
token_str = token_str[1:]
else:
token_str = tokenizer.convert_ids_to_tokens([token])[0]
if token_str == "[PAD]":
......@@ -43,8 +45,7 @@ def decode(tokens, labels_decoded, tokenizer, bpe=False):
word.append(token_str.replace("##", ""))
else:
if len(word) > 0:
if not bpe or word_end != ' ':
word.append(word_end)
word.append(word_end)
text_recovered.append("".join(word))
word = []
if label.startswith("__ALL_UPPER__"):
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment