Commit 39b3f106 authored by Jarema Radom's avatar Jarema Radom

fix for bpe related decoding

parent 2b98e2b1
Pipeline #3398 passed with stages
in 2 minutes and 44 seconds
...@@ -35,6 +35,8 @@ def decode(tokens, labels_decoded, tokenizer, bpe=False): ...@@ -35,6 +35,8 @@ def decode(tokens, labels_decoded, tokenizer, bpe=False):
for label, token in zip(labels_decoded, tokens): for label, token in zip(labels_decoded, tokens):
if bpe: if bpe:
token_str = tokenizer.decode(token) token_str = tokenizer.decode(token)
if token_str.startswith(" "):
token_str = token_str[1:]
else: else:
token_str = tokenizer.convert_ids_to_tokens([token])[0] token_str = tokenizer.convert_ids_to_tokens([token])[0]
if token_str == "[PAD]": if token_str == "[PAD]":
...@@ -43,8 +45,7 @@ def decode(tokens, labels_decoded, tokenizer, bpe=False): ...@@ -43,8 +45,7 @@ def decode(tokens, labels_decoded, tokenizer, bpe=False):
word.append(token_str.replace("##", "")) word.append(token_str.replace("##", ""))
else: else:
if len(word) > 0: if len(word) > 0:
if not bpe or word_end != ' ': word.append(word_end)
word.append(word_end)
text_recovered.append("".join(word)) text_recovered.append("".join(word))
word = [] word = []
if label.startswith("__ALL_UPPER__"): if label.startswith("__ALL_UPPER__"):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment