From a978579d2c19c75b944a676c350d0a40c2059942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martyna=20Wi=C4=85cek?= <martyna.wiacek@ipipan.waw.pl> Date: Sun, 4 Feb 2024 00:16:20 +0100 Subject: [PATCH] fixed proper division into multiwords --- combo/data/tokenizers/lambo_tokenizer.py | 36 ++++++++++++++++-------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py index e88f098..abb4e33 100644 --- a/combo/data/tokenizers/lambo_tokenizer.py +++ b/combo/data/tokenizers/lambo_tokenizer.py @@ -84,6 +84,15 @@ class LamboTokenizer(Tokenizer): _reset_idx() sentence_tokens = [] for token in sentence.tokens: + if len(token.subwords) > 0 and split_subwords: + # @TODO this is a very dirty fix for Lambo model's shortcomings + # I noticed that for longer words with multiwords it tends to remove the last letter in the last multiword + # so this is a quick workaround to fix it + + # check if subwords in token.subwords are consistent with token.text + if "".join(token.subwords) != token.text: + fixed_subwords = fix_subwords(token) + token.subwords = fixed_subwords sentence_tokens.extend(_sentence_tokens(token, split_subwords)) tokens.append(sentence_tokens) else: @@ -130,17 +139,7 @@ class LamboTokenizer(Tokenizer): # check if subwords in token.subwords are consistent with token.text if "".join(token.subwords) != token.text: - fixed_subwords = [] - text_it = 0 - for i, subword in enumerate(token.subwords): - if token.text[text_it:text_it + len(subword)] == subword: - if i == len(token.subwords) - 1 and (text_it + len(subword) < len(token.text)): - subword = token.text[text_it:] - fixed_subwords.append(subword) - text_it += len(subword) - else: - fixed_subwords.append(token.text[text_it:text_it + len(subword)]) - text_it += len(subword) + fixed_subwords = fix_subwords(token) token.subwords = fixed_subwords # sentence_tokens.extend(_sentence_tokens(token, split_subwords)) # else: @@ -151,3 +150,18 @@ class LamboTokenizer(Tokenizer): sentences.append(sentence_tokens) return sentences + + +def fix_subwords(token: Token): + fixed_subwords = [] + text_it = 0 + for i, subword in enumerate(token.subwords): + if token.text[text_it:text_it + len(subword)] == subword: + if i == len(token.subwords) - 1 and (text_it + len(subword) < len(token.text)): + subword = token.text[text_it:] + fixed_subwords.append(subword) + text_it += len(subword) + else: + fixed_subwords.append(token.text[text_it:text_it + len(subword)]) + text_it += len(subword) + return fixed_subwords \ No newline at end of file -- GitLab