From a978579d2c19c75b944a676c350d0a40c2059942 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martyna=20Wi=C4=85cek?= <martyna.wiacek@ipipan.waw.pl>
Date: Sun, 4 Feb 2024 00:16:20 +0100
Subject: [PATCH] fixed proper division into multiwords

---
 combo/data/tokenizers/lambo_tokenizer.py | 36 ++++++++++++++++--------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py
index e88f098..abb4e33 100644
--- a/combo/data/tokenizers/lambo_tokenizer.py
+++ b/combo/data/tokenizers/lambo_tokenizer.py
@@ -84,6 +84,15 @@ class LamboTokenizer(Tokenizer):
                     _reset_idx()
                     sentence_tokens = []
                     for token in sentence.tokens:
+                        if len(token.subwords) > 0 and split_subwords:
+                            # @TODO this is a very dirty fix for Lambo model's shortcomings
+                            # I noticed that for longer words with multiwords it tends to remove the last letter in the last multiword
+                            # so this is a quick workaround to fix it
+
+                            # check if subwords in token.subwords are consistent with token.text
+                            if "".join(token.subwords) != token.text:
+                                fixed_subwords = fix_subwords(token)
+                                token.subwords = fixed_subwords
                         sentence_tokens.extend(_sentence_tokens(token, split_subwords))
                     tokens.append(sentence_tokens)
         else:
@@ -130,17 +139,7 @@ class LamboTokenizer(Tokenizer):
 
                         # check if subwords in token.subwords are consistent with token.text
                         if "".join(token.subwords) != token.text:
-                            fixed_subwords = []
-                            text_it = 0
-                            for i, subword in enumerate(token.subwords):
-                                if token.text[text_it:text_it + len(subword)] == subword:
-                                    if i == len(token.subwords) - 1 and (text_it + len(subword) < len(token.text)):
-                                        subword = token.text[text_it:]
-                                    fixed_subwords.append(subword)
-                                    text_it += len(subword)
-                                else:
-                                    fixed_subwords.append(token.text[text_it:text_it + len(subword)])
-                                    text_it += len(subword)
+                            fixed_subwords = fix_subwords(token)
                             token.subwords = fixed_subwords
                         # sentence_tokens.extend(_sentence_tokens(token, split_subwords))
                     # else:
@@ -151,3 +150,18 @@ class LamboTokenizer(Tokenizer):
                 sentences.append(sentence_tokens)
 
         return sentences
+
+
+def fix_subwords(token: Token):
+    fixed_subwords = []
+    text_it = 0
+    for i, subword in enumerate(token.subwords):
+        if token.text[text_it:text_it + len(subword)] == subword:
+            if i == len(token.subwords) - 1 and (text_it + len(subword) < len(token.text)):
+                subword = token.text[text_it:]
+            fixed_subwords.append(subword)
+            text_it += len(subword)
+        else:
+            fixed_subwords.append(token.text[text_it:text_it + len(subword)])
+            text_it += len(subword)
+    return fixed_subwords
\ No newline at end of file
-- 
GitLab