Trim to long tokens.

83dcdfdf · Michał Marcińczuk · 5ea7c9c4 · 83dcdfdf
Commit 83dcdfdf authored 2 years ago by Michał Marcińczuk
--- a/poldeepner2/utils/sequences.py
+++ b/poldeepner2/utils/sequences.py
@@ -112,8 +112,9 @@ class FeatureGenerator:
            labels = ["O"] * len(tokens)
        for word, label_1 in zip(tokens, labels):
            subtokens = self.encode_method(word.strip())
-            # Temporal hack to shorten token to max 14 subtokens
+            if len(subtokens) > 6:
-            subtokens = subtokens[:6]
+                logging.warning(f"Token {word} was truncated to 6 subtokens: {subtokens}")
+                subtokens = subtokens[:6]
            if len(subtokens) == 0:
                replacement = "x" * len(word.strip())
                logging.warning(f"Token '{word}' has no subwords. It was replaced with '{replacement}'")