From 5ea7c9c4dab175bf093eecf41f472143606ec5b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Marci=C5=84czuk?= <marcinczuk@gmail.com>
Date: Thu, 20 Oct 2022 11:01:16 +0200
Subject: [PATCH] Trim to long tokens.

---
 poldeepner2/utils/sequences.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/poldeepner2/utils/sequences.py b/poldeepner2/utils/sequences.py
index c3cd4c6..00535ae 100644
--- a/poldeepner2/utils/sequences.py
+++ b/poldeepner2/utils/sequences.py
@@ -112,6 +112,8 @@ class FeatureGenerator:
             labels = ["O"] * len(tokens)
         for word, label_1 in zip(tokens, labels):
             subtokens = self.encode_method(word.strip())
+            # Temporal hack to shorten token to max 14 subtokens
+            subtokens = subtokens[:6]
             if len(subtokens) == 0:
                 replacement = "x" * len(word.strip())
                 logging.warning(f"Token '{word}' has no subwords. It was replaced with '{replacement}'")
@@ -225,7 +227,8 @@ class FeatureGeneratorWindowContext(FeatureGenerator):
                 sentence_end = 1 if idx + 1 == len(sentence_tokens_features.tokens) else 0
 
                 if token_features.length() + 1 > self.max_segment_length:
-                    raise Exception("Single token has move subtokens than the max_segment_length limit.")
+                    raise Exception(f"Single token has more subtokens than the max_segment_length limit. "
+                                    f"Token: {token_features.tokens}. Length: {token_features.length()}")
 
                 if token_features.length() + segment_features.length() + sentence_end > self.max_segment_length:
                     segment_features = SegmentFeatures()
-- 
GitLab