From 5ea7c9c4dab175bf093eecf41f472143606ec5b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Marci=C5=84czuk?= <marcinczuk@gmail.com> Date: Thu, 20 Oct 2022 11:01:16 +0200 Subject: [PATCH] Trim to long tokens. --- poldeepner2/utils/sequences.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/poldeepner2/utils/sequences.py b/poldeepner2/utils/sequences.py index c3cd4c6..00535ae 100644 --- a/poldeepner2/utils/sequences.py +++ b/poldeepner2/utils/sequences.py @@ -112,6 +112,8 @@ class FeatureGenerator: labels = ["O"] * len(tokens) for word, label_1 in zip(tokens, labels): subtokens = self.encode_method(word.strip()) + # Temporal hack to shorten token to max 14 subtokens + subtokens = subtokens[:6] if len(subtokens) == 0: replacement = "x" * len(word.strip()) logging.warning(f"Token '{word}' has no subwords. It was replaced with '{replacement}'") @@ -225,7 +227,8 @@ class FeatureGeneratorWindowContext(FeatureGenerator): sentence_end = 1 if idx + 1 == len(sentence_tokens_features.tokens) else 0 if token_features.length() + 1 > self.max_segment_length: - raise Exception("Single token has move subtokens than the max_segment_length limit.") + raise Exception(f"Single token has more subtokens than the max_segment_length limit. " + f"Token: {token_features.tokens}. Length: {token_features.length()}") if token_features.length() + segment_features.length() + sentence_end > self.max_segment_length: segment_features = SegmentFeatures() -- GitLab