diff --git a/poldeepner2/utils/sequences.py b/poldeepner2/utils/sequences.py index c3cd4c64d01c5caff33fe64972d61cd6fe256647..00535aea85e64b4abafd1d4143312afd6ac1b893 100644 --- a/poldeepner2/utils/sequences.py +++ b/poldeepner2/utils/sequences.py @@ -112,6 +112,8 @@ class FeatureGenerator: labels = ["O"] * len(tokens) for word, label_1 in zip(tokens, labels): subtokens = self.encode_method(word.strip()) + # Temporal hack to shorten token to max 14 subtokens + subtokens = subtokens[:6] if len(subtokens) == 0: replacement = "x" * len(word.strip()) logging.warning(f"Token '{word}' has no subwords. It was replaced with '{replacement}'") @@ -225,7 +227,8 @@ class FeatureGeneratorWindowContext(FeatureGenerator): sentence_end = 1 if idx + 1 == len(sentence_tokens_features.tokens) else 0 if token_features.length() + 1 > self.max_segment_length: - raise Exception("Single token has move subtokens than the max_segment_length limit.") + raise Exception(f"Single token has more subtokens than the max_segment_length limit. " + f"Token: {token_features.tokens}. Length: {token_features.length()}") if token_features.length() + segment_features.length() + sentence_end > self.max_segment_length: segment_features = SegmentFeatures()