Skip to content
Snippets Groups Projects
Commit 5ea7c9c4 authored by Michał Marcińczuk's avatar Michał Marcińczuk
Browse files

Trim to long tokens.

parent 6da40619
1 merge request!41Dev v07
Pipeline #6119 failed with stage
in 3 minutes and 4 seconds
......@@ -112,6 +112,8 @@ class FeatureGenerator:
labels = ["O"] * len(tokens)
for word, label_1 in zip(tokens, labels):
subtokens = self.encode_method(word.strip())
# Temporal hack to shorten token to max 14 subtokens
subtokens = subtokens[:6]
if len(subtokens) == 0:
replacement = "x" * len(word.strip())
logging.warning(f"Token '{word}' has no subwords. It was replaced with '{replacement}'")
......@@ -225,7 +227,8 @@ class FeatureGeneratorWindowContext(FeatureGenerator):
sentence_end = 1 if idx + 1 == len(sentence_tokens_features.tokens) else 0
if token_features.length() + 1 > self.max_segment_length:
raise Exception("Single token has move subtokens than the max_segment_length limit.")
raise Exception(f"Single token has more subtokens than the max_segment_length limit. "
f"Token: {token_features.tokens}. Length: {token_features.length()}")
if token_features.length() + segment_features.length() + sentence_end > self.max_segment_length:
segment_features = SegmentFeatures()
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment