Skip to content
Snippets Groups Projects
Commit 5ea7c9c4 authored by Michał Marcińczuk's avatar Michał Marcińczuk
Browse files

Trim to long tokens.

parent 6da40619
No related branches found
No related tags found
1 merge request!41Dev v07
Pipeline #6119 failed
...@@ -112,6 +112,8 @@ class FeatureGenerator: ...@@ -112,6 +112,8 @@ class FeatureGenerator:
labels = ["O"] * len(tokens) labels = ["O"] * len(tokens)
for word, label_1 in zip(tokens, labels): for word, label_1 in zip(tokens, labels):
subtokens = self.encode_method(word.strip()) subtokens = self.encode_method(word.strip())
# Temporal hack to shorten token to max 14 subtokens
subtokens = subtokens[:6]
if len(subtokens) == 0: if len(subtokens) == 0:
replacement = "x" * len(word.strip()) replacement = "x" * len(word.strip())
logging.warning(f"Token '{word}' has no subwords. It was replaced with '{replacement}'") logging.warning(f"Token '{word}' has no subwords. It was replaced with '{replacement}'")
...@@ -225,7 +227,8 @@ class FeatureGeneratorWindowContext(FeatureGenerator): ...@@ -225,7 +227,8 @@ class FeatureGeneratorWindowContext(FeatureGenerator):
sentence_end = 1 if idx + 1 == len(sentence_tokens_features.tokens) else 0 sentence_end = 1 if idx + 1 == len(sentence_tokens_features.tokens) else 0
if token_features.length() + 1 > self.max_segment_length: if token_features.length() + 1 > self.max_segment_length:
raise Exception("Single token has move subtokens than the max_segment_length limit.") raise Exception(f"Single token has more subtokens than the max_segment_length limit. "
f"Token: {token_features.tokens}. Length: {token_features.length()}")
if token_features.length() + segment_features.length() + sentence_end > self.max_segment_length: if token_features.length() + segment_features.length() + sentence_end > self.max_segment_length:
segment_features = SegmentFeatures() segment_features = SegmentFeatures()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment