Skip to content
Snippets Groups Projects
Commit 83dcdfdf authored by Michał Marcińczuk's avatar Michał Marcińczuk
Browse files

Trim to long tokens.

parent 5ea7c9c4
1 merge request!41Dev v07
Pipeline #6120 failed with stage
in 1 minute and 47 seconds
...@@ -112,8 +112,9 @@ class FeatureGenerator: ...@@ -112,8 +112,9 @@ class FeatureGenerator:
labels = ["O"] * len(tokens) labels = ["O"] * len(tokens)
for word, label_1 in zip(tokens, labels): for word, label_1 in zip(tokens, labels):
subtokens = self.encode_method(word.strip()) subtokens = self.encode_method(word.strip())
# Temporal hack to shorten token to max 14 subtokens if len(subtokens) > 6:
subtokens = subtokens[:6] logging.warning(f"Token {word} was truncated to 6 subtokens: {subtokens}")
subtokens = subtokens[:6]
if len(subtokens) == 0: if len(subtokens) == 0:
replacement = "x" * len(word.strip()) replacement = "x" * len(word.strip())
logging.warning(f"Token '{word}' has no subwords. It was replaced with '{replacement}'") logging.warning(f"Token '{word}' has no subwords. It was replaced with '{replacement}'")
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment