From a08dc86d4f25c1180a5bf12346927b8916ee49d2 Mon Sep 17 00:00:00 2001 From: Maja Jablonska <majajjablonska@gmail.com> Date: Wed, 22 Nov 2023 14:56:54 +1100 Subject: [PATCH] Add multiword support to lambo_tokenizer.py --- combo/data/tokenizers/lambo_tokenizer.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py index c5f4451..8d4e4e6 100644 --- a/combo/data/tokenizers/lambo_tokenizer.py +++ b/combo/data/tokenizers/lambo_tokenizer.py @@ -43,9 +43,16 @@ class LamboTokenizer(Tokenizer): document = self.__tokenizer.segment(text) sentences = [] + sentence_tokens = [] for turn in document.turns: for sentence in turn.sentences: - sentences.append([t.text for t in sentence.tokens]) + sentence_tokens = [] + for token in sentence.tokens: + if len(token.subwords) > 0: + sentence_tokens.extend([s for s in token.subwords]) + else: + sentence_tokens.append(token.text) + sentences.append(sentence_tokens) return sentences -- GitLab