diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py index c5f4451034ec614cacfc08429be9bdbd644b7229..8d4e4e6c8faca685f0554bab031c308288bdc97b 100644 --- a/combo/data/tokenizers/lambo_tokenizer.py +++ b/combo/data/tokenizers/lambo_tokenizer.py @@ -43,9 +43,16 @@ class LamboTokenizer(Tokenizer): document = self.__tokenizer.segment(text) sentences = [] + sentence_tokens = [] for turn in document.turns: for sentence in turn.sentences: - sentences.append([t.text for t in sentence.tokens]) + sentence_tokens = [] + for token in sentence.tokens: + if len(token.subwords) > 0: + sentence_tokens.extend([s for s in token.subwords]) + else: + sentence_tokens.append(token.text) + sentences.append(sentence_tokens) return sentences