Skip to content
Snippets Groups Projects
Commit a08dc86d authored by Maja Jablonska's avatar Maja Jablonska
Browse files

Add multiword support to lambo_tokenizer.py

parent 7ca1bc2a
1 merge request!46Merge COMBO 3.0 into master
...@@ -43,9 +43,16 @@ class LamboTokenizer(Tokenizer): ...@@ -43,9 +43,16 @@ class LamboTokenizer(Tokenizer):
document = self.__tokenizer.segment(text) document = self.__tokenizer.segment(text)
sentences = [] sentences = []
sentence_tokens = []
for turn in document.turns: for turn in document.turns:
for sentence in turn.sentences: for sentence in turn.sentences:
sentences.append([t.text for t in sentence.tokens]) sentence_tokens = []
for token in sentence.tokens:
if len(token.subwords) > 0:
sentence_tokens.extend([s for s in token.subwords])
else:
sentence_tokens.append(token.text)
sentences.append(sentence_tokens)
return sentences return sentences
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment