diff --git a/combo/utils/lambo.py b/combo/utils/lambo.py index 990b03f57ec6d310ba802e46d52afd4537bac1b6..75284ef9a85272b0856c39ef4c71eccf479f16dc 100644 --- a/combo/utils/lambo.py +++ b/combo/utils/lambo.py @@ -6,7 +6,7 @@ from lambo.segmenter.lambo import Lambo class LamboTokenizer(Tokenizer): - def __init__(self, model: str = "en",) -> None: + def __init__(self, model: str = "LAMBO_no_pretraining-UD_Polish-PDB",) -> None: self.lambo=Lambo.get(model) # Simple tokenisation: ignoring sentence split @@ -20,13 +20,13 @@ class LamboTokenizer(Tokenizer): return result # Full segmentation: divide into sentences and tokens - def segment(self, text: str) -> List[List[Token]]: + def segment(self, text: str) -> List[List[str]]: result = [] document = self.lambo.segment(text) for turn in document.turns: for sentence in turn.sentences: resultS=[] for token in sentence.tokens: - resultS.append(Token(token.text)) + resultS.append(token.text) result.append(resultS) return result \ No newline at end of file