From ba8acc86771fec803d2e39ee59373d12c01156ff Mon Sep 17 00:00:00 2001 From: Piotr <piotr.m.przybyla@gmail.com> Date: Wed, 19 Oct 2022 09:44:19 +0200 Subject: [PATCH] LAMBO integration working. --- combo/utils/lambo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/combo/utils/lambo.py b/combo/utils/lambo.py index 990b03f..75284ef 100644 --- a/combo/utils/lambo.py +++ b/combo/utils/lambo.py @@ -6,7 +6,7 @@ from lambo.segmenter.lambo import Lambo class LamboTokenizer(Tokenizer): - def __init__(self, model: str = "en",) -> None: + def __init__(self, model: str = "LAMBO_no_pretraining-UD_Polish-PDB",) -> None: self.lambo=Lambo.get(model) # Simple tokenisation: ignoring sentence split @@ -20,13 +20,13 @@ class LamboTokenizer(Tokenizer): return result # Full segmentation: divide into sentences and tokens - def segment(self, text: str) -> List[List[Token]]: + def segment(self, text: str) -> List[List[str]]: result = [] document = self.lambo.segment(text) for turn in document.turns: for sentence in turn.sentences: resultS=[] for token in sentence.tokens: - resultS.append(Token(token.text)) + resultS.append(token.text) result.append(resultS) return result \ No newline at end of file -- GitLab