LAMBO integration working.

e14a6660 · Piotr · Lukasz Pszenny · 03fd1def · e14a6660
Commit e14a6660 authored 2 years ago by Piotr Committed by Lukasz Pszenny 2 years ago
--- a/combo/utils/lambo.py
+++ b/combo/utils/lambo.py
@@ -6,7 +6,7 @@ from lambo.segmenter.lambo import Lambo
 class LamboTokenizer(Tokenizer):
-    def __init__(self, model: str = "en",) -> None:
+    def __init__(self, model: str = "LAMBO_no_pretraining-UD_Polish-PDB",) -> None:
        self.lambo=Lambo.get(model)
    # Simple tokenisation: ignoring sentence split
@@ -20,13 +20,13 @@ class LamboTokenizer(Tokenizer):
        return result
    # Full segmentation: divide into sentences and tokens
-    def segment(self, text: str) -> List[List[Token]]:
+    def segment(self, text: str) -> List[List[str]]:
        result = []
        document = self.lambo.segment(text)
        for turn in document.turns:
            for sentence in turn.sentences:
                resultS=[]
                for token in sentence.tokens:
-                    resultS.append(Token(token.text))
+                    resultS.append(token.text)
                result.append(resultS)
        return result
\ No newline at end of file