Skip to content
Snippets Groups Projects
Commit ba8acc86 authored by Piotr's avatar Piotr
Browse files

LAMBO integration working.

parent ac6cab41
Branches
Tags
No related merge requests found
Pipeline #6087 failed with stage
in 5 minutes and 55 seconds
......@@ -6,7 +6,7 @@ from lambo.segmenter.lambo import Lambo
class LamboTokenizer(Tokenizer):
def __init__(self, model: str = "en",) -> None:
def __init__(self, model: str = "LAMBO_no_pretraining-UD_Polish-PDB",) -> None:
self.lambo=Lambo.get(model)
# Simple tokenisation: ignoring sentence split
......@@ -20,13 +20,13 @@ class LamboTokenizer(Tokenizer):
return result
# Full segmentation: divide into sentences and tokens
def segment(self, text: str) -> List[List[Token]]:
def segment(self, text: str) -> List[List[str]]:
result = []
document = self.lambo.segment(text)
for turn in document.turns:
for sentence in turn.sentences:
resultS=[]
for token in sentence.tokens:
resultS.append(Token(token.text))
resultS.append(token.text)
result.append(resultS)
return result
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment