Skip to content
Snippets Groups Projects
Commit ba8acc86 authored by Piotr's avatar Piotr
Browse files

LAMBO integration working.

parent ac6cab41
No related merge requests found
Pipeline #6087 failed with stage
in 5 minutes and 55 seconds
...@@ -6,7 +6,7 @@ from lambo.segmenter.lambo import Lambo ...@@ -6,7 +6,7 @@ from lambo.segmenter.lambo import Lambo
class LamboTokenizer(Tokenizer): class LamboTokenizer(Tokenizer):
def __init__(self, model: str = "en",) -> None: def __init__(self, model: str = "LAMBO_no_pretraining-UD_Polish-PDB",) -> None:
self.lambo=Lambo.get(model) self.lambo=Lambo.get(model)
# Simple tokenisation: ignoring sentence split # Simple tokenisation: ignoring sentence split
...@@ -20,13 +20,13 @@ class LamboTokenizer(Tokenizer): ...@@ -20,13 +20,13 @@ class LamboTokenizer(Tokenizer):
return result return result
# Full segmentation: divide into sentences and tokens # Full segmentation: divide into sentences and tokens
def segment(self, text: str) -> List[List[Token]]: def segment(self, text: str) -> List[List[str]]:
result = [] result = []
document = self.lambo.segment(text) document = self.lambo.segment(text)
for turn in document.turns: for turn in document.turns:
for sentence in turn.sentences: for sentence in turn.sentences:
resultS=[] resultS=[]
for token in sentence.tokens: for token in sentence.tokens:
resultS.append(Token(token.text)) resultS.append(token.text)
result.append(resultS) result.append(resultS)
return result return result
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment