Skip to content
Snippets Groups Projects
Commit e14a6660 authored by Piotr's avatar Piotr Committed by Lukasz Pszenny
Browse files

LAMBO integration working.

parent 03fd1def
Branches
No related merge requests found
...@@ -6,7 +6,7 @@ from lambo.segmenter.lambo import Lambo ...@@ -6,7 +6,7 @@ from lambo.segmenter.lambo import Lambo
class LamboTokenizer(Tokenizer): class LamboTokenizer(Tokenizer):
def __init__(self, model: str = "en",) -> None: def __init__(self, model: str = "LAMBO_no_pretraining-UD_Polish-PDB",) -> None:
self.lambo=Lambo.get(model) self.lambo=Lambo.get(model)
# Simple tokenisation: ignoring sentence split # Simple tokenisation: ignoring sentence split
...@@ -20,13 +20,13 @@ class LamboTokenizer(Tokenizer): ...@@ -20,13 +20,13 @@ class LamboTokenizer(Tokenizer):
return result return result
# Full segmentation: divide into sentences and tokens # Full segmentation: divide into sentences and tokens
def segment(self, text: str) -> List[List[Token]]: def segment(self, text: str) -> List[List[str]]:
result = [] result = []
document = self.lambo.segment(text) document = self.lambo.segment(text)
for turn in document.turns: for turn in document.turns:
for sentence in turn.sentences: for sentence in turn.sentences:
resultS=[] resultS=[]
for token in sentence.tokens: for token in sentence.tokens:
resultS.append(Token(token.text)) resultS.append(token.text)
result.append(resultS) result.append(resultS)
return result return result
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment