Skip to content
Snippets Groups Projects
Commit 03fd1def authored by piotrmp's avatar piotrmp Committed by Lukasz Pszenny
Browse files

LAMBO segmentation prototype.

parent de83b195
Branches
No related tags found
No related merge requests found
......@@ -59,6 +59,10 @@ class COMBO(predictor.Predictor):
def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
if isinstance(sentence, str):
if isinstance(self._tokenizer,lambo.LamboTokenizer):
segmented = self._tokenizer.segment(sentence)
return self.predict(segmented)
else:
return self.predict_json({"sentence": sentence})
elif isinstance(sentence, list):
if len(sentence) == 0:
......
......@@ -2,12 +2,31 @@ from typing import List
from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers.token_class import Token
from lambo.segmenter.lambo import Lambo
class LamboTokenizer(Tokenizer):
def __init__(self, language: str = "??",) -> None:
self.language = language
def __init__(self, model: str = "en",) -> None:
self.lambo=Lambo.get(model)
# Simple tokenisation: ignoring sentence split
def tokenize(self, text: str) -> List[Token]:
#TODO
return None
\ No newline at end of file
result=[]
document = self.lambo.segment(text)
for turn in document.turns:
for sentence in turn.sentences:
for token in sentence.tokens:
result.append(Token(token.text))
return result
# Full segmentation: divide into sentences and tokens
def segment(self, text: str) -> List[List[Token]]:
result = []
document = self.lambo.segment(text)
for turn in document.turns:
for sentence in turn.sentences:
resultS=[]
for token in sentence.tokens:
resultS.append(Token(token.text))
result.append(resultS)
return result
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment