Skip to content
Snippets Groups Projects
Commit ac6cab41 authored by piotrmp's avatar piotrmp
Browse files

LAMBO segmentation prototype.

parent 045232b5
No related branches found
No related tags found
No related merge requests found
Pipeline #6081 failed
......@@ -59,6 +59,10 @@ class COMBO(predictor.Predictor):
def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
if isinstance(sentence, str):
if isinstance(self._tokenizer,lambo.LamboTokenizer):
segmented = self._tokenizer.segment(sentence)
return self.predict(segmented)
else:
return self.predict_json({"sentence": sentence})
elif isinstance(sentence, list):
if len(sentence) == 0:
......
......@@ -2,12 +2,31 @@ from typing import List
from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers.token_class import Token
from lambo.segmenter.lambo import Lambo
class LamboTokenizer(Tokenizer):
def __init__(self, language: str = "??",) -> None:
self.language = language
def __init__(self, model: str = "en",) -> None:
self.lambo=Lambo.get(model)
# Simple tokenisation: ignoring sentence split
def tokenize(self, text: str) -> List[Token]:
#TODO
return None
\ No newline at end of file
result=[]
document = self.lambo.segment(text)
for turn in document.turns:
for sentence in turn.sentences:
for token in sentence.tokens:
result.append(Token(token.text))
return result
# Full segmentation: divide into sentences and tokens
def segment(self, text: str) -> List[List[Token]]:
result = []
document = self.lambo.segment(text)
for turn in document.turns:
for sentence in turn.sentences:
resultS=[]
for token in sentence.tokens:
resultS.append(Token(token.text))
result.append(resultS)
return result
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment