diff --git a/combo/predict.py b/combo/predict.py index fcc8fff031b4d55ef10878d0f6b5766067146207..1481bf54466153290d01db4a2c83848f9a309d44 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -59,7 +59,11 @@ class COMBO(predictor.Predictor): def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): if isinstance(sentence, str): - return self.predict_json({"sentence": sentence}) + if isinstance(self._tokenizer,lambo.LamboTokenizer): + segmented = self._tokenizer.segment(sentence) + return self.predict(segmented) + else: + return self.predict_json({"sentence": sentence}) elif isinstance(sentence, list): if len(sentence) == 0: return [] diff --git a/combo/utils/lambo.py b/combo/utils/lambo.py index 5493a2e39d22bb6e33b3b88aec9448e8144826f7..990b03f57ec6d310ba802e46d52afd4537bac1b6 100644 --- a/combo/utils/lambo.py +++ b/combo/utils/lambo.py @@ -2,12 +2,31 @@ from typing import List from allennlp.data.tokenizers.tokenizer import Tokenizer from allennlp.data.tokenizers.token_class import Token +from lambo.segmenter.lambo import Lambo class LamboTokenizer(Tokenizer): - def __init__(self, language: str = "??",) -> None: - self.language = language + def __init__(self, model: str = "en",) -> None: + self.lambo=Lambo.get(model) + # Simple tokenisation: ignoring sentence split def tokenize(self, text: str) -> List[Token]: - #TODO - return None \ No newline at end of file + result=[] + document = self.lambo.segment(text) + for turn in document.turns: + for sentence in turn.sentences: + for token in sentence.tokens: + result.append(Token(token.text)) + return result + + # Full segmentation: divide into sentences and tokens + def segment(self, text: str) -> List[List[Token]]: + result = [] + document = self.lambo.segment(text) + for turn in document.turns: + for sentence in turn.sentences: + resultS=[] + for token in sentence.tokens: + resultS.append(Token(token.text)) + result.append(resultS) + return result \ No newline at end of file