From ac6cab412b1c295159c2924e239be32bbdb63885 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Tue, 18 Oct 2022 17:08:58 +0200 Subject: [PATCH] LAMBO segmentation prototype. --- combo/predict.py | 6 +++++- combo/utils/lambo.py | 27 +++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/combo/predict.py b/combo/predict.py index fcc8fff..1481bf5 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -59,7 +59,11 @@ class COMBO(predictor.Predictor): def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): if isinstance(sentence, str): - return self.predict_json({"sentence": sentence}) + if isinstance(self._tokenizer,lambo.LamboTokenizer): + segmented = self._tokenizer.segment(sentence) + return self.predict(segmented) + else: + return self.predict_json({"sentence": sentence}) elif isinstance(sentence, list): if len(sentence) == 0: return [] diff --git a/combo/utils/lambo.py b/combo/utils/lambo.py index 5493a2e..990b03f 100644 --- a/combo/utils/lambo.py +++ b/combo/utils/lambo.py @@ -2,12 +2,31 @@ from typing import List from allennlp.data.tokenizers.tokenizer import Tokenizer from allennlp.data.tokenizers.token_class import Token +from lambo.segmenter.lambo import Lambo class LamboTokenizer(Tokenizer): - def __init__(self, language: str = "??",) -> None: - self.language = language + def __init__(self, model: str = "en",) -> None: + self.lambo=Lambo.get(model) + # Simple tokenisation: ignoring sentence split def tokenize(self, text: str) -> List[Token]: - #TODO - return None \ No newline at end of file + result=[] + document = self.lambo.segment(text) + for turn in document.turns: + for sentence in turn.sentences: + for token in sentence.tokens: + result.append(Token(token.text)) + return result + + # Full segmentation: divide into sentences and tokens + def segment(self, text: str) -> List[List[Token]]: + result = [] + document = self.lambo.segment(text) + for turn in document.turns: + for sentence in turn.sentences: + resultS=[] + for token in sentence.tokens: + resultS.append(Token(token.text)) + result.append(resultS) + return result \ No newline at end of file -- GitLab