From ac6cab412b1c295159c2924e239be32bbdb63885 Mon Sep 17 00:00:00 2001
From: piotrmp <piotr.m.przybyla@gmail.com>
Date: Tue, 18 Oct 2022 17:08:58 +0200
Subject: [PATCH] LAMBO segmentation prototype.

---
 combo/predict.py     |  6 +++++-
 combo/utils/lambo.py | 27 +++++++++++++++++++++++----
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/combo/predict.py b/combo/predict.py
index fcc8fff..1481bf5 100644
--- a/combo/predict.py
+++ b/combo/predict.py
@@ -59,7 +59,11 @@ class COMBO(predictor.Predictor):
 
     def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
         if isinstance(sentence, str):
-            return self.predict_json({"sentence": sentence})
+            if isinstance(self._tokenizer,lambo.LamboTokenizer):
+                segmented = self._tokenizer.segment(sentence)
+                return self.predict(segmented)
+            else:
+                return self.predict_json({"sentence": sentence})
         elif isinstance(sentence, list):
             if len(sentence) == 0:
                 return []
diff --git a/combo/utils/lambo.py b/combo/utils/lambo.py
index 5493a2e..990b03f 100644
--- a/combo/utils/lambo.py
+++ b/combo/utils/lambo.py
@@ -2,12 +2,31 @@ from typing import List
 
 from allennlp.data.tokenizers.tokenizer import Tokenizer
 from allennlp.data.tokenizers.token_class import Token
+from lambo.segmenter.lambo import Lambo
 
 class LamboTokenizer(Tokenizer):
 
-    def __init__(self, language: str = "??",) -> None:
-        self.language = language
+    def __init__(self, model: str = "en",) -> None:
+        self.lambo=Lambo.get(model)
 
+    # Simple tokenisation: ignoring sentence split
     def tokenize(self, text: str) -> List[Token]:
-        #TODO
-        return None
\ No newline at end of file
+        result=[]
+        document = self.lambo.segment(text)
+        for turn in document.turns:
+            for sentence in turn.sentences:
+                for token in sentence.tokens:
+                    result.append(Token(token.text))
+        return result
+    
+    # Full segmentation: divide into sentences and tokens
+    def segment(self, text: str) -> List[List[Token]]:
+        result = []
+        document = self.lambo.segment(text)
+        for turn in document.turns:
+            for sentence in turn.sentences:
+                resultS=[]
+                for token in sentence.tokens:
+                    resultS.append(Token(token.text))
+                result.append(resultS)
+        return result
\ No newline at end of file
-- 
GitLab