Skip to content
Snippets Groups Projects
Commit 03fd1def authored by piotrmp's avatar piotrmp Committed by Lukasz Pszenny
Browse files

LAMBO segmentation prototype.

parent de83b195
No related merge requests found
...@@ -59,7 +59,11 @@ class COMBO(predictor.Predictor): ...@@ -59,7 +59,11 @@ class COMBO(predictor.Predictor):
def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
if isinstance(sentence, str): if isinstance(sentence, str):
return self.predict_json({"sentence": sentence}) if isinstance(self._tokenizer,lambo.LamboTokenizer):
segmented = self._tokenizer.segment(sentence)
return self.predict(segmented)
else:
return self.predict_json({"sentence": sentence})
elif isinstance(sentence, list): elif isinstance(sentence, list):
if len(sentence) == 0: if len(sentence) == 0:
return [] return []
......
...@@ -2,12 +2,31 @@ from typing import List ...@@ -2,12 +2,31 @@ from typing import List
from allennlp.data.tokenizers.tokenizer import Tokenizer from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers.token_class import Token from allennlp.data.tokenizers.token_class import Token
from lambo.segmenter.lambo import Lambo
class LamboTokenizer(Tokenizer): class LamboTokenizer(Tokenizer):
def __init__(self, language: str = "??",) -> None: def __init__(self, model: str = "en",) -> None:
self.language = language self.lambo=Lambo.get(model)
# Simple tokenisation: ignoring sentence split
def tokenize(self, text: str) -> List[Token]: def tokenize(self, text: str) -> List[Token]:
#TODO result=[]
return None document = self.lambo.segment(text)
\ No newline at end of file for turn in document.turns:
for sentence in turn.sentences:
for token in sentence.tokens:
result.append(Token(token.text))
return result
# Full segmentation: divide into sentences and tokens
def segment(self, text: str) -> List[List[Token]]:
result = []
document = self.lambo.segment(text)
for turn in document.turns:
for sentence in turn.sentences:
resultS=[]
for token in sentence.tokens:
resultS.append(Token(token.text))
result.append(resultS)
return result
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment