Skip to content
Snippets Groups Projects
Commit 03fd1def authored by piotrmp's avatar piotrmp Committed by Lukasz Pszenny
Browse files

LAMBO segmentation prototype.

parent de83b195
No related branches found
No related tags found
No related merge requests found
...@@ -59,6 +59,10 @@ class COMBO(predictor.Predictor): ...@@ -59,6 +59,10 @@ class COMBO(predictor.Predictor):
def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
if isinstance(sentence, str): if isinstance(sentence, str):
if isinstance(self._tokenizer,lambo.LamboTokenizer):
segmented = self._tokenizer.segment(sentence)
return self.predict(segmented)
else:
return self.predict_json({"sentence": sentence}) return self.predict_json({"sentence": sentence})
elif isinstance(sentence, list): elif isinstance(sentence, list):
if len(sentence) == 0: if len(sentence) == 0:
......
...@@ -2,12 +2,31 @@ from typing import List ...@@ -2,12 +2,31 @@ from typing import List
from allennlp.data.tokenizers.tokenizer import Tokenizer from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers.token_class import Token from allennlp.data.tokenizers.token_class import Token
from lambo.segmenter.lambo import Lambo
class LamboTokenizer(Tokenizer): class LamboTokenizer(Tokenizer):
def __init__(self, language: str = "??",) -> None: def __init__(self, model: str = "en",) -> None:
self.language = language self.lambo=Lambo.get(model)
# Simple tokenisation: ignoring sentence split
def tokenize(self, text: str) -> List[Token]: def tokenize(self, text: str) -> List[Token]:
#TODO result=[]
return None document = self.lambo.segment(text)
\ No newline at end of file for turn in document.turns:
for sentence in turn.sentences:
for token in sentence.tokens:
result.append(Token(token.text))
return result
# Full segmentation: divide into sentences and tokens
def segment(self, text: str) -> List[List[Token]]:
result = []
document = self.lambo.segment(text)
for turn in document.turns:
for sentence in turn.sentences:
resultS=[]
for token in sentence.tokens:
resultS.append(Token(token.text))
result.append(resultS)
return result
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment