Skip to content
Snippets Groups Projects
Commit ac6cab41 authored by piotrmp's avatar piotrmp
Browse files

LAMBO segmentation prototype.

parent 045232b5
Branches
No related merge requests found
Pipeline #6081 failed with stage
in 5 minutes and 19 seconds
......@@ -59,7 +59,11 @@ class COMBO(predictor.Predictor):
def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
if isinstance(sentence, str):
return self.predict_json({"sentence": sentence})
if isinstance(self._tokenizer,lambo.LamboTokenizer):
segmented = self._tokenizer.segment(sentence)
return self.predict(segmented)
else:
return self.predict_json({"sentence": sentence})
elif isinstance(sentence, list):
if len(sentence) == 0:
return []
......
......@@ -2,12 +2,31 @@ from typing import List
from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers.token_class import Token
from lambo.segmenter.lambo import Lambo
class LamboTokenizer(Tokenizer):
def __init__(self, language: str = "??",) -> None:
self.language = language
def __init__(self, model: str = "en",) -> None:
self.lambo=Lambo.get(model)
# Simple tokenisation: ignoring sentence split
def tokenize(self, text: str) -> List[Token]:
#TODO
return None
\ No newline at end of file
result=[]
document = self.lambo.segment(text)
for turn in document.turns:
for sentence in turn.sentences:
for token in sentence.tokens:
result.append(Token(token.text))
return result
# Full segmentation: divide into sentences and tokens
def segment(self, text: str) -> List[List[Token]]:
result = []
document = self.lambo.segment(text)
for turn in document.turns:
for sentence in turn.sentences:
resultS=[]
for token in sentence.tokens:
resultS.append(Token(token.text))
result.append(resultS)
return result
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment