diff --git a/combo/main.py b/combo/main.py index 79253999d11d7d38d2eb5c8aed7ceeffad31a283..5a2066c20fd1db98d2d88a3581200bb968356632 100644 --- a/combo/main.py +++ b/combo/main.py @@ -86,8 +86,8 @@ flags.DEFINE_integer(name="batch_size", default=1, flags.DEFINE_boolean(name="silent", default=True, help="Silent prediction to file (without printing to console).") flags.DEFINE_enum(name="predictor_name", default="combo-spacy", - enum_values=["combo", "combo-spacy", "lambo"], - help="Use predictor with whitespace or spacy tokenizer.") + enum_values=["combo", "combo-spacy", "combo-lambo"], + help="Use predictor with whitespace, spacy or LAMBO tokenizer.") def run(_): diff --git a/combo/predict.py b/combo/predict.py index 83b030ff41a5026672a2e555115698980d00de77..803928fff4bb531a626754a89982b6910a374c9c 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -10,15 +10,15 @@ from allennlp.data import tokenizers from allennlp.predictors import predictor from overrides import overrides -from combo import data from combo.data import sentence2conllu, tokens2conllu, conllu2sentence -from combo.utils import download, graph +from combo.utils import download, graph, lambo logger = logging.getLogger(__name__) @predictor.Predictor.register("combo") @predictor.Predictor.register("combo-spacy", constructor="with_spacy_tokenizer") +@predictor.Predictor.register("combo-lambo", constructor="with_lambo_tokenizer") class COMBO(predictor.Predictor): def __init__(self, @@ -230,6 +230,11 @@ class COMBO(predictor.Predictor): def with_spacy_tokenizer(cls, model: models.Model, dataset_reader: allen_data.DatasetReader): return cls(model, dataset_reader, tokenizers.SpacyTokenizer()) + + @classmethod + def with_lambo_tokenizer(cls, model: models.Model, + dataset_reader: allen_data.DatasetReader): + return cls(model, dataset_reader, lambo.LamboTokenizer()) @classmethod def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(), diff --git a/combo/utils/lambo.py b/combo/utils/lambo.py new file mode 100644 index 0000000000000000000000000000000000000000..5493a2e39d22bb6e33b3b88aec9448e8144826f7 --- /dev/null +++ b/combo/utils/lambo.py @@ -0,0 +1,13 @@ +from typing import List + +from allennlp.data.tokenizers.tokenizer import Tokenizer +from allennlp.data.tokenizers.token_class import Token + +class LamboTokenizer(Tokenizer): + + def __init__(self, language: str = "??",) -> None: + self.language = language + + def tokenize(self, text: str) -> List[Token]: + #TODO + return None \ No newline at end of file