From ebc411b04618ffe19807fd48e7a38de5d61a7709 Mon Sep 17 00:00:00 2001 From: Piotr <piotr.m.przybyla@gmail.com> Date: Thu, 13 Oct 2022 14:37:27 +0200 Subject: [PATCH] LAMBO Tokenization skeleton. --- combo/main.py | 4 ++-- combo/predict.py | 9 +++++++-- combo/utils/lambo.py | 13 +++++++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) create mode 100644 combo/utils/lambo.py diff --git a/combo/main.py b/combo/main.py index 7925399..5a2066c 100644 --- a/combo/main.py +++ b/combo/main.py @@ -86,8 +86,8 @@ flags.DEFINE_integer(name="batch_size", default=1, flags.DEFINE_boolean(name="silent", default=True, help="Silent prediction to file (without printing to console).") flags.DEFINE_enum(name="predictor_name", default="combo-spacy", - enum_values=["combo", "combo-spacy", "lambo"], - help="Use predictor with whitespace or spacy tokenizer.") + enum_values=["combo", "combo-spacy", "combo-lambo"], + help="Use predictor with whitespace, spacy or LAMBO tokenizer.") def run(_): diff --git a/combo/predict.py b/combo/predict.py index 83b030f..803928f 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -10,15 +10,15 @@ from allennlp.data import tokenizers from allennlp.predictors import predictor from overrides import overrides -from combo import data from combo.data import sentence2conllu, tokens2conllu, conllu2sentence -from combo.utils import download, graph +from combo.utils import download, graph, lambo logger = logging.getLogger(__name__) @predictor.Predictor.register("combo") @predictor.Predictor.register("combo-spacy", constructor="with_spacy_tokenizer") +@predictor.Predictor.register("combo-lambo", constructor="with_lambo_tokenizer") class COMBO(predictor.Predictor): def __init__(self, @@ -230,6 +230,11 @@ class COMBO(predictor.Predictor): def with_spacy_tokenizer(cls, model: models.Model, dataset_reader: allen_data.DatasetReader): return cls(model, dataset_reader, tokenizers.SpacyTokenizer()) + + @classmethod + def with_lambo_tokenizer(cls, model: models.Model, + dataset_reader: allen_data.DatasetReader): + return cls(model, dataset_reader, lambo.LamboTokenizer()) @classmethod def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(), diff --git a/combo/utils/lambo.py b/combo/utils/lambo.py new file mode 100644 index 0000000..5493a2e --- /dev/null +++ b/combo/utils/lambo.py @@ -0,0 +1,13 @@ +from typing import List + +from allennlp.data.tokenizers.tokenizer import Tokenizer +from allennlp.data.tokenizers.token_class import Token + +class LamboTokenizer(Tokenizer): + + def __init__(self, language: str = "??",) -> None: + self.language = language + + def tokenize(self, text: str) -> List[Token]: + #TODO + return None \ No newline at end of file -- GitLab