Skip to content
Snippets Groups Projects
Commit ebc411b0 authored by Piotr's avatar Piotr Committed by Lukasz Pszenny
Browse files

LAMBO Tokenization skeleton.

parent 21b65381
Branches
No related merge requests found
......@@ -86,8 +86,8 @@ flags.DEFINE_integer(name="batch_size", default=1,
flags.DEFINE_boolean(name="silent", default=True,
help="Silent prediction to file (without printing to console).")
flags.DEFINE_enum(name="predictor_name", default="combo-spacy",
enum_values=["combo", "combo-spacy", "lambo"],
help="Use predictor with whitespace or spacy tokenizer.")
enum_values=["combo", "combo-spacy", "combo-lambo"],
help="Use predictor with whitespace, spacy or LAMBO tokenizer.")
def run(_):
......
......@@ -10,15 +10,15 @@ from allennlp.data import tokenizers
from allennlp.predictors import predictor
from overrides import overrides
from combo import data
from combo.data import sentence2conllu, tokens2conllu, conllu2sentence
from combo.utils import download, graph
from combo.utils import download, graph, lambo
logger = logging.getLogger(__name__)
@predictor.Predictor.register("combo")
@predictor.Predictor.register("combo-spacy", constructor="with_spacy_tokenizer")
@predictor.Predictor.register("combo-lambo", constructor="with_lambo_tokenizer")
class COMBO(predictor.Predictor):
def __init__(self,
......@@ -230,6 +230,11 @@ class COMBO(predictor.Predictor):
def with_spacy_tokenizer(cls, model: models.Model,
dataset_reader: allen_data.DatasetReader):
return cls(model, dataset_reader, tokenizers.SpacyTokenizer())
@classmethod
def with_lambo_tokenizer(cls, model: models.Model,
dataset_reader: allen_data.DatasetReader):
return cls(model, dataset_reader, lambo.LamboTokenizer())
@classmethod
def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(),
......
from typing import List
from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers.token_class import Token
class LamboTokenizer(Tokenizer):
def __init__(self, language: str = "??",) -> None:
self.language = language
def tokenize(self, text: str) -> List[Token]:
#TODO
return None
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment