Skip to content
Snippets Groups Projects
Commit ebc411b0 authored by Piotr's avatar Piotr Committed by Lukasz Pszenny
Browse files

LAMBO Tokenization skeleton.

parent 21b65381
No related branches found
No related tags found
No related merge requests found
...@@ -86,8 +86,8 @@ flags.DEFINE_integer(name="batch_size", default=1, ...@@ -86,8 +86,8 @@ flags.DEFINE_integer(name="batch_size", default=1,
flags.DEFINE_boolean(name="silent", default=True, flags.DEFINE_boolean(name="silent", default=True,
help="Silent prediction to file (without printing to console).") help="Silent prediction to file (without printing to console).")
flags.DEFINE_enum(name="predictor_name", default="combo-spacy", flags.DEFINE_enum(name="predictor_name", default="combo-spacy",
enum_values=["combo", "combo-spacy", "lambo"], enum_values=["combo", "combo-spacy", "combo-lambo"],
help="Use predictor with whitespace or spacy tokenizer.") help="Use predictor with whitespace, spacy or LAMBO tokenizer.")
def run(_): def run(_):
......
...@@ -10,15 +10,15 @@ from allennlp.data import tokenizers ...@@ -10,15 +10,15 @@ from allennlp.data import tokenizers
from allennlp.predictors import predictor from allennlp.predictors import predictor
from overrides import overrides from overrides import overrides
from combo import data
from combo.data import sentence2conllu, tokens2conllu, conllu2sentence from combo.data import sentence2conllu, tokens2conllu, conllu2sentence
from combo.utils import download, graph from combo.utils import download, graph, lambo
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@predictor.Predictor.register("combo") @predictor.Predictor.register("combo")
@predictor.Predictor.register("combo-spacy", constructor="with_spacy_tokenizer") @predictor.Predictor.register("combo-spacy", constructor="with_spacy_tokenizer")
@predictor.Predictor.register("combo-lambo", constructor="with_lambo_tokenizer")
class COMBO(predictor.Predictor): class COMBO(predictor.Predictor):
def __init__(self, def __init__(self,
...@@ -231,6 +231,11 @@ class COMBO(predictor.Predictor): ...@@ -231,6 +231,11 @@ class COMBO(predictor.Predictor):
dataset_reader: allen_data.DatasetReader): dataset_reader: allen_data.DatasetReader):
return cls(model, dataset_reader, tokenizers.SpacyTokenizer()) return cls(model, dataset_reader, tokenizers.SpacyTokenizer())
@classmethod
def with_lambo_tokenizer(cls, model: models.Model,
dataset_reader: allen_data.DatasetReader):
return cls(model, dataset_reader, lambo.LamboTokenizer())
@classmethod @classmethod
def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(), def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(),
batch_size: int = 1024, batch_size: int = 1024,
......
from typing import List
from allennlp.data.tokenizers.tokenizer import Tokenizer
from allennlp.data.tokenizers.token_class import Token
class LamboTokenizer(Tokenizer):
def __init__(self, language: str = "??",) -> None:
self.language = language
def tokenize(self, text: str) -> List[Token]:
#TODO
return None
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment