diff --git a/combo/main.py b/combo/main.py index 5a2066c20fd1db98d2d88a3581200bb968356632..0bc6126b0787ba9cf362c5022b41100c18159576 100644 --- a/combo/main.py +++ b/combo/main.py @@ -88,7 +88,8 @@ flags.DEFINE_boolean(name="silent", default=True, flags.DEFINE_enum(name="predictor_name", default="combo-spacy", enum_values=["combo", "combo-spacy", "combo-lambo"], help="Use predictor with whitespace, spacy or LAMBO tokenizer.") - +flags.DEFINE_string(name="lambo_model_name", default="en", + help="LAMBO model name (if LAMBO used for segmentation).") def run(_): """Run model.""" @@ -175,7 +176,7 @@ def _get_predictor() -> predictors.Predictor: ) return predictors.Predictor.from_archive( - archive, FLAGS.predictor_name + archive, FLAGS.predictor_name, extra_args= {"lambo_model_name" : FLAGS.lambo_model_name} ) diff --git a/combo/predict.py b/combo/predict.py index 1481bf54466153290d01db4a2c83848f9a309d44..68bdef3dae6ea701cfd0bb6b88620166b4877bae 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -238,8 +238,8 @@ class COMBO(predictor.Predictor): @classmethod def with_lambo_tokenizer(cls, model: models.Model, - dataset_reader: allen_data.DatasetReader): - return cls(model, dataset_reader, lambo.LamboTokenizer()) + dataset_reader: allen_data.DatasetReader, lambo_model_name : str): + return cls(model, dataset_reader, lambo.LamboTokenizer(lambo_model_name)) @classmethod def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(), diff --git a/combo/utils/lambo.py b/combo/utils/lambo.py index 75284ef9a85272b0856c39ef4c71eccf479f16dc..92ef8e1399f95ea8caf0bbf8f0763325c6c7a355 100644 --- a/combo/utils/lambo.py +++ b/combo/utils/lambo.py @@ -6,7 +6,7 @@ from lambo.segmenter.lambo import Lambo class LamboTokenizer(Tokenizer): - def __init__(self, model: str = "LAMBO_no_pretraining-UD_Polish-PDB",) -> None: + def __init__(self, model: str) -> None: self.lambo=Lambo.get(model) # Simple tokenisation: ignoring sentence split