From 799257d7d4a1099341dfd3294f73648a55e31939 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Fri, 4 Nov 2022 10:32:00 +0100 Subject: [PATCH] Command-line options for LAMBO segmentation. --- combo/main.py | 5 +++-- combo/predict.py | 4 ++-- combo/utils/lambo.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/combo/main.py b/combo/main.py index 5a2066c..0bc6126 100644 --- a/combo/main.py +++ b/combo/main.py @@ -88,7 +88,8 @@ flags.DEFINE_boolean(name="silent", default=True, flags.DEFINE_enum(name="predictor_name", default="combo-spacy", enum_values=["combo", "combo-spacy", "combo-lambo"], help="Use predictor with whitespace, spacy or LAMBO tokenizer.") - +flags.DEFINE_string(name="lambo_model_name", default="en", + help="LAMBO model name (if LAMBO used for segmentation).") def run(_): """Run model.""" @@ -175,7 +176,7 @@ def _get_predictor() -> predictors.Predictor: ) return predictors.Predictor.from_archive( - archive, FLAGS.predictor_name + archive, FLAGS.predictor_name, extra_args= {"lambo_model_name" : FLAGS.lambo_model_name} ) diff --git a/combo/predict.py b/combo/predict.py index 1481bf5..68bdef3 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -238,8 +238,8 @@ class COMBO(predictor.Predictor): @classmethod def with_lambo_tokenizer(cls, model: models.Model, - dataset_reader: allen_data.DatasetReader): - return cls(model, dataset_reader, lambo.LamboTokenizer()) + dataset_reader: allen_data.DatasetReader, lambo_model_name : str): + return cls(model, dataset_reader, lambo.LamboTokenizer(lambo_model_name)) @classmethod def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(), diff --git a/combo/utils/lambo.py b/combo/utils/lambo.py index 75284ef..92ef8e1 100644 --- a/combo/utils/lambo.py +++ b/combo/utils/lambo.py @@ -6,7 +6,7 @@ from lambo.segmenter.lambo import Lambo class LamboTokenizer(Tokenizer): - def __init__(self, model: str = "LAMBO_no_pretraining-UD_Polish-PDB",) -> None: + def __init__(self, model: str) -> None: self.lambo=Lambo.get(model) # Simple tokenisation: ignoring sentence split -- GitLab