"""Implementation of nlp_worker.""" from hydra import initialize, compose from hydra.utils import instantiate class Worker: def __init__( self, configuration="ccl", default_language="pl", default_replacer="tag" ) -> None: self._last_config = None self._pipeline = None self._configuration = configuration self._default_language = default_language self._default_replacer = default_replacer super().__init__() def _prepare_pipeline(self, task_options): language = task_options.get("language", self._default_language) replace_method = task_options.get("method", self._default_replacer) overrides = [ "language=" + language, "replacers=" + replace_method, "configuration=" + self._configuration, ] assert language in ["pl"] assert replace_method in ["delete", "tag", "pseudo"] config_hash = hash(tuple(overrides)) if self._last_config != config_hash: with initialize(config_path="../config", version_base="1.1"): cfg = compose(config_name="config", overrides=overrides) self._pipeline = instantiate(cfg["pipeline"]) return self._pipeline def process(self, input_file, task_options, output_file): """Anonymizes input text. It is assumed input_file is encoded in UTF-8. Options: method - 'delete'/'tag'/'pseudo' - 'delete' deletes selected tokens, 'tag' replaces selected tokens with arbitrary tags, 'pseudo' replaces selected tokens with a random token that language - 'pl' - language of the input text. As of now only Polish is supported. """ pipeline = self._prepare_pipeline(task_options) with open(output_file, "w", encoding="utf-8") as f: result = pipeline.run(input_file) f.write(result)