diff --git a/README.md b/README.md index 8c04942f6368752c73198db230cbfe78eccd66e9..410ddbd1631c134c7aa643d1b90f69e2e552c4ec 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Punctuator -A service that automatically adds punctuation to raw word-stream (eg. from speech2text) for polish language. +A service that automatically adds punctuation to raw word-stream (eg. from speech2text) for polish, russian and english language. **Example input**: > według webometrycznego rankingu uniwersytetów świata ze stycznia 2019 pokazującego zaangażowanie instytucji akademickich w internecie uczelnia zajmuje 5 miejsce w polsce wśród uczelni technicznych a na świecie 964 wśród wszystkich typów uczelni w rankingu szkół wyższych perspektyw politechnika wrocławska zajęła w 2019 roku 3 miejsce wśród uczelni technicznych oraz 6 miejsce spośród wszystkich uczelni akademickich w polsce @@ -18,8 +18,9 @@ overlap = 20 ; The number of tokens from the environment that will be taken at i ``` ## LPMN +Punctuator have one argument `language` with options: `pl` `ru` `en` : ``` -filedir(/users/michal.pogoda)|any2txt|punctuator +filedir(/users/michal.pogoda)|any2txt|punctuator({"language":"en"}) ``` ## Mountpoints diff --git a/worker.py b/worker.py index 4275942fa790eab700069d14453fd943d9fc1904..d51300c3368ceaaf2b38ec1bd3cfbbd27001a29c 100644 --- a/worker.py +++ b/worker.py @@ -53,22 +53,22 @@ class Worker(nlp_ws.NLPWorker): self.model_path_ru = self.config["model_path_ru"] self.model_path_en = self.config["model_path_en"] self.tool_pl, self.model_pl, self.tokenizer_pl, self.mapping_pl \ - = self.initialize_model(self.model_path_pl, self.device) + = self.initialize_model('pl-PL', self.model_path_pl, self.device) self.tool_en, self.model_en, self.tokenizer_en, self.mapping_en \ - = self.initialize_model(self.model_path_en, 'cpu') + = self.initialize_model('en-US', self.model_path_en, 'cpu') self.tool_ru, self.model_ru, self.tokenizer_ru, self.mapping_ru \ - = self.initialize_model(self.model_path_ru, 'cpu') + = self.initialize_model('ru', self.model_path_ru, 'cpu') self.current_model = self.model_path_pl def process( self, input_path: str, task_options: dict, output_path: str ) -> None: - - if task_options['language'] != self.current_model: - self.pass_device(task_options['language']) - self.current_model = task_options['language'] + if task_options['language'] == 'en': + bpe = True + else: + bpe = False tool, model, tokenizer, mapping = self.get_setup_for_language( - self.current_model) + task_options['language']) with open(input_path, "r") as f: text = f.read() @@ -109,29 +109,27 @@ class Worker(nlp_ws.NLPWorker): ): tokens += tokenized["input_ids"][0, combine_mask].numpy().tolist() - text_out = decode(tokens, labels, tokenizer, - self.current_model != self.model_path_pl) + text_out = decode(tokens, labels, tokenizer, bpe) text_out = _post_process(text_out, tool) with open(output_path, "w") as f: f.write(text_out) - def initialize_model(self, model_path: str, device: str): - tool = language_tool_python.LanguageTool(self.languagetool_map - [model_path]) + def initialize_model(self, lang, model_path: str, device: str): + tool = language_tool_python.LanguageTool(lang) model = AutoModelForTokenClassification.from_pretrained( model_path ).to(device) tokenizer = AutoTokenizer.from_pretrained(model_path) mapping = {} - with open(f"{model_path}/classes.json", "r") as f: + with open("model/punctuator_pl/classes.json", "r") as f: mapping = json.load(f) mapping = list(mapping.keys()) return tool, model, tokenizer, mapping def get_setup_for_language(self, language): - if language == 'model_path_ru': + if language == 'ru': return self.tool_ru, self.model_ru, self.tokenizer_ru, self.mapping_ru - elif language == 'model_path_en': + elif language == 'en': return self.tool_en, self.model_en, self.tokenizer_en, self.mapping_en else: return self.tool_pl, self.model_pl, self.tokenizer_pl, self.mapping_pl