"""Utils for scripts.""" import pathlib import subprocess LANG2TRANSFORMER = { "en": "bert-base-cased", "pl": "allegro/herbert-base-cased", "zh": "bert-base-chinese", "fi": "TurkuNLP/bert-base-finnish-cased-v1", "ko": "kykim/bert-kor-base", "de": "dbmdz/bert-base-german-cased", "ar": "aubmindlab/bert-base-arabertv2", "eu": "ixa-ehu/berteus-base-cased", "tr": "dbmdz/bert-base-turkish-cased", "bg": "iarfmoose/roberta-base-bulgarian", "nl": "GroNLP/bert-base-dutch-cased", "fr": "camembert-base", "it": "dbmdz/bert-base-italian-cased", "ru": "blinoff/roberta-base-russian-v0", "sv": "KB/bert-base-swedish-cased", "uk": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-uk-cased/", "ta": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-ta-cased/", "sk": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-sk-cased/", "lt": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lt-cased/", "lv": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lv-cased/", "cs": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-cs-cased/", "et": "/tmp/lustre_shared/mklimasz/transformers/etwiki-bert/", # "uk": http://dl.turkunlp.org/wikibert/wikibert-base-uk-cased/ # "ta": http://dl.turkunlp.org/wikibert/wikibert-base-ta-cased/ # "sk": http://dl.turkunlp.org/wikibert/wikibert-base-sk-cased/ # "lt": http://dl.turkunlp.org/wikibert/wikibert-base-lt-cased/ # "lv": http://dl.turkunlp.org/wikibert/wikibert-base-lv-cased/ # "et": http://dl.turkunlp.org/estonian-bert/etwiki-bert/pytorch/ # "cs": https://github.com/kiv-air/Czert https://arxiv.org/pdf/2103.13031.pdf } def execute_command(command, output_file=None): command = [c for c in command.split() if c.strip()] if output_file: with open(output_file, "w") as f: subprocess.run(command, check=True, stdout=f) else: subprocess.run(command, check=True) def path_to_str(path: pathlib.Path) -> str: return str(path.resolve()) def collapse_nodes(data_dir: pathlib.Path, treebank_file: pathlib.Path, output: str): output_path = pathlib.Path(output) if not output_path.exists(): execute_command(f"perl {path_to_str(data_dir / 'tools' / 'enhanced_collapse_empty_nodes.pl')} " f"{path_to_str(treebank_file)}", output)