Skip to content
Snippets Groups Projects
utils.py 1.77 KiB
"""Utils for scripts."""
import pathlib
import subprocess

LANG2TRANSFORMER = {
    "en": "bert-base-cased",
    "pl": "allegro/herbert-large-cased",
    "zh": "bert-base-chinese",
    "fi": "TurkuNLP/bert-base-finnish-cased-v1",
    "ko": "kykim/bert-kor-base",
    "de": "dbmdz/bert-base-german-cased",
    "ar": "aubmindlab/bert-base-arabertv2",
    "eu": "ixa-ehu/berteus-base-cased",
    "tr": "dbmdz/bert-base-turkish-cased",
    "bg": "xlm-roberta-large",
    "nl": "xlm-roberta-large",
    "fr": "camembert-base",
    "it": "xlm-roberta-large",
    "ru": "xlm-roberta-large",
    "sv": "xlm-roberta-large",
    "uk": "xlm-roberta-large",
    "ta": "xlm-roberta-large",
    "sk": "xlm-roberta-large",
    "lt": "xlm-roberta-large",
    "lv": "xlm-roberta-large",
    "cs": "xlm-roberta-large",
    "et": "xlm-roberta-large",
}


def execute_command(command, output_file=None):
    command = [c for c in command.split() if c.strip()]
    if output_file:
        with open(output_file, "w") as f:
            subprocess.run(command, check=True, stdout=f)
    else:
        subprocess.run(command, check=True)


def path_to_str(path: pathlib.Path) -> str:
    return str(path.resolve())


def collapse_nodes(data_dir: pathlib.Path, treebank_file: pathlib.Path, output: str):
    output_path = pathlib.Path(output)
    if not output_path.exists():
        execute_command(f"perl {path_to_str(data_dir / 'enhanced_collapse_empty_nodes.pl')} "
                        f"{path_to_str(treebank_file)}", output)


def quick_fix(data_dir: pathlib.Path, treebank_file: pathlib.Path, output: str):
    output_path = pathlib.Path(output)
    if not output_path.exists():
        execute_command(f"perl {path_to_str(data_dir / 'conllu-quick-fix.pl')} "
                        f"{path_to_str(treebank_file)}", output)