Skip to content
Snippets Groups Projects
Select Git revision
  • master
  • deanonimzer
  • v2 protected
  • v1 protected
  • develop protected
5 results

cli.py

Blame
  • utils.py 1.77 KiB
    """Utils for scripts."""
    import pathlib
    import subprocess
    
    LANG2TRANSFORMER = {
        "en": "bert-base-cased",
        "pl": "allegro/herbert-large-cased",
        "zh": "bert-base-chinese",
        "fi": "TurkuNLP/bert-base-finnish-cased-v1",
        "ko": "kykim/bert-kor-base",
        "de": "dbmdz/bert-base-german-cased",
        "ar": "aubmindlab/bert-base-arabertv2",
        "eu": "ixa-ehu/berteus-base-cased",
        "tr": "dbmdz/bert-base-turkish-cased",
        "bg": "xlm-roberta-large",
        "nl": "xlm-roberta-large",
        "fr": "camembert-base",
        "it": "xlm-roberta-large",
        "ru": "xlm-roberta-large",
        "sv": "xlm-roberta-large",
        "uk": "xlm-roberta-large",
        "ta": "xlm-roberta-large",
        "sk": "xlm-roberta-large",
        "lt": "xlm-roberta-large",
        "lv": "xlm-roberta-large",
        "cs": "xlm-roberta-large",
        "et": "xlm-roberta-large",
    }
    
    
    def execute_command(command, output_file=None):
        command = [c for c in command.split() if c.strip()]
        if output_file:
            with open(output_file, "w") as f:
                subprocess.run(command, check=True, stdout=f)
        else:
            subprocess.run(command, check=True)
    
    
    def path_to_str(path: pathlib.Path) -> str:
        return str(path.resolve())
    
    
    def collapse_nodes(data_dir: pathlib.Path, treebank_file: pathlib.Path, output: str):
        output_path = pathlib.Path(output)
        if not output_path.exists():
            execute_command(f"perl {path_to_str(data_dir / 'enhanced_collapse_empty_nodes.pl')} "
                            f"{path_to_str(treebank_file)}", output)
    
    
    def quick_fix(data_dir: pathlib.Path, treebank_file: pathlib.Path, output: str):
        output_path = pathlib.Path(output)
        if not output_path.exists():
            execute_command(f"perl {path_to_str(data_dir / 'conllu-quick-fix.pl')} "
                            f"{path_to_str(treebank_file)}", output)