From c7a903ad9dabd16a9cb6c5033139035c049e0d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl> Date: Mon, 12 Dec 2022 14:08:41 +0100 Subject: [PATCH] Added option to run from cli --- .gitignore | 2 + .../marek_kowalski_pojechal_do_wroclawia.ccl | 44 +++++++++++++++++++ requirements.txt | 1 + scripts/cli.py | 43 ++++++++++++++++++ src/anonymizers/polish_anonymizer.py | 2 +- 5 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl create mode 100644 scripts/cli.py diff --git a/.gitignore b/.gitignore index f1d48cf..1a63a49 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,5 @@ cython_debug/ .vscode *.ipynb + +/test.txt \ No newline at end of file diff --git a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl new file mode 100644 index 0000000..d8db042 --- /dev/null +++ b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl @@ -0,0 +1,44 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE chunkList SYSTEM "ccl.dtd"> +<chunkList> + <chunk type="p" id="ch1"> + <sentence id="s1"> + <tok> + <orth>Marek</orth> + <lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex> + <lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="person_first_nam" head="1">1</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> + </tok> + <tok> + <orth>Kowalski</orth> + <lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam" head="1">1</ann> + <ann chan="city_nam">0</ann> + </tok> + <tok> + <orth>pojechał</orth> + <lex disamb="1"><base>pojechać</base><ctag>praet:sg:m1:perf</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> + </tok> + <tok> + <orth>do</orth> + <lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> + </tok> + <tok> + <orth>Wrocławia</orth> + <lex disamb="1"><base>Wrocław</base><ctag>subst:sg:gen:m3</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam" head="1">1</ann> + </tok> + </sentence> + </chunk> +</chunkList> \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9022646..f7260eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +--index-url https://pypi.clarin-pl.eu/simple/ nlp-ws regex==2020.10.28 Babel==2.8.0 \ No newline at end of file diff --git a/scripts/cli.py b/scripts/cli.py new file mode 100644 index 0000000..9ee5bea --- /dev/null +++ b/scripts/cli.py @@ -0,0 +1,43 @@ +"""Implementation of anonymizer service.""" +import argparse +from src.worker import Worker + + +from src.anonymizers.polish_anonymizer import PolishAnonymizer +from src.anonymizers.english_anonymizer import EnglishAnonymizer +from src.anonymizers.russian_anonymizer import RussianAnonymizer + + +def get_args(): + """Gets command line arguments.""" + parser = argparse.ArgumentParser(description="anonymizer") + + parser.add_argument( + "-l", "--language", help="Language of the input text", default="pl" + ) + parser.add_argument("-m", "--method", help="Anonymization method", default="tag", choices=["delete", "tag", "pseudo"]) + parser.add_argument("input_file", help="Path to input file") + parser.add_argument("output_file", help="Path to output file") + + return parser.parse_args() + + +def main(): + """Runs the program.""" + args = get_args() + + task_options = { + "method": args.method, + } + + anonymizers = { + "pl": PolishAnonymizer, + "en": EnglishAnonymizer, + "ru": RussianAnonymizer, + } + anon = anonymizers.get(args.language, PolishAnonymizer)(task_options) + anon.process(args.input_file, args.output_file) + + +if __name__ == "__main__": + main() diff --git a/src/anonymizers/polish_anonymizer.py b/src/anonymizers/polish_anonymizer.py index a9d0b25..60f9c50 100644 --- a/src/anonymizers/polish_anonymizer.py +++ b/src/anonymizers/polish_anonymizer.py @@ -79,7 +79,7 @@ class PolishAnonymizer(BaseAnonymizer): self._pseudo_ann_list = list() self._load_file() - def _load_file(self, filename='pl_dict.txt'): + def _load_file(self, filename='dictionaries/pl_dict.txt'): with open(filename, 'r', encoding='utf-8') as f: for line in f.readlines(): l_list = line.split() -- GitLab