From c7a903ad9dabd16a9cb6c5033139035c049e0d8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl>
Date: Mon, 12 Dec 2022 14:08:41 +0100
Subject: [PATCH] Added option to run from cli

---
 .gitignore                                    |  2 +
 .../marek_kowalski_pojechal_do_wroclawia.ccl  | 44 +++++++++++++++++++
 requirements.txt                              |  1 +
 scripts/cli.py                                | 43 ++++++++++++++++++
 src/anonymizers/polish_anonymizer.py          |  2 +-
 5 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl
 create mode 100644 scripts/cli.py

diff --git a/.gitignore b/.gitignore
index f1d48cf..1a63a49 100644
--- a/.gitignore
+++ b/.gitignore
@@ -139,3 +139,5 @@ cython_debug/
 
 .vscode
 *.ipynb
+
+/test.txt
\ No newline at end of file
diff --git a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl
new file mode 100644
index 0000000..d8db042
--- /dev/null
+++ b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chunkList SYSTEM "ccl.dtd">
+<chunkList>
+ <chunk type="p" id="ch1">
+  <sentence id="s1">
+   <tok>
+    <orth>Marek</orth>
+    <lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="person_first_nam" head="1">1</ann>
+    <ann chan="person_last_nam">0</ann>
+    <ann chan="city_nam">0</ann>
+   </tok>
+   <tok>
+    <orth>Kowalski</orth>
+    <lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="person_first_nam">0</ann>
+    <ann chan="person_last_nam" head="1">1</ann>
+    <ann chan="city_nam">0</ann>
+   </tok>
+   <tok>
+    <orth>pojechał</orth>
+    <lex disamb="1"><base>pojechać</base><ctag>praet:sg:m1:perf</ctag></lex>
+    <ann chan="person_first_nam">0</ann>
+    <ann chan="person_last_nam">0</ann>
+    <ann chan="city_nam">0</ann>
+   </tok>
+   <tok>
+    <orth>do</orth>
+    <lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex>
+    <ann chan="person_first_nam">0</ann>
+    <ann chan="person_last_nam">0</ann>
+    <ann chan="city_nam">0</ann>
+   </tok>
+   <tok>
+    <orth>Wrocławia</orth>
+    <lex disamb="1"><base>Wrocław</base><ctag>subst:sg:gen:m3</ctag></lex>
+    <ann chan="person_first_nam">0</ann>
+    <ann chan="person_last_nam">0</ann>
+    <ann chan="city_nam" head="1">1</ann>
+   </tok>
+  </sentence>
+ </chunk>
+</chunkList>
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 9022646..f7260eb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+--index-url https://pypi.clarin-pl.eu/simple/
 nlp-ws
 regex==2020.10.28
 Babel==2.8.0
\ No newline at end of file
diff --git a/scripts/cli.py b/scripts/cli.py
new file mode 100644
index 0000000..9ee5bea
--- /dev/null
+++ b/scripts/cli.py
@@ -0,0 +1,43 @@
+"""Implementation of anonymizer service."""
+import argparse
+from src.worker import Worker
+
+
+from src.anonymizers.polish_anonymizer import PolishAnonymizer
+from src.anonymizers.english_anonymizer import EnglishAnonymizer
+from src.anonymizers.russian_anonymizer import RussianAnonymizer
+
+
+def get_args():
+    """Gets command line arguments."""
+    parser = argparse.ArgumentParser(description="anonymizer")
+
+    parser.add_argument(
+        "-l", "--language", help="Language of the input text", default="pl"
+    )
+    parser.add_argument("-m", "--method", help="Anonymization method", default="tag", choices=["delete", "tag", "pseudo"])
+    parser.add_argument("input_file", help="Path to input file")
+    parser.add_argument("output_file", help="Path to output file")
+
+    return parser.parse_args()
+
+
+def main():
+    """Runs the program."""
+    args = get_args()
+
+    task_options = {
+        "method": args.method,
+    }
+
+    anonymizers = {
+        "pl": PolishAnonymizer,
+        "en": EnglishAnonymizer,
+        "ru": RussianAnonymizer,
+    }
+    anon = anonymizers.get(args.language, PolishAnonymizer)(task_options)
+    anon.process(args.input_file, args.output_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/anonymizers/polish_anonymizer.py b/src/anonymizers/polish_anonymizer.py
index a9d0b25..60f9c50 100644
--- a/src/anonymizers/polish_anonymizer.py
+++ b/src/anonymizers/polish_anonymizer.py
@@ -79,7 +79,7 @@ class PolishAnonymizer(BaseAnonymizer):
         self._pseudo_ann_list = list()
         self._load_file()
 
-    def _load_file(self, filename='pl_dict.txt'):
+    def _load_file(self, filename='dictionaries/pl_dict.txt'):
         with open(filename, 'r', encoding='utf-8') as f:
             for line in f.readlines():
                 l_list = line.split()
-- 
GitLab