From 0456a3f2defd5217f20f5c93e76c1bebc1bad167 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl>
Date: Fri, 18 Aug 2023 14:12:08 +0200
Subject: [PATCH] Remove legacy script

---
 utility/NELex2_to_wiki.py | 113 --------------------------------------
 1 file changed, 113 deletions(-)
 delete mode 100644 utility/NELex2_to_wiki.py

diff --git a/utility/NELex2_to_wiki.py b/utility/NELex2_to_wiki.py
deleted file mode 100644
index 704e900..0000000
--- a/utility/NELex2_to_wiki.py
+++ /dev/null
@@ -1,113 +0,0 @@
-"""Convert NELexicon into wiki used by anonymizer.
-
-Requires morfeusz2 to be installed.
-"""
-
-import morfeusz2
-
-morf = morfeusz2.Morfeusz(expand_tags=True)
-
-_file_to_liner_dispatch = {
-    "nam_liv_person": "person_first_nam",
-    "nam_liv_person_last": "person_last_nam",
-    "nam_fac_road": "road_nam",
-    "nam_loc_gpe_city": "city_nam",
-    "nam_org_group_team": "country_nam",
-}
-
-_allowed_genders = ["f", "m1", "m2", "m3", "n"]
-
-
-def _create_wiki():
-    with open("wiki.txt", "wt+", encoding="utf-8") as f:
-        _add_gender(f)
-        _last_names(f)
-
-
-def _add_gender(
-    output, file_name="nelexicon2/extra/wiktionary-forms-with-bases-and-tags.txt"
-):
-    with open(file_name, "r", encoding="utf-8") as f:
-        _form_dict = dict()
-        for line in f:
-            l_list = line.split()
-            cat = l_list[0]
-            if cat in _file_to_liner_dispatch:
-                cat_name = cat
-                length = int((len(l_list) - 2) / 2)
-                gen_name = " ".join(l_list[(1 + length) : (1 + 2 * length)])
-                flx_name = " ".join(l_list[1 : (1 + length)])
-                flex = l_list[-1]
-                if cat_name not in _form_dict:
-                    _form_dict[cat_name] = dict()
-                if length not in _form_dict[cat_name]:
-                    _form_dict[cat_name][length] = dict()
-                if gen_name not in _form_dict[cat_name][length]:
-                    _form_dict[cat_name][length][gen_name] = dict()
-                if flex not in _form_dict[cat_name][length][gen_name]:
-                    _form_dict[cat_name][length][gen_name][flex] = flx_name
-                    name = gen_name.split(" ")[0]
-                    generate = morf.generate(name)
-                    flex_split = generate[0][2].split(":")
-                    if len(flex_split) > 3:
-                        gender = flex_split[3]
-                        new_flex = flex + ":" + gender
-                        output.write(
-                            cat
-                            + "\t"
-                            + flx_name
-                            + "\t"
-                            + gen_name
-                            + "\t"
-                            + new_flex
-                            + "\n"
-                        )
-
-
-def _last_names(output):
-    dict_list = list()
-    with open("nelexicon2/extra/wikipedia-liner2.txt", "rt", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            line_l = line.split("\t")
-            if line_l[0] == "nam_liv_person_last":
-                line_l = line_l[1]
-                line_l.split(" ")
-                line_len = len(line_l)
-                if type(line_l) == list() and line_len > 1:
-                    dictionary = dict()
-                    for word in line_l:
-                        gen = morf.generate(word)
-                        for w in gen:
-                            tag_list = w[2].split(":")
-                            if len(tag_list) > 3:
-                                tag = tag_list[1] + ":" + tag_list[2]
-                                if tag not in dictionary:
-                                    dictionary[tag] = w[0]
-                                else:
-                                    dictionary[tag] += " " + w[0]
-                    for key in dictionary:
-                        if len(dictionary[key].split(" ")) == line_len:
-                            d = dictionary[key]
-                            dict_list.append(d)
-                else:
-                    word = line_l[0] if type(line_l) == list() else line_l
-                    generate = morf.generate(word)
-                    for g in generate:
-                        if len(g) > 4 and "nazwisko" in g[3]:
-                            dict_list.append(g)
-    for word in dict_list:
-        d = word
-        line = (
-            "nam_liv_person_last"
-            + "\t"
-            + d[0].split(":")[0]
-            + "\t"
-            + d[1].split(":")[0]
-            + "\t"
-            + ":".join(d[2].split(":")[1:])
-        )
-        output.write(line + "\n")
-
-
-_create_wiki()
-- 
GitLab