From 0456a3f2defd5217f20f5c93e76c1bebc1bad167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl> Date: Fri, 18 Aug 2023 14:12:08 +0200 Subject: [PATCH] Remove legacy script --- utility/NELex2_to_wiki.py | 113 -------------------------------------- 1 file changed, 113 deletions(-) delete mode 100644 utility/NELex2_to_wiki.py diff --git a/utility/NELex2_to_wiki.py b/utility/NELex2_to_wiki.py deleted file mode 100644 index 704e900..0000000 --- a/utility/NELex2_to_wiki.py +++ /dev/null @@ -1,113 +0,0 @@ -"""Convert NELexicon into wiki used by anonymizer. - -Requires morfeusz2 to be installed. -""" - -import morfeusz2 - -morf = morfeusz2.Morfeusz(expand_tags=True) - -_file_to_liner_dispatch = { - "nam_liv_person": "person_first_nam", - "nam_liv_person_last": "person_last_nam", - "nam_fac_road": "road_nam", - "nam_loc_gpe_city": "city_nam", - "nam_org_group_team": "country_nam", -} - -_allowed_genders = ["f", "m1", "m2", "m3", "n"] - - -def _create_wiki(): - with open("wiki.txt", "wt+", encoding="utf-8") as f: - _add_gender(f) - _last_names(f) - - -def _add_gender( - output, file_name="nelexicon2/extra/wiktionary-forms-with-bases-and-tags.txt" -): - with open(file_name, "r", encoding="utf-8") as f: - _form_dict = dict() - for line in f: - l_list = line.split() - cat = l_list[0] - if cat in _file_to_liner_dispatch: - cat_name = cat - length = int((len(l_list) - 2) / 2) - gen_name = " ".join(l_list[(1 + length) : (1 + 2 * length)]) - flx_name = " ".join(l_list[1 : (1 + length)]) - flex = l_list[-1] - if cat_name not in _form_dict: - _form_dict[cat_name] = dict() - if length not in _form_dict[cat_name]: - _form_dict[cat_name][length] = dict() - if gen_name not in _form_dict[cat_name][length]: - _form_dict[cat_name][length][gen_name] = dict() - if flex not in _form_dict[cat_name][length][gen_name]: - _form_dict[cat_name][length][gen_name][flex] = flx_name - name = gen_name.split(" ")[0] - generate = morf.generate(name) - flex_split = generate[0][2].split(":") - if len(flex_split) > 3: - gender = flex_split[3] - new_flex = flex + ":" + gender - output.write( - cat - + "\t" - + flx_name - + "\t" - + gen_name - + "\t" - + new_flex - + "\n" - ) - - -def _last_names(output): - dict_list = list() - with open("nelexicon2/extra/wikipedia-liner2.txt", "rt", encoding="utf-8") as f: - for line in f: - line = line.strip() - line_l = line.split("\t") - if line_l[0] == "nam_liv_person_last": - line_l = line_l[1] - line_l.split(" ") - line_len = len(line_l) - if type(line_l) == list() and line_len > 1: - dictionary = dict() - for word in line_l: - gen = morf.generate(word) - for w in gen: - tag_list = w[2].split(":") - if len(tag_list) > 3: - tag = tag_list[1] + ":" + tag_list[2] - if tag not in dictionary: - dictionary[tag] = w[0] - else: - dictionary[tag] += " " + w[0] - for key in dictionary: - if len(dictionary[key].split(" ")) == line_len: - d = dictionary[key] - dict_list.append(d) - else: - word = line_l[0] if type(line_l) == list() else line_l - generate = morf.generate(word) - for g in generate: - if len(g) > 4 and "nazwisko" in g[3]: - dict_list.append(g) - for word in dict_list: - d = word - line = ( - "nam_liv_person_last" - + "\t" - + d[0].split(":")[0] - + "\t" - + d[1].split(":")[0] - + "\t" - + ":".join(d[2].split(":")[1:]) - ) - output.write(line + "\n") - - -_create_wiki() -- GitLab