Skip to content
Snippets Groups Projects
Commit 0456a3f2 authored by Michał Pogoda's avatar Michał Pogoda
Browse files

Remove legacy script

parent 992bf6c1
No related merge requests found
Pipeline #14173 passed with stages
in 40 seconds
"""Convert NELexicon into wiki used by anonymizer.
Requires morfeusz2 to be installed.
"""
import morfeusz2
morf = morfeusz2.Morfeusz(expand_tags=True)
_file_to_liner_dispatch = {
"nam_liv_person": "person_first_nam",
"nam_liv_person_last": "person_last_nam",
"nam_fac_road": "road_nam",
"nam_loc_gpe_city": "city_nam",
"nam_org_group_team": "country_nam",
}
_allowed_genders = ["f", "m1", "m2", "m3", "n"]
def _create_wiki():
with open("wiki.txt", "wt+", encoding="utf-8") as f:
_add_gender(f)
_last_names(f)
def _add_gender(
output, file_name="nelexicon2/extra/wiktionary-forms-with-bases-and-tags.txt"
):
with open(file_name, "r", encoding="utf-8") as f:
_form_dict = dict()
for line in f:
l_list = line.split()
cat = l_list[0]
if cat in _file_to_liner_dispatch:
cat_name = cat
length = int((len(l_list) - 2) / 2)
gen_name = " ".join(l_list[(1 + length) : (1 + 2 * length)])
flx_name = " ".join(l_list[1 : (1 + length)])
flex = l_list[-1]
if cat_name not in _form_dict:
_form_dict[cat_name] = dict()
if length not in _form_dict[cat_name]:
_form_dict[cat_name][length] = dict()
if gen_name not in _form_dict[cat_name][length]:
_form_dict[cat_name][length][gen_name] = dict()
if flex not in _form_dict[cat_name][length][gen_name]:
_form_dict[cat_name][length][gen_name][flex] = flx_name
name = gen_name.split(" ")[0]
generate = morf.generate(name)
flex_split = generate[0][2].split(":")
if len(flex_split) > 3:
gender = flex_split[3]
new_flex = flex + ":" + gender
output.write(
cat
+ "\t"
+ flx_name
+ "\t"
+ gen_name
+ "\t"
+ new_flex
+ "\n"
)
def _last_names(output):
dict_list = list()
with open("nelexicon2/extra/wikipedia-liner2.txt", "rt", encoding="utf-8") as f:
for line in f:
line = line.strip()
line_l = line.split("\t")
if line_l[0] == "nam_liv_person_last":
line_l = line_l[1]
line_l.split(" ")
line_len = len(line_l)
if type(line_l) == list() and line_len > 1:
dictionary = dict()
for word in line_l:
gen = morf.generate(word)
for w in gen:
tag_list = w[2].split(":")
if len(tag_list) > 3:
tag = tag_list[1] + ":" + tag_list[2]
if tag not in dictionary:
dictionary[tag] = w[0]
else:
dictionary[tag] += " " + w[0]
for key in dictionary:
if len(dictionary[key].split(" ")) == line_len:
d = dictionary[key]
dict_list.append(d)
else:
word = line_l[0] if type(line_l) == list() else line_l
generate = morf.generate(word)
for g in generate:
if len(g) > 4 and "nazwisko" in g[3]:
dict_list.append(g)
for word in dict_list:
d = word
line = (
"nam_liv_person_last"
+ "\t"
+ d[0].split(":")[0]
+ "\t"
+ d[1].split(":")[0]
+ "\t"
+ ":".join(d[2].split(":")[1:])
)
output.write(line + "\n")
_create_wiki()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment