Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
anonymizer
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
nlpworkers
anonymizer
Commits
0456a3f2
Commit
0456a3f2
authored
1 year ago
by
Michał Pogoda
Browse files
Options
Downloads
Patches
Plain Diff
Remove legacy script
parent
992bf6c1
No related merge requests found
Pipeline
#14173
passed with stages
in 40 seconds
Changes
1
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
utility/NELex2_to_wiki.py
+0
-113
0 additions, 113 deletions
utility/NELex2_to_wiki.py
with
0 additions
and
113 deletions
utility/NELex2_to_wiki.py
deleted
100644 → 0
+
0
−
113
View file @
992bf6c1
"""
Convert NELexicon into wiki used by anonymizer.
Requires morfeusz2 to be installed.
"""
import
morfeusz2
morf
=
morfeusz2
.
Morfeusz
(
expand_tags
=
True
)
_file_to_liner_dispatch
=
{
"
nam_liv_person
"
:
"
person_first_nam
"
,
"
nam_liv_person_last
"
:
"
person_last_nam
"
,
"
nam_fac_road
"
:
"
road_nam
"
,
"
nam_loc_gpe_city
"
:
"
city_nam
"
,
"
nam_org_group_team
"
:
"
country_nam
"
,
}
_allowed_genders
=
[
"
f
"
,
"
m1
"
,
"
m2
"
,
"
m3
"
,
"
n
"
]
def
_create_wiki
():
with
open
(
"
wiki.txt
"
,
"
wt+
"
,
encoding
=
"
utf-8
"
)
as
f
:
_add_gender
(
f
)
_last_names
(
f
)
def
_add_gender
(
output
,
file_name
=
"
nelexicon2/extra/wiktionary-forms-with-bases-and-tags.txt
"
):
with
open
(
file_name
,
"
r
"
,
encoding
=
"
utf-8
"
)
as
f
:
_form_dict
=
dict
()
for
line
in
f
:
l_list
=
line
.
split
()
cat
=
l_list
[
0
]
if
cat
in
_file_to_liner_dispatch
:
cat_name
=
cat
length
=
int
((
len
(
l_list
)
-
2
)
/
2
)
gen_name
=
"
"
.
join
(
l_list
[(
1
+
length
)
:
(
1
+
2
*
length
)])
flx_name
=
"
"
.
join
(
l_list
[
1
:
(
1
+
length
)])
flex
=
l_list
[
-
1
]
if
cat_name
not
in
_form_dict
:
_form_dict
[
cat_name
]
=
dict
()
if
length
not
in
_form_dict
[
cat_name
]:
_form_dict
[
cat_name
][
length
]
=
dict
()
if
gen_name
not
in
_form_dict
[
cat_name
][
length
]:
_form_dict
[
cat_name
][
length
][
gen_name
]
=
dict
()
if
flex
not
in
_form_dict
[
cat_name
][
length
][
gen_name
]:
_form_dict
[
cat_name
][
length
][
gen_name
][
flex
]
=
flx_name
name
=
gen_name
.
split
(
"
"
)[
0
]
generate
=
morf
.
generate
(
name
)
flex_split
=
generate
[
0
][
2
].
split
(
"
:
"
)
if
len
(
flex_split
)
>
3
:
gender
=
flex_split
[
3
]
new_flex
=
flex
+
"
:
"
+
gender
output
.
write
(
cat
+
"
\t
"
+
flx_name
+
"
\t
"
+
gen_name
+
"
\t
"
+
new_flex
+
"
\n
"
)
def
_last_names
(
output
):
dict_list
=
list
()
with
open
(
"
nelexicon2/extra/wikipedia-liner2.txt
"
,
"
rt
"
,
encoding
=
"
utf-8
"
)
as
f
:
for
line
in
f
:
line
=
line
.
strip
()
line_l
=
line
.
split
(
"
\t
"
)
if
line_l
[
0
]
==
"
nam_liv_person_last
"
:
line_l
=
line_l
[
1
]
line_l
.
split
(
"
"
)
line_len
=
len
(
line_l
)
if
type
(
line_l
)
==
list
()
and
line_len
>
1
:
dictionary
=
dict
()
for
word
in
line_l
:
gen
=
morf
.
generate
(
word
)
for
w
in
gen
:
tag_list
=
w
[
2
].
split
(
"
:
"
)
if
len
(
tag_list
)
>
3
:
tag
=
tag_list
[
1
]
+
"
:
"
+
tag_list
[
2
]
if
tag
not
in
dictionary
:
dictionary
[
tag
]
=
w
[
0
]
else
:
dictionary
[
tag
]
+=
"
"
+
w
[
0
]
for
key
in
dictionary
:
if
len
(
dictionary
[
key
].
split
(
"
"
))
==
line_len
:
d
=
dictionary
[
key
]
dict_list
.
append
(
d
)
else
:
word
=
line_l
[
0
]
if
type
(
line_l
)
==
list
()
else
line_l
generate
=
morf
.
generate
(
word
)
for
g
in
generate
:
if
len
(
g
)
>
4
and
"
nazwisko
"
in
g
[
3
]:
dict_list
.
append
(
g
)
for
word
in
dict_list
:
d
=
word
line
=
(
"
nam_liv_person_last
"
+
"
\t
"
+
d
[
0
].
split
(
"
:
"
)[
0
]
+
"
\t
"
+
d
[
1
].
split
(
"
:
"
)[
0
]
+
"
\t
"
+
"
:
"
.
join
(
d
[
2
].
split
(
"
:
"
)[
1
:])
)
output
.
write
(
line
+
"
\n
"
)
_create_wiki
()
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment