Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
anonymizer
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
nlpworkers
anonymizer
Merge requests
!3
Code refactor + date/url/user/phone/email anonymization
Code
Review changes
Check out branch
Download
Patches
Plain diff
Expand sidebar
Merged
Code refactor + date/url/user/phone/email anonymization
code_refactor
into
develop
Overview
0
Commits
6
Pipelines
1
Changes
8
Merged
Code refactor + date/url/user/phone/email anonymization
Norbert Ropiak
requested to merge
code_refactor
into
develop
Mar 11, 2021
Overview
0
Commits
6
Pipelines
1
Changes
8
0
0
Merge request reports
Compare
develop
version 1
2f865af2
Mar 11, 2021
develop (base)
and
version 1
latest version
2f865af2
6 commits,
Mar 11, 2021
version 1
2f865af2
5 commits,
Mar 11, 2021
8 files
+
597
−
237
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
Files
8
src/anonymizer.py
+
114
−
222
View file @ 2f865af2
Edit in single-file editor
Open in Web IDE
Show full file
"""
Implementation of anonymizer functionality.
"""
import
re
from
string
import
punctuation
,
ascii_lowercase
,
ascii_uppercase
,
digits
import
random
from
src.generators
import
(
generate_pseudo_email
,
generate_pseudo_phone_number
,
generate_pseudo_user
,
generate_pseudo_website
,
generate_phone_number_tag
,
generate_pseudo_date
)
import
regex
class
Anonymizer
:
"""
Class used to edit sentences based on options.
"""
email_regex
=
regex
.
compile
(
r
'
(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+
'
r
'
(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)
'
r
'
(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)
'
r
'
(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)
'
,
regex
.
I
)
user_regex
=
regex
.
compile
(
r
'
\B(?P<username>\@[\w\-]+)
'
)
_website_exceptions
=
[
'
m.in
'
]
website_regex
=
regex
.
compile
(
r
'
\b(?:{})\b(*SKIP)(*FAIL)|
'
.
format
(
'
|
'
.
join
(
_website_exceptions
))
+
r
'
(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?
'
r
'
(?P<auth>\S+(?::\S*)?@)?
'
r
'
(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})
'
r
'
(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})
'
r
'
(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})
'
r
'
(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])
'
r
'
(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}
'
r
'
(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))
'
r
'
|
'
r
'
((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?
'
r
'
[a-z0-9\u00a1-\uffff]\.)+)
'
r
'
(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)
'
r
'
(?P<port>:\d{2,5})?
'
r
'
(?P<path>[/?#]\S*)?)
'
,
regex
.
UNICODE
|
regex
.
I
)
phone_number_regex
=
regex
.
compile
(
r
'
(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?
'
r
'
(?P<number>(\d[- ]??){9,10})
'
)
date_regex
=
regex
.
compile
(
r
'
\b(?P<day_or_month_year>
'
r
'
(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})
'
r
'
(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})
'
r
'
(?P<year1>\d{4}|\d{2}))\b|
'
r
'
\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})
'
r
'
(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)
'
r
'
(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|
'
r
'
(?P<month_in_words>
'
r
'
(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?
'
r
'
\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|
'
r
'
Kwi(?:|ecie[nń]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)
'
r
'
|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)
'
r
'
|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|stopada)
'
r
'
|Gru(?:|dzie[nń]|dnia))\b
'
r
'
((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))
'
r
'
(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|
'
r
'
(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)
'
,
regex
.
I
)
_file_to_liner_dispatch
=
{
'
nam_liv_person
'
:
'
person_first_nam
'
,
'
nam_liv_person_last
'
:
'
person_last_nam
'
,
@@ -33,9 +88,24 @@ class Anonymizer:
self
.
_mail_token
=
'
[MAIL]
'
self
.
_user_token
=
'
@[USER]
'
self
.
_website_token
=
'
[WWW]
'
self
.
_digits_token
=
'
[DIGITS]
'
self
.
_date_token
=
'
[DATE]
'
self
.
_default_token
=
'
[INNE]
'
self
.
_form_dict
=
dict
()
self
.
_pseudo_ann_list
=
list
()
# Order is important, first more specific
self
.
_category_anonymisation
=
{
'
user
'
:
(
self
.
user_regex
,
self
.
_user_token
,
generate_pseudo_user
),
'
email
'
:
(
self
.
email_regex
,
self
.
_mail_token
,
generate_pseudo_email
),
'
website
'
:
(
self
.
website_regex
,
self
.
_website_token
,
generate_pseudo_website
),
'
date
'
:
(
self
.
date_regex
,
self
.
_date_token
,
generate_pseudo_date
),
'
phone_number
'
:
(
self
.
phone_number_regex
,
self
.
_digits_token
,
generate_pseudo_phone_number
),
}
self
.
_load_file
()
def
_load_file
(
self
,
file_name
=
'
wiki.txt
'
):
@@ -80,20 +150,13 @@ class Anonymizer:
def
_process_sentence
(
self
,
string_builder
):
string_builder
=
self
.
_handle_pseudo_ann
(
string_builder
)
return
self
.
_anonoymize_phone_number
(
self
.
_anonoymize_website
(
self
.
_anonoymize_user
(
self
.
_anonoymize_email
(
''
.
join
(
string_builder
)
)
)
)
)
sentence
=
''
.
join
(
string_builder
)
return
self
.
_anonymize
(
sentence
)
def
_process_word
(
self
,
id
,
text
,
tag
,
ann
):
for
annotation
in
ann
:
if
annotation
[
1
]
!=
0
:
text
=
self
.
_handle_annotated
(
id
,
text
,
tag
,
an
notation
[
0
]
)
for
chan
,
value
in
ann
:
if
value
!=
0
:
text
=
self
.
_handle_annotated
(
id
,
text
,
tag
,
ch
an
)
break
return
text
@@ -194,220 +257,49 @@ class Anonymizer:
chan
=
ann_subtree
.
attrib
[
"
chan
"
]
return
chan
,
value
@staticmethod
def
_get_random_character
(
digit
=
False
,
upper
=
False
):
return
random
.
choice
(
digits
)
if
digit
\
else
random
.
choice
(
ascii_uppercase
)
\
if
upper
else
random
.
choice
(
ascii_lowercase
)
@staticmethod
def
_generate_pseudo_email
(
email
):
new_mail
=
[]
it
=
iter
(
email
)
top_domain_len
=
email
.
rfind
(
'
.
'
)
i
=
0
for
char
in
it
:
if
char
==
'
@
'
:
new_mail
.
append
(
char
)
i
+=
1
break
elif
char
in
punctuation
:
new_mail
.
append
(
char
)
else
:
new_mail
.
append
(
Anonymizer
.
_get_random_character
(
char
.
isdigit
(),
char
.
isupper
()
)
)
i
+=
1
for
char
in
it
:
if
char
==
'
.
'
:
if
i
==
top_domain_len
:
new_mail
.
append
(
char
)
break
new_mail
.
append
(
char
)
elif
char
in
punctuation
:
new_mail
.
append
(
char
)
else
:
new_mail
.
append
(
Anonymizer
.
_get_random_character
(
char
.
isdigit
(),
char
.
isupper
()
)
)
i
+=
1
for
char
in
it
:
new_mail
.
append
(
char
)
return
r
''
.
join
(
new_mail
)
@staticmethod
def
_generate_pseudo_user
(
user
):
it
=
iter
(
user
)
new_user
=
[]
for
char
in
it
:
if
char
in
punctuation
:
new_user
.
append
(
char
)
else
:
new_user
.
append
(
Anonymizer
.
_get_random_character
(
char
.
isdigit
(),
char
.
isupper
()
)
)
return
r
''
.
join
(
new_user
)
@staticmethod
def
_generate_pseudo_website
(
link
):
it
=
iter
(
link
)
new_link
=
[]
if
link
[
0
:
4
].
lower
()
==
'
http
'
:
slashes
=
0
for
char
in
it
:
if
char
==
'
/
'
:
slashes
+=
1
new_link
.
append
(
char
)
if
slashes
==
2
:
break
for
char
in
it
:
if
char
==
'
/
'
:
new_link
.
append
(
char
)
break
else
:
new_link
.
append
(
char
)
for
char
in
it
:
if
char
in
punctuation
:
new_link
.
append
(
char
)
else
:
new_link
.
append
(
Anonymizer
.
_get_random_character
(
char
.
isdigit
(),
char
.
isupper
()
)
)
return
r
''
.
join
(
new_link
)
@staticmethod
def
_generate_pseudo_phone_number
(
number
):
new_number
=
[]
length
=
len
(
number
)
it
=
iter
(
number
)
if
number
[
0
]
==
'
+
'
:
how_many
=
length
-
9
for
j
in
range
(
0
,
how_many
):
new_number
.
append
(
next
(
it
))
elif
number
[
0
]
==
'
0
'
and
number
[
1
]
==
'
0
'
\
and
number
[
length
-
10
]
==
'
'
:
for
j
in
range
(
0
,
length
-
10
):
new_number
.
append
(
next
(
it
))
elif
number
[
0
]
==
'
(
'
and
number
[
1
]
==
'
0
'
and
number
[
4
]
==
'
)
'
:
for
j
in
range
(
0
,
2
):
new_number
.
append
(
next
(
it
))
for
char
in
it
:
if
char
.
isdigit
():
new_number
.
append
(
Anonymizer
.
_get_random_character
(
digit
=
True
))
else
:
new_number
.
append
(
char
)
return
r
''
.
join
(
new_number
)
def
_generate_phone_number_tag
(
self
,
number
):
new_number
=
number
.
split
(
'
'
)
for
i
in
range
(
len
(
new_number
)):
new_number
[
i
]
=
self
.
_default_token
return
r
'
'
.
join
(
new_number
)
def
_anonoymize_email
(
self
,
sentence
):
"""
Handles removal/changing of emails addresses.
"""
email_regex
=
r
'
[\w\.-]+@[\w\.-]+\.\w{2,4}
'
def
_anonymize
(
self
,
sentence
):
if
self
.
_method
==
'
delete
'
:
sentence
=
re
.
sub
(
email_regex
,
''
,
sentence
)
for
pattern
,
_
,
_
in
self
.
_category_anonymisation
.
values
():
sentence
=
regex
.
sub
(
pattern
,
''
,
sentence
)
elif
self
.
_method
==
'
tag
'
:
sentence
=
re
.
sub
(
email_regex
,
self
.
_mail_token
,
sentence
)
sentence
=
self
.
_tagging
(
sentence
)
elif
self
.
_method
==
'
pseudo
'
:
matches
=
re
.
findall
(
email_regex
,
sentence
)
for
match
in
matches
:
sentence
=
re
.
sub
(
re
.
escape
(
match
),
self
.
_generate_pseudo_email
(
match
),
sentence
)
sentence
=
self
.
_pseudonymization
(
sentence
)
return
sentence
def
_anonoymize_user
(
self
,
sentence
):
"""
Handles removal/change of users.
"""
user_regex
=
r
'
\B\@([\w\-]+)
'
if
self
.
_method
==
'
delete
'
:
sentence
=
re
.
sub
(
user_regex
,
''
,
sentence
)
elif
self
.
_method
==
'
tag
'
:
sentence
=
re
.
sub
(
user_regex
,
self
.
_user_token
,
sentence
)
elif
self
.
_method
==
'
pseudo
'
:
matches
=
re
.
findall
(
user_regex
,
sentence
)
for
match
in
matches
:
sentence
=
re
.
sub
(
re
.
escape
(
match
),
self
.
_generate_pseudo_user
(
match
),
sentence
)
return
sentence
def
_tagging
(
self
,
sentence
):
for
category
in
self
.
_category_anonymisation
:
pattern
,
token
,
_
=
self
.
_category_anonymisation
[
category
]
def
_anonoymize_website
(
self
,
sentence
):
"""
Handles removal/change of links.
"""
link_regex
=
r
'
(((h|H)(t|T)(t|T)(p|P)(s|S)?:\/\/(?:www\.|(?!www)))?
'
\
r
'
[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]+\.(?:(?!(h|H)
'
\
r
'
(t|T)(t|T)(p|P)(s|S)?:\/\/))[^\s]{2,}|www\.[a-zA-Z0-9]
'
\
r
'
[a-zA-Z0-9-]+[a-zA-Z0-9]\.(?:(?!(h|H)(t|T)(t|T)(p|P)
'
\
r
'
(s|S)?:\/\/))[^\s]{2,}|(h|H)(t|T)(t|T)(p|P)(s|S)?:\/\/
'
\
r
'
(?:www\.|(?!www))[a-zA-Z0-9]+\.(?:(?!(h|H)(t|T)(t|T)
'
\
r
'
(p|P)(s|S)?:\/\/))[^\s]{2,}|www\.[a-zA-Z0-9]+\.
'
\
r
'
(?:(?!(h|H)(t|T)(t|T)(p|P)(s|S)?:\/\/))[^\s]{2,})
'
if
self
.
_method
==
'
delete
'
:
sentence
=
re
.
sub
(
link_regex
,
''
,
sentence
)
elif
self
.
_method
==
'
tag
'
:
sentence
=
re
.
sub
(
link_regex
,
self
.
_website_token
,
sentence
)
elif
self
.
_method
==
'
pseudo
'
:
matches
=
re
.
findall
(
link_regex
,
sentence
)
for
match
in
matches
:
for
val
in
match
:
if
val
!=
''
:
match
=
val
break
sentence
=
re
.
sub
(
re
.
escape
(
match
),
self
.
_generate_pseudo_website
(
match
),
sentence
)
if
category
==
'
phone_number
'
:
matches
=
[
m
for
m
in
pattern
.
finditer
(
sentence
)]
for
match
in
matches
:
tag
=
generate_phone_number_tag
(
match
.
groupdict
(
''
),
token
)
replace_match
=
match
.
group
(
0
)
sentence
=
regex
.
sub
(
regex
.
escape
(
replace_match
),
tag
,
sentence
)
else
:
sentence
=
regex
.
sub
(
pattern
,
token
,
sentence
)
return
sentence
def
_anonoymize_phone_number
(
self
,
sentence
):
"""
Handles removal/change of links.
"""
phone_number_regex
=
r
'
(((\+[1-9]\d{0,2}|00[1-9]\d{0,2}) ?)?(\d{9}))
'
\
r
'
|((\+[1-9]\d{0,2} |00[1-9]\d{0,2} )?
'
\
r
'
(\d{3} \d{3} \d{3}))|(\(0\d{2}\) \d{2} \d{2}
'
\
r
'
\d{3})|(\(\d{2}\) \d{2} \d{3} \d{2})
'
if
self
.
_method
==
'
delete
'
:
sentence
=
re
.
sub
(
phone_number_regex
,
''
,
sentence
)
elif
self
.
_method
==
'
tag
'
:
matches
=
re
.
findall
(
phone_number_regex
,
sentence
)
for
match
in
matches
:
for
val
in
match
:
if
val
!=
''
:
match
=
val
break
sentence
=
re
.
sub
(
re
.
escape
(
match
),
self
.
_generate_phone_number_tag
(
match
),
sentence
)
elif
self
.
_method
==
'
pseudo
'
:
matches
=
re
.
findall
(
phone_number_regex
,
sentence
)
for
match
in
matches
:
for
val
in
match
:
if
val
!=
''
:
match
=
val
break
sentence
=
re
.
sub
(
re
.
escape
(
match
),
self
.
_generate_pseudo_phone_number
(
match
),
sentence
)
def
_pseudonymization
(
self
,
sentence
):
sentence_after_regex
=
sentence
to_replace
=
[]
for
category
in
self
.
_category_anonymisation
:
pattern
,
_
,
generator
=
self
.
_category_anonymisation
[
category
]
for
match
in
pattern
.
finditer
(
sentence_after_regex
):
if
not
match
:
continue
to_replace
.
append
((
match
,
generator
))
sentence_after_regex
=
regex
.
sub
(
regex
.
escape
(
match
.
group
(
0
)),
''
,
sentence_after_regex
)
for
match
,
generator
in
to_replace
:
replace_match
=
match
.
group
(
0
)
pseudo_string
=
generator
(
match
.
groupdict
(
''
))
sentence
=
regex
.
sub
(
regex
.
escape
(
replace_match
),
pseudo_string
,
sentence
)
return
sentence
Loading