Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
anonymizer
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
nlpworkers
anonymizer
Commits
773f8011
Commit
773f8011
authored
4 years ago
by
Bartłomiej Koptyra
Browse files
Options
Downloads
Patches
Plain Diff
Added some emails and users handling.
parent
90b0d778
Branches
Branches containing commit
2 merge requests
!2
Develop
,
!1
Develop
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/anonymizer.py
+71
-1
71 additions, 1 deletion
src/anonymizer.py
with
71 additions
and
1 deletion
src/anonymizer.py
+
71
−
1
View file @
773f8011
"""
Implementation of anonymizer functionality.
"""
import
re
from
string
import
punctuation
,
ascii_lowercase
,
ascii_uppercase
import
random
class
Anonymizer
:
"""
Class used to edit sentences based on options.
"""
def
__init__
(
self
,
task_options
):
self
.
method
=
task_options
.
get
(
'
method
'
,
'
delete
'
)
self
.
_method
=
task_options
.
get
(
'
method
'
,
'
delete
'
)
self
.
_mail_token
=
'
[MAIL]
'
self
.
_user_token
=
'
@[USER]
'
def
process
(
self
):
if
ctag
==
'
ign
'
:
# sprawddz czy to nick a potem email
# sprawdz czy to nazwa własna jak mBank? nie wiem
print
()
@staticmethod
def
_get_random_chatacter
(
upper
=
False
):
return
random
.
choice
(
ascii_uppercase
)
\
if
upper
else
random
.
choice
(
ascii_lowercase
)
def
_generate_pseudo_email
(
self
,
email
):
new_mail
=
[]
it
=
iter
(
email
)
top_domain_len
=
len
(
email
)
-
email
.
rfind
(
'
.
'
)
for
char
in
it
:
if
char
==
'
@
'
:
new_mail
.
append
(
char
)
break
elif
char
in
punctuation
:
new_mail
.
append
(
char
)
else
:
new_mail
.
append
(
self
.
_get_random_chatacter
(
char
.
isupper
()))
for
char
in
it
:
if
char
==
'
.
'
:
if
len
(
list
(
it
))
==
top_domain_len
:
new_mail
.
append
(
char
)
break
new_mail
.
append
(
char
)
elif
char
in
punctuation
:
new_mail
.
append
(
char
)
else
:
new_mail
.
append
(
self
.
_get_random_chatacter
(
char
.
isupper
()))
for
char
in
it
:
new_mail
.
append
(
char
)
return
''
.
join
(
new_mail
)
def
_generate_pseudo_user
(
self
,
user
):
it
=
iter
(
user
)
new_user
=
[]
new_user
.
append
(
next
(
it
))
for
char
in
it
:
if
char
in
punctuation
:
new_user
.
append
(
char
)
else
:
new_user
.
append
(
self
.
_get_random_chatacter
(
char
.
isupper
()))
return
''
.
join
(
new_user
)
def
_anonoymize_email
(
self
,
token
):
"""
Handles removal/changing of emails addresses.
"""
email_regex
=
r
'
[\w\.-]+@[\w\.-]+\.\w{2,4}
'
if
self
.
_method
==
'
delete
'
:
token
=
re
.
sub
(
email_regex
,
r
''
,
token
)
elif
self
.
_method
==
'
tag
'
:
token
=
re
.
sub
(
email_regex
,
self
.
_mail_token
,
token
)
elif
self
.
_method
==
'
pseudo
'
:
token
=
self
.
_generate_pseudo_email
(
token
)
return
token
def
_anonoymize_user
(
self
,
token
):
"""
Handles removal/change of users.
"""
mention_regex
=
r
'
\B\@([\w\-]+)
'
if
self
.
_method
==
'
delete
'
:
token
=
re
.
sub
(
mention_regex
,
r
''
,
token
)
elif
self
.
_method
==
'
tag
'
:
token
=
re
.
sub
(
mention_regex
,
self
.
_user_token
,
token
)
elif
self
.
_method
==
'
pseudo
'
:
token
=
self
.
_generate_pseudo_user
(
token
)
return
token
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment