Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
anonymizer
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
nlpworkers
anonymizer
Commits
f879f239
Commit
f879f239
authored
1 year ago
by
Paweł Walkowiak
Browse files
Options
Downloads
Plain Diff
Merge branch 'fix1' into 'master'
Fix anonymizer errors See merge request
!13
parents
c96cc8e3
ba2b1885
Branches
Branches containing commit
1 merge request
!13
Fix anonymizer errors
Pipeline
#15494
passed with stages
in 1 minute and 37 seconds
Changes
3
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/dictionaries/morphosyntactic/ner_file.py
+60
-20
60 additions, 20 deletions
src/dictionaries/morphosyntactic/ner_file.py
src/replacers/date_replacer.py
+4
-2
4 additions, 2 deletions
src/replacers/date_replacer.py
src/replacers/ner_replacer.py
+1
-1
1 addition, 1 deletion
src/replacers/ner_replacer.py
with
65 additions
and
23 deletions
src/dictionaries/morphosyntactic/ner_file.py
+
60
−
20
View file @
f879f239
"""
Module responsible for Morphosyntactic dict that uses a tsv file with NER tags.
"""
import
random
import
string
from
collections
import
defaultdict
from
typing
import
List
,
Optional
,
Type
,
Dict
from
src.detections
import
DETECTION_CLASSES_MAP
,
Detection
,
MorphosyntacticInfoMixin
from
src.dictionaries.morphosyntactic.interface
import
MorphosyntacticDictionary
import
logging
_log
=
logging
.
getLogger
(
__name__
)
class
NERFileMorphosyntacticDictionary
(
MorphosyntacticDictionary
):
"""
Morphosyntactic dictionary that uses a tsv file with NER tags as a source.
...
...
@@ -52,6 +57,9 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
ner_tag
,
word
,
lemma
,
morpho_tag
=
line
.
split
(
"
\t
"
)
replacement_dictionary
[
ner_tag
][
morpho_tag
][
lemma
]
=
word
replacement_dictionary
=
{
k
:
dict
(
v
)
for
k
,
v
in
replacement_dictionary
.
items
()
if
v
}
# freeze dict
return
replacement_dictionary
def
get_supported_detection_classes
(
self
)
->
List
[
Type
[
Detection
]]:
...
...
@@ -92,21 +100,39 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
if
original_entry_type_name
in
self
.
_dictionary
:
entry_type
=
original_entry_type_name
if
morpho_tag
in
self
.
_dictionary
[
entry_type
]:
lemma
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
][
morpho_tag
].
keys
())
)
word
=
self
.
_dictionary
[
entry_type
][
morpho_tag
][
lemma
]
else
:
morpho_tag
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
].
keys
())
)
lemma
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
][
morpho_tag
].
keys
())
)
word
=
lemma
try
:
if
entry_type
in
self
.
_dictionary
\
and
morpho_tag
in
self
.
_dictionary
[
entry_type
]
\
and
len
(
list
(
self
.
_dictionary
[
entry_type
][
morpho_tag
].
keys
())
)
>
0
:
lemma
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
][
morpho_tag
].
keys
())
)
word
=
self
.
_dictionary
[
entry_type
][
morpho_tag
][
lemma
]
elif
morpho_tag
==
"
ign
"
:
# unknown form
letters
=
string
.
ascii_lowercase
size
=
random
.
randint
(
3
,
5
)
lemma
=
""
.
join
(
random
.
sample
(
list
(
letters
),
size
)).
upper
()
word
=
lemma
else
:
morpho_tag
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
].
keys
())
)
lemma
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
][
morpho_tag
].
keys
()
)
)
word
=
lemma
except
IndexError
as
exp
:
_log
.
error
(
f
"
IndexError entry_type
"
f
"
{
entry_type
}
morpho_tag
{
morpho_tag
}
"
)
_log
.
error
(
exp
)
_log
.
error
(
f
"
Dictionary
{
self
.
_dictionary
[
entry_type
][
morpho_tag
]
}
"
)
if
word
is
None
and
self
.
_always_replace
:
entry_type
=
random
.
choice
(
list
(
self
.
_dictionary
.
keys
()))
...
...
@@ -157,11 +183,19 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
original_entries
)
possible_lemmas
=
set
(
self
.
_dictionary
[
detection_type
][
required_tags
[
0
]].
keys
())
possible_lemmas
=
set
(
self
.
_dictionary
[
detection_type
][
required_tags
[
0
]].
keys
()
)
\
if
detection_type
in
self
.
_dictionary
\
and
required_tags
[
0
]
in
self
.
_dictionary
[
detection_type
]
\
else
set
()
for
tag
in
required_tags
[
1
:]:
possible_lemmas
.
intersection_update
(
self
.
_dictionary
[
detection_type
][
tag
].
keys
()
)
keys
=
self
.
_dictionary
[
detection_type
][
tag
].
keys
()
\
if
detection_type
in
self
.
_dictionary
\
and
tag
in
self
.
_dictionary
[
detection_type
]
\
else
set
()
if
keys
:
possible_lemmas
.
intersection_update
(
keys
)
if
len
(
possible_lemmas
)
==
0
:
return
[
self
.
get_random_replacement
(
original_entries
[
0
])]
*
len
(
...
...
@@ -174,7 +208,13 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
for
entry
in
original_entries
:
if
isinstance
(
entry
,
MorphosyntacticInfoMixin
):
morpho_tag
=
entry
.
morpho_tag
word
=
self
.
_dictionary
[
detection_type
][
morpho_tag
][
lemma
]
if
detection_type
in
self
.
_dictionary
\
and
morpho_tag
in
self
.
_dictionary
[
detection_type
]
\
and
lemma
in
\
self
.
_dictionary
[
detection_type
][
morpho_tag
]:
word
=
self
.
_dictionary
[
detection_type
][
morpho_tag
][
lemma
]
else
:
word
=
lemma
else
:
word
=
lemma
...
...
This diff is collapsed.
Click to expand it.
src/replacers/date_replacer.py
+
4
−
2
View file @
f879f239
...
...
@@ -92,8 +92,10 @@ class DateReplacer(ReplacerInterface):
month_name
=
months_map
[
random_month
]
replacement
.
append
(
month_name
)
elif
entry
[
0
]
==
DateDetection
.
AnnotationPart
.
OTHER
:
replacement
.
append
(
entry
[
1
])
if
entry
[
1
]
is
not
None
:
replacement
.
append
(
entry
[
1
])
else
:
raise
ValueError
(
f
"
Unknown format entry:
{
entry
}
"
)
replacement
=
""
.
join
(
replacement
)
already_replaced
[
text
[
start
:
end
]]
=
replacement
...
...
This diff is collapsed.
Click to expand it.
src/replacers/ner_replacer.py
+
1
−
1
View file @
f879f239
...
...
@@ -61,7 +61,7 @@ class NERReplacer(ReplacerInterface):
)
morpho_detections
[
key
].
append
(
detection
)
else
:
key
=
(
text
[
start
:
end
],
detection
.
TYPE_NAME
)
key
=
(
text
[
start
:
end
],
detection
_entry
.
TYPE_NAME
)
non_morpho_detections
[
key
].
append
(
detection
)
# Replace morphosyntactic detections
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment