Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
anonymizer
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
nlpworkers
anonymizer
Compare revisions
c96cc8e3f63c0a6702df6fe7803abd3d98b92840 to f879f23953c8c15cfa10ad606daba4e66f4a17da
Compare revisions
Changes are shown as if the
source
revision was being merged into the
target
revision.
Learn more about comparing revisions.
Source
nlpworkers/anonymizer
Select target project
No results found
f879f23953c8c15cfa10ad606daba4e66f4a17da
Select Git revision
Branches
deanonimzer
develop
master
v1
v2
Swap
Target
nlpworkers/anonymizer
Select target project
nlpworkers/anonymizer
1 result
c96cc8e3f63c0a6702df6fe7803abd3d98b92840
Select Git revision
Branches
deanonimzer
develop
master
v1
v2
Show changes
Only incoming changes from source
Include changes to target since source was created
Compare
Commits on Source (2)
Fix anonymizer errors
· ba2b1885
Paweł Walkowiak
authored
1 year ago
ba2b1885
Merge branch 'fix1' into 'master'
· f879f239
Paweł Walkowiak
authored
1 year ago
Fix anonymizer errors See merge request
!13
f879f239
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/dictionaries/morphosyntactic/ner_file.py
+60
-20
60 additions, 20 deletions
src/dictionaries/morphosyntactic/ner_file.py
src/replacers/date_replacer.py
+4
-2
4 additions, 2 deletions
src/replacers/date_replacer.py
src/replacers/ner_replacer.py
+1
-1
1 addition, 1 deletion
src/replacers/ner_replacer.py
with
65 additions
and
23 deletions
src/dictionaries/morphosyntactic/ner_file.py
View file @
f879f239
"""
Module responsible for Morphosyntactic dict that uses a tsv file with NER tags.
"""
import
random
import
string
from
collections
import
defaultdict
from
typing
import
List
,
Optional
,
Type
,
Dict
from
src.detections
import
DETECTION_CLASSES_MAP
,
Detection
,
MorphosyntacticInfoMixin
from
src.dictionaries.morphosyntactic.interface
import
MorphosyntacticDictionary
import
logging
_log
=
logging
.
getLogger
(
__name__
)
class
NERFileMorphosyntacticDictionary
(
MorphosyntacticDictionary
):
"""
Morphosyntactic dictionary that uses a tsv file with NER tags as a source.
...
...
@@ -52,6 +57,9 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
ner_tag
,
word
,
lemma
,
morpho_tag
=
line
.
split
(
"
\t
"
)
replacement_dictionary
[
ner_tag
][
morpho_tag
][
lemma
]
=
word
replacement_dictionary
=
{
k
:
dict
(
v
)
for
k
,
v
in
replacement_dictionary
.
items
()
if
v
}
# freeze dict
return
replacement_dictionary
def
get_supported_detection_classes
(
self
)
->
List
[
Type
[
Detection
]]:
...
...
@@ -92,21 +100,39 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
if
original_entry_type_name
in
self
.
_dictionary
:
entry_type
=
original_entry_type_name
if
morpho_tag
in
self
.
_dictionary
[
entry_type
]:
lemma
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
][
morpho_tag
].
keys
())
)
word
=
self
.
_dictionary
[
entry_type
][
morpho_tag
][
lemma
]
else
:
morpho_tag
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
].
keys
())
)
lemma
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
][
morpho_tag
].
keys
())
)
word
=
lemma
try
:
if
entry_type
in
self
.
_dictionary
\
and
morpho_tag
in
self
.
_dictionary
[
entry_type
]
\
and
len
(
list
(
self
.
_dictionary
[
entry_type
][
morpho_tag
].
keys
())
)
>
0
:
lemma
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
][
morpho_tag
].
keys
())
)
word
=
self
.
_dictionary
[
entry_type
][
morpho_tag
][
lemma
]
elif
morpho_tag
==
"
ign
"
:
# unknown form
letters
=
string
.
ascii_lowercase
size
=
random
.
randint
(
3
,
5
)
lemma
=
""
.
join
(
random
.
sample
(
list
(
letters
),
size
)).
upper
()
word
=
lemma
else
:
morpho_tag
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
].
keys
())
)
lemma
=
random
.
choice
(
list
(
self
.
_dictionary
[
entry_type
][
morpho_tag
].
keys
()
)
)
word
=
lemma
except
IndexError
as
exp
:
_log
.
error
(
f
"
IndexError entry_type
"
f
"
{
entry_type
}
morpho_tag
{
morpho_tag
}
"
)
_log
.
error
(
exp
)
_log
.
error
(
f
"
Dictionary
{
self
.
_dictionary
[
entry_type
][
morpho_tag
]
}
"
)
if
word
is
None
and
self
.
_always_replace
:
entry_type
=
random
.
choice
(
list
(
self
.
_dictionary
.
keys
()))
...
...
@@ -157,11 +183,19 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
original_entries
)
possible_lemmas
=
set
(
self
.
_dictionary
[
detection_type
][
required_tags
[
0
]].
keys
())
possible_lemmas
=
set
(
self
.
_dictionary
[
detection_type
][
required_tags
[
0
]].
keys
()
)
\
if
detection_type
in
self
.
_dictionary
\
and
required_tags
[
0
]
in
self
.
_dictionary
[
detection_type
]
\
else
set
()
for
tag
in
required_tags
[
1
:]:
possible_lemmas
.
intersection_update
(
self
.
_dictionary
[
detection_type
][
tag
].
keys
()
)
keys
=
self
.
_dictionary
[
detection_type
][
tag
].
keys
()
\
if
detection_type
in
self
.
_dictionary
\
and
tag
in
self
.
_dictionary
[
detection_type
]
\
else
set
()
if
keys
:
possible_lemmas
.
intersection_update
(
keys
)
if
len
(
possible_lemmas
)
==
0
:
return
[
self
.
get_random_replacement
(
original_entries
[
0
])]
*
len
(
...
...
@@ -174,7 +208,13 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
for
entry
in
original_entries
:
if
isinstance
(
entry
,
MorphosyntacticInfoMixin
):
morpho_tag
=
entry
.
morpho_tag
word
=
self
.
_dictionary
[
detection_type
][
morpho_tag
][
lemma
]
if
detection_type
in
self
.
_dictionary
\
and
morpho_tag
in
self
.
_dictionary
[
detection_type
]
\
and
lemma
in
\
self
.
_dictionary
[
detection_type
][
morpho_tag
]:
word
=
self
.
_dictionary
[
detection_type
][
morpho_tag
][
lemma
]
else
:
word
=
lemma
else
:
word
=
lemma
...
...
This diff is collapsed.
Click to expand it.
src/replacers/date_replacer.py
View file @
f879f239
...
...
@@ -92,8 +92,10 @@ class DateReplacer(ReplacerInterface):
month_name
=
months_map
[
random_month
]
replacement
.
append
(
month_name
)
elif
entry
[
0
]
==
DateDetection
.
AnnotationPart
.
OTHER
:
replacement
.
append
(
entry
[
1
])
if
entry
[
1
]
is
not
None
:
replacement
.
append
(
entry
[
1
])
else
:
raise
ValueError
(
f
"
Unknown format entry:
{
entry
}
"
)
replacement
=
""
.
join
(
replacement
)
already_replaced
[
text
[
start
:
end
]]
=
replacement
...
...
This diff is collapsed.
Click to expand it.
src/replacers/ner_replacer.py
View file @
f879f239
...
...
@@ -61,7 +61,7 @@ class NERReplacer(ReplacerInterface):
)
morpho_detections
[
key
].
append
(
detection
)
else
:
key
=
(
text
[
start
:
end
],
detection
.
TYPE_NAME
)
key
=
(
text
[
start
:
end
],
detection
_entry
.
TYPE_NAME
)
non_morpho_detections
[
key
].
append
(
detection
)
# Replace morphosyntactic detections
...
...
This diff is collapsed.
Click to expand it.