Skip to content
Snippets Groups Projects

Support for English and Russian language

6 open threads

Merge request reports

Loading
Loading

Activity

Filter activity
  • Approvals
  • Assignees & reviewers
  • Comments (from bots)
  • Comments (from users)
  • Commits & branches
  • Edits
  • Labels
  • Lock status
  • Mentions
  • Merge request status
  • Tracking
23 24 'tag' replaces selected tokens with arbitrary tags, 'pseudo'
24 25 replaces selected tokens with a random token that
25 26 """
26 anon = Anonymizer(task_options)
27 ccl_handler = CCLHandler(input_file)
28 ccl_handler.process(output_file, anon.unmarshallers)
27 lang = task_options.get('language', 'pl')
28 if lang == 'pl':
  • 172 if ann_len > 1:
    173 consume(pseudo_ann_iter, ann_len - 1)
    174 if math.ceil(len(new_text) / 2) > ann_len:
    175 shifted_id += len(new_text) - ann_len
    176 self._pseudo_ann_list.clear()
    177 return string_builder
    178
    179 def _anonymize(self, sentence):
    180 if self._method == 'delete':
    181 for pattern, _, _ in self._category_anonymisation.values():
    182 sentence = regex.sub(pattern, '', sentence)
    183 elif self._method == 'tag':
    184 sentence = self._tagging(sentence)
    185 elif self._method == 'pseudo':
    186 sentence = self._pseudonymization(sentence)
    187 return sentence
  • 188
    189 def _tagging(self, sentence):
    190 for category in self._category_anonymisation:
    191 pattern, token, _ = self._category_anonymisation[category]
    192
    193 if category == 'phone_number':
    194 matches = [m for m in pattern.finditer(sentence)]
    195 for match in matches:
    196 tag = generate_phone_number_tag(match.groupdict(''), token)
    197 replace_match = match.group(0)
    198 sentence = regex.sub(regex.escape(replace_match),
    199 tag, sentence)
    200 else:
    201 sentence = regex.sub(pattern, token, sentence)
    202 return sentence
    203
  • 209 for match in pattern.finditer(sentence_after_regex):
    210 if not match:
    211 continue
    212 to_replace.append((match, generator))
    213 sentence_after_regex = regex.sub(regex.escape(match.group(0)),
    214 '', sentence_after_regex)
    215
    216 for match, generator in to_replace:
    217 replace_match = match.group(0)
    218 pseudo_string = generator(match.groupdict(''))
    219 sentence = regex.sub(
    220 regex.escape(replace_match),
    221 pseudo_string,
    222 sentence
    223 )
    224 return sentence
  • 5 5 COPY ./main.py .
    6 6 COPY ./requirements.txt .
    7 7 COPY ./wiki.txt .
    8 COPY ./en_dict.txt .
    9 COPY ./ru_dict.txt .
  • 104
    105 def _handle_annotated(self, id, text, tag, ann):
    106 if self._method == 'delete':
    107 return ''
    108 elif self._method == 'tag':
    109 if ann in self._liner_to_tag_dispatch:
    110 return self._liner_to_tag_dispatch[ann]
    111 elif self._method == 'pseudo':
    112 if ann in self._form_dict:
    113 self._pseudo_ann_list.append((id, text, tag, ann))
    114 return text
    115
    116 def _process_sentence(self, string_builder):
    117 string_builder = self._handle_pseudo_ann(string_builder)
    118 sentence = ''.join(string_builder)
    119 return self._anonymize(sentence)
  • Norbert Ropiak added 1 commit

    added 1 commit

    • cbce317a - fix minor bugs with regexes, move files to appropriate folders

    Compare with previous version

  • Norbert Ropiak added 1 commit

    added 1 commit

    Compare with previous version

  • Norbert Ropiak added 1 commit

    added 1 commit

    Compare with previous version

  • Szymon Ciombor mentioned in commit 1677a12c

    mentioned in commit 1677a12c

  • Please register or sign in to reply
    Loading