Support for English and Russian language
6 open threads
6 open threads
Merge request reports
Activity
added 4 commits
-
559cb540...dfc755e6 - 3 commits from branch
master - f0ec034f - resolve conflicts
-
559cb540...dfc755e6 - 3 commits from branch
assigned to @szymekc
23 24 'tag' replaces selected tokens with arbitrary tags, 'pseudo' 24 25 replaces selected tokens with a random token that 25 26 """ 26 anon = Anonymizer(task_options) 27 ccl_handler = CCLHandler(input_file) 28 ccl_handler.process(output_file, anon.unmarshallers) 27 lang = task_options.get('language', 'pl') 28 if lang == 'pl': changed this line in version 3 of the diff
172 if ann_len > 1: 173 consume(pseudo_ann_iter, ann_len - 1) 174 if math.ceil(len(new_text) / 2) > ann_len: 175 shifted_id += len(new_text) - ann_len 176 self._pseudo_ann_list.clear() 177 return string_builder 178 179 def _anonymize(self, sentence): 180 if self._method == 'delete': 181 for pattern, _, _ in self._category_anonymisation.values(): 182 sentence = regex.sub(pattern, '', sentence) 183 elif self._method == 'tag': 184 sentence = self._tagging(sentence) 185 elif self._method == 'pseudo': 186 sentence = self._pseudonymization(sentence) 187 return sentence changed this line in version 3 of the diff
188 189 def _tagging(self, sentence): 190 for category in self._category_anonymisation: 191 pattern, token, _ = self._category_anonymisation[category] 192 193 if category == 'phone_number': 194 matches = [m for m in pattern.finditer(sentence)] 195 for match in matches: 196 tag = generate_phone_number_tag(match.groupdict(''), token) 197 replace_match = match.group(0) 198 sentence = regex.sub(regex.escape(replace_match), 199 tag, sentence) 200 else: 201 sentence = regex.sub(pattern, token, sentence) 202 return sentence 203 changed this line in version 3 of the diff
209 for match in pattern.finditer(sentence_after_regex): 210 if not match: 211 continue 212 to_replace.append((match, generator)) 213 sentence_after_regex = regex.sub(regex.escape(match.group(0)), 214 '', sentence_after_regex) 215 216 for match, generator in to_replace: 217 replace_match = match.group(0) 218 pseudo_string = generator(match.groupdict('')) 219 sentence = regex.sub( 220 regex.escape(replace_match), 221 pseudo_string, 222 sentence 223 ) 224 return sentence changed this line in version 3 of the diff
5 5 COPY ./main.py . 6 6 COPY ./requirements.txt . 7 7 COPY ./wiki.txt . 8 COPY ./en_dict.txt . 9 COPY ./ru_dict.txt . changed this line in version 3 of the diff
104 105 def _handle_annotated(self, id, text, tag, ann): 106 if self._method == 'delete': 107 return '' 108 elif self._method == 'tag': 109 if ann in self._liner_to_tag_dispatch: 110 return self._liner_to_tag_dispatch[ann] 111 elif self._method == 'pseudo': 112 if ann in self._form_dict: 113 self._pseudo_ann_list.append((id, text, tag, ann)) 114 return text 115 116 def _process_sentence(self, string_builder): 117 string_builder = self._handle_pseudo_ann(string_builder) 118 sentence = ''.join(string_builder) 119 return self._anonymize(sentence) changed this line in version 3 of the diff
added 1 commit
- cbce317a - fix minor bugs with regexes, move files to appropriate folders
mentioned in commit 1677a12c
Please register or sign in to reply