Skip to content
Snippets Groups Projects
Commit e25eadab authored by Michał Pogoda's avatar Michał Pogoda
Browse files

Implement support for wiktorner

parent 63784ace
Branches
2 merge requests!10Anonimizer v2,!7Better coverage
Pipeline #7844 failed with stage
in 21 seconds
Showing
with 8418 additions and 8285 deletions
defaults:
- paths: default
- detectors: all
- replacers: tag
- suppressor: order_based
- input_parser: ccl
- pipeline: default
- _self_
language: "pl"
\ No newline at end of file
- configuration: ccl
- _self_
\ No newline at end of file
# @package _global_
defaults:
- /paths: default
- /detectors: all
- /replacers: tag
- /suppressor: order_based
- /input_parser: ccl
- /pipeline: default
- _self_
language: "pl"
\ No newline at end of file
# @package _global_
defaults:
- /paths: default
- /detectors: all_ner_kpwr_ext
- /replacers: tag
- /suppressor: order_based
- /input_parser: wiktor_ner
- /pipeline: sequential_jsonl
- _self_
language: "pl"
\ No newline at end of file
defaults:
- date
- email
- ner
- ner_n5
- phone
- url
- user
\ No newline at end of file
- user
- number
\ No newline at end of file
defaults:
- date
- email
- ner_kpwr_ext
- phone
- url
- user
- number
\ No newline at end of file
ner:
_target_: src.detectors.ner.NerDetector
language: ${language}
\ No newline at end of file
ner:
_target_: src.detectors.ner.NerDetector
language: ${language}
detection_mapping:
"nam_liv_person_first": "name"
"nam_liv_person_last": "surname"
"nam_fac_road": "street_name"
"nam_loc_gpe_city": "city"
"nam_loc_gpe_country": "country"
"nam_loc_gpe_admin1": "country" # TODO: Implement better mapping for this
"nam_loc_historical_region": "country" # TODO: Implement better mapping for this
ner:
_target_: src.detectors.ner.NerDetector
language: ${language}
detection_mapping:
"person_first_nam": "name"
"person_last_nam": "surname"
"road_nam": "street_name"
"city_nam": "city"
"country_nam": "country"
\ No newline at end of file
_target_: src.input_parsers.wiktor_ner.WiktorNERInputParser
\ No newline at end of file
_target_: src.pipeline.sequential_jsonl.SequentialJSONLPipeline
input_parser: ${input_parser}
detectors: ${detectors}
suppressor: ${suppressor}
replacers: ${replacers}
\ No newline at end of file
ner:
_target_: src.replacers.ner_replacer.NERReplacer
dictionary:
_target_: src.dictionaries.morphosyntactic.pl_ner.PlNERMorphosyntacticDictionary
_target_: src.dictionaries.morphosyntactic.ner_file.NERFileMorphosyntacticDictionary
dictionary_path: ${paths.dictionaries_path}/pl_dict.txt
\ No newline at end of file
source diff could not be displayed: it is too large. Options to address this: view the blob.
{"filename": "bb4a16ff-33de-4478-939d-12db67d750b1","text": "ROZDZIAŁ I. CO NIECO O SAMEJ PIPIDÓWCE Przede wszystkim muszę uprzedzić z góry czytelników, aby się daremnie nie trudzili nad szukaniem wyżej wyrażonego miasteczka na mapach Galicji i Lodomerii, bo go tam nie znajdą. Nie dlatego, jakoby Pipidówka nie istniała w rzeczywistości i była tylko wytworem fantazji autora, ale po prostu dlatego, że mieszkańcy owego sławnego grodu, urosłszy z czasem w ambicję, uważali tę nazwę jako ubliżającą ich powadze i podali do c. k. namiestnictwa pokorną prośbę o pozwolenie zamienienia jej na inną. Podobne zamiany nazwisk praktykują się dość często w Galicji, szczególnie u pojedynczych osób, które nie czując się na siłach uszlachetnienia sobą, swymi czynami własnego nazwiska, chcą nazwiskiem uszlachetnić siebie, i tak np.","tokens": [{"index": 1,"position": [0,8],"orth": "ROZDZIAŁ","lexemes": [{"lemma": "rozdział","mstag": "subst:sg:nom:m3","disamb": true}]},{"index": 2,"position": [9,10],"orth": "I","lexemes": [{"lemma": "I","mstag": "adj:sg:nom:m3:pos","disamb": true}]},{"index": 3,"position": [10,11],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]},{"index": 4,"position": [12,14],"orth": "CO","lexemes": [{"lemma": "co","mstag": "conj","disamb": true}]},{"index": 5,"position": [15,20],"orth": "NIECO","lexemes": [{"lemma": "nieco","mstag": "adv","disamb": true}]},{"index": 6,"position": [21,22],"orth": "O","lexemes": [{"lemma": "o","mstag": "prep:loc","disamb": true}]},{"index": 7,"position": [23,28],"orth": "SAMEJ","lexemes": [{"lemma": "sam","mstag": "adj:sg:loc:f:pos","disamb": true}]},{"index": 8,"position": [29,38],"orth": "PIPIDÓWCE","lexemes": [{"lemma": "Pipidówka","mstag": "subst:sg:loc:f","disamb": true}]},{"index": 9,"position": [39,45],"orth": "Przede","lexemes": [{"lemma": "przed","mstag": "prep:inst:wok","disamb": true}]},{"index": 10,"position": [46,55],"orth": "wszystkim","lexemes": [{"lemma": "wszystko","mstag": "subst:sg:inst:n","disamb": true}]},{"index": 11,"position": [56,61],"orth": "muszę","lexemes": [{"lemma": "musieć","mstag": "fin:sg:pri:imperf","disamb": true}]},{"index": 12,"position": [62,71],"orth": "uprzedzić","lexemes": [{"lemma": "uprzedzić","mstag": "inf:perf","disamb": true}]},{"index": 13,"position": [72,73],"orth": "z","lexemes": [{"lemma": "z","mstag": "prep:gen:nwok","disamb": true}]},{"index": 14,"position": [74,78],"orth": "góry","lexemes": [{"lemma": "góra","mstag": "subst:sg:gen:f","disamb": true}]},{"index": 15,"position": [79,90],"orth": "czytelników","lexemes": [{"lemma": "czytelnik","mstag": "subst:pl:gen:m1","disamb": true}]},{"index": 16,"position": [90,91],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 17,"position": [92,95],"orth": "aby","lexemes": [{"lemma": "aby","mstag": "comp","disamb": true}]},{"index": 18,"position": [96,99],"orth": "się","lexemes": [{"lemma": "się","mstag": "qub","disamb": true}]},{"index": 19,"position": [100,108],"orth": "daremnie","lexemes": [{"lemma": "daremnie","mstag": "adv:pos","disamb": true}]},{"index": 20,"position": [109,112],"orth": "nie","lexemes": [{"lemma": "nie","mstag": "qub","disamb": true}]},{"index": 21,"position": [113,121],"orth": "trudzili","lexemes": [{"lemma": "trudzić","mstag": "praet:pl:m1:imperf","disamb": true}]},{"index": 22,"position": [122,125],"orth": "nad","lexemes": [{"lemma": "nad","mstag": "prep:inst:nwok","disamb": true}]},{"index": 23,"position": [126,135],"orth": "szukaniem","lexemes": [{"lemma": "szukać","mstag": "ger:sg:inst:n:imperf:aff","disamb": true}]},{"index": 24,"position": [136,141],"orth": "wyżej","lexemes": [{"lemma": "wysoko","mstag": "adv:com","disamb": true}]},{"index": 25,"position": [142,152],"orth": "wyrażonego","lexemes": [{"lemma": "wyrazić","mstag": "ppas:sg:gen:n:perf:aff","disamb": true}]},{"index": 26,"position": [153,163],"orth": "miasteczka","lexemes": [{"lemma": "miasteczko","mstag": "subst:sg:gen:n","disamb": true}]},{"index": 27,"position": [164,166],"orth": "na","lexemes": [{"lemma": "na","mstag": "prep:loc","disamb": true}]},{"index": 28,"position": [167,173],"orth": "mapach","lexemes": [{"lemma": "mapa","mstag": "subst:pl:loc:f","disamb": true}]},{"index": 29,"position": [174,181],"orth": "Galicji","lexemes": [{"lemma": "Galicja","mstag": "subst:sg:gen:f","disamb": true}]},{"index": 30,"position": [182,183],"orth": "i","lexemes": [{"lemma": "i","mstag": "conj","disamb": true}]},{"index": 31,"position": [184,193],"orth": "Lodomerii","lexemes": [{"lemma": "Lodomerii","mstag": "ign","disamb": true}]},{"index": 32,"position": [193,194],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 33,"position": [195,197],"orth": "bo","lexemes": [{"lemma": "bo","mstag": "comp","disamb": true}]},{"index": 34,"position": [198,200],"orth": "go","lexemes": [{"lemma": "on","mstag": "ppron3:sg:gen:m1:ter:nakc:npraep","disamb": true}]},{"index": 35,"position": [201,204],"orth": "tam","lexemes": [{"lemma": "tam","mstag": "adv","disamb": true}]},{"index": 36,"position": [205,208],"orth": "nie","lexemes": [{"lemma": "nie","mstag": "qub","disamb": true}]},{"index": 37,"position": [209,215],"orth": "znajdą","lexemes": [{"lemma": "znaleźć","mstag": "fin:pl:ter:perf","disamb": true}]},{"index": 38,"position": [215,216],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]},{"index": 39,"position": [217,220],"orth": "Nie","lexemes": [{"lemma": "nie","mstag": "qub","disamb": true}]},{"index": 40,"position": [221,228],"orth": "dlatego","lexemes": [{"lemma": "dlatego","mstag": "adv","disamb": true}]},{"index": 41,"position": [228,229],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 42,"position": [230,236],"orth": "jakoby","lexemes": [{"lemma": "jakoby","mstag": "comp","disamb": true}]},{"index": 43,"position": [237,246],"orth": "Pipidówka","lexemes": [{"lemma": "Pipidówka","mstag": "subst:sg:nom:f","disamb": true}]},{"index": 44,"position": [247,250],"orth": "nie","lexemes": [{"lemma": "nie","mstag": "qub","disamb": true}]},{"index": 45,"position": [251,259],"orth": "istniała","lexemes": [{"lemma": "istnieć","mstag": "praet:sg:f:imperf","disamb": true}]},{"index": 46,"position": [260,261],"orth": "w","lexemes": [{"lemma": "w","mstag": "prep:loc:nwok","disamb": true}]},{"index": 47,"position": [262,276],"orth": "rzeczywistości","lexemes": [{"lemma": "rzeczywistość","mstag": "subst:sg:loc:f","disamb": true}]},{"index": 48,"position": [277,278],"orth": "i","lexemes": [{"lemma": "i","mstag": "conj","disamb": true}]},{"index": 49,"position": [279,283],"orth": "była","lexemes": [{"lemma": "być","mstag": "praet:sg:f:imperf","disamb": true}]},{"index": 50,"position": [284,289],"orth": "tylko","lexemes": [{"lemma": "tylko","mstag": "qub","disamb": true}]},{"index": 51,"position": [290,298],"orth": "wytworem","lexemes": [{"lemma": "wytwór","mstag": "subst:sg:inst:m3","disamb": true}]},{"index": 52,"position": [299,307],"orth": "fantazji","lexemes": [{"lemma": "fantazja","mstag": "subst:sg:gen:f","disamb": true}]},{"index": 53,"position": [308,314],"orth": "autora","lexemes": [{"lemma": "autor","mstag": "subst:sg:gen:m1","disamb": true}]},{"index": 54,"position": [314,315],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 55,"position": [316,319],"orth": "ale","lexemes": [{"lemma": "ale","mstag": "conj","disamb": true}]},{"index": 56,"position": [320,322],"orth": "po","lexemes": [{"lemma": "po","mstag": "prep:acc","disamb": true}]},{"index": 57,"position": [323,329],"orth": "prostu","lexemes": [{"lemma": "prosty","mstag": "adjp","disamb": true}]},{"index": 58,"position": [330,337],"orth": "dlatego","lexemes": [{"lemma": "dlatego","mstag": "adv","disamb": true}]},{"index": 59,"position": [337,338],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 60,"position": [339,341],"orth": "że","lexemes": [{"lemma": "że","mstag": "comp","disamb": true}]},{"index": 61,"position": [342,352],"orth": "mieszkańcy","lexemes": [{"lemma": "mieszkaniec","mstag": "subst:pl:nom:m1","disamb": true}]},{"index": 62,"position": [353,358],"orth": "owego","lexemes": [{"lemma": "ów","mstag": "adj:sg:gen:m3:pos","disamb": true}]},{"index": 63,"position": [359,367],"orth": "sławnego","lexemes": [{"lemma": "sławny","mstag": "adj:sg:gen:m3:pos","disamb": true}]},{"index": 64,"position": [368,373],"orth": "grodu","lexemes": [{"lemma": "gród","mstag": "subst:sg:gen:m3","disamb": true}]},{"index": 65,"position": [373,374],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 66,"position": [375,383],"orth": "urosłszy","lexemes": [{"lemma": "urosłszy","mstag": "ign","disamb": true}]},{"index": 67,"position": [384,385],"orth": "z","lexemes": [{"lemma": "z","mstag": "prep:inst:nwok","disamb": true}]},{"index": 68,"position": [386,392],"orth": "czasem","lexemes": [{"lemma": "czas","mstag": "subst:sg:inst:m3","disamb": true}]},{"index": 69,"position": [393,394],"orth": "w","lexemes": [{"lemma": "w","mstag": "prep:acc:nwok","disamb": true}]},{"index": 70,"position": [395,402],"orth": "ambicję","lexemes": [{"lemma": "ambicja","mstag": "subst:sg:acc:f","disamb": true}]},{"index": 71,"position": [402,403],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 72,"position": [404,411],"orth": "uważali","lexemes": [{"lemma": "uważać","mstag": "praet:pl:m1:imperf","disamb": true}]},{"index": 73,"position": [412,414],"orth": "tę","lexemes": [{"lemma": "ten","mstag": "adj:sg:acc:f:pos","disamb": true}]},{"index": 74,"position": [415,420],"orth": "nazwę","lexemes": [{"lemma": "nazwa","mstag": "subst:sg:acc:f","disamb": true}]},{"index": 75,"position": [421,425],"orth": "jako","lexemes": [{"lemma": "jako","mstag": "conj","disamb": true}]},{"index": 76,"position": [426,436],"orth": "ubliżającą","lexemes": [{"lemma": "ubliżający","mstag": "adj:sg:acc:f:pos","disamb": true}]},{"index": 77,"position": [437,440],"orth": "ich","lexemes": [{"lemma": "on","mstag": "ppron3:pl:gen:m1:ter:akc:npraep","disamb": true}]},{"index": 78,"position": [441,448],"orth": "powadze","lexemes": [{"lemma": "powaga","mstag": "subst:sg:loc:f","disamb": true}]},{"index": 79,"position": [449,450],"orth": "i","lexemes": [{"lemma": "i","mstag": "conj","disamb": true}]},{"index": 80,"position": [451,457],"orth": "podali","lexemes": [{"lemma": "podać","mstag": "praet:pl:m1:perf","disamb": true}]},{"index": 81,"position": [458,460],"orth": "do","lexemes": [{"lemma": "do","mstag": "prep:gen","disamb": true}]},{"index": 82,"position": [461,462],"orth": "c","lexemes": [{"lemma": "c","mstag": "subst:sg:gen:f","disamb": true}]},{"index": 83,"position": [462,463],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]},{"index": 84,"position": [464,465],"orth": "k","lexemes": [{"lemma": "K","mstag": "brev:pun","disamb": true}]},{"index": 85,"position": [465,466],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]},{"index": 86,"position": [467,480],"orth": "namiestnictwa","lexemes": [{"lemma": "namiestnictwo","mstag": "subst:sg:gen:n","disamb": true}]},{"index": 87,"position": [481,488],"orth": "pokorną","lexemes": [{"lemma": "pokorny","mstag": "adj:sg:acc:f:pos","disamb": true}]},{"index": 88,"position": [489,495],"orth": "prośbę","lexemes": [{"lemma": "prośba","mstag": "subst:sg:acc:f","disamb": true}]},{"index": 89,"position": [496,497],"orth": "o","lexemes": [{"lemma": "o","mstag": "prep:acc","disamb": true}]},{"index": 90,"position": [498,508],"orth": "pozwolenie","lexemes": [{"lemma": "pozwolić","mstag": "ger:sg:acc:n:perf:aff","disamb": true}]},{"index": 91,"position": [509,520],"orth": "zamienienia","lexemes": [{"lemma": "zamienić","mstag": "ger:sg:gen:n:perf:aff","disamb": true}]},{"index": 92,"position": [521,524],"orth": "jej","lexemes": [{"lemma": "on","mstag": "ppron3:sg:gen:f:ter:akc:npraep","disamb": true}]},{"index": 93,"position": [525,527],"orth": "na","lexemes": [{"lemma": "na","mstag": "prep:acc","disamb": true}]},{"index": 94,"position": [528,532],"orth": "inną","lexemes": [{"lemma": "inny","mstag": "adj:sg:acc:f:pos","disamb": true}]},{"index": 95,"position": [532,533],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]},{"index": 96,"position": [534,541],"orth": "Podobne","lexemes": [{"lemma": "podobny","mstag": "adj:pl:nom:f:pos","disamb": true}]},{"index": 97,"position": [542,549],"orth": "zamiany","lexemes": [{"lemma": "zamiana","mstag": "subst:pl:nom:f","disamb": true}]},{"index": 98,"position": [550,557],"orth": "nazwisk","lexemes": [{"lemma": "nazwisko","mstag": "subst:pl:gen:n","disamb": true}]},{"index": 99,"position": [558,568],"orth": "praktykują","lexemes": [{"lemma": "praktykować","mstag": "fin:pl:ter:imperf","disamb": true}]},{"index": 100,"position": [569,572],"orth": "się","lexemes": [{"lemma": "się","mstag": "qub","disamb": true}]},{"index": 101,"position": [573,577],"orth": "dość","lexemes": [{"lemma": "dość","mstag": "qub","disamb": true}]},{"index": 102,"position": [578,584],"orth": "często","lexemes": [{"lemma": "często","mstag": "adv:pos","disamb": true}]},{"index": 103,"position": [585,586],"orth": "w","lexemes": [{"lemma": "w","mstag": "prep:loc:nwok","disamb": true}]},{"index": 104,"position": [587,594],"orth": "Galicji","lexemes": [{"lemma": "Galicja","mstag": "subst:sg:loc:f","disamb": true}]},{"index": 105,"position": [594,595],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 106,"position": [596,607],"orth": "szczególnie","lexemes": [{"lemma": "szczególnie","mstag": "qub","disamb": true}]},{"index": 107,"position": [608,609],"orth": "u","lexemes": [{"lemma": "u","mstag": "prep:gen","disamb": true}]},{"index": 108,"position": [610,622],"orth": "pojedynczych","lexemes": [{"lemma": "pojedynczy","mstag": "adj:pl:gen:f:pos","disamb": true}]},{"index": 109,"position": [623,627],"orth": "osób","lexemes": [{"lemma": "osoba","mstag": "subst:pl:gen:f","disamb": true}]},{"index": 110,"position": [627,628],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 111,"position": [629,634],"orth": "które","lexemes": [{"lemma": "który","mstag": "adj:pl:nom:f:pos","disamb": true}]},{"index": 112,"position": [635,638],"orth": "nie","lexemes": [{"lemma": "nie","mstag": "qub","disamb": true}]},{"index": 113,"position": [639,645],"orth": "czując","lexemes": [{"lemma": "czuć","mstag": "pcon:imperf","disamb": true}]},{"index": 114,"position": [646,649],"orth": "się","lexemes": [{"lemma": "się","mstag": "qub","disamb": true}]},{"index": 115,"position": [650,652],"orth": "na","lexemes": [{"lemma": "na","mstag": "prep:loc","disamb": true}]},{"index": 116,"position": [653,659],"orth": "siłach","lexemes": [{"lemma": "siła","mstag": "subst:pl:loc:f","disamb": true}]},{"index": 117,"position": [660,675],"orth": "uszlachetnienia","lexemes": [{"lemma": "uszlachetnić","mstag": "ger:sg:gen:n:perf:aff","disamb": true}]},{"index": 118,"position": [676,680],"orth": "sobą","lexemes": [{"lemma": "siebie","mstag": "siebie:inst","disamb": true}]},{"index": 119,"position": [680,681],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 120,"position": [682,687],"orth": "swymi","lexemes": [{"lemma": "swój","mstag": "adj:pl:inst:m3:pos","disamb": true}]},{"index": 121,"position": [688,695],"orth": "czynami","lexemes": [{"lemma": "czyn","mstag": "subst:pl:inst:m3","disamb": true}]},{"index": 122,"position": [696,704],"orth": "własnego","lexemes": [{"lemma": "własny","mstag": "adj:sg:gen:n:pos","disamb": true}]},{"index": 123,"position": [705,713],"orth": "nazwiska","lexemes": [{"lemma": "nazwisko","mstag": "subst:sg:gen:n","disamb": true}]},{"index": 124,"position": [713,714],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 125,"position": [715,719],"orth": "chcą","lexemes": [{"lemma": "chcieć","mstag": "fin:pl:ter:imperf","disamb": true}]},{"index": 126,"position": [720,730],"orth": "nazwiskiem","lexemes": [{"lemma": "nazwisko","mstag": "subst:sg:inst:n","disamb": true}]},{"index": 127,"position": [731,743],"orth": "uszlachetnić","lexemes": [{"lemma": "uszlachetnić","mstag": "inf:perf","disamb": true}]},{"index": 128,"position": [744,750],"orth": "siebie","lexemes": [{"lemma": "siebie","mstag": "siebie:acc","disamb": true}]},{"index": 129,"position": [750,751],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 130,"position": [752,753],"orth": "i","lexemes": [{"lemma": "i","mstag": "conj","disamb": true}]},{"index": 131,"position": [754,757],"orth": "tak","lexemes": [{"lemma": "tak","mstag": "adv:pos","disamb": true}]},{"index": 132,"position": [758,760],"orth": "np","lexemes": [{"lemma": "na przykład","mstag": "brev:pun","disamb": true}]},{"index": 133,"position": [760,761],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]}],"entities": [{"text": "Galicji","type": "nam_loc_historical_region","tokens": [28,29],"positions": [174,181]},{"text": "Lodomerii","type": "nam_loc_gpe_admin1","tokens": [30,31],"positions": [184,193]},{"text": "Pipidówka","type": "nam_loc_gpe_city","tokens": [42,43],"positions": [237,246]},{"text": "Galicji","type": "nam_loc_gpe_admin1","tokens": [103,104],"positions": [587,594]}]}
\ No newline at end of file
tmp.ipynb
\ No newline at end of file
from src.detections.detection import *
from src.detections.date import *
\ No newline at end of file
from src.detections.date import *
from src.utils.subclasses import get_sublcasses
DETECTION_CLASSES_MAP = {
detection_class.TYPE_NAME: detection_class
for detection_class in get_sublcasses(Detection)
}
\ No newline at end of file
......@@ -2,6 +2,7 @@ from src.detections.detection import Detection
from typing import List, Tuple, Optional
class DateDetection(Detection):
TYPE_NAME = "date"
class AnnotationPart:
TWO_DIGITS_DAY = "DD"
ONE_DIGIT_DAY = "D"
......@@ -19,7 +20,7 @@ class DateDetection(Detection):
:type format: Optional[List[Tuple[str, str]]]
"""
super().__init__("date")
super().__init__()
self.format = format
......
......@@ -3,9 +3,10 @@ from typing import Optional
@dataclass
class Detection:
def __init__(self, type_name: str) -> None:
self._type_name = type_name
TYPE_NAME = "detection"
def __init__(self) -> None:
pass
def __hash__(self) -> int:
return (type(self), *(self.__dict__.values())).__hash__()
......@@ -19,53 +20,78 @@ class MorphosyntacticInfoMixin:
return self._morpho_tag
class NameDetection(MorphosyntacticInfoMixin, Detection):
TYPE_NAME = "name"
def __init__(self, morpho_tag: Optional[str] = None) -> None:
super().__init__(morpho_tag=morpho_tag, type_name="name")
super().__init__(morpho_tag=morpho_tag)
class SurnameDetection(MorphosyntacticInfoMixin, Detection):
TYPE_NAME = "surname"
def __init__(self, morpho_tag: Optional[str] = None) -> None:
super().__init__(morpho_tag=morpho_tag, type_name="surname")
super().__init__(morpho_tag=morpho_tag)
class StreetNameDetection(MorphosyntacticInfoMixin, Detection):
TYPE_NAME = "street_name"
def __init__(self, morpho_tag: Optional[str] = None) -> None:
super().__init__(morpho_tag=morpho_tag, type_name="street_name")
super().__init__(morpho_tag=morpho_tag)
class CityDetection(MorphosyntacticInfoMixin, Detection):
TYPE_NAME = "city"
def __init__(self, morpho_tag: Optional[str] = None) -> None:
super().__init__(morpho_tag=morpho_tag, type_name="city")
super().__init__(morpho_tag=morpho_tag)
class CountryDetection(MorphosyntacticInfoMixin, Detection):
TYPE_NAME = "country"
def __init__(self, morpho_tag: Optional[str] = None) -> None:
super().__init__(morpho_tag=morpho_tag, type_name="country")
super().__init__(morpho_tag=morpho_tag)
class UrlDetection(Detection):
TYPE_NAME = "url"
def __init__(self) -> None:
super().__init__("url")
super().__init__()
class UserDetection(Detection):
TYPE_NAME = "user"
def __init__(self) -> None:
super().__init__("user")
super().__init__()
class EmailDetection(Detection):
TYPE_NAME = "email"
def __init__(self) -> None:
super().__init__("email")
super().__init__()
class NumberDetection(Detection):
TYPE_NAME = "number"
def __init__(self) -> None:
super().__init__("number")
super().__init__()
class PhoneNumberDetection(NumberDetection):
TYPE_NAME = "phone_number"
def __init__(self) -> None:
super().__init__()
self._type_name = "phone_number"
class TINDetection(Detection): # Tax Identification Number
TYPE_NAME = "tin"
def __init__(self) -> None:
super().__init__("tin")
super().__init__()
class KRSDetection(Detection): # National Court Register
TYPE_NAME = "krs"
def __init__(self) -> None:
super().__init__("krs")
super().__init__()
class OtherDetection(Detection): # Non standard entity
TYPE_NAME = "other"
def __init__(self) -> None:
super().__init__("other")
\ No newline at end of file
super().__init__()
\ No newline at end of file
from typing import List, Dict, Any, Tuple
from src.detectors.ner.pl_liner_n5 import detect_ner_pl_liner_n5
from src.detectors.interface import Detector
from src.detections import Detection
from src.annotations import Annotation
from src.detections import Detection, MorphosyntacticInfoMixin
from src.annotations import Annotation, NerAnnotation, MorphosyntacticAnnotation
from src.detections import DETECTION_CLASSES_MAP
class NerDetector(Detector):
def __init__(self, language: str = "pl") -> None:
def __init__(self, detection_mapping: Dict[str, str], language: str = "pl") -> None:
self._language = language
self._detection_mapping = detection_mapping
def detect(
self, text: str, annotations: List[Tuple[int, int, Annotation]]
) -> List[Tuple[int, int, str]]:
return detect_ner(annotations, self._language)
def detect_ner(
annotations: List[Tuple[int, int, Annotation]], language: str
) -> List[Tuple[int, int, str]]:
if language == "pl":
return detect_ner_pl_liner_n5(annotations)
else:
raise NotImplementedError(f"Language {language} is not supported.")
morpho_tags = dict()
ner_detections = []
for annotation in annotations:
start, end, annotation = annotation
if isinstance(annotation, MorphosyntacticAnnotation):
morpho_tags[(start, end)] = annotation.morphosyntactic_tag
elif isinstance(annotation, NerAnnotation):
ner_type = annotation.ner_type
if ner_type in self._detection_mapping:
detection_class = DETECTION_CLASSES_MAP[self._detection_mapping[ner_type]]
ner_detections.append((start, end, detection_class))
result = []
for start, end, ner_detection in ner_detections:
kwargs = dict()
if issubclass(ner_detection, MorphosyntacticInfoMixin):
if (start, end) in morpho_tags:
kwargs["morpho_tag"] = morpho_tags[(start, end)]
result.append((start, end, ner_detection(**kwargs)))
return result
from typing import List, Tuple, Dict
from src.utils.utils import subdict
from src.detections import OtherDetection, Detection
from src.mappings.ner_pl_n5_mapping import NER_PL_N5_MAPPING
from src.annotations import Annotation, NerAnnotation, MorphosyntacticAnnotation
def detect_ner_pl_liner_n5(
annotations: List[Tuple[int, int, Annotation]],
) -> List[Tuple[int, int, str]]:
"""
Detects ner entities in the text based on liner_n5 NER ontology.
:param ner_annotations: a dictionary of NER annotations
:type ner_annotations: Dict[str, List[Tuple[int, int, str]]]
:return: a list of tuples containing (start, end, entity_type)
:rtype: List[Tuple[int, int, Annotation]]
"""
ner_anotations = []
ner_annotation_tags = dict()
for annotation in annotations:
if issubclass(annotation[2].__class__, NerAnnotation):
if annotation[2].ner_type in NER_PL_N5_MAPPING.keys():
ner_anotations.append(annotation)
if issubclass(annotation[2].__class__, MorphosyntacticAnnotation):
ner_annotation_tags[(annotation[0], annotation[1])] = annotation[2].morphosyntactic_tag
return [
(start, end, NER_PL_N5_MAPPING.get(ner_annotation.ner_type, OtherDetection)(morpho_tag=ner_annotation_tags.get((start, end), None)))
for start, end, ner_annotation in ner_anotations
]
from typing import Dict, List, Optional, Type
from collections import defaultdict
from src.detections import Detection, MorphosyntacticInfoMixin, DETECTION_CLASSES_MAP
from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary
import random
class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
def __init__(
self,
dictionary_path: Optional[str] = None,
always_replace=True,
) -> None:
super().__init__()
self._dictionary = None
self._always_replace = always_replace
self._from_file(dictionary_path)
def _from_file(
self, path_to_dictionary: str
) -> None:
replacement_dictionary = defaultdict(lambda: defaultdict(dict))
with open(path_to_dictionary, "r", encoding="utf-8") as file:
for line in file:
line = line.strip()
ner_tag, word, lemma, morpho_tag = line.split("\t")
replacement_dictionary[ner_tag][morpho_tag][lemma] = word
self._dictionary = replacement_dictionary
def get_supported_detection_classes(self) -> List[Type[Detection]]:
"""
Returns a list of supported detection classes
"""
return [
DETECTION_CLASSES_MAP[name]
for name in self._dictionary.keys()
]
def get_random_replacement(self, original_entry: Detection) -> Optional[str]:
original_entry_type = type(original_entry)
original_entry_type_name = original_entry_type.TYPE_NAME
result = None
if issubclass(original_entry_type, MorphosyntacticInfoMixin):
morpho_tag = original_entry.morpho_tag
if (
original_entry_type_name in self._dictionary
and morpho_tag in self._dictionary[original_entry_type_name]
):
result = random.choice(
list(self._dictionary[original_entry_type_name][morpho_tag].values())
)
if result is None and self._always_replace:
random_type = random.choice(list(self._dictionary.keys()))
random_tag = random.choice(list(self._dictionary[random_type].keys()))
result = random.choice(
list(self._dictionary[random_type][random_tag].values())
)
return result
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment