Skip to content
Snippets Groups Projects
Commit e25eadab authored by Michał Pogoda's avatar Michał Pogoda
Browse files

Implement support for wiktorner

parent 63784ace
No related branches found
No related tags found
2 merge requests!10Anonimizer v2,!7Better coverage
Pipeline #7844 failed
Showing
with 8418 additions and 8285 deletions
defaults:
- paths: default
- detectors: all
- replacers: tag
- suppressor: order_based
- input_parser: ccl
- pipeline: default
- configuration: ccl
- _self_
\ No newline at end of file
language: "pl"
\ No newline at end of file
# @package _global_
defaults:
- /paths: default
- /detectors: all
- /replacers: tag
- /suppressor: order_based
- /input_parser: ccl
- /pipeline: default
- _self_
language: "pl"
\ No newline at end of file
# @package _global_
defaults:
- /paths: default
- /detectors: all_ner_kpwr_ext
- /replacers: tag
- /suppressor: order_based
- /input_parser: wiktor_ner
- /pipeline: sequential_jsonl
- _self_
language: "pl"
\ No newline at end of file
defaults:
- date
- email
- ner
- ner_n5
- phone
- url
- user
- number
\ No newline at end of file
defaults:
- date
- email
- ner_kpwr_ext
- phone
- url
- user
- number
\ No newline at end of file
ner:
_target_: src.detectors.ner.NerDetector
language: ${language}
\ No newline at end of file
ner:
_target_: src.detectors.ner.NerDetector
language: ${language}
detection_mapping:
"nam_liv_person_first": "name"
"nam_liv_person_last": "surname"
"nam_fac_road": "street_name"
"nam_loc_gpe_city": "city"
"nam_loc_gpe_country": "country"
"nam_loc_gpe_admin1": "country" # TODO: Implement better mapping for this
"nam_loc_historical_region": "country" # TODO: Implement better mapping for this
ner:
_target_: src.detectors.ner.NerDetector
language: ${language}
detection_mapping:
"person_first_nam": "name"
"person_last_nam": "surname"
"road_nam": "street_name"
"city_nam": "city"
"country_nam": "country"
\ No newline at end of file
_target_: src.input_parsers.wiktor_ner.WiktorNERInputParser
\ No newline at end of file
_target_: src.pipeline.sequential_jsonl.SequentialJSONLPipeline
input_parser: ${input_parser}
detectors: ${detectors}
suppressor: ${suppressor}
replacers: ${replacers}
\ No newline at end of file
ner:
_target_: src.replacers.ner_replacer.NERReplacer
dictionary:
_target_: src.dictionaries.morphosyntactic.pl_ner.PlNERMorphosyntacticDictionary
_target_: src.dictionaries.morphosyntactic.ner_file.NERFileMorphosyntacticDictionary
dictionary_path: ${paths.dictionaries_path}/pl_dict.txt
\ No newline at end of file
This diff is collapsed.
{"filename": "bb4a16ff-33de-4478-939d-12db67d750b1","text": "ROZDZIAŁ I. CO NIECO O SAMEJ PIPIDÓWCE Przede wszystkim muszę uprzedzić z góry czytelników, aby się daremnie nie trudzili nad szukaniem wyżej wyrażonego miasteczka na mapach Galicji i Lodomerii, bo go tam nie znajdą. Nie dlatego, jakoby Pipidówka nie istniała w rzeczywistości i była tylko wytworem fantazji autora, ale po prostu dlatego, że mieszkańcy owego sławnego grodu, urosłszy z czasem w ambicję, uważali tę nazwę jako ubliżającą ich powadze i podali do c. k. namiestnictwa pokorną prośbę o pozwolenie zamienienia jej na inną. Podobne zamiany nazwisk praktykują się dość często w Galicji, szczególnie u pojedynczych osób, które nie czując się na siłach uszlachetnienia sobą, swymi czynami własnego nazwiska, chcą nazwiskiem uszlachetnić siebie, i tak np.","tokens": [{"index": 1,"position": [0,8],"orth": "ROZDZIAŁ","lexemes": [{"lemma": "rozdział","mstag": "subst:sg:nom:m3","disamb": true}]},{"index": 2,"position": [9,10],"orth": "I","lexemes": [{"lemma": "I","mstag": "adj:sg:nom:m3:pos","disamb": true}]},{"index": 3,"position": [10,11],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]},{"index": 4,"position": [12,14],"orth": "CO","lexemes": [{"lemma": "co","mstag": "conj","disamb": true}]},{"index": 5,"position": [15,20],"orth": "NIECO","lexemes": [{"lemma": "nieco","mstag": "adv","disamb": true}]},{"index": 6,"position": [21,22],"orth": "O","lexemes": [{"lemma": "o","mstag": "prep:loc","disamb": true}]},{"index": 7,"position": [23,28],"orth": "SAMEJ","lexemes": [{"lemma": "sam","mstag": "adj:sg:loc:f:pos","disamb": true}]},{"index": 8,"position": [29,38],"orth": "PIPIDÓWCE","lexemes": [{"lemma": "Pipidówka","mstag": "subst:sg:loc:f","disamb": true}]},{"index": 9,"position": [39,45],"orth": "Przede","lexemes": [{"lemma": "przed","mstag": "prep:inst:wok","disamb": true}]},{"index": 10,"position": [46,55],"orth": "wszystkim","lexemes": [{"lemma": "wszystko","mstag": "subst:sg:inst:n","disamb": true}]},{"index": 11,"position": [56,61],"orth": "muszę","lexemes": [{"lemma": "musieć","mstag": "fin:sg:pri:imperf","disamb": true}]},{"index": 12,"position": [62,71],"orth": "uprzedzić","lexemes": [{"lemma": "uprzedzić","mstag": "inf:perf","disamb": true}]},{"index": 13,"position": [72,73],"orth": "z","lexemes": [{"lemma": "z","mstag": "prep:gen:nwok","disamb": true}]},{"index": 14,"position": [74,78],"orth": "góry","lexemes": [{"lemma": "góra","mstag": "subst:sg:gen:f","disamb": true}]},{"index": 15,"position": [79,90],"orth": "czytelników","lexemes": [{"lemma": "czytelnik","mstag": "subst:pl:gen:m1","disamb": true}]},{"index": 16,"position": [90,91],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 17,"position": [92,95],"orth": "aby","lexemes": [{"lemma": "aby","mstag": "comp","disamb": true}]},{"index": 18,"position": [96,99],"orth": "się","lexemes": [{"lemma": "się","mstag": "qub","disamb": true}]},{"index": 19,"position": [100,108],"orth": "daremnie","lexemes": [{"lemma": "daremnie","mstag": "adv:pos","disamb": true}]},{"index": 20,"position": [109,112],"orth": "nie","lexemes": [{"lemma": "nie","mstag": "qub","disamb": true}]},{"index": 21,"position": [113,121],"orth": "trudzili","lexemes": [{"lemma": "trudzić","mstag": "praet:pl:m1:imperf","disamb": true}]},{"index": 22,"position": [122,125],"orth": "nad","lexemes": [{"lemma": "nad","mstag": "prep:inst:nwok","disamb": true}]},{"index": 23,"position": [126,135],"orth": "szukaniem","lexemes": [{"lemma": "szukać","mstag": "ger:sg:inst:n:imperf:aff","disamb": true}]},{"index": 24,"position": [136,141],"orth": "wyżej","lexemes": [{"lemma": "wysoko","mstag": "adv:com","disamb": true}]},{"index": 25,"position": [142,152],"orth": "wyrażonego","lexemes": [{"lemma": "wyrazić","mstag": "ppas:sg:gen:n:perf:aff","disamb": true}]},{"index": 26,"position": [153,163],"orth": "miasteczka","lexemes": [{"lemma": "miasteczko","mstag": "subst:sg:gen:n","disamb": true}]},{"index": 27,"position": [164,166],"orth": "na","lexemes": [{"lemma": "na","mstag": "prep:loc","disamb": true}]},{"index": 28,"position": [167,173],"orth": "mapach","lexemes": [{"lemma": "mapa","mstag": "subst:pl:loc:f","disamb": true}]},{"index": 29,"position": [174,181],"orth": "Galicji","lexemes": [{"lemma": "Galicja","mstag": "subst:sg:gen:f","disamb": true}]},{"index": 30,"position": [182,183],"orth": "i","lexemes": [{"lemma": "i","mstag": "conj","disamb": true}]},{"index": 31,"position": [184,193],"orth": "Lodomerii","lexemes": [{"lemma": "Lodomerii","mstag": "ign","disamb": true}]},{"index": 32,"position": [193,194],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 33,"position": [195,197],"orth": "bo","lexemes": [{"lemma": "bo","mstag": "comp","disamb": true}]},{"index": 34,"position": [198,200],"orth": "go","lexemes": [{"lemma": "on","mstag": "ppron3:sg:gen:m1:ter:nakc:npraep","disamb": true}]},{"index": 35,"position": [201,204],"orth": "tam","lexemes": [{"lemma": "tam","mstag": "adv","disamb": true}]},{"index": 36,"position": [205,208],"orth": "nie","lexemes": [{"lemma": "nie","mstag": "qub","disamb": true}]},{"index": 37,"position": [209,215],"orth": "znajdą","lexemes": [{"lemma": "znaleźć","mstag": "fin:pl:ter:perf","disamb": true}]},{"index": 38,"position": [215,216],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]},{"index": 39,"position": [217,220],"orth": "Nie","lexemes": [{"lemma": "nie","mstag": "qub","disamb": true}]},{"index": 40,"position": [221,228],"orth": "dlatego","lexemes": [{"lemma": "dlatego","mstag": "adv","disamb": true}]},{"index": 41,"position": [228,229],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 42,"position": [230,236],"orth": "jakoby","lexemes": [{"lemma": "jakoby","mstag": "comp","disamb": true}]},{"index": 43,"position": [237,246],"orth": "Pipidówka","lexemes": [{"lemma": "Pipidówka","mstag": "subst:sg:nom:f","disamb": true}]},{"index": 44,"position": [247,250],"orth": "nie","lexemes": [{"lemma": "nie","mstag": "qub","disamb": true}]},{"index": 45,"position": [251,259],"orth": "istniała","lexemes": [{"lemma": "istnieć","mstag": "praet:sg:f:imperf","disamb": true}]},{"index": 46,"position": [260,261],"orth": "w","lexemes": [{"lemma": "w","mstag": "prep:loc:nwok","disamb": true}]},{"index": 47,"position": [262,276],"orth": "rzeczywistości","lexemes": [{"lemma": "rzeczywistość","mstag": "subst:sg:loc:f","disamb": true}]},{"index": 48,"position": [277,278],"orth": "i","lexemes": [{"lemma": "i","mstag": "conj","disamb": true}]},{"index": 49,"position": [279,283],"orth": "była","lexemes": [{"lemma": "być","mstag": "praet:sg:f:imperf","disamb": true}]},{"index": 50,"position": [284,289],"orth": "tylko","lexemes": [{"lemma": "tylko","mstag": "qub","disamb": true}]},{"index": 51,"position": [290,298],"orth": "wytworem","lexemes": [{"lemma": "wytwór","mstag": "subst:sg:inst:m3","disamb": true}]},{"index": 52,"position": [299,307],"orth": "fantazji","lexemes": [{"lemma": "fantazja","mstag": "subst:sg:gen:f","disamb": true}]},{"index": 53,"position": [308,314],"orth": "autora","lexemes": [{"lemma": "autor","mstag": "subst:sg:gen:m1","disamb": true}]},{"index": 54,"position": [314,315],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 55,"position": [316,319],"orth": "ale","lexemes": [{"lemma": "ale","mstag": "conj","disamb": true}]},{"index": 56,"position": [320,322],"orth": "po","lexemes": [{"lemma": "po","mstag": "prep:acc","disamb": true}]},{"index": 57,"position": [323,329],"orth": "prostu","lexemes": [{"lemma": "prosty","mstag": "adjp","disamb": true}]},{"index": 58,"position": [330,337],"orth": "dlatego","lexemes": [{"lemma": "dlatego","mstag": "adv","disamb": true}]},{"index": 59,"position": [337,338],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 60,"position": [339,341],"orth": "że","lexemes": [{"lemma": "że","mstag": "comp","disamb": true}]},{"index": 61,"position": [342,352],"orth": "mieszkańcy","lexemes": [{"lemma": "mieszkaniec","mstag": "subst:pl:nom:m1","disamb": true}]},{"index": 62,"position": [353,358],"orth": "owego","lexemes": [{"lemma": "ów","mstag": "adj:sg:gen:m3:pos","disamb": true}]},{"index": 63,"position": [359,367],"orth": "sławnego","lexemes": [{"lemma": "sławny","mstag": "adj:sg:gen:m3:pos","disamb": true}]},{"index": 64,"position": [368,373],"orth": "grodu","lexemes": [{"lemma": "gród","mstag": "subst:sg:gen:m3","disamb": true}]},{"index": 65,"position": [373,374],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 66,"position": [375,383],"orth": "urosłszy","lexemes": [{"lemma": "urosłszy","mstag": "ign","disamb": true}]},{"index": 67,"position": [384,385],"orth": "z","lexemes": [{"lemma": "z","mstag": "prep:inst:nwok","disamb": true}]},{"index": 68,"position": [386,392],"orth": "czasem","lexemes": [{"lemma": "czas","mstag": "subst:sg:inst:m3","disamb": true}]},{"index": 69,"position": [393,394],"orth": "w","lexemes": [{"lemma": "w","mstag": "prep:acc:nwok","disamb": true}]},{"index": 70,"position": [395,402],"orth": "ambicję","lexemes": [{"lemma": "ambicja","mstag": "subst:sg:acc:f","disamb": true}]},{"index": 71,"position": [402,403],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 72,"position": [404,411],"orth": "uważali","lexemes": [{"lemma": "uważać","mstag": "praet:pl:m1:imperf","disamb": true}]},{"index": 73,"position": [412,414],"orth": "tę","lexemes": [{"lemma": "ten","mstag": "adj:sg:acc:f:pos","disamb": true}]},{"index": 74,"position": [415,420],"orth": "nazwę","lexemes": [{"lemma": "nazwa","mstag": "subst:sg:acc:f","disamb": true}]},{"index": 75,"position": [421,425],"orth": "jako","lexemes": [{"lemma": "jako","mstag": "conj","disamb": true}]},{"index": 76,"position": [426,436],"orth": "ubliżającą","lexemes": [{"lemma": "ubliżający","mstag": "adj:sg:acc:f:pos","disamb": true}]},{"index": 77,"position": [437,440],"orth": "ich","lexemes": [{"lemma": "on","mstag": "ppron3:pl:gen:m1:ter:akc:npraep","disamb": true}]},{"index": 78,"position": [441,448],"orth": "powadze","lexemes": [{"lemma": "powaga","mstag": "subst:sg:loc:f","disamb": true}]},{"index": 79,"position": [449,450],"orth": "i","lexemes": [{"lemma": "i","mstag": "conj","disamb": true}]},{"index": 80,"position": [451,457],"orth": "podali","lexemes": [{"lemma": "podać","mstag": "praet:pl:m1:perf","disamb": true}]},{"index": 81,"position": [458,460],"orth": "do","lexemes": [{"lemma": "do","mstag": "prep:gen","disamb": true}]},{"index": 82,"position": [461,462],"orth": "c","lexemes": [{"lemma": "c","mstag": "subst:sg:gen:f","disamb": true}]},{"index": 83,"position": [462,463],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]},{"index": 84,"position": [464,465],"orth": "k","lexemes": [{"lemma": "K","mstag": "brev:pun","disamb": true}]},{"index": 85,"position": [465,466],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]},{"index": 86,"position": [467,480],"orth": "namiestnictwa","lexemes": [{"lemma": "namiestnictwo","mstag": "subst:sg:gen:n","disamb": true}]},{"index": 87,"position": [481,488],"orth": "pokorną","lexemes": [{"lemma": "pokorny","mstag": "adj:sg:acc:f:pos","disamb": true}]},{"index": 88,"position": [489,495],"orth": "prośbę","lexemes": [{"lemma": "prośba","mstag": "subst:sg:acc:f","disamb": true}]},{"index": 89,"position": [496,497],"orth": "o","lexemes": [{"lemma": "o","mstag": "prep:acc","disamb": true}]},{"index": 90,"position": [498,508],"orth": "pozwolenie","lexemes": [{"lemma": "pozwolić","mstag": "ger:sg:acc:n:perf:aff","disamb": true}]},{"index": 91,"position": [509,520],"orth": "zamienienia","lexemes": [{"lemma": "zamienić","mstag": "ger:sg:gen:n:perf:aff","disamb": true}]},{"index": 92,"position": [521,524],"orth": "jej","lexemes": [{"lemma": "on","mstag": "ppron3:sg:gen:f:ter:akc:npraep","disamb": true}]},{"index": 93,"position": [525,527],"orth": "na","lexemes": [{"lemma": "na","mstag": "prep:acc","disamb": true}]},{"index": 94,"position": [528,532],"orth": "inną","lexemes": [{"lemma": "inny","mstag": "adj:sg:acc:f:pos","disamb": true}]},{"index": 95,"position": [532,533],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]},{"index": 96,"position": [534,541],"orth": "Podobne","lexemes": [{"lemma": "podobny","mstag": "adj:pl:nom:f:pos","disamb": true}]},{"index": 97,"position": [542,549],"orth": "zamiany","lexemes": [{"lemma": "zamiana","mstag": "subst:pl:nom:f","disamb": true}]},{"index": 98,"position": [550,557],"orth": "nazwisk","lexemes": [{"lemma": "nazwisko","mstag": "subst:pl:gen:n","disamb": true}]},{"index": 99,"position": [558,568],"orth": "praktykują","lexemes": [{"lemma": "praktykować","mstag": "fin:pl:ter:imperf","disamb": true}]},{"index": 100,"position": [569,572],"orth": "się","lexemes": [{"lemma": "się","mstag": "qub","disamb": true}]},{"index": 101,"position": [573,577],"orth": "dość","lexemes": [{"lemma": "dość","mstag": "qub","disamb": true}]},{"index": 102,"position": [578,584],"orth": "często","lexemes": [{"lemma": "często","mstag": "adv:pos","disamb": true}]},{"index": 103,"position": [585,586],"orth": "w","lexemes": [{"lemma": "w","mstag": "prep:loc:nwok","disamb": true}]},{"index": 104,"position": [587,594],"orth": "Galicji","lexemes": [{"lemma": "Galicja","mstag": "subst:sg:loc:f","disamb": true}]},{"index": 105,"position": [594,595],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 106,"position": [596,607],"orth": "szczególnie","lexemes": [{"lemma": "szczególnie","mstag": "qub","disamb": true}]},{"index": 107,"position": [608,609],"orth": "u","lexemes": [{"lemma": "u","mstag": "prep:gen","disamb": true}]},{"index": 108,"position": [610,622],"orth": "pojedynczych","lexemes": [{"lemma": "pojedynczy","mstag": "adj:pl:gen:f:pos","disamb": true}]},{"index": 109,"position": [623,627],"orth": "osób","lexemes": [{"lemma": "osoba","mstag": "subst:pl:gen:f","disamb": true}]},{"index": 110,"position": [627,628],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 111,"position": [629,634],"orth": "które","lexemes": [{"lemma": "który","mstag": "adj:pl:nom:f:pos","disamb": true}]},{"index": 112,"position": [635,638],"orth": "nie","lexemes": [{"lemma": "nie","mstag": "qub","disamb": true}]},{"index": 113,"position": [639,645],"orth": "czując","lexemes": [{"lemma": "czuć","mstag": "pcon:imperf","disamb": true}]},{"index": 114,"position": [646,649],"orth": "się","lexemes": [{"lemma": "się","mstag": "qub","disamb": true}]},{"index": 115,"position": [650,652],"orth": "na","lexemes": [{"lemma": "na","mstag": "prep:loc","disamb": true}]},{"index": 116,"position": [653,659],"orth": "siłach","lexemes": [{"lemma": "siła","mstag": "subst:pl:loc:f","disamb": true}]},{"index": 117,"position": [660,675],"orth": "uszlachetnienia","lexemes": [{"lemma": "uszlachetnić","mstag": "ger:sg:gen:n:perf:aff","disamb": true}]},{"index": 118,"position": [676,680],"orth": "sobą","lexemes": [{"lemma": "siebie","mstag": "siebie:inst","disamb": true}]},{"index": 119,"position": [680,681],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 120,"position": [682,687],"orth": "swymi","lexemes": [{"lemma": "swój","mstag": "adj:pl:inst:m3:pos","disamb": true}]},{"index": 121,"position": [688,695],"orth": "czynami","lexemes": [{"lemma": "czyn","mstag": "subst:pl:inst:m3","disamb": true}]},{"index": 122,"position": [696,704],"orth": "własnego","lexemes": [{"lemma": "własny","mstag": "adj:sg:gen:n:pos","disamb": true}]},{"index": 123,"position": [705,713],"orth": "nazwiska","lexemes": [{"lemma": "nazwisko","mstag": "subst:sg:gen:n","disamb": true}]},{"index": 124,"position": [713,714],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 125,"position": [715,719],"orth": "chcą","lexemes": [{"lemma": "chcieć","mstag": "fin:pl:ter:imperf","disamb": true}]},{"index": 126,"position": [720,730],"orth": "nazwiskiem","lexemes": [{"lemma": "nazwisko","mstag": "subst:sg:inst:n","disamb": true}]},{"index": 127,"position": [731,743],"orth": "uszlachetnić","lexemes": [{"lemma": "uszlachetnić","mstag": "inf:perf","disamb": true}]},{"index": 128,"position": [744,750],"orth": "siebie","lexemes": [{"lemma": "siebie","mstag": "siebie:acc","disamb": true}]},{"index": 129,"position": [750,751],"orth": ",","lexemes": [{"lemma": ",","mstag": "interp","disamb": true}]},{"index": 130,"position": [752,753],"orth": "i","lexemes": [{"lemma": "i","mstag": "conj","disamb": true}]},{"index": 131,"position": [754,757],"orth": "tak","lexemes": [{"lemma": "tak","mstag": "adv:pos","disamb": true}]},{"index": 132,"position": [758,760],"orth": "np","lexemes": [{"lemma": "na przykład","mstag": "brev:pun","disamb": true}]},{"index": 133,"position": [760,761],"orth": ".","lexemes": [{"lemma": ".","mstag": "interp","disamb": true}]}],"entities": [{"text": "Galicji","type": "nam_loc_historical_region","tokens": [28,29],"positions": [174,181]},{"text": "Lodomerii","type": "nam_loc_gpe_admin1","tokens": [30,31],"positions": [184,193]},{"text": "Pipidówka","type": "nam_loc_gpe_city","tokens": [42,43],"positions": [237,246]},{"text": "Galicji","type": "nam_loc_gpe_admin1","tokens": [103,104],"positions": [587,594]}]}
\ No newline at end of file
tmp.ipynb
\ No newline at end of file
from src.detections.detection import *
from src.detections.date import *
from src.utils.subclasses import get_sublcasses
DETECTION_CLASSES_MAP = {
detection_class.TYPE_NAME: detection_class
for detection_class in get_sublcasses(Detection)
}
\ No newline at end of file
......@@ -2,6 +2,7 @@ from src.detections.detection import Detection
from typing import List, Tuple, Optional
class DateDetection(Detection):
TYPE_NAME = "date"
class AnnotationPart:
TWO_DIGITS_DAY = "DD"
ONE_DIGIT_DAY = "D"
......@@ -19,7 +20,7 @@ class DateDetection(Detection):
:type format: Optional[List[Tuple[str, str]]]
"""
super().__init__("date")
super().__init__()
self.format = format
......
......@@ -3,8 +3,9 @@ from typing import Optional
@dataclass
class Detection:
def __init__(self, type_name: str) -> None:
self._type_name = type_name
TYPE_NAME = "detection"
def __init__(self) -> None:
pass
def __hash__(self) -> int:
return (type(self), *(self.__dict__.values())).__hash__()
......@@ -19,53 +20,78 @@ class MorphosyntacticInfoMixin:
return self._morpho_tag
class NameDetection(MorphosyntacticInfoMixin, Detection):
TYPE_NAME = "name"
def __init__(self, morpho_tag: Optional[str] = None) -> None:
super().__init__(morpho_tag=morpho_tag, type_name="name")
super().__init__(morpho_tag=morpho_tag)
class SurnameDetection(MorphosyntacticInfoMixin, Detection):
TYPE_NAME = "surname"
def __init__(self, morpho_tag: Optional[str] = None) -> None:
super().__init__(morpho_tag=morpho_tag, type_name="surname")
super().__init__(morpho_tag=morpho_tag)
class StreetNameDetection(MorphosyntacticInfoMixin, Detection):
TYPE_NAME = "street_name"
def __init__(self, morpho_tag: Optional[str] = None) -> None:
super().__init__(morpho_tag=morpho_tag, type_name="street_name")
super().__init__(morpho_tag=morpho_tag)
class CityDetection(MorphosyntacticInfoMixin, Detection):
TYPE_NAME = "city"
def __init__(self, morpho_tag: Optional[str] = None) -> None:
super().__init__(morpho_tag=morpho_tag, type_name="city")
super().__init__(morpho_tag=morpho_tag)
class CountryDetection(MorphosyntacticInfoMixin, Detection):
TYPE_NAME = "country"
def __init__(self, morpho_tag: Optional[str] = None) -> None:
super().__init__(morpho_tag=morpho_tag, type_name="country")
super().__init__(morpho_tag=morpho_tag)
class UrlDetection(Detection):
TYPE_NAME = "url"
def __init__(self) -> None:
super().__init__("url")
super().__init__()
class UserDetection(Detection):
TYPE_NAME = "user"
def __init__(self) -> None:
super().__init__("user")
super().__init__()
class EmailDetection(Detection):
TYPE_NAME = "email"
def __init__(self) -> None:
super().__init__("email")
super().__init__()
class NumberDetection(Detection):
TYPE_NAME = "number"
def __init__(self) -> None:
super().__init__("number")
super().__init__()
class PhoneNumberDetection(NumberDetection):
TYPE_NAME = "phone_number"
def __init__(self) -> None:
super().__init__()
self._type_name = "phone_number"
class TINDetection(Detection): # Tax Identification Number
TYPE_NAME = "tin"
def __init__(self) -> None:
super().__init__("tin")
super().__init__()
class KRSDetection(Detection): # National Court Register
TYPE_NAME = "krs"
def __init__(self) -> None:
super().__init__("krs")
super().__init__()
class OtherDetection(Detection): # Non standard entity
TYPE_NAME = "other"
def __init__(self) -> None:
super().__init__("other")
\ No newline at end of file
super().__init__()
\ No newline at end of file
from typing import List, Dict, Any, Tuple
from src.detectors.ner.pl_liner_n5 import detect_ner_pl_liner_n5
from src.detectors.interface import Detector
from src.detections import Detection
from src.annotations import Annotation
from src.detections import Detection, MorphosyntacticInfoMixin
from src.annotations import Annotation, NerAnnotation, MorphosyntacticAnnotation
from src.detections import DETECTION_CLASSES_MAP
class NerDetector(Detector):
def __init__(self, language: str = "pl") -> None:
def __init__(self, detection_mapping: Dict[str, str], language: str = "pl") -> None:
self._language = language
self._detection_mapping = detection_mapping
def detect(
self, text: str, annotations: List[Tuple[int, int, Annotation]]
) -> List[Tuple[int, int, str]]:
return detect_ner(annotations, self._language)
morpho_tags = dict()
ner_detections = []
def detect_ner(
annotations: List[Tuple[int, int, Annotation]], language: str
) -> List[Tuple[int, int, str]]:
if language == "pl":
return detect_ner_pl_liner_n5(annotations)
else:
raise NotImplementedError(f"Language {language} is not supported.")
for annotation in annotations:
start, end, annotation = annotation
if isinstance(annotation, MorphosyntacticAnnotation):
morpho_tags[(start, end)] = annotation.morphosyntactic_tag
elif isinstance(annotation, NerAnnotation):
ner_type = annotation.ner_type
if ner_type in self._detection_mapping:
detection_class = DETECTION_CLASSES_MAP[self._detection_mapping[ner_type]]
ner_detections.append((start, end, detection_class))
result = []
for start, end, ner_detection in ner_detections:
kwargs = dict()
if issubclass(ner_detection, MorphosyntacticInfoMixin):
if (start, end) in morpho_tags:
kwargs["morpho_tag"] = morpho_tags[(start, end)]
result.append((start, end, ner_detection(**kwargs)))
return result
from typing import List, Tuple, Dict
from src.utils.utils import subdict
from src.detections import OtherDetection, Detection
from src.mappings.ner_pl_n5_mapping import NER_PL_N5_MAPPING
from src.annotations import Annotation, NerAnnotation, MorphosyntacticAnnotation
def detect_ner_pl_liner_n5(
annotations: List[Tuple[int, int, Annotation]],
) -> List[Tuple[int, int, str]]:
"""
Detects ner entities in the text based on liner_n5 NER ontology.
:param ner_annotations: a dictionary of NER annotations
:type ner_annotations: Dict[str, List[Tuple[int, int, str]]]
:return: a list of tuples containing (start, end, entity_type)
:rtype: List[Tuple[int, int, Annotation]]
"""
ner_anotations = []
ner_annotation_tags = dict()
for annotation in annotations:
if issubclass(annotation[2].__class__, NerAnnotation):
if annotation[2].ner_type in NER_PL_N5_MAPPING.keys():
ner_anotations.append(annotation)
if issubclass(annotation[2].__class__, MorphosyntacticAnnotation):
ner_annotation_tags[(annotation[0], annotation[1])] = annotation[2].morphosyntactic_tag
return [
(start, end, NER_PL_N5_MAPPING.get(ner_annotation.ner_type, OtherDetection)(morpho_tag=ner_annotation_tags.get((start, end), None)))
for start, end, ner_annotation in ner_anotations
]
from typing import Dict, List, Optional, Type
from collections import defaultdict
from src.detections import Detection, MorphosyntacticInfoMixin, DETECTION_CLASSES_MAP
from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary
import random
class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
def __init__(
self,
dictionary_path: Optional[str] = None,
always_replace=True,
) -> None:
super().__init__()
self._dictionary = None
self._always_replace = always_replace
self._from_file(dictionary_path)
def _from_file(
self, path_to_dictionary: str
) -> None:
replacement_dictionary = defaultdict(lambda: defaultdict(dict))
with open(path_to_dictionary, "r", encoding="utf-8") as file:
for line in file:
line = line.strip()
ner_tag, word, lemma, morpho_tag = line.split("\t")
replacement_dictionary[ner_tag][morpho_tag][lemma] = word
self._dictionary = replacement_dictionary
def get_supported_detection_classes(self) -> List[Type[Detection]]:
"""
Returns a list of supported detection classes
"""
return [
DETECTION_CLASSES_MAP[name]
for name in self._dictionary.keys()
]
def get_random_replacement(self, original_entry: Detection) -> Optional[str]:
original_entry_type = type(original_entry)
original_entry_type_name = original_entry_type.TYPE_NAME
result = None
if issubclass(original_entry_type, MorphosyntacticInfoMixin):
morpho_tag = original_entry.morpho_tag
if (
original_entry_type_name in self._dictionary
and morpho_tag in self._dictionary[original_entry_type_name]
):
result = random.choice(
list(self._dictionary[original_entry_type_name][morpho_tag].values())
)
if result is None and self._always_replace:
random_type = random.choice(list(self._dictionary.keys()))
random_tag = random.choice(list(self._dictionary[random_type].keys()))
result = random.choice(
list(self._dictionary[random_type][random_tag].values())
)
return result
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment