From 63784aceda776eece84c0b7aee7a0fb8b34f81c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl> Date: Thu, 26 Jan 2023 15:00:20 +0100 Subject: [PATCH] Working polish version --- config/config.yaml | 1 + config/detectors/number.yaml | 2 + config/paths/default.yaml | 3 +- config/replacers/ner.yaml | 3 +- config/replacers/number.yaml | 2 + config/replacers/pseudo.yaml | 1 + dictionaries/pl_dict.txt | 1 + .../marek_kowalski_pojechal_do_wroclawia.ccl | 84 ++++++++++++++++--- src/annotations/__init__.py | 1 + src/annotations/annotations.py | 14 ++++ src/detections/detection.py | 14 ++-- src/detectors/ner/ner.py | 7 +- src/detectors/ner/pl_liner_n5.py | 24 +++--- src/detectors/number/__init__.py | 1 + src/detectors/number/number.py | 25 ++++++ src/dictionaries/morphosyntactic/interface.py | 8 +- src/dictionaries/morphosyntactic/pl_ner.py | 29 +++++-- src/input_parsers/ccl.py | 27 +++--- src/replacers/ner_replacer.py | 4 + src/replacers/number_replacer.py | 43 ++++++++++ tests/detectors/ner/test_pl_liner_n5.py | 28 ++++--- tests/input_parsers/test_ccl.py | 26 +++--- 22 files changed, 272 insertions(+), 76 deletions(-) create mode 100644 config/detectors/number.yaml create mode 100644 config/replacers/number.yaml create mode 100644 src/annotations/__init__.py create mode 100644 src/annotations/annotations.py create mode 100644 src/detectors/number/__init__.py create mode 100644 src/detectors/number/number.py create mode 100644 src/replacers/number_replacer.py diff --git a/config/config.yaml b/config/config.yaml index d4c077e..071d64b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,4 +1,5 @@ defaults: + - paths: default - detectors: all - replacers: tag - suppressor: order_based diff --git a/config/detectors/number.yaml b/config/detectors/number.yaml new file mode 100644 index 0000000..e7714ca --- /dev/null +++ b/config/detectors/number.yaml @@ -0,0 +1,2 @@ +number: + _target_: src.detectors.number.NumberDetector \ No newline at end of file diff --git a/config/paths/default.yaml b/config/paths/default.yaml index 657a167..f903ce8 100644 --- a/config/paths/default.yaml +++ b/config/paths/default.yaml @@ -1 +1,2 @@ -dictionaries_path: dictionaries \ No newline at end of file +root_path: ./ +dictionaries_path: ${paths.root_path}/dictionaries \ No newline at end of file diff --git a/config/replacers/ner.yaml b/config/replacers/ner.yaml index 8d20018..e8e6cb9 100644 --- a/config/replacers/ner.yaml +++ b/config/replacers/ner.yaml @@ -2,4 +2,5 @@ ner: _target_: src.replacers.ner_replacer.NERReplacer dictionary: _target_: src.dictionaries.morphosyntactic.pl_ner.PlNERMorphosyntacticDictionary - dictionary_path: ${paths.dictionaries_path}/pl_dict.txt \ No newline at end of file + dictionary_path: ${paths.dictionaries_path}/pl_dict.txt + \ No newline at end of file diff --git a/config/replacers/number.yaml b/config/replacers/number.yaml new file mode 100644 index 0000000..0d494fd --- /dev/null +++ b/config/replacers/number.yaml @@ -0,0 +1,2 @@ +number: + _target_: src.replacers.number_replacer.NumberReplacer \ No newline at end of file diff --git a/config/replacers/pseudo.yaml b/config/replacers/pseudo.yaml index 5c4a301..4538b57 100644 --- a/config/replacers/pseudo.yaml +++ b/config/replacers/pseudo.yaml @@ -3,4 +3,5 @@ defaults: - email - ner - user + - number - tag # Fallback to tag replacement if no other replacement is found \ No newline at end of file diff --git a/dictionaries/pl_dict.txt b/dictionaries/pl_dict.txt index ad0cb84..281114c 100644 --- a/dictionaries/pl_dict.txt +++ b/dictionaries/pl_dict.txt @@ -2456,6 +2456,7 @@ nam_loc_gpe_city Helsinek Helsinki pl:gen:n nam_loc_gpe_city Helsinkami Helsinki pl:inst:n nam_loc_gpe_city Helsinkach Helsinki pl:loc:n nam_loc_gpe_city Helsinkom Helsinki pl:dat:n +nam_liv_person Henryk Henryk sg:nom:m1 nam_liv_person Henryka Henryk sg:acc:m3 nam_liv_person Henrykowi Henryk sg:dat:m3 nam_liv_person Henrykiem Henryk sg:inst:m3 diff --git a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl index f1459ba..453acb9 100644 --- a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl +++ b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl @@ -4,9 +4,8 @@ <chunk type="p" id="ch1"> <sentence id="s1"> <tok> - <orth>Marek</orth> - <lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex> - <lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex> + <orth>Jan</orth> + <lex disamb="1"><base>Jan</base><ctag>subst:sg:nom:m1</ctag></lex> <ann chan="person_first_nam" head="1">1</ann> <ann chan="person_last_nam">0</ann> <ann chan="city_nam">0</ann> @@ -19,34 +18,95 @@ <ann chan="city_nam">0</ann> </tok> <tok> - <orth>pojechał</orth> - <lex disamb="1"><base>pojechać</base><ctag>praet:sg:m1:perf</ctag></lex> + <orth>(</orth> + <lex disamb="1"><base>(</base><ctag>interp</ctag></lex> <ann chan="person_first_nam">0</ann> <ann chan="person_last_nam">0</ann> <ann chan="city_nam">0</ann> </tok> + <ns/> <tok> - <orth>do</orth> - <lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex> + <orth>numer</orth> + <lex disamb="1"><base>numer</base><ctag>subst:sg:nom:m3</ctag></lex> <ann chan="person_first_nam">0</ann> <ann chan="person_last_nam">0</ann> <ann chan="city_nam">0</ann> </tok> <tok> - <orth>Wrocławia</orth> - <lex disamb="1"><base>Wrocław</base><ctag>subst:sg:gen:m3</ctag></lex> + <orth>telefonu</orth> + <lex disamb="1"><base>telefon</base><ctag>subst:sg:gen:m3</ctag></lex> <ann chan="person_first_nam">0</ann> <ann chan="person_last_nam">0</ann> - <ann chan="city_nam" head="1">1</ann> + <ann chan="city_nam">0</ann> + </tok> + <tok> + <orth>123</orth> + <lex disamb="1"><base>123</base><ctag>num:pl:nom:m1:rec</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> + </tok> + <ns/> + <tok> + <orth>-</orth> + <lex disamb="1"><base>-</base><ctag>interp</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> + </tok> + <ns/> + <tok> + <orth>456</orth> + <lex disamb="1"><base>456</base><ctag>num:pl:nom:m1:rec</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> </tok> <ns/> <tok> - <orth>.</orth> - <lex disamb="1"><base>.</base><ctag>interp</ctag></lex> + <orth>-</orth> + <lex disamb="1"><base>-</base><ctag>interp</ctag></lex> <ann chan="person_first_nam">0</ann> <ann chan="person_last_nam">0</ann> <ann chan="city_nam">0</ann> </tok> + <ns/> + <tok> + <orth>789</orth> + <lex disamb="1"><base>789</base><ctag>num:pl:nom:m1:rec</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> + </tok> + <ns/> + <tok> + <orth>)</orth> + <lex disamb="1"><base>)</base><ctag>interp</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> + </tok> + <tok> + <orth>miesza</orth> + <lex disamb="1"><base>mieszać</base><ctag>fin:sg:ter:imperf</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> + </tok> + <tok> + <orth>we</orth> + <lex disamb="1"><base>w</base><ctag>prep:acc:wok</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> + </tok> + <tok> + <orth>Wrocławiu</orth> + <lex disamb="1"><base>Wrocław</base><ctag>subst:sg:loc:m3</ctag></lex> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam" head="1">1</ann> + </tok> </sentence> </chunk> </chunkList> \ No newline at end of file diff --git a/src/annotations/__init__.py b/src/annotations/__init__.py new file mode 100644 index 0000000..d09a852 --- /dev/null +++ b/src/annotations/__init__.py @@ -0,0 +1 @@ +from src.annotations.annotations import * \ No newline at end of file diff --git a/src/annotations/annotations.py b/src/annotations/annotations.py new file mode 100644 index 0000000..fab8b10 --- /dev/null +++ b/src/annotations/annotations.py @@ -0,0 +1,14 @@ +from dataclasses import dataclass + +@dataclass +class Annotation: + def __hash__(self) -> int: + return (type(self), *(self.__dict__.values())).__hash__() + +class MorphosyntacticAnnotation(Annotation): + def __init__(self, morphosyntactic_tag) -> None: + self.morphosyntactic_tag = morphosyntactic_tag + +class NerAnnotation(Annotation): + def __init__(self, ner_type: str) -> None: + self.ner_type = ner_type \ No newline at end of file diff --git a/src/detections/detection.py b/src/detections/detection.py index 59d712d..79abbde 100644 --- a/src/detections/detection.py +++ b/src/detections/detection.py @@ -7,7 +7,7 @@ class Detection: self._type_name = type_name def __hash__(self) -> int: - return tuple(self.__dict__.values()).__hash__() + return (type(self), *(self.__dict__.values())).__hash__() class MorphosyntacticInfoMixin: def __init__(self, morpho_tag: str, *args, **kwargs) -> None: @@ -38,10 +38,6 @@ class CountryDetection(MorphosyntacticInfoMixin, Detection): def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag, type_name="country") -class PhoneNumberDetection(Detection): - def __init__(self) -> None: - super().__init__("phone_number") - class UrlDetection(Detection): def __init__(self) -> None: super().__init__("url") @@ -54,6 +50,14 @@ class EmailDetection(Detection): def __init__(self) -> None: super().__init__("email") +class NumberDetection(Detection): + def __init__(self) -> None: + super().__init__("number") +class PhoneNumberDetection(NumberDetection): + def __init__(self) -> None: + super().__init__() + self._type_name = "phone_number" + class TINDetection(Detection): # Tax Identification Number def __init__(self) -> None: super().__init__("tin") diff --git a/src/detectors/ner/ner.py b/src/detectors/ner/ner.py index 6c4ae8a..1a4fdad 100644 --- a/src/detectors/ner/ner.py +++ b/src/detectors/ner/ner.py @@ -2,6 +2,7 @@ from typing import List, Dict, Any, Tuple from src.detectors.ner.pl_liner_n5 import detect_ner_pl_liner_n5 from src.detectors.interface import Detector from src.detections import Detection +from src.annotations import Annotation class NerDetector(Detector): @@ -9,15 +10,15 @@ class NerDetector(Detector): self._language = language def detect( - self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] + self, text: str, annotations: List[Tuple[int, int, Annotation]] ) -> List[Tuple[int, int, str]]: return detect_ner(annotations, self._language) def detect_ner( - ccl_annotations: Dict[str, List[Tuple[int, int, Detection]]], language: str + annotations: List[Tuple[int, int, Annotation]], language: str ) -> List[Tuple[int, int, str]]: if language == "pl": - return detect_ner_pl_liner_n5(ccl_annotations) + return detect_ner_pl_liner_n5(annotations) else: raise NotImplementedError(f"Language {language} is not supported.") diff --git a/src/detectors/ner/pl_liner_n5.py b/src/detectors/ner/pl_liner_n5.py index d51cfa9..f11b67a 100644 --- a/src/detectors/ner/pl_liner_n5.py +++ b/src/detectors/ner/pl_liner_n5.py @@ -2,9 +2,10 @@ from typing import List, Tuple, Dict from src.utils.utils import subdict from src.detections import OtherDetection, Detection from src.mappings.ner_pl_n5_mapping import NER_PL_N5_MAPPING +from src.annotations import Annotation, NerAnnotation, MorphosyntacticAnnotation def detect_ner_pl_liner_n5( - ccl_annotations: Dict[str, List[Tuple[int, int, Detection]]] + annotations: List[Tuple[int, int, Annotation]], ) -> List[Tuple[int, int, str]]: """ Detects ner entities in the text based on liner_n5 NER ontology. @@ -14,14 +15,17 @@ def detect_ner_pl_liner_n5( :return: a list of tuples containing (start, end, entity_type) :rtype: List[Tuple[int, int, Annotation]] """ - names = subdict( - ccl_annotations, - list(NER_PL_N5_MAPPING.keys()), - all_must_be_present=False, - ) - + + ner_anotations = [] + ner_annotation_tags = dict() + for annotation in annotations: + if issubclass(annotation[2].__class__, NerAnnotation): + if annotation[2].ner_type in NER_PL_N5_MAPPING.keys(): + ner_anotations.append(annotation) + if issubclass(annotation[2].__class__, MorphosyntacticAnnotation): + ner_annotation_tags[(annotation[0], annotation[1])] = annotation[2].morphosyntactic_tag + return [ - (start, end, NER_PL_N5_MAPPING.get(entity_type, OtherDetection)()) - for entity_type, entity in names.items() - for start, end, _ in entity + (start, end, NER_PL_N5_MAPPING.get(ner_annotation.ner_type, OtherDetection)(morpho_tag=ner_annotation_tags.get((start, end), None))) + for start, end, ner_annotation in ner_anotations ] diff --git a/src/detectors/number/__init__.py b/src/detectors/number/__init__.py new file mode 100644 index 0000000..7d72f52 --- /dev/null +++ b/src/detectors/number/__init__.py @@ -0,0 +1 @@ +from src.detectors.number.number import NumberDetector \ No newline at end of file diff --git a/src/detectors/number/number.py b/src/detectors/number/number.py new file mode 100644 index 0000000..3fbfa55 --- /dev/null +++ b/src/detectors/number/number.py @@ -0,0 +1,25 @@ +import regex as re +from typing import List, Dict, Any, Tuple +from src.detections import NumberDetection +from src.detectors.interface import Detector + +NUMBER_REGEX = re.compile( + r"\d+[^a-zA-Z\d]*\d*", + re.I, +) + +class NumberDetector(Detector): + def __init__(self) -> None: + super().__init__() + + def detect( + self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] + ) -> List[Tuple[int, int, NumberDetection]]: + NUMBER_REGEX.finditer(text) + numbers = [] + + for number in numbers: + numbers.append((number.start(), number.end(), NumberDetection())) + + + return numbers \ No newline at end of file diff --git a/src/dictionaries/morphosyntactic/interface.py b/src/dictionaries/morphosyntactic/interface.py index 3f8a66b..f8d9fa7 100644 --- a/src/dictionaries/morphosyntactic/interface.py +++ b/src/dictionaries/morphosyntactic/interface.py @@ -1,7 +1,13 @@ from src.detections import Detection -from typing import Optional +from typing import Optional, List, Type class MorphosyntacticDictionary: + def get_supported_detection_classes(self) -> List[Type[Detection]]: + """ + Returns a list of supported detection classes + """ + raise NotImplementedError() + def get_random_replacement(self, original_entry: Detection) -> Optional[str]: """ Returns a random replacement for the original entry diff --git a/src/dictionaries/morphosyntactic/pl_ner.py b/src/dictionaries/morphosyntactic/pl_ner.py index d25beae..d3b1160 100644 --- a/src/dictionaries/morphosyntactic/pl_ner.py +++ b/src/dictionaries/morphosyntactic/pl_ner.py @@ -4,12 +4,26 @@ from src.detections import Detection, OtherDetection, MorphosyntacticInfoMixin from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary import random +from src.detections import ( + NameDetection, + SurnameDetection, + StreetNameDetection, + CityDetection, + CountryDetection, +) + +NER_PL_N5_MAPPING = { + "nam_liv_person": NameDetection, + "nam_liv_person_last": SurnameDetection, + "nam_fac_road": StreetNameDetection, + "nam_loc_gpe_city": CityDetection, + "nam_org_group_team": CountryDetection, +} class PlNERMorphosyntacticDictionary(MorphosyntacticDictionary): def __init__( self, dictionary_path: Optional[str] = None, - annotation_mapping: Optional[Dict[str, Type[Detection]]] = None, list: Optional[List[Tuple[Detection, str, str, str]]] = None, always_replace=True, ) -> None: @@ -18,8 +32,7 @@ class PlNERMorphosyntacticDictionary(MorphosyntacticDictionary): self._always_replace = always_replace if dictionary_path is not None: - assert annotation_mapping is not None - self._from_file(dictionary_path, annotation_mapping) + self._from_file(dictionary_path, NER_PL_N5_MAPPING) elif list is not None: self._from_list(list) else: @@ -36,6 +49,12 @@ class PlNERMorphosyntacticDictionary(MorphosyntacticDictionary): self._dictionary = defaultdict(lambda: defaultdict(dict)) for annotation, word, lemma, morpho_tag in list: self._dictionary[annotation][morpho_tag][lemma] = word + + def get_supported_detection_classes(self) -> List[Type[Detection]]: + """ + Returns a list of supported detection classes + """ + return list(self._dictionary.keys()) def get_random_replacement(self, original_entry: Detection) -> Optional[str]: original_entry_type = type(original_entry) @@ -43,7 +62,7 @@ class PlNERMorphosyntacticDictionary(MorphosyntacticDictionary): result = None if issubclass(original_entry_type, MorphosyntacticInfoMixin): - morpho_tag = original_entry.morpho_tag + morpho_tag = ":".join(original_entry.morpho_tag.split(":")[1:]) if ( original_entry_type in self._dictionary @@ -103,7 +122,7 @@ def load_pl_ner_replacements_dictionary( ner_tag, word, lemma, morpho_tag = line.split("\t") if ner_mapping is not None: - ner_tag = ner_mapping.get(ner_tag, OtherDetection)() + ner_tag = ner_mapping.get(ner_tag, OtherDetection) replacement_dictionary[ner_tag][morpho_tag][lemma] = word diff --git a/src/input_parsers/ccl.py b/src/input_parsers/ccl.py index 7b8bb7e..7847043 100644 --- a/src/input_parsers/ccl.py +++ b/src/input_parsers/ccl.py @@ -3,12 +3,13 @@ from lxml import etree from collections import defaultdict # from src.annotation_types_old import from src.input_parsers.interface import InputParser +from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation class CCLInputParser(InputParser): def __init__(self) -> None: super().__init__() - def parse(self, path_to_input: str) -> Tuple[str, Dict[str, List[Tuple[int, int, str]]]]: + def parse(self, path_to_input: str) -> List[Tuple[int, int, Annotation]]: """Parse CCL string into text and annotations. Annotations are returned as a dictionary with channel name as a key and list of tuples. @@ -17,16 +18,19 @@ class CCLInputParser(InputParser): path_to_input (str): Path to file containing CCL. Returns: - Tuple[str, Dict[str, List[Tuple[int, int, str]]]]: Text and annotations. + Tuple[str, Dict[str, List[Tuple[int, int, Annotation]]]]: Text and annotations. """ with open(path_to_input, 'r') as f: ccl = f.read() ccl_tree = etree.fromstring(ccl.strip().encode('utf-8')) - results = defaultdict(list) + results = [] text = "" + ner_annotations = [] + morphosyntactic_annotations = [] + # First token is assumed to not have space before it last_was_ns = True @@ -43,7 +47,7 @@ class CCLInputParser(InputParser): for lex in token.xpath('./lex'): if lex.attrib['disamb'] == "1": ctag = lex.xpath('./ctag')[0] - # results[AnnotationTypes.MORPHOSYNTACTIC_TAG].append((start, end, ctag.text)) + morphosyntactic_annotations.append((start, end, MorphosyntacticAnnotation(ctag.text))) break @@ -56,20 +60,17 @@ class CCLInputParser(InputParser): is_head = "head" in ann.attrib and ann.attrib['head'] == "1" if is_head: - results[channel].append((start, end, word)) - else: - if last_was_ns: - new_word = results[channel][-1][2] + word - else: - new_word = results[channel][-1][2] + " " + word - - old_start = results[channel][-1][0] + ner_annotations.append((start, end, NerAnnotation(channel))) + else: + old_start = ner_annotations[-1][0] - results[channel][-1] = (old_start, end, new_word) + ner_annotations[-1] = (old_start, end, ner_annotations[-1][2]) last_was_ns = False text += word elif token.tag == 'ns': last_was_ns = True + results = ner_annotations + morphosyntactic_annotations + return text, results \ No newline at end of file diff --git a/src/replacers/ner_replacer.py b/src/replacers/ner_replacer.py index 214f0b7..6804451 100644 --- a/src/replacers/ner_replacer.py +++ b/src/replacers/ner_replacer.py @@ -20,6 +20,10 @@ class NERReplacer(ReplacerInterface): already_replaced = dict() for item in detections: + if type(item[2]) not in self._dictionary.get_supported_detection_classes(): + not_processed.append(item) + continue + start, end, detection = item key = (text[start:end], type(detection)) diff --git a/src/replacers/number_replacer.py b/src/replacers/number_replacer.py new file mode 100644 index 0000000..d0c8f9e --- /dev/null +++ b/src/replacers/number_replacer.py @@ -0,0 +1,43 @@ +from typing import List, Tuple +from src.detections import ( + Detection, + NumberDetection, +) +from src.string_replacements import replace_and_update +from src.replacers.interface import ReplacerInterface +import random +import string + +def randomize_digits_in_text(text: str) -> str: + result = "" + + for c in text: + if c.isdigit(): + result += random.choice(string.digits) + else: + result += c + + return result + +class NumberReplacer(ReplacerInterface): + def __init__(self): + pass + + def replace(self, text: str, detections: List[Tuple[int, int, Detection]]) -> Tuple[str, List[Tuple[int, int, Detection]]]: + replacements = [] + not_processed = [] + + already_replaced = dict() + + for item in detections: + start, end, detection = item + + if isinstance(detection, NumberDetection): + if text[start:end] not in already_replaced: + already_replaced[text[start:end]] = randomize_digits_in_text(text[start:end]) + + replacements.append((start, end, already_replaced[text[start:end]])) + else: + not_processed.append(item) + + return replace_and_update(text, replacements, not_processed) \ No newline at end of file diff --git a/tests/detectors/ner/test_pl_liner_n5.py b/tests/detectors/ner/test_pl_liner_n5.py index 7af941c..08f77a9 100644 --- a/tests/detectors/ner/test_pl_liner_n5.py +++ b/tests/detectors/ner/test_pl_liner_n5.py @@ -1,23 +1,29 @@ +from src.annotations import NerAnnotation, MorphosyntacticAnnotation from src.detections import NameDetection, SurnameDetection, CityDetection from src.detectors.ner import NerDetector def test_detect_names_pl_liner_n5(): detector = NerDetector("pl") - ccl_annotations = { - 'person_first_nam': [(10, 16, 'Marian'), (100, 109, 'Magdalena')], - 'person_last_nam': [(30, 35, 'Nowak')], - 'city_nam': [(50, 59, 'Wrocławiu')], - 'some_other_annotation': [(120, 124, 'zowd')], - } + annotations = [ + (10, 16, NerAnnotation("person_first_nam")), + (100, 109, NerAnnotation("person_first_nam")), + (30, 35, NerAnnotation("person_last_nam")), + (50, 59, NerAnnotation("city_nam")), + (120, 124, NerAnnotation("some_other_annotation")), + (10, 16, MorphosyntacticAnnotation("1")), + (100, 109, MorphosyntacticAnnotation("2")), + (30, 35, MorphosyntacticAnnotation("3")), + (120, 124, MorphosyntacticAnnotation("some_other_morphosyntactic_annotation")), + ] - result = detector.detect("", ccl_annotations) + result = detector.detect("", annotations) expected = [ - (10, 16, NameDetection()), - (100, 109, NameDetection()), - (30, 35, SurnameDetection()), - (50, 59, CityDetection()), + (10, 16, NameDetection(morpho_tag="1")), + (100, 109, NameDetection(morpho_tag="2")), + (30, 35, SurnameDetection(morpho_tag="3")), + (50, 59, CityDetection(morpho_tag=None)), ] assert set(result) == set(expected) \ No newline at end of file diff --git a/tests/input_parsers/test_ccl.py b/tests/input_parsers/test_ccl.py index ec78647..498e68c 100644 --- a/tests/input_parsers/test_ccl.py +++ b/tests/input_parsers/test_ccl.py @@ -1,6 +1,7 @@ # from src.annotation_types_old import AnnotationTypes from src.input_parsers.ccl import CCLInputParser from tempfile import NamedTemporaryFile +from src.annotations import NerAnnotation, MorphosyntacticAnnotation example_ccl = """<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE chunkList SYSTEM "ccl.dtd"> @@ -50,7 +51,6 @@ example_ccl = """<?xml version="1.0" encoding="UTF-8"?> </chunkList> """ - def test_ccl_input_parser(): parser = CCLInputParser() @@ -60,16 +60,14 @@ def test_ccl_input_parser(): text, annotations = parser.parse(f.name) assert text == "Marek Kowalski pojechał do Wrocławia." - - # assert set(annotations.keys()) == set(["nam_liv", "nam_loc", AnnotationTypes.MORPHOSYNTACTIC_TAG]) - - assert annotations["nam_liv"] == [(0, 14, "Marek Kowalski")] - assert annotations["nam_loc"] == [(27, 36, "Wrocławia")] - # assert annotations[AnnotationTypes.MORPHOSYNTACTIC_TAG] == [ - # (0, 5, "subst:sg:nom:m1"), - # (6, 14, "subst:sg:nom:m1"), - # (15, 23, "praet:sg:m1:perf"), - # (24, 26, "prep:gen"), - # (27, 36, "subst:sg:gen:m3"), - # (36, 37, "interp"), - # ] + assert len(annotations) == 8 + + assert (0, 14, NerAnnotation("nam_liv")) in annotations + assert (27, 36, NerAnnotation("nam_loc")) in annotations + + assert (0, 5, MorphosyntacticAnnotation("subst:sg:nom:m1")) in annotations + assert (6, 14, MorphosyntacticAnnotation("subst:sg:nom:m1")) in annotations + assert (15, 23, MorphosyntacticAnnotation("praet:sg:m1:perf")) in annotations + assert (24, 26, MorphosyntacticAnnotation("prep:gen")) in annotations + assert (27, 36, MorphosyntacticAnnotation("subst:sg:gen:m3")) in annotations + assert (36, 37, MorphosyntacticAnnotation("interp")) in annotations \ No newline at end of file -- GitLab