diff --git a/CHANGELOG.md b/CHANGELOG.md index c3723831efe7af9858da71cd38d48cec22900699..8930d549a859f437a4cf3b5cf0b17b806b135c43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ # PolDeepNer2 Changelog +## 0.8.1 +### Added +- Pre-trained model pdn2-v08-nkjp-large + ## 0.8.0 ### Changed - Fixes and improvements to handle the PolEval 2018 model. diff --git a/poldeepner2/__init__.py b/poldeepner2/__init__.py index c1f802779b94adb643f0cd80886e06e2bae55f4d..1c5ce1bc23eef37f94a236f5145b575c07ee71cc 100644 --- a/poldeepner2/__init__.py +++ b/poldeepner2/__init__.py @@ -2,25 +2,21 @@ import os from pathlib import Path from poldeepner2.models import PolDeepNer2 +from poldeepner2.pipeline.tokenization import Tokenizer from poldeepner2.utils.file_utils import download_file resources = { - "pdn2-v07-kpwr-n82-base-01": { - "url": "https://s3.clarin-pl.eu/users/czuk/_public/pdn2/v07/pdn2-v07-kpwr-n82-base-01.zip", + "pdn2-v08-nkjp-large": { + "url": "https://s3.clarin-pl.eu/workers/pdn2/pdn2-v08-nkjp-large.zip", "compression": "zip", - "extractToSubfolder": False - }, - "pdn2-v07-cen-n82-base-01": { - "url": "https://s3.clarin-pl.eu/users/czuk/_public/pdn2/v07/pdn2-v07-cen-n82-base-01.zip", - "compression": "zip", - "extractToSubfolder": False - }, + "extractToSubfolder": False, + } } -def load(path_or_name: str, device: str = None, resources_path: str = ".resources") -> PolDeepNer2: +def get_model_or_download(path_or_name: str, resources_path: str = ".resources") -> str: if Path(path_or_name).exists(): path = path_or_name else: @@ -32,4 +28,10 @@ def load(path_or_name: str, device: str = None, resources_path: str = ".resource extract_to_subfolder) else: raise ValueError(f"Unknown resource name or invalid path: {path_or_name}") - return PolDeepNer2(path, device=device) + return path + + +def load(path_or_name: str, device: str = None, tokenizer: Tokenizer = None, resources_path: str = ".resources") \ + -> PolDeepNer2: + path = get_model_or_download(path_or_name, resources_path) + return PolDeepNer2(path, device=device, tokenizer=tokenizer) diff --git a/poldeepner2/model/hf_for_token_calssification.py b/poldeepner2/model/hf_for_token_calssification.py index c2d51507e1c78232899a77c7cb641d6185e30647..6d985cf8bb5d52c086fbe9a99587b634ecc85464 100644 --- a/poldeepner2/model/hf_for_token_calssification.py +++ b/poldeepner2/model/hf_for_token_calssification.py @@ -36,10 +36,13 @@ class Pdn2TokenClassification(nn.Module): super().__init__() root = Path(path) + self.metadata = {} + if config: self.config = config else: self.load_config(root) + self.load_metadata(root) if device: self.config.device = device setup_seed(self.config.seed) @@ -142,3 +145,9 @@ class Pdn2TokenClassification(nn.Module): def load_config(self, path: str): with open(str(Path(path) / 'pdn2_config.yml'), 'r') as f: self.config = yaml.load(f, Loader=yaml.Loader) + + def load_metadata(self, path: str): + path = Path(path) / 'pdn2_metadata.yml' + if path.exists(): + with open(str(path), 'r') as f: + self.metadata = yaml.load(f, Loader=yaml.Loader) diff --git a/poldeepner2/models.py b/poldeepner2/models.py index edb4dd647a50bbd64e1d6d6833088fa5660a2756..62e8b51423e16b0eb2a6a2073e74b75b217fa746 100644 --- a/poldeepner2/models.py +++ b/poldeepner2/models.py @@ -1,6 +1,6 @@ import logging import os -from typing import List +from typing import List, Dict import torch import tqdm @@ -24,11 +24,13 @@ class PolDeepNer2: processor_annotations: List[ProcessorAnnotations] = None, device: str = None): if not os.path.exists(path): raise ValueError("Model not found on path '%s'" % path) - self.model = Pdn2TokenClassification(path, device=device) self.tokenizer = tokenizer if tokenizer is not None else TokenizerFast() self.processors_annotations = processor_annotations if processor_annotations else [split_hashtags] + def metadata(self) -> Dict[str, str]: + return self.model.metadata + def process(self, sentences: [[str]], show_progress: bool = False): """ @param sentences -- array of arrays of words, [['Jan', 'z', 'Warszawy'], ['IBM', 'i', 'Apple']] diff --git a/process_poleval.py b/process_poleval.py index 6afb03d0f06454b62ac8fce5dbfde4c4046bdbba..ab2b17647ada2eb46aec12d37848ea164a5edb21 100644 --- a/process_poleval.py +++ b/process_poleval.py @@ -8,8 +8,7 @@ import time from tqdm import tqdm -from poldeepner2.models import PolDeepNer2 -from poldeepner2.pipeline.tokenization import TokenizerFast +import poldeepner2 from poldeepner2.utils.data_utils import get_poleval_dict, wrap_annotations @@ -23,9 +22,15 @@ def merge_sentences(sentences: [[str]]): def main(model_path: str, input: str, output: str, device: str, do_merge_sentences: bool = True): print("Loading the NER model ...") t0 = time.perf_counter() - ner = PolDeepNer2(model_path, device=device, tokenizer=TokenizerFast()) + ner = poldeepner2.load(model_path, device=device) time_load = time.perf_counter() - t0 + if ner.metadata(): + print("\nModel metadata\n--------------") + for key, value in ner.metadata().items(): + print(f"{key:11}: {value}") + print("") + time_preprocess, time_ner, data_size = 0, 0, 0 dict_list = [] @@ -65,7 +70,7 @@ def parse_args(): help='path to a file with a list of files') parser.add_argument('--output', required=True, metavar='PATH', help='path to a json output file') - parser.add_argument('--model', required=True, metavar='PATH', + parser.add_argument('--model', required=False, metavar='PATH', default="pdn2-v08-nkjp-large", help='model name or path to a model name') parser.add_argument('--device', required=False, default="cpu", metavar='cpu|cuda', help='device type used for processing') diff --git a/setup.py b/setup.py index d6782b3baabfc9b373f26cbf8f5ddb97f565cc3c..5c99b7e710594fc601dae33467ba85ef499df128 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ install_requires = [ setuptools.setup( name="poldeepner2", - version="0.8.0", + version="0.8.1", author="Michał Marcińczuk", author_email="marcinczuk@gmail.com", description="PolDeepNer2 is a tool for sequence labeling tasks based on "