Skip to content
Snippets Groups Projects
Commit 74dd2978 authored by Michał Marcińczuk's avatar Michał Marcińczuk
Browse files

Merge branch 'dev_81' into 'master'

Added model pdn2-v08-nkjp-large

See merge request !42
parents f5de5762 05697c21
1 merge request!42Added model pdn2-v08-nkjp-large
Pipeline #9151 passed with stages
in 1 minute and 59 seconds
...@@ -2,6 +2,10 @@ ...@@ -2,6 +2,10 @@
# PolDeepNer2 Changelog # PolDeepNer2 Changelog
## 0.8.1
### Added
- Pre-trained model pdn2-v08-nkjp-large
## 0.8.0 ## 0.8.0
### Changed ### Changed
- Fixes and improvements to handle the PolEval 2018 model. - Fixes and improvements to handle the PolEval 2018 model.
......
...@@ -2,25 +2,21 @@ import os ...@@ -2,25 +2,21 @@ import os
from pathlib import Path from pathlib import Path
from poldeepner2.models import PolDeepNer2 from poldeepner2.models import PolDeepNer2
from poldeepner2.pipeline.tokenization import Tokenizer
from poldeepner2.utils.file_utils import download_file from poldeepner2.utils.file_utils import download_file
resources = { resources = {
"pdn2-v07-kpwr-n82-base-01": { "pdn2-v08-nkjp-large": {
"url": "https://s3.clarin-pl.eu/users/czuk/_public/pdn2/v07/pdn2-v07-kpwr-n82-base-01.zip", "url": "https://s3.clarin-pl.eu/workers/pdn2/pdn2-v08-nkjp-large.zip",
"compression": "zip", "compression": "zip",
"extractToSubfolder": False "extractToSubfolder": False,
}, }
"pdn2-v07-cen-n82-base-01": {
"url": "https://s3.clarin-pl.eu/users/czuk/_public/pdn2/v07/pdn2-v07-cen-n82-base-01.zip",
"compression": "zip",
"extractToSubfolder": False
},
} }
def load(path_or_name: str, device: str = None, resources_path: str = ".resources") -> PolDeepNer2: def get_model_or_download(path_or_name: str, resources_path: str = ".resources") -> str:
if Path(path_or_name).exists(): if Path(path_or_name).exists():
path = path_or_name path = path_or_name
else: else:
...@@ -32,4 +28,10 @@ def load(path_or_name: str, device: str = None, resources_path: str = ".resource ...@@ -32,4 +28,10 @@ def load(path_or_name: str, device: str = None, resources_path: str = ".resource
extract_to_subfolder) extract_to_subfolder)
else: else:
raise ValueError(f"Unknown resource name or invalid path: {path_or_name}") raise ValueError(f"Unknown resource name or invalid path: {path_or_name}")
return PolDeepNer2(path, device=device) return path
def load(path_or_name: str, device: str = None, tokenizer: Tokenizer = None, resources_path: str = ".resources") \
-> PolDeepNer2:
path = get_model_or_download(path_or_name, resources_path)
return PolDeepNer2(path, device=device, tokenizer=tokenizer)
...@@ -36,10 +36,13 @@ class Pdn2TokenClassification(nn.Module): ...@@ -36,10 +36,13 @@ class Pdn2TokenClassification(nn.Module):
super().__init__() super().__init__()
root = Path(path) root = Path(path)
self.metadata = {}
if config: if config:
self.config = config self.config = config
else: else:
self.load_config(root) self.load_config(root)
self.load_metadata(root)
if device: if device:
self.config.device = device self.config.device = device
setup_seed(self.config.seed) setup_seed(self.config.seed)
...@@ -142,3 +145,9 @@ class Pdn2TokenClassification(nn.Module): ...@@ -142,3 +145,9 @@ class Pdn2TokenClassification(nn.Module):
def load_config(self, path: str): def load_config(self, path: str):
with open(str(Path(path) / 'pdn2_config.yml'), 'r') as f: with open(str(Path(path) / 'pdn2_config.yml'), 'r') as f:
self.config = yaml.load(f, Loader=yaml.Loader) self.config = yaml.load(f, Loader=yaml.Loader)
def load_metadata(self, path: str):
path = Path(path) / 'pdn2_metadata.yml'
if path.exists():
with open(str(path), 'r') as f:
self.metadata = yaml.load(f, Loader=yaml.Loader)
import logging import logging
import os import os
from typing import List from typing import List, Dict
import torch import torch
import tqdm import tqdm
...@@ -24,11 +24,13 @@ class PolDeepNer2: ...@@ -24,11 +24,13 @@ class PolDeepNer2:
processor_annotations: List[ProcessorAnnotations] = None, device: str = None): processor_annotations: List[ProcessorAnnotations] = None, device: str = None):
if not os.path.exists(path): if not os.path.exists(path):
raise ValueError("Model not found on path '%s'" % path) raise ValueError("Model not found on path '%s'" % path)
self.model = Pdn2TokenClassification(path, device=device) self.model = Pdn2TokenClassification(path, device=device)
self.tokenizer = tokenizer if tokenizer is not None else TokenizerFast() self.tokenizer = tokenizer if tokenizer is not None else TokenizerFast()
self.processors_annotations = processor_annotations if processor_annotations else [split_hashtags] self.processors_annotations = processor_annotations if processor_annotations else [split_hashtags]
def metadata(self) -> Dict[str, str]:
return self.model.metadata
def process(self, sentences: [[str]], show_progress: bool = False): def process(self, sentences: [[str]], show_progress: bool = False):
""" """
@param sentences -- array of arrays of words, [['Jan', 'z', 'Warszawy'], ['IBM', 'i', 'Apple']] @param sentences -- array of arrays of words, [['Jan', 'z', 'Warszawy'], ['IBM', 'i', 'Apple']]
......
...@@ -8,8 +8,7 @@ import time ...@@ -8,8 +8,7 @@ import time
from tqdm import tqdm from tqdm import tqdm
from poldeepner2.models import PolDeepNer2 import poldeepner2
from poldeepner2.pipeline.tokenization import TokenizerFast
from poldeepner2.utils.data_utils import get_poleval_dict, wrap_annotations from poldeepner2.utils.data_utils import get_poleval_dict, wrap_annotations
...@@ -23,9 +22,15 @@ def merge_sentences(sentences: [[str]]): ...@@ -23,9 +22,15 @@ def merge_sentences(sentences: [[str]]):
def main(model_path: str, input: str, output: str, device: str, do_merge_sentences: bool = True): def main(model_path: str, input: str, output: str, device: str, do_merge_sentences: bool = True):
print("Loading the NER model ...") print("Loading the NER model ...")
t0 = time.perf_counter() t0 = time.perf_counter()
ner = PolDeepNer2(model_path, device=device, tokenizer=TokenizerFast()) ner = poldeepner2.load(model_path, device=device)
time_load = time.perf_counter() - t0 time_load = time.perf_counter() - t0
if ner.metadata():
print("\nModel metadata\n--------------")
for key, value in ner.metadata().items():
print(f"{key:11}: {value}")
print("")
time_preprocess, time_ner, data_size = 0, 0, 0 time_preprocess, time_ner, data_size = 0, 0, 0
dict_list = [] dict_list = []
...@@ -65,7 +70,7 @@ def parse_args(): ...@@ -65,7 +70,7 @@ def parse_args():
help='path to a file with a list of files') help='path to a file with a list of files')
parser.add_argument('--output', required=True, metavar='PATH', parser.add_argument('--output', required=True, metavar='PATH',
help='path to a json output file') help='path to a json output file')
parser.add_argument('--model', required=True, metavar='PATH', parser.add_argument('--model', required=False, metavar='PATH', default="pdn2-v08-nkjp-large",
help='model name or path to a model name') help='model name or path to a model name')
parser.add_argument('--device', required=False, default="cpu", parser.add_argument('--device', required=False, default="cpu",
metavar='cpu|cuda', help='device type used for processing') metavar='cpu|cuda', help='device type used for processing')
......
...@@ -17,7 +17,7 @@ install_requires = [ ...@@ -17,7 +17,7 @@ install_requires = [
setuptools.setup( setuptools.setup(
name="poldeepner2", name="poldeepner2",
version="0.8.0", version="0.8.1",
author="Michał Marcińczuk", author="Michał Marcińczuk",
author_email="marcinczuk@gmail.com", author_email="marcinczuk@gmail.com",
description="PolDeepNer2 is a tool for sequence labeling tasks based on " description="PolDeepNer2 is a tool for sequence labeling tasks based on "
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment