Skip to content
Snippets Groups Projects
Commit c730fc7e authored by Wiktor Walentynowicz's avatar Wiktor Walentynowicz :construction_worker_tone1:
Browse files

Conflicts resolved.

parents 2067c441 f46ac273
Branches
Tags
3 merge requests!39Version 0.7.0,!38Version 0.7,!37Refactoring model serialization and deserialization. When loading a model the...
Pipeline #4738 passed
Showing
with 15495 additions and 114976 deletions
poldeepner2/__pycache__/*
data/POLEVAL-NER_GOLD.json
dist
/.resources/
......@@ -3,7 +3,7 @@ image: "python:3.6"
before_script:
- python --version
- pip install -r requirements.txt
- python -m spacy download pl_core_news_sm
- pip install -r requirements-dev.txt
stages:
- test
......
......@@ -2,6 +2,17 @@
# PolDeepNer2 Changelog
## 0.6.6
### Added
- Script for batch training.
### Changed
- Sequences longer than max_seq_length are split into several subsequences.
### Removed
- Model watch by wandb.
## 0.6.5
### Added
- Model for German.
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
data_train: data/cen_n82/train.txt
data_tune: data/cen_n82/valid.txt
data_test: data/cen_n82/test.txt
pretrained_path:
- hf:allegro/herbert-large-cased
- hf:allegro/herbert-base-cased
- hf:sdadas/polish-roberta-base-v1
- hf:sdadas/polish-roberta-large-v1
- hf:sdadas/polish-roberta-base-v2
- hf:sdadas/polish-roberta-large-v2
max_seq_length: 256
num_train_epochs:
# - 5
# - 10
- 20
- 50
- 100
seed:
- 101
- 102
- 103
- 104
- 105
warmup_proportion: 0.0
learning_rate: 6e-5
train_batch_size: 8
dropout: 0.3
wandb: ner-cen-n82-ttt-epochs
squeeze:
output_dir: dev
......@@ -8,8 +8,9 @@ import os
import time
# from time import time F811 redefinition of unused 'time'
from poldeepner2.models import PolDeepNer2
import poldeepner2
from poldeepner2.utils.data_utils import read_tsv
from poldeepner2.utils.seed import setup_seed
from poldeepner2.utils.sequence_labeling import classification_report
......@@ -22,14 +23,18 @@ def main(args):
"""
print("Loading the NER model ...")
ner = PolDeepNer2.load(
model=args.model,
pretrained_path=args.pretrained_path,
device=args.device,
max_seq_length=args.max_seq_length,
squeeze=args.squeeze,
# seed=args.seed
)
ner = poldeepner2.load(args.model, device=args.device)
for param in ["device", "max_seq_length", "squeeze"]:
value = args.__dict__.get(param, None)
if value is not None:
value_default = ner.model.config.__dict__.get(param)
if str(value) != str(value_default):
print(f"Forced change of the parameter: {param} '{value_default}' => '{value}'")
ner.model.config.__dict__[param] = value
if args.seed is not None:
setup_seed(args.seed)
print("Processing ...")
sentences_labels = read_tsv(os.path.join(args.input), True)
......@@ -48,12 +53,9 @@ def main(args):
print(report)
print(f"Total time : {time_processing:>8.4} second(s)")
print(f"Data size: : {data_size/1000000:>8.4} "
f"M characters")
print(f"Speed: : "
f"{data_size / 1000000 / (time_processing/60):>8.4} "
f"M characters/minute")
print(f"Number of token labels : {len(ner.label_list):>8} ")
print(f"Data size: : {data_size/1000000:>8.4} M characters")
print(f"Speed: : {data_size / 1000000 / (time_processing/60):>8.4} M characters/minute")
print(f"Number of token labels : {len(ner.model.config.labels):>8} ")
def parse_args():
......@@ -64,23 +66,17 @@ def parse_args():
"""
parser = argparse.ArgumentParser(
description='Process a single TSV with a NER model')
parser.add_argument('--input', required=True, metavar='PATH',
help='path to a file with a list of files')
parser.add_argument('--model', required=True, metavar='PATH',
help='path to NER model')
parser.add_argument('--pretrained_path', required=False, metavar='PATH',
help='pretrained XLM-Roberta model path')
parser.add_argument('--max_seq_length', required=False, default=512,
metavar='N', type=int,
help='the maximum total input sequence length after '
'WordPiece tokenization.')
parser.add_argument('--device', required=False, default="cpu",
metavar='cpu|cuda',
help='device type used for processing')
parser.add_argument('--squeeze', required=False, default=False,
action="store_true",
help='try to squeeze multiple examples into one Input '
'Feature')
parser.add_argument('--input', required=True, metavar='PATH', help='path to a file with a list of files')
parser.add_argument('--model', required=True, metavar='PATH', help='path or name of the model')
parser.add_argument('--max_seq_length', required=False, default=None, metavar='N', type=int,
help='override default values of the max_seq_length')
parser.add_argument('--device', default=None, metavar='cpu|cuda',
help='override default value of the device')
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument("--squeeze", dest="squeeze", default=None, action='store_true')
group.add_argument("--no-squeeze", dest="squeeze", default=None, action='store_false')
parser.add_argument('--seed', required=False, default=None, metavar='N', type=int,
help='a seed used to initialize a number generator')
return parser.parse_args()
......
import os
from pathlib import Path
from poldeepner2.models import PolDeepNer2
from poldeepner2.utils.file_utils import download_file
resources = {
"pdn2-v07-kpwr-n82-base-01": {
"url": "https://s3.clarin-pl.eu/users/czuk/_public/pdn2/v07/pdn2-v07-kpwr-n82-base-01.zip",
"compression": "zip",
"extractToSubfolder": False
},
"pdn2-v07-cen-n82-base-01": {
"url": "https://s3.clarin-pl.eu/users/czuk/_public/pdn2/v07/pdn2-v07-cen-n82-base-01.zip",
"compression": "zip",
"extractToSubfolder": False
},
}
def load(path_or_name: str, device: str = None, resources_path: str = ".resources") -> PolDeepNer2:
if Path(path_or_name).exists():
path = path_or_name
else:
path = os.path.join(resources_path, path_or_name)
if not os.path.exists(path):
if path_or_name in resources:
extract_to_subfolder = resources[path_or_name].get("extractToSubfolder", False)
download_file(resources[path_or_name]["url"], path, resources[path_or_name]["compression"],
extract_to_subfolder)
else:
raise ValueError(f"Unknown resource name or invalid path: {path_or_name}")
return PolDeepNer2(path, device=device)
"""A message of shame -- documentation must be completed."""
from transformers import AutoModel, AutoTokenizer
from pathlib import Path
import yaml
from typing import List
import torch
from attr import dataclass
from dataclasses import field
from transformers import AutoModel, AutoTokenizer, AutoConfig
import torch.nn as nn
import torch.nn.functional as F
from poldeepner2.utils.seed import setup_seed
class HfModelForTokenClassification(nn.Module):
"""A message of shame -- documentation must be completed."""
def __init__(self, pretrained_path,
n_labels, hidden_size=768,
dropout_p=0.2, label_ignore_idx=0,
head_init_range=0.04, device='cuda'):
"""A message of shame -- documentation must be completed.
Args:
pretrained_path:A message of shame -- documentation must be
completed.
n_labels:A message of shame -- documentation must be completed.
hidden_size:A message of shame -- documentation must be completed.
dropout_p:A message of shame -- documentation must be completed.
label_ignore_idx:A message of shame -- documentation must be
completed.
head_init_range:A message of shame -- documentation must be
completed.
device:A message of shame -- documentation must be completed.
"""
super().__init__()
@dataclass
class Pdn2ModelConfiguration:
labels: List[str] = field(default_factory=list)
hidden_size: int = 768
dropout_p: float = 0.2
label_ignore_idx: int = 0
head_init_range: float = 0.04
device: str = 'cuda'
max_seq_length: int = 256
squeeze: bool = True
seed: int = 101
self.n_labels = n_labels
def label_count(self) -> int:
return len(self.labels) + 1
self.linear_1 = nn.Linear(hidden_size, hidden_size)
self.classification_head = nn.Linear(hidden_size, n_labels)
self.label_ignore_idx = label_ignore_idx
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
class Pdn2TokenClassification(nn.Module):
self.model = AutoModel.from_pretrained(pretrained_path)
def __init__(self, path: str, config: Pdn2ModelConfiguration = None, device: str = None):
self.dropout = nn.Dropout(dropout_p)
self.device = device
super().__init__()
root = Path(path)
# initializing classification head
self.classification_head.weight.data.normal_(mean=0.0,
std=head_init_range)
if config:
self.config = config
else:
self.load_config(root)
if device:
self.config.device = device
setup_seed(self.config.seed)
self.linear_1 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
self.classification_head = nn.Linear(self.config.hidden_size, self.config.label_count())
self.label_ignore_idx = self.config.label_ignore_idx
self.dropout = nn.Dropout(self.config.dropout_p)
self.classification_head.weight.data.normal_(mean=0.0, std=self.config.head_init_range)
self.tokenizer = AutoTokenizer.from_pretrained(path)
if Pdn2TokenClassification.get_model_path(root).exists():
config = AutoConfig.from_pretrained(path)
self.model = AutoModel.from_config(config)
state_dict = torch.load(open(str(Pdn2TokenClassification.get_model_path(root)), 'rb'),
map_location=self.config.device)
self.load_state_dict(state_dict)
self.eval()
self.to(self.config.device)
else:
self.model = AutoModel.from_pretrained(path)
def forward(self, inputs_ids, labels, labels_mask, valid_mask):
"""Computes a forward pass through the sequence tagging model.
......@@ -76,7 +93,7 @@ class HfModelForTokenClassification(nn.Module):
if labels_mask is not None:
active_loss = valid_mask.view(-1) == 1
active_logits = logits.view(-1, self.n_labels)[active_loss]
active_logits = logits.view(-1, self.config.label_count())[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
......@@ -99,3 +116,23 @@ class HfModelForTokenClassification(nn.Module):
tensor_ids = self.tokenizer.encode(s)
# remove <s> and </s> ids
return tensor_ids[1:-1]
@staticmethod
def get_model_path(root: Path) -> Path:
return root / 'pdn2_model.pt'
def save(self, path: str):
root = Path(path)
root.mkdir(parents=True, exist_ok=True)
torch.save(self.state_dict(), open(str(Pdn2TokenClassification.get_model_path(root)), 'wb'))
self.save_config(path)
self.model.config.save_pretrained(str(root))
self.tokenizer.save_pretrained(str(root))
def save_config(self, path: str):
with open(str(Path(path) / 'pdn2_config.yml'), 'w') as f:
yaml.dump(self.config, f)
def load_config(self, path: str):
with open(str(Path(path) / 'pdn2_config.yml'), 'r') as f:
self.config = yaml.load(f, Loader=yaml.FullLoader)
......@@ -2,241 +2,40 @@
import logging
import os
import codecs
from pathlib import Path
from typing import List
import torch
import tqdm
from poldeepner2.data.document import Document
from poldeepner2.pipeline.lemmatization import ProcessorAnnotations, \
AnnotationLemmatizerPolem
from poldeepner2.pipeline.lemmatization import ProcessorAnnotations, AnnotationLemmatizerPolem
from poldeepner2.utils.annotation import AnnotationText
from poldeepner2.utils.file_utils import download_file
from poldeepner2.utils.seed import setup_seed
from poldeepner2.pipeline import tokenization
from poldeepner2.pipeline.tokenization import TokenizerSpaces, Tokenizer
from poldeepner2.utils.data_utils import InputExample, \
convert_examples_to_features, create_dataset, read_params, \
from poldeepner2.pipeline.tokenization import Tokenizer, TokenizerFast
from poldeepner2.utils.data_utils import InputExample, create_dataset, \
wrap_annotations, align_tokens_with_text
from poldeepner2.model.xlmr_for_token_classification \
import XLMRForTokenClassification
from poldeepner2.model.hf_for_token_calssification \
import HfModelForTokenClassification
from poldeepner2.model.hf_for_token_calssification import Pdn2TokenClassification
from torch.utils.data.dataloader import DataLoader
resources = {
"roberta_base_fairseq": {
"url": "https://github.com/sdadas/polish-roberta/releases/download"
"/models/roberta_base_fairseq.zip",
"compression": "zip",
"extractToSubfolder": True
},
"roberta_large_fairseq": {
"url": "https://github.com/sdadas/polish-roberta/releases/download"
"/models/roberta_large_fairseq.zip",
"compression": "zip",
"extractToSubfolder": True
},
"kpwr_n82_base": {
"url": "https://minio.clarin-pl.eu/public/models/poldeepner2"
"/kpwr_n82_base.zip",
"compression": "zip"
},
"kpwr_n82_large": {
"url": "https://minio.clarin-pl.eu/public/models/poldeepner2"
"/kpwr_n82_large.zip",
"compression": "zip"
},
"cen_n82_base": {
"url": "https://minio.clarin-pl.eu/public/models/poldeepner2"
"/cen_n82_base.zip",
"compression": "zip"
},
"cen_n82_large": {
"url": "https://minio.clarin-pl.eu/public/models/poldeepner2"
"/cen_n82_large.zip",
"compression": "zip"
},
"nkjp_base": {
"url": "https://minio.clarin-pl.eu/public/models/poldeepner2"
"/nkjp_base.zip",
"compression": "zip"
},
"nkjp_base_sq": {
"url": "https://minio.clarin-pl.eu/public/models/poldeepner2"
"/nkjp_base_sq.zip",
"compression": "zip"
},
"pdn2_conll_english_large": {
"url": "https://minio.clarin-pl.eu/public/models/poldeepner2"
"/pdn2_conll_english_large.zip",
"compression": "zip"
},
"nkjp_herbert_large_sq": {
"url": "https://minio.clarin-pl.eu/public/models/poldeepner2"
"/nkjp_herbert_large_sq.zip",
"compression": "zip"
},
"pdn2_cen_n82_roberta_large_sq_krnnt_cuda.pdn2": {
"url": "https://minio.clarin-pl.eu/public/models/poldeepner2"
"/pdn2_cen_n82_roberta_large_sq_krnnt_cuda.pdn2",
"compression": None
},
"pdn2_nkjp_herbert_large_sq_fast_cuda.pdn2": {
"url": "https://minio.clarin-pl.eu/public/models/poldeepner2"
"/pdn2_nkjp_herbert_large_sq_fast_cuda.pdn2",
"compression": None
},
"pdn2_ner_de_germeval_xmlr_base_cuda0.pdn2": {
"url": "https://minio.clarin-pl.eu/public/models/poldeepner2"
"/pdn2_ner_de_germeval_xmlr_base_cuda0.pdn2",
"compression": None
}
}
from poldeepner2.utils.preprocess import split_hashtags
from poldeepner2.utils.sequences import convert_examples_to_features
class PolDeepNer2:
"""A message of shame -- documentation must be completed."""
def __init__(self, model_path: str,
pretrained_path: str,
device="cpu",
squeeze=False,
max_seq_length=256,
tokenizer=TokenizerSpaces(),
processor_annotations: [ProcessorAnnotations] = [],
seed=377):
"""A message of shame -- documentation must be completed.
Args:
model_path:A message of shame -- documentation must be completed.
pretrained_path:A message of shame -- documentation must be
completed.
device:A message of shame -- documentation must be completed.
squeeze:A message of shame -- documentation must be completed.
max_seq_length:A message of shame -- documentation must be
completed.
tokenizer:A message of shame -- documentation must be completed.
processor_annotations:A message of shame -- documentation must be
completed.
seed:A message of shame -- documentation must be completed.
"""
setup_seed(seed)
if not os.path.exists(model_path):
raise ValueError("Model not found on path '%s'" % model_path)
dropout, num_labels, label_list = read_params(model_path)
self.label_list = label_list
hidden_size = 1024 if 'large' in pretrained_path else 768
if pretrained_path.startswith('hf:'):
pretrained_dir = pretrained_path[len('hf:'):]
model = HfModelForTokenClassification(
pretrained_path=pretrained_dir, n_labels=num_labels,
hidden_size=hidden_size, dropout_p=dropout,
device=device)
else:
pretrained_dir = pretrained_path
if ":" in pretrained_dir:
pretrained_dir = pretrained_dir.split(":")[1]
if not os.path.exists(pretrained_dir):
raise ValueError("RoBERTa language model not found on path "
"'%s'" % pretrained_dir)
model = XLMRForTokenClassification(
pretrained_path=pretrained_dir, n_labels=num_labels,
dropout_p=dropout, device=device,
hidden_size=hidden_size)
logging.info(f"Device: {device}")
state_dict = torch.load(
open(os.path.join(model_path, 'model.pt'), 'rb'),
map_location=device)
model.load_state_dict(state_dict)
model.eval()
model.to(device)
self.model = model
self.device = device
self.squeeze = squeeze
self.max_seq_length = max_seq_length
self.tokenizer = tokenizer
self.processors_annotations = processor_annotations
@staticmethod
def load_labels(path):
"""A message of shame -- documentation must be completed.
Args:
path:A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
return [line.strip() for line in
codecs.open(path, "r", "utf8").readlines()
if len(line.strip()) > 0]
@staticmethod
def load(model: str,
resources_path: str = ".models",
pretrained_path: str = None,
device: str = "cpu",
squeeze: bool = False,
max_seq_length: int = 256,
tokenizer: Tokenizer = None,
processor_annotations: [ProcessorAnnotations] = [],
seed=377) -> "PolDeepNer2":
"""A message of shame -- documentation must be completed.
def __init__(self, path: str, tokenizer: Tokenizer = None,
processor_annotations: List[ProcessorAnnotations] = None, device: str = None):
if not os.path.exists(path):
raise ValueError("Model not found on path '%s'" % path)
Args:
model:A message of shame -- documentation must be completed.
resources_path:A message of shame -- documentation must be
completed.
pretrained_path:A message of shame -- documentation must be
completed.
device:A message of shame -- documentation must be completed.
squeeze:A message of shame -- documentation must be completed.
max_seq_length:A message of shame -- documentation must be
completed.
tokenizer:A message of shame -- documentation must be completed.
processor_annotations:A message of shame -- documentation must be
completed.
seed:A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
self.model = Pdn2TokenClassification(path, device=device)
self.tokenizer = tokenizer if tokenizer is not None else TokenizerFast()
self.processors_annotations = processor_annotations if processor_annotations else [split_hashtags]
def process(self, sentences: [[str]], show_progress: bool = False):
"""
if Path(model).suffix == ".pdn2":
if model.startswith("pdn2_"):
model_path = ModelFactory.get_resource(model, resources_path)
else:
model_path = model
pdn2 = torch.load(model_path)
if tokenizer is not None:
pdn2.tokenizer = tokenizer
return pdn2
else:
tokenizer = TokenizerSpaces() if tokenizer is None else tokenizer
return PolDeepNer2(model_path=model,
pretrained_path=pretrained_path, device=device,
squeeze=squeeze,
max_seq_length=max_seq_length,
tokenizer=tokenizer,
processor_annotations=processor_annotations,
seed=seed)
def process(self, sentences: [[str]], show_progress=False):
"""A message of shame -- documentation must be completed.
@param sentences -- array of array of words, [['Jan', 'z',
'Warszawy'], ['IBM', 'i', 'Apple']] @param max_seq_length -- the
maximum total input sequence length after WordPiece tokenization
@param squeeze -- boolean enabling squeezing multiple sentences into
one Input Feature
@param sentences -- array of arrays of words, [['Jan', 'z', 'Warszawy'], ['IBM', 'i', 'Apple']]
@param show_progress -- print progress of processing in terminal
"""
examples = []
for idx, tokens in enumerate(sentences):
......@@ -247,15 +46,14 @@ class PolDeepNer2:
examples.append(InputExample(guid=guid, text_a=text_a,
text_b=text_b, label=label))
eval_features = convert_examples_to_features(examples, self.label_list,
self.max_seq_length,
self.model.encode_word,
self.squeeze)
eval_features = convert_examples_to_features(examples, self.model.config.labels,
self.model.config.max_seq_length,
self.model.encode_word, self.model.config.squeeze)
eval_dataset = create_dataset(eval_features)
eval_dataloader = DataLoader(eval_dataset, batch_size=1)
y_pred = []
label_map = {i: label for i, label in enumerate(self.label_list, 1)}
label_map = {i: label for i, label in enumerate(self.model.config.labels, 1)}
if show_progress:
outer = tqdm.tqdm(total=len(eval_dataloader), desc='Processing',
......@@ -263,9 +61,9 @@ class PolDeepNer2:
for input_ids, label_ids, l_mask, valid_ids in eval_dataloader:
if show_progress:
outer.update(1)
input_ids = input_ids.to(self.device)
label_ids = label_ids.to(self.device)
valid_ids = valid_ids.to(self.device)
input_ids = input_ids.to(self.model.config.device)
label_ids = label_ids.to(self.model.config.device)
valid_ids = valid_ids.to(self.model.config.device)
with torch.no_grad():
logits = self.model(input_ids, labels=None, labels_mask=None,
......@@ -355,176 +153,3 @@ class PolDeepNer2:
"""
return self.process(tokens)
class ModelFactory:
"""A message of shame -- documentation must be completed."""
@staticmethod
def get_resource(name, resources_path):
"""A message of shame -- documentation must be completed.
Args:
name:A message of shame -- documentation must be completed.
resources_path:A message of shame -- documentation must be
completed.
Returns:A message of shame -- documentation must be completed.
"""
path = os.path.join(resources_path, name)
if not os.path.exists(path):
if name in resources:
extract_to_subfolder = resources[name].get(
"extractToSubfolder", False)
download_file(resources[name]["url"], path,
resources[name]["compression"],
extract_to_subfolder)
else:
raise ValueError(f"Unknown resource name or invalid path: "
f"{name}")
return path
def load(self, device="cpu", resources_path="models"):
"""A message of shame -- documentation must be completed.
Args:
device:A message of shame -- documentation must be completed.
resources_path:A message of shame -- documentation must be
completed.
Returns:A message of shame -- documentation must be completed.
"""
return None
class ModelFactoryGeneric(ModelFactory):
"""A message of shame -- documentation must be completed."""
def __init__(self, roberta_name, model_name, squeeze=False,
tokenizer="spacy-ext"):
"""A message of shame -- documentation must be completed.
Args:
roberta_name:A message of shame -- documentation must be completed.
model_name:A message of shame -- documentation must be completed.
squeeze:A message of shame -- documentation must be completed.
tokenizer:A message of shame -- documentation must be completed.
"""
self.roberta_name = roberta_name
self.model_name = model_name
self.squeeze = squeeze
self.tokenizer = tokenizer
def load(self, device="cpu", resources_path="models"):
"""A message of shame -- documentation must be completed.
Args:
device:A message of shame -- documentation must be completed.
resources_path:A message of shame -- documentation must be
completed.
Returns:A message of shame -- documentation must be completed.
"""
return PolDeepNer2(ModelFactory.get_resource(self.model_name,
resources_path),
ModelFactory.get_resource(self.roberta_name,
resources_path),
device=device,
squeeze=self.squeeze,
tokenizer=tokenization.load(self.tokenizer))
class ModelFactoryAutoModel(ModelFactory):
"""A message of shame -- documentation must be completed."""
def __init__(self, roberta_name, model_name, squeeze=False,
tokenizer="spacy-ext"):
"""A message of shame -- documentation must be completed.
Args:
roberta_name:A message of shame -- documentation must be completed.
model_name:A message of shame -- documentation must be completed.
squeeze:A message of shame -- documentation must be completed.
tokenizer:A message of shame -- documentation must be completed.
"""
self.roberta_name = roberta_name
self.model_name = model_name
self.squeeze = squeeze
self.tokenizer = tokenizer
def load(self, device="cpu", resources_path="models"):
"""A message of shame -- documentation must be completed.
Args:
device:A message of shame -- documentation must be completed.
resources_path:A message of shame -- documentation must be
completed.
Returns:A message of shame -- documentation must be completed.
"""
return PolDeepNer2(ModelFactory.get_resource(self.model_name,
resources_path),
self.roberta_name,
device=device,
squeeze=self.squeeze,
tokenizer=tokenization.load(self.tokenizer))
pdn2_models = {
"kpwr-n82-base":
ModelFactoryGeneric("roberta_base_fairseq", "kpwr_n82_base",
False, "spacy-ext"),
"kpwr-n82-large":
ModelFactoryGeneric("roberta_large_fairseq", "kpwr_n82_large",
False, "spacy-ext"),
"cen-n82-base":
ModelFactoryGeneric("roberta_base_fairseq", "cen_n82_base",
False, "spacy-ext"),
"cen-n82-large":
ModelFactoryGeneric("roberta_large_fairseq", "cen_n82_large",
False, "spacy-ext"),
"nkjp-base":
ModelFactoryGeneric("roberta_base_fairseq", "nkjp_base",
False, "spacy-ext"),
"nkjp-base-sq":
ModelFactoryGeneric("roberta_base_fairseq", "nkjp_base_sq",
True, "spacy-ext"),
"conll-english-large-sq":
ModelFactoryAutoModel("hf:xlm-roberta-large-finetuned-conll03-english",
"pdn2_conll_english_large", True, "spacy-ext"),
"nkjp-herbert-large-sq":
ModelFactoryAutoModel("hf:allegro/herbert-large-cased",
"nkjp_herbert_large_sq", True, "spacy-ext")
}
def get_models_names():
"""A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
return pdn2_models.keys()
def load(name: str, device="cpu", resources_path="models") -> PolDeepNer2:
"""A message of shame -- documentation must be completed.
Args:
name:A message of shame -- documentation must be completed.
device:A message of shame -- documentation must be completed.
resources_path:A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
if name not in pdn2_models:
raise ValueError(f"Unknown PDN2 model: {name}. Expected: "
f"{get_models_names()}")
return pdn2_models[name].load(device, resources_path)
......@@ -3,7 +3,6 @@
import re
import requests
import spacy
from poldeepner2.data.token import Token
from poldeepner2.utils.preprocess import split_hashtags, split_leading_name, \
......@@ -129,96 +128,6 @@ class TokenizerSpaces(Tokenizer):
return [re.sub(r"\s+", " ", text.strip()).split(" ") for text in texts]
class TokenizerSpacy(Tokenizer):
"""A message of shame -- documentation must be completed."""
def __init__(self):
"""A message of shame -- documentation must be completed."""
self.nlp = spacy.load("pl_core_news_sm")
def tokenize(self, texts: [str]) -> [[str]]:
"""A message of shame -- documentation must be completed.
Args:
texts:A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
out = []
texts = list(map(lambda x: " ".join(x.split()), texts))
texts = list(filter(lambda x: len(x), texts))
for doc in self.nlp.pipe(texts, disable=["tagger", "ner"]):
for sentence in doc.sents:
out.append([t.text for t in sentence])
return out
class TokenizerSpacyExt(Tokenizer):
"""A message of shame -- documentation must be completed."""
def __init__(self):
"""A message of shame -- documentation must be completed."""
self.nlp = spacy.load("pl_core_news_sm")
self.abbrev_no_eos = set(["tzw", "np", "m.in", "tj"])
def tokenize(self, texts: [str]) -> [[str]]:
"""A message of shame -- documentation must be completed.
Args:
texts:A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
out = []
texts = list(map(lambda x: " ".join(x.split()), texts))
texts = list(filter(lambda x: len(x), texts))
for doc in self.nlp.pipe(texts, disable=["tagger", "ner"]):
current = []
for sentence in doc.sents:
current.extend([t.text for t in sentence])
if current[-1] in set(['"', ")", ""]) \
or self.is_ended_with_abbrev(current)\
or self.is_ended_with_name_initial(current):
pass
elif len(current) > 0:
out.append(current)
current = []
if len(current) > 0:
out.append(current)
out = [split_hashtags(sentence) for sentence in out]
out = [split_leading_name(sentence) for sentence in out]
out = [split_underscore(sentence) for sentence in out]
return out
def is_ended_with_abbrev(self, sequence: [str]) -> bool:
"""A message of shame -- documentation must be completed.
Args:
sequence:A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
return len(sequence) > 1 and sequence[-1] == "." \
and sequence[-2] in self.abbrev_no_eos
def is_ended_with_name_initial(self, sequence: [str]) -> bool:
"""A message of shame -- documentation must be completed.
Args:
sequence:A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
return len(sequence) > 1 and sequence[-1] == "." \
and len(sequence[-2]) == 1 \
and sequence[-2].isupper() and sequence[-2].isalpha()
class TokenizerKrnnt(Tokenizer):
"""A message of shame -- documentation must be completed."""
......@@ -318,10 +227,6 @@ def load(tokenizer_type: str) -> Tokenizer:
"""
if tokenizer_type == "space":
return TokenizerSpaces()
elif tokenizer_type == "spacy":
return TokenizerSpacy()
elif tokenizer_type == "spacy-ext":
return TokenizerSpacyExt()
elif tokenizer_type == "krnnt":
return TokenizerKrnnt()
elif tokenizer_type == "fast":
......@@ -330,4 +235,4 @@ def load(tokenizer_type: str) -> Tokenizer:
raise ValueError(f"Unknown tokenizer type: {tokenizer_type}")
names = ["space", "spacy", "spacy-ext", "krnnt", "fast"]
names = ["space", "krnnt", "fast"]
"""A message of shame -- documentation must be completed."""
import codecs
import json
import logging
from typing import List
import torch
......@@ -32,133 +31,15 @@ class InputExample(object):
self.label = label
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, label_id, valid_ids=None,
label_mask=None):
"""A message of shame -- documentation must be completed.
Args:
input_ids: A message of shame -- documentation must be completed.
input_mask: A message of shame -- documentation must be completed.
label_id: A message of shame -- documentation must be completed.
valid_ids: A message of shame -- documentation must be completed.
label_mask: A message of shame -- documentation must be completed.
"""
self.input_ids = input_ids
self.input_mask = input_mask
self.label_id = label_id
self.valid_ids = valid_ids
self.label_mask = label_mask
class TokenFeatures(InputFeatures):
"""A message of shame -- documentation must be completed."""
def __init__(self, input_ids, label_id):
"""A message of shame -- documentation must be completed.
Args:
input_ids: A message of shame -- documentation must be completed.
label_id: A message of shame -- documentation must be completed.
"""
tail_length = len(input_ids) - 1
self.input_ids = input_ids
self.input_mask = [1] + ([0] * tail_length)
self.label_id = [label_id] + ([LABEL_IGNORE_ID] * tail_length)
self.valid_ids = [1] + ([0] * tail_length)
def length(self):
"""A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
return len(self.input_ids)
class SequenceFeatures(InputFeatures):
"""A message of shame -- documentation must be completed."""
def __init__(self):
"""A message of shame -- documentation must be completed."""
self.input_ids = []
self.input_mask = []
self.label_id = []
self.valid_ids = []
self.append(0, 0, LABEL_IGNORE_ID, 0) # adding <s>
def append(self, token_id: int, input_mask: int, label_id: int,
valid_id: int):
"""A message of shame -- documentation must be completed.
Args:
token_id: A message of shame -- documentation must be completed.
input_mask: A message of shame -- documentation must be completed.
label_id: A message of shame -- documentation must be completed.
valid_id: A message of shame -- documentation must be completed.
Returns: A message of shame -- documentation must be completed.
"""
self.input_ids.append(token_id)
self.input_mask.append(input_mask)
self.label_id.append(label_id)
self.valid_ids.append(valid_id)
def add_token(self, token: TokenFeatures):
"""A message of shame -- documentation must be completed.
Args:
token: A message of shame -- documentation must be completed.
Returns: A message of shame -- documentation must be completed.
"""
self.input_ids.extend(token.input_ids)
self.input_mask.extend(token.input_mask)
self.label_id.extend(token.label_id)
self.valid_ids.extend(token.valid_ids)
def close_and_fill(self, max_length=128):
"""A message of shame -- documentation must be completed.
Args:
max_length: A message of shame -- documentation must be completed.
Returns: A message of shame -- documentation must be completed.
"""
self.append(2, 0, LABEL_IGNORE_ID, 0) # adding </s>
while len(self.input_ids) < max_length:
self.append(1, 0, LABEL_IGNORE_ID, 0) # adding padding
def length(self):
"""A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
return len(self.input_ids)
class NerProcessor:
"""Processor for the CoNLL-2003 data set."""
def get_examples(self, data_path, data_type="data"):
"""A message of shame -- documentation must be completed.
Args:
data_path: A message of shame -- documentation must be completed.
data_type: A message of shame -- documentation must be completed.
Returns: A message of shame -- documentation must be completed.
"""
return self._create_examples(self._read_file(data_path), data_type)
def get_examples(self, data_path: List[str], data_type="data") -> List[InputExample]:
examples = []
print(data_path)
for path in data_path:
examples.extend(self._create_examples(self._read_file(path), data_type))
return examples
def get_labels(self, paths):
"""A message of shame -- documentation must be completed.
......@@ -171,7 +52,7 @@ class NerProcessor:
"""
label_set = set([])
for path in paths:
examples = self._create_examples(self._read_iob(path, tag_column_index), "data")
examples = self.get_examples(path)
label_set.update(NerProcessor._get_labels(examples))
return sorted(list(label_set))
......@@ -272,213 +153,6 @@ class NerProcessor:
return sorted(list(label_set))
def convert_examples_to_features(examples, label_list, max_seq_length,
encode_method, squeeze=False):
"""A message of shame -- documentation must be completed.
Args:
examples: A message of shame -- documentation must be completed.
label_list: A message of shame -- documentation must be completed.
max_seq_length: A message of shame -- documentation must be completed.
encode_method: A message of shame -- documentation must be completed.
squeeze: A message of shame -- documentation must be completed.
Returns: A message of shame -- documentation must be completed.
"""
if squeeze:
return convert_examples_to_features_sq(examples, label_list,
max_seq_length, encode_method)
else:
return convert_examples_to_features_nosq(examples, label_list,
max_seq_length, encode_method)
def convert_examples_to_features_sq(examples, label_list, max_seq_length,
encode_method):
"""Converts a set of examples into XLMR compatible format.
Squeezes more than one example provided enough space.
Args:
examples: A message of shame -- documentation must be completed.
label_list: A message of shame -- documentation must be completed.
max_seq_length: A message of shame -- documentation must be completed.
encode_method: A message of shame -- documentation must be completed.
Returns: A message of shame -- documentation must be completed.
"""
ignored_label = "IGNORE"
label_map = {label: i for i, label in enumerate(label_list, 1)}
label_map[ignored_label] = 0 # 0 label is to be ignored
pending_token_ids = []
pending_input_mask = []
pending_label_ids = []
pending_valid = []
pending_label_mask = []
features = []
for (ex_index, example) in enumerate(examples):
textlist = example.text_a.split(' ')
labellist = example.label
labels = []
valid = []
label_mask = []
token_ids = []
for i, word in enumerate(textlist):
tokens = encode_method(word.strip()) # word token ids
token_ids.extend(tokens) # all sentence token ids
label_1 = labellist[i]
for m in range(len(tokens)):
if m == 0: # only label the first BPE token of each work
labels.append(label_1)
valid.append(1)
label_mask.append(1)
else:
labels.append(ignored_label) # unlabeled BPE token
label_mask.append(0)
valid.append(0)
if len(token_ids) >= max_seq_length - 1: # trim extra tokens
logging.warning(
f"Trimmed tokens: {len(token_ids)} to {max_seq_length}")
token_ids = token_ids[0:(max_seq_length - 2)]
labels = labels[0:(max_seq_length - 2)]
valid = valid[0:(max_seq_length - 2)]
label_mask = label_mask[0:(max_seq_length - 2)]
assert len(token_ids) == len(labels)
assert len(valid) == len(labels)
label_ids = []
for i, _ in enumerate(token_ids):
label_ids.append(label_map[labels[i]])
assert len(token_ids) == len(label_ids)
assert len(valid) == len(label_ids)
input_mask = [1] * len(token_ids)
if len(token_ids) + len(pending_token_ids) > max_seq_length:
features.append(append_pending(ignored_label, pending_token_ids,
pending_input_mask,
pending_label_ids,
pending_valid, pending_label_mask,
max_seq_length, label_map, ex_index,
example))
pending_token_ids = token_ids
pending_input_mask = input_mask
pending_label_ids = label_ids
pending_valid = valid
pending_label_mask = label_mask
else:
pending_token_ids.extend(token_ids)
pending_input_mask.extend(input_mask)
pending_label_ids.extend(label_ids)
pending_valid.extend(valid)
pending_label_mask.extend(label_mask)
features.append(
append_pending(ignored_label, pending_token_ids, pending_input_mask,
pending_label_ids,
pending_valid, pending_label_mask, max_seq_length,
label_map))
return features
def append_pending(ignored_label, pending_token_ids, pending_input_mask,
pending_label_ids, pending_valid,
pending_label_mask, max_seq_length, label_map,
ex_index=None, example=None):
"""A message of shame -- documentation must be completed.
Args:
ignored_label: A message of shame -- documentation must be completed.
pending_token_ids: A message of shame -- documentation must be
completed.
pending_input_mask: A message of shame -- documentation must be
completed.
pending_label_ids: A message of shame -- documentation must be
completed.
pending_valid: A message of shame -- documentation must be completed.
pending_label_mask: A message of shame -- documentation must
be completed.
max_seq_length: A message of shame -- documentation must be completed.
label_map: A message of shame -- documentation must be completed.
ex_index: A message of shame -- documentation must be completed.
example: A message of shame -- documentation must be completed.
Returns: A message of shame -- documentation must be completed.
"""
while len(pending_token_ids) < max_seq_length:
pending_token_ids.append(1) # token padding idx
pending_input_mask.append(0)
pending_label_ids.append(label_map[ignored_label]) # label ignore idx
pending_valid.append(0)
pending_label_mask.append(0)
while len(pending_label_ids) < max_seq_length:
pending_label_ids.append(label_map[ignored_label])
pending_label_mask.append(0)
assert len(pending_token_ids) == max_seq_length
assert len(pending_input_mask) == max_seq_length
assert len(pending_label_ids) == max_seq_length
assert len(pending_valid) == max_seq_length
assert len(pending_label_mask) == max_seq_length
return InputFeatures(input_ids=pending_token_ids,
input_mask=pending_input_mask,
label_id=pending_label_ids,
valid_ids=pending_valid,
label_mask=pending_label_mask)
def convert_examples_to_features_nosq(examples, label_list, max_seq_length,
encode_method) -> List[InputFeatures]:
"""A message of shame -- documentation must be completed.
Args:
examples: A message of shame -- documentation must be completed.
label_list: A message of shame -- documentation must be completed.
max_seq_length: A message of shame -- documentation must be completed.
encode_method: A message of shame -- documentation must be completed.
Returns: A message of shame -- documentation must be completed.
"""
"""Converts a set of examples into XLMR compatible format
* Labels are only assigned to the positions correspoinding to the first
BPE token of each word.
* Other positions are labeled with 0 ("IGNORE")
"""
label_map = {label: i for i, label in enumerate(label_list, 1)}
features = []
for (ex_index, example) in enumerate(examples):
textlist = example.text_a.split(' ')
labellist = example.label
tfs = []
for word, label_1 in zip(textlist, labellist):
tokens = encode_method(word.strip())
tfs.append(TokenFeatures(tokens, label_map[label_1]))
sf = SequenceFeatures()
for tf in tfs:
if sf.length() + tf.length() + 2 >= max_seq_length:
sf.close_and_fill(max_seq_length)
features.append(sf)
sf = SequenceFeatures()
sf.add_token(tf)
if sf.length() > 1:
sf.close_and_fill(max_seq_length)
features.append(sf)
return features
def create_dataset(features) -> TensorDataset:
"""A message of shame -- documentation must be completed.
......@@ -652,24 +326,6 @@ def read_json(path):
return json_out
def save_params(model_path, dropout, num_labels, label_list):
"""A message of shame -- documentation must be completed.
Args:
model_path: A message of shame -- documentation must be completed.
dropout: A message of shame -- documentation must be completed.
num_labels: A message of shame -- documentation must be completed.
label_list: A message of shame -- documentation must be completed.
"""
data = {}
data['dropout'] = dropout
data['num_labels'] = num_labels
data['label_list'] = label_list
with open(model_path + '/params.json', 'w') as f:
json.dump(data, f)
def read_tsv(filename, with_labels=False):
"""A message of shame -- documentation must be completed.
......@@ -805,7 +461,6 @@ def iob2_to_iob(iob2_text):
iob2_list = []
iob1_list = []
# separate_skip = False NOT USED
tags_to_separate = []
for line in iob2_text.split('\n'):
......
"""A message of shame -- documentation must be completed."""
import numpy as np
import torch
import random
def setup_seed(n=377):
def setup_seed(n=101):
"""A message of shame -- documentation must be completed.
Args:
......@@ -12,5 +13,6 @@ def setup_seed(n=377):
Returns: A message of shame -- documentation must be completed.
"""
torch.manual_seed(n)
random.seed(n)
np.random.seed(n)
torch.manual_seed(n)
......@@ -342,13 +342,10 @@ def performance_measure(y_true, y_pred):
if any(isinstance(s, list) for s in y_true):
y_true = [item for sublist in y_true for item in sublist]
y_pred = [item for sublist in y_pred for item in sublist]
performace_dict['TP'] = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred)
if ((y_t != 'O') or (y_p != 'O')))
performace_dict['TP'] = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred) if ((y_t != 'O') or (y_p != 'O')))
performace_dict['FP'] = sum(y_t != y_p for y_t, y_p in zip(y_true, y_pred))
performace_dict['FN'] = sum(((y_t != 'O') and (y_p == 'O'))
for y_t, y_p in zip(y_true, y_pred))
performace_dict['TN'] = sum((y_t == y_p == 'O')
for y_t, y_p in zip(y_true, y_pred))
performace_dict['FN'] = sum(((y_t != 'O') and (y_p == 'O')) for y_t, y_p in zip(y_true, y_pred))
performace_dict['TN'] = sum((y_t == y_p == 'O') for y_t, y_p in zip(y_true, y_pred))
return performace_dict
......
import logging
from typing import List, Dict, Any
from poldeepner2.utils.data_utils import LABEL_IGNORE_ID, InputExample
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, label_id, valid_ids=None, label_mask=None):
self.input_ids = input_ids
self.input_mask = input_mask
self.label_id = label_id
self.valid_ids = valid_ids
self.label_mask = label_mask
class TokenFeatures(InputFeatures):
def __init__(self, input_ids, label_id):
tail_length = len(input_ids) - 1
self.input_ids = input_ids
self.input_mask = [1] + ([0] * tail_length)
self.label_id = [label_id] + ([LABEL_IGNORE_ID] * tail_length)
self.valid_ids = [1] + ([0] * tail_length)
def length(self):
return len(self.input_ids)
class SequenceFeatures(InputFeatures):
def __init__(self):
self.input_ids = []
self.input_mask = []
self.label_id = []
self.valid_ids = []
self.append(0, 0, LABEL_IGNORE_ID, 0) # adding <s>
def append(self, token_id: int, input_mask: int, label_id: int, valid_id: int):
self.input_ids.append(token_id)
self.input_mask.append(input_mask)
self.label_id.append(label_id)
self.valid_ids.append(valid_id)
def add_token(self, token: TokenFeatures):
self.input_ids.extend(token.input_ids)
self.input_mask.extend(token.input_mask)
self.label_id.extend(token.label_id)
self.valid_ids.extend(token.valid_ids)
def close_and_fill(self, max_length=128):
self.append(2, 0, LABEL_IGNORE_ID, 0) # adding </s>
while len(self.input_ids) < max_length:
self.append(1, 0, LABEL_IGNORE_ID, 0) # adding padding
def length(self):
return len(self.input_ids)
def convert_examples_to_features(examples: List[InputExample], label_list: List[str], max_seq_length: int,
encode_method: Any, squeeze=False) \
-> List[InputFeatures]:
if squeeze:
return convert_examples_to_features_sq(examples, label_list, max_seq_length, encode_method)
else:
return convert_examples_to_features_nosq(examples, label_list, max_seq_length, encode_method)
def tokens_and_labels_into_token_features(tokens: List[str], labels: List[str], encode_method,
label_map: Dict[str, int]) -> List[TokenFeatures]:
tfs = []
for word, label_1 in zip(tokens, labels):
subtokens = encode_method(word.strip())
if len(subtokens) == 0:
logging.warning(f"Token '{word}' has no subwords")
continue
tfs.append(TokenFeatures(subtokens, label_map[label_1]))
return tfs
def convert_examples_to_features_sq(examples: List[InputExample], label_list: List[str], max_seq_length: int,
encode_method: Any) -> List[TokenFeatures]:
label_map = {label: i for i, label in enumerate(label_list, 1)}
tokend_ending_sequence = set()
tfs = []
for (ex_index, example) in enumerate(examples):
textlist = example.text_a.split(' ')
labellist = example.label
tfs.extend(tokens_and_labels_into_token_features(textlist, labellist, encode_method, label_map))
tokend_ending_sequence.add(tfs[-1])
features = []
sf = SequenceFeatures()
for tf in tfs:
if sf.length() + tf.length() + 1 > max_seq_length:
sf.close_and_fill(max_seq_length)
features.append(sf)
sf = SequenceFeatures()
sf.add_token(tf)
if sf.length() > 1:
sf.close_and_fill(max_seq_length)
features.append(sf)
return features
def convert_examples_to_features_nosq(examples: List[InputExample], label_list: List[str], max_seq_length: int,
encode_method: Any) -> List[InputFeatures]:
label_map = {label: i for i, label in enumerate(label_list, 1)}
features = []
for (ex_index, example) in enumerate(examples):
textlist = example.text_a.split(' ')
labellist = example.label
tfs = tokens_and_labels_into_token_features(textlist, labellist, encode_method, label_map)
sf = SequenceFeatures()
for tf in tfs:
if sf.length() + tf.length() + 1 > max_seq_length:
sf.close_and_fill(max_seq_length)
features.append(sf)
sf = SequenceFeatures()
sf.add_token(tf)
if sf.length() > 1:
sf.close_and_fill(max_seq_length)
features.append(sf)
return features
"""A message of shame -- documentation must be completed."""
import torch
from torch.utils.data import SequentialSampler, DataLoader
from poldeepner2.utils.sequence_labeling import classification_report, f1_score
from poldeepner2.utils.sequence_labeling import classification_report, f1_score, precision_score, recall_score
def add_xlmr_args(parser):
"""Adds training and validation arguments to the passed parser."""
parser.add_argument("--data_train", default=None, type=str,
required=True)
parser.add_argument("--data_tune", default=None, type=str,
required=False)
parser.add_argument("--data_test", default=None, type=str,
required=False)
parser.add_argument("--pretrained_path", default=None, type=str,
required=True,
help="pretrained XLM-Roberta model path with model "
"name as prefix, "
"a.e automodel:allegro/herbert-large-cased")
def add_train_args(parser):
parser.add_argument("--data_train", default=[], type=str, action="append", required=True)
parser.add_argument("--data_tune", default=[], type=str, action="append", required=False)
parser.add_argument("--data_test", default=[], type=str, action="append", required=False)
parser.add_argument("--pretrained_path", default=None, type=str, required=True,
help="pretrained XLM-Roberta model path with model name as prefix, a.e automodel:allegro/herbert-large-cased")
parser.add_argument("--output_dir", default=None, type=str, required=True,
help="The output directory where the model "
"predictions and checkpoints will be written.")
help="The output directory where the model predictions and checkpoints will be written.")
parser.add_argument("--cache_dir", default="", type=str,
help="Where do you want to store the pre-trained "
"models downloaded from s3")
help="Where do you want to store the pre-trained models downloaded from s3")
parser.add_argument("--device", default="cuda:0", type=str,
help="Which device should be used to train the model")
parser.add_argument("--transfer", default=None, type=str,
help="Load weights from target model (transfer "
"learning)")
parser.add_argument("--max_seq_length", default=128, type=int,
help="The maximum total input sequence length after "
"WordPiece tokenization. \n "
"Sequences longer than this will be truncated, "
"and sequences shorter \n "
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--do_eval", action='store_true',
help="Whether to run eval or not.")
parser.add_argument("--do_lower_case", action='store_true',
help="Set this flag if you are using an uncased "
"model.")
help="Set this flag if you are using an uncased model.")
parser.add_argument("--train_batch_size", default=32, type=int,
help="Total batch size for training.")
parser.add_argument("--eval_batch_size", default=32, type=int,
......@@ -48,8 +33,7 @@ def add_xlmr_args(parser):
parser.add_argument("--num_train_epochs", default=3, type=int,
help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion", default=0.1, type=float,
help="Proportion of training to perform linear "
"learning rate warmup for. "
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
parser.add_argument("--weight_decay", default=0.01, type=float,
help="Weight deay if we apply some.")
......@@ -60,42 +44,35 @@ def add_xlmr_args(parser):
parser.add_argument('--seed', type=int, default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
help="Number of updates steps to accumulate before "
"performing a backward/update pass.")
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--fp16', action='store_true',
help="Whether to use 16-bit float precision instead "
"of 32-bit")
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--fp16_opt_level', type=str, default='O1',
help="For fp16: Apex AMP optimization level "
"selected in ['O0', 'O1', 'O2', and 'O3']. "
"See details at https://nvidia.github.io/apex"
"/amp.html")
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
"See details at https://nvidia.github.io/apex/amp.html")
parser.add_argument('--loss_scale', type=float, default=0,
help="Loss scaling to improve fp16 numeric "
"stability. Only used when fp16 set to True.\n "
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling "
"value.\n")
"Positive power of 2: static loss scaling value.\n")
parser.add_argument('--dropout',
type=float, default=0.3,
help="training dropout probability")
parser.add_argument('--freeze_model', action='store_true', default=False,
help="whether to freeze the XLM-R base model and "
"train only the classification heads")
parser.add_argument('--epoch_save_model', action='store_true',
default=False,
help="whether to freeze the XLM-R base model and train only the classification heads")
parser.add_argument('--epoch_save_model', action='store_true', default=False,
help="save model for every epoch")
parser.add_argument('--squeeze', default=False, action="store_true",
help="try to squeeze multiple examples into one "
"Input Feature")
help="try to squeeze multiple examples into one Input Feature")
parser.add_argument('--training_mix', default=False, action="store_true",
help="user dense and sparse representation of the data to train the model")
parser.add_argument('--wandb', type=str,
help="Wandb project id. If present the training "
"data will be logged using wandb api.")
help="Wandb project id. If present the training data will be logged using wandb api.")
parser.add_argument('--hidden_size', type=int, default=None,
help="size of the hidden layer (language model output size)")
return parser
def evaluate_model(model, eval_dataset, label_list, batch_size, device,
model_name='Roberta'):
def evaluate_model(model, eval_dataset, label_list, batch_size, device, model_name='Roberta'):
"""
Evaluates an NER model on the eval_dataset provided.
Returns:
......@@ -104,78 +81,7 @@ def evaluate_model(model, eval_dataset, label_list, batch_size, device,
"""
# Run prediction for full data
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(
eval_dataset, sampler=eval_sampler, batch_size=batch_size)
model.eval() # turn of dropout
y_true = []
y_pred = []
label_map = {i: label for i, label in enumerate(label_list, 1)}
#print(f'label_list: {label_list}')
#print(f'label_map: {label_map}')
for input_ids, label_ids, l_mask, valid_ids in eval_dataloader:
# print(f'eval_sampler {eval_sampler}')
# print(f'eval_dataset {eval_dataset}')
# print(f'input_ids {input_ids}')
# print(f'label_ids {label_ids}')
input_ids = input_ids.to(device)
label_ids = label_ids.to(device)
valid_ids = valid_ids.to(device)
#print(f'valid_ids {label_ids}')
l_mask = l_mask.to(device)
with torch.no_grad():
if model_name == 'Roberta':
logits = model(input_ids, labels=None, labels_mask=None,
valid_mask=valid_ids)
else:
logits = model(input_ids, return_dict=True).logits
#print(f'logits1: {logits}')
logits = torch.argmax(logits, dim=2)
#print(f'logits11: {logits}')
logits = logits.detach().cpu().numpy()
label_ids = label_ids.cpu().numpy()
for i, cur_label in enumerate(label_ids):
#print(f'i {i}, cur_label:{cur_label}')
temp_1 = []
temp_2 = []
for j, m in enumerate(cur_label):
if valid_ids[i][j]: #and logits[i][j]: # if it's a valid label
temp_1.append(label_map[m])
if logits[i][j]:
index = label_map[logits[i][j]] # for debug
temp_2.append(index)
#assert len(temp_1) == len(temp_2)
y_true.append(temp_1)
y_pred.append(temp_2)
report = classification_report(y_true, y_pred, digits=4)
f1 = f1_score(y_true, y_pred, average='Macro')
return f1, report
def predict_model(model, eval_dataset, label_list, batch_size, device,
report=True):
"""
Evaluates an NER model on the eval_dataset provided.
Returns:
F1_score: Macro-average f1_score on the evaluation dataset.
Report: detailed classification report
"""
# Run prediction for full data
eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(
eval_dataset, sampler=eval_sampler, batch_size=batch_size)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=batch_size)
model.eval() # turn of dropout
......@@ -185,17 +91,13 @@ def predict_model(model, eval_dataset, label_list, batch_size, device,
label_map = {i: label for i, label in enumerate(label_list, 1)}
for input_ids, label_ids, l_mask, valid_ids in eval_dataloader:
input_ids = input_ids.to(device)
label_ids = label_ids.to(device)
valid_ids = valid_ids.to(device)
l_mask = l_mask.to(device)
with torch.no_grad():
if model_name == 'Roberta':
logits = model(input_ids, labels=None, labels_mask=None,
valid_mask=valid_ids)
logits = model(input_ids, labels=None, labels_mask=None, valid_mask=valid_ids)
else:
logits = model(input_ids, return_dict=True).logits
......@@ -213,63 +115,13 @@ def predict_model(model, eval_dataset, label_list, batch_size, device,
temp_2.append(label_map[logits[i][j]])
assert len(temp_1) == len(temp_2)
y_true.append(temp_1)
y_pred.append(temp_2)
# All labels are joined into a single sequence to merge annotations which were split between sequences.
y_true.extend(temp_1)
y_pred.extend(temp_2)
report = classification_report(y_true, y_pred, digits=4)
report = classification_report([y_true], [y_pred], digits=4)
f1 = f1_score(y_true, y_pred, average='Macro')
precision = precision_score(y_true, y_pred, average='Macro')
recall = recall_score(y_true, y_pred, average='Macro')
return f1, report
def predict_model(model, eval_dataset, label_list, batch_size, device,
report=True):
"""Evaluates an NER model on the eval_dataset provided.
Returns:
F1_score: Macro-average f1_score on the evaluation dataset.
Report: detailed classification report
"""
# Run prediction for full data
# eval_sampler = SequentialSampler(eval_dataset) Not used
eval_dataloader = DataLoader(eval_dataset, batch_size=1)
model.eval() # turn of dropout
y_true = []
y_pred = []
label_map = {i: label for i, label in enumerate(label_list, 1)}
for input_ids, label_ids, l_mask, valid_ids in eval_dataloader:
input_ids = input_ids.to(device)
label_ids = label_ids.to(device)
valid_ids = valid_ids.to(device)
l_mask = l_mask.to(device)
with torch.no_grad():
logits = model(input_ids, labels=None, labels_mask=None,
valid_mask=valid_ids)
logits = torch.argmax(logits, dim=2)
logits = logits.detach().cpu().numpy()
label_ids = label_ids.cpu().numpy()
for i, cur_label in enumerate(label_ids):
temp_1 = []
temp_2 = []
for j, m in enumerate(cur_label):
if valid_ids[i][j]: # if it's a valid label
temp_1.append(label_map[m])
temp_2.append(label_map[logits[i][j]])
assert len(temp_1) == len(temp_2)
y_true.append(temp_1)
y_pred.append(temp_2)
if report:
report = classification_report(y_true, y_pred, digits=4)
return y_pred
return f1, precision, recall, report
pytest~=6.0.1
numpy==1.19.4
fairseq==0.9.0
pytorch-transformers
seqeval==0.0.12
pytest~=6.0.1
tqdm
console-menu
fastapi==0.61.1
uvicorn==0.12.2
pandas==1.1.1
spacy==2.3.2
wandb==0.10.7
transformers==4.16.2
torch==1.9.0.
......
"""A message of shame -- documentation must be completed."""
import poldeepner2
from poldeepner2.models import PolDeepNer2
model = "pdn2_nkjp_herbert_large_sq_fast_cuda.pdn2"
resources_path = "/tmp"
ner = PolDeepNer2.load(model=model, resources_path=resources_path)
path_or_name = "pdn2-v07-cen-n82-base-01"
ner = poldeepner2.load(path_or_name)
sentences = ["Marek Nowak z Politechniki Wrocławskiej mieszka przy ul. "
"Sądeckiej.",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment