Skip to content
Snippets Groups Projects
Commit 6861972b authored by Michał Marcińczuk's avatar Michał Marcińczuk
Browse files

Option to output more than one label per token.

parent ada9e8fd
No related branches found
No related tags found
1 merge request!41Dev v07
Pipeline #6496 failed
"""A message of shame -- documentation must be completed."""
from pathlib import Path
import yaml
from typing import List
......@@ -24,6 +22,7 @@ class Pdn2ModelConfiguration:
max_seq_length: int = 256
sequence_generator: str = None
seed: int = 101
output_top_k: int = 1
def label_count(self) -> int:
return len(self.labels) + 1
......@@ -103,14 +102,8 @@ class Pdn2TokenClassification(nn.Module):
return logits
def encode_word(self, s):
"""Takes a string and returns a list of token ids.
Args:
self:A message of shame -- documentation must be completed.
s:A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
Takes a string and returns a list of token ids.
"""
tensor_ids = self.tokenizer.encode(s)
# remove <s> and </s> ids
......@@ -134,4 +127,4 @@ class Pdn2TokenClassification(nn.Module):
def load_config(self, path: str):
with open(str(Path(path) / 'pdn2_config.yml'), 'r') as f:
self.config = yaml.load(f, Loader=yaml.FullLoader)
self.config = yaml.load(f, Loader=yaml.Loader)
"""A message of shame -- documentation must be completed."""
import logging
import os
from typing import List
......@@ -21,7 +18,6 @@ from poldeepner2.utils.sequences import FeatureGeneratorFactory
class PolDeepNer2:
"""A message of shame -- documentation must be completed."""
def __init__(self, path: str, tokenizer: Tokenizer = None,
processor_annotations: List[ProcessorAnnotations] = None, device: str = None):
......@@ -40,6 +36,7 @@ class PolDeepNer2:
examples = [InputExample(guid=str(idx), tokens=tokens, labels=["O"] * len(tokens))
for idx, tokens in enumerate(sentences)]
assert self.model.config.sequence_generator != "union", "In the inference mode the sequence_generator cannot be union"
gen = FeatureGeneratorFactory.create(self.model.config.sequence_generator,
label_list=self.model.config.labels,
max_seq_length=self.model.config.max_seq_length,
......@@ -67,6 +64,27 @@ class PolDeepNer2:
with torch.no_grad():
logits = self.model(input_ids, labels=None, valid_mask=valid_ids)
if self.model.config.output_top_k > 1:
logits = logits.detach().cpu().numpy()
valid_ids = valid_ids.detach().cpu().numpy()
for idx, (scores, is_valid) in enumerate(zip(logits[0], valid_ids[0])):
if is_valid:
label_ids_scores = [(score, label_id) for label_id, score in enumerate(scores)]
label_ids_scores = sorted(label_ids_scores, reverse=True)
score, label_id = label_ids_scores[0]
label_name = label_map[label_id]
labels = [label_name]
labels_score = [(label_name, score)]
if label_name != "O":
for topn in range(1, self.model.config.output_top_k):
score, label_id = label_ids_scores[topn]
label_name = label_map[label_id]
if label_name == "O":
break
labels_score.append((label_name, score))
labels.append(label_name)
y_pred.append("#".join(labels))
else:
logits = torch.argmax(logits, dim=2)
logits = logits.detach().cpu().numpy()
label_ids = label_ids.cpu().numpy()
......@@ -77,6 +95,7 @@ class PolDeepNer2:
y_pred.append(label_map[logits[i][j]])
token_count = sum([len(s) for s in sentences])
assert token_count == len(y_pred), \
f"The number of returned labels differ from the number of " \
f"tokens. Number of tokens: {token_count}, " \
......@@ -95,7 +114,7 @@ class PolDeepNer2:
return sentences_y_pred, stats
def process_text(self, text: str) -> [AnnotationText]:
"""A message of shame -- documentation must be completed.
"""
@texts: Array of sentences. Each sentence is a string.
"John lives in New York. Mary lives in Chicago"
......@@ -108,15 +127,13 @@ class PolDeepNer2:
"""
sentences = self.tokenizer.tokenize([text])
predictions = self.process(sentences)
annotations = wrap_annotations(predictions)
annotations = wrap_annotations(predictions[0])
return align_tokens_with_text(text, sentences, annotations)
def process_document(self, text: str) -> Document:
"""A message of shame -- documentation must be completed.
"""
Process given texts and return Document structure representing the
result of processing.
"""
polem = AnnotationLemmatizerPolem()
......@@ -145,8 +162,7 @@ class PolDeepNer2:
return document
def process_tokenized(self, tokens: [[str]]) -> [[str]]:
"""A message of shame -- documentation must be completed.
"""
@tokens: Array of sentences. Each sentence is an array of words.
[["John", "lives", "in", "New", "York"],
["Mary", "lives", "in", "Chicago"]]
......
......@@ -23,7 +23,6 @@ class NerProcessor:
def get_examples(self, data_path: List[str], data_type="data") -> List[InputExample]:
examples = []
print(data_path)
for path in data_path:
examples.extend(self._create_examples(self._read_file(path), data_type))
return examples
......
"""A message of shame -- documentation must be completed."""
import logging
from random import choice
import torch
......@@ -118,6 +119,10 @@ def evaluate_model(model, eval_dataset, label_list, batch_size, device, model_na
for j, m in enumerate(cur_label):
if valid_ids[i][j]: # if it's a valid label
temp_1.append(label_map[m])
if logits[i][j] not in label_map:
logging.error(f"Key {logits[i][j]} not found in {label_map}. Used default value 'O'")
temp_2.append("O")
else:
temp_2.append(label_map[logits[i][j]])
assert len(temp_1) == len(temp_2)
......
"""A message of shame -- documentation must be completed."""
from __future__ import absolute_import, division, print_function
import argparse
import logging
import os
from poldeepner2.models import PolDeepNer2
import poldeepner2
from poldeepner2.utils.data_utils import read_tsv, save_tsv
from poldeepner2.utils.seed import setup_seed
from poldeepner2.utils.sequences import FeatureGeneratorFactory
def main(args):
"""A message of shame -- documentation must be completed.
print("Loading the NER model ...")
ner = poldeepner2.load(args.model, device=args.device)
Args:
args:A message of shame -- documentation must be completed.
for param in ["device", "max_seq_length", "sequence_generator", "output_top_k"]:
value = args.__dict__.get(param, None)
if value is not None:
value_default = ner.model.config.__dict__.get(param)
if str(value) != str(value_default):
print(f"Forced change of the parameter: {param} '{value_default}' => '{value}'")
ner.model.config.__dict__[param] = value
"""
logging.info("Loading the NER model ...")
# ner = PolDeepNer2(args.model, args.pretrained_path, args.device,
# args.squeeze, args.max_seq_length)
ner = PolDeepNer2.load(
model=args.model,
pretrained_path=args.pretrained_path,
device=args.device,
max_seq_length=args.max_seq_length,
squeeze=args.squeeze,
# seed=args.seed
)
if args.seed is not None:
setup_seed(args.seed)
logging.info("Processing ...")
sentences_labels = read_tsv(os.path.join(args.input))
sentences = [sentence[0] for sentence in sentences_labels]
logging.info(f"Number of sentences to process: {len(sentences)}")
predictions = ner.process(sentences, args.max_seq_length)
predictions, stats = ner.process(sentences, args.max_seq_length)
save_tsv(os.path.join(args.output), sentences, predictions)
logging.info("done.")
def parse_args():
"""A message of shame -- documentation must be completed.
Returns: A message of shame -- documentation must be completed.
"""
parser = argparse.ArgumentParser(
description='Process a single TSV with a NER model')
parser.add_argument('--input', required=True, metavar='PATH',
help='path to a file with a list of files')
parser.add_argument('--input', required=True, metavar='PATH', help='path to a file with a list of files')
parser.add_argument('--model', required=True, metavar='PATH', help='path or name of the model')
parser.add_argument('--output', required=True, metavar='PATH',
help='path to a json output file')
parser.add_argument('--model', required=True, metavar='PATH',
help='path to NER model')
parser.add_argument('--pretrained_path', required=False, metavar='PATH',
help='pretrained XLM-Roberta model path')
parser.add_argument('--max_seq_length', required=False, default=256,
metavar='N', type=int,
help='the maximum total input sequence length after '
'WordPiece tokenization.')
parser.add_argument('--device', required=False, default="cpu",
metavar='cpu|cuda',
help='device type used for processing')
parser.add_argument('--squeeze', required=False, default=False,
help='try to squeeze multiple examples into one '
'Input Feature')
parser.add_argument('--max_seq_length', required=False, default=None, metavar='N', type=int,
help='override default values of the max_seq_length')
parser.add_argument('--device', default=None, metavar='cpu|cuda',
help='override default value of the device')
parser.add_argument('--sequence-generator', type=str, choices=FeatureGeneratorFactory.methods,
help="method of sequence generation", default=None, required=False)
parser.add_argument('--seed', required=False, default=None, metavar='N', type=int,
help='a seed used to initialize a number generator')
parser.add_argument('--output-top-k', required=False, default=None, metavar='N', type=int,
help='output top k labels for each token')
return parser.parse_args()
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
cliargs = parse_args()
try:
main(cliargs)
......
......@@ -3,9 +3,10 @@ pytorch-transformers
seqeval==0.0.12
tqdm
fastapi==0.61.1
PyYAML==5.3.1
uvicorn==0.12.2
pandas==1.1.1
wandb==0.10.7
transformers==4.16.2
torch==1.9.0.
torch==1.9.0
torchsummary==1.5.1
\ No newline at end of file
......@@ -6,32 +6,37 @@ import argparse
from fastapi import FastAPI
from typing import List, Optional
import poldeepner2
from poldeepner2.data.token import Token
from poldeepner2.models import PolDeepNer2
from poldeepner2.pipeline.tokenization import TokenizerSpaces, load, names
from pydantic import BaseModel
from starlette.middleware.cors import CORSMiddleware
from poldeepner2.utils.annotation import Annotation
from poldeepner2.utils.sequences import FeatureGeneratorFactory
class PredictionReq(BaseModel):
"""A message of shame -- documentation must be completed."""
text: str
tokenization: Optional[str] = 'spacy'
tokenization: Optional[str] = 'fast'
class Prediction(BaseModel):
"""A message of shame -- documentation must be completed."""
text: str
tokens: List[List[str]]
doc: List[List[str]]
class ResponseToken(BaseModel):
"""A message of shame -- documentation must be completed."""
class Entity(BaseModel):
text: str
label: str
class Entities(BaseModel):
entities: List[Entity]
class ResponseToken(BaseModel):
orth: str
lemma: str
start: int
......@@ -40,18 +45,11 @@ class ResponseToken(BaseModel):
@staticmethod
def generate(token: Token):
"""A message of shame -- documentation must be completed.
Args:
token: A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
return {"orth": token.orth, "lemma": token.lemma, "start": token.start,
"end": token.end, "eos": token.eos}
class ResponseAnnotation(BaseModel):
"""A message of shame -- documentation must be completed."""
......@@ -95,28 +93,25 @@ class Server:
allow_headers=['*'])
global spacyTokenizer
spacyTokenizer = load('spacy')
spacyTokenizer = load('fast')
@app.post('/predict', response_model=Prediction)
async def predict(pred_req: PredictionReq):
"""A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
text = pred_req.text
sentences = text.split('\n')
tokens = spacyTokenizer.tokenize(sentences)
output = ner.process_tokenized(tokens)
return {"text": text, "entities": output}
output, stats = ner.process_tokenized(tokens)
return Prediction(text=text, tokens=tokens, doc=output)
@app.post('/entities', response_model=Entities)
async def entities(pred_req: PredictionReq):
text = pred_req.text
annotations = ner.process_text(text)
entities = [Entity(text=an.text, label=an.label) for an in annotations]
return Entities(entities=entities)
@app.post('/polem', response_model=ResponsePolem)
async def polem(pred_req: PredictionReq):
"""A message of shame -- documentation must be completed.
Returns:A message of shame -- documentation must be completed.
"""
text = pred_req.text
doc = ner.process_document(text)
return {"text": text,
......@@ -126,31 +121,21 @@ class Server:
def parse_args():
"""A message of shame -- documentation must be completed."""
parser = argparse.ArgumentParser(
description='Process a single TSV with a NER model')
parser.add_argument('--model', required=True, metavar='PATH',
help='path to NER model')
parser.add_argument('--device', required=False, default="cpu",
metavar='cpu|cuda',
help='device type used for processing')
parser.add_argument('--max_seq_length', required=False, default=256,
metavar='N', type=int,
help='the maximum total input sequence length after '
'WordPiece tokenization.')
parser.add_argument('--pretrained_path', required=False, metavar='PATH',
default=None,
help='pretrained XLM-Roberta model path with model '
'name as prefix, '
'a.e automodel:allegro/herbert-large-cased')
parser.add_argument('--processes', help='number of processes', default=1)
parser.add_argument('--tokenization', required=False, default="spacy-ext",
choices=names,
help='Tokenization method')
parser.add_argument('--squeeze', required=False, default=False,
action="store_true",
help='try to squeeze multiple examples into one '
'Input Feature')
parser.add_argument('--max_seq_length', required=False, default=None, metavar='N', type=int,
help='override default values of the max_seq_length')
parser.add_argument('--device', default=None, metavar='cpu|cuda',
help='override default value of the device')
parser.add_argument('--sequence-generator', type=str, choices=FeatureGeneratorFactory.methods,
help="method of sequence generation", default=None, required=False)
parser.add_argument('--seed', required=False, default=None, metavar='N', type=int,
help='a seed used to initialize a number generator')
parser.add_argument('--output-top-k', required=False, default=None, metavar='N', type=int,
help='output top k labels for each token')
parser.add_argument('--host', required=False, default="0.0.0.0")
parser.add_argument('--port', required=False, default=8001, type=int)
return parser.parse_args()
......@@ -159,18 +144,20 @@ def parse_args():
server = Server()
if __name__ == "__main__":
cliargs = parse_args()
args = parse_args()
try:
global ner
ner = PolDeepNer2.load(cliargs.model,
pretrained_path=cliargs.pretrained_path,
device=cliargs.device,
max_seq_length=cliargs.max_seq_length,
squeeze=cliargs.squeeze,
tokenizer=TokenizerSpaces())
ner = poldeepner2.load(args.model, device=args.device)
for param in ["device", "max_seq_length", "sequence_generator", "output_top_k"]:
value = args.__dict__.get(param, None)
if value is not None:
value_default = ner.model.config.__dict__.get(param)
if str(value) != str(value_default):
print(f"Forced change of the parameter: {param} '{value_default}' => '{value}'")
ner.model.config.__dict__[param] = value
# threaded=True, processes=cliargs.processes
uvicorn.run(server.app, host=cliargs.host, port=cliargs.port,
log_level="info")
uvicorn.run(server.app, host=args.host, port=args.port, log_level="info")
except ValueError as er:
print("[ERROR] %s" % er)
......@@ -12,12 +12,12 @@ install_requires = [
"pandas==1.1.1",
"transformers==4.2.1",
"tqdm",
"torch==1.7.1"
"torch==1.12.1"
]
setuptools.setup(
name="poldeepner2",
version="0.8.0-alpha.1+001",
version="0.7.1",
author="Michał Marcińczuk",
author_email="michal.marcinczuk@pwr.edu.pl",
description="PolDeepNer2 is a tool for sequence labeling tasks based on "
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment