diff --git a/poldeepner2/model/hf_for_token_calssification.py b/poldeepner2/model/hf_for_token_calssification.py index fecc8b3d930b50277e99ef8464bc2cd7d51f5435..92a428f5d1e8f18a6006756b42fba2ca848da4b7 100644 --- a/poldeepner2/model/hf_for_token_calssification.py +++ b/poldeepner2/model/hf_for_token_calssification.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - from pathlib import Path import yaml from typing import List @@ -24,6 +22,7 @@ class Pdn2ModelConfiguration: max_seq_length: int = 256 sequence_generator: str = None seed: int = 101 + output_top_k: int = 1 def label_count(self) -> int: return len(self.labels) + 1 @@ -103,14 +102,8 @@ class Pdn2TokenClassification(nn.Module): return logits def encode_word(self, s): - """Takes a string and returns a list of token ids. - - Args: - self:A message of shame -- documentation must be completed. - s:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - + """ + Takes a string and returns a list of token ids. """ tensor_ids = self.tokenizer.encode(s) # remove <s> and </s> ids @@ -134,4 +127,4 @@ class Pdn2TokenClassification(nn.Module): def load_config(self, path: str): with open(str(Path(path) / 'pdn2_config.yml'), 'r') as f: - self.config = yaml.load(f, Loader=yaml.FullLoader) + self.config = yaml.load(f, Loader=yaml.Loader) diff --git a/poldeepner2/models.py b/poldeepner2/models.py index c63794f0fb469d0b7f2ba013216773746a28c169..832331a36bbded7669a186ad582e07fd45ae5dc3 100644 --- a/poldeepner2/models.py +++ b/poldeepner2/models.py @@ -1,6 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - -import logging import os from typing import List @@ -21,7 +18,6 @@ from poldeepner2.utils.sequences import FeatureGeneratorFactory class PolDeepNer2: - """A message of shame -- documentation must be completed.""" def __init__(self, path: str, tokenizer: Tokenizer = None, processor_annotations: List[ProcessorAnnotations] = None, device: str = None): @@ -40,6 +36,7 @@ class PolDeepNer2: examples = [InputExample(guid=str(idx), tokens=tokens, labels=["O"] * len(tokens)) for idx, tokens in enumerate(sentences)] + assert self.model.config.sequence_generator != "union", "In the inference mode the sequence_generator cannot be union" gen = FeatureGeneratorFactory.create(self.model.config.sequence_generator, label_list=self.model.config.labels, max_seq_length=self.model.config.max_seq_length, @@ -67,16 +64,38 @@ class PolDeepNer2: with torch.no_grad(): logits = self.model(input_ids, labels=None, valid_mask=valid_ids) - logits = torch.argmax(logits, dim=2) - logits = logits.detach().cpu().numpy() - label_ids = label_ids.cpu().numpy() - - for i, cur_label in enumerate(label_ids): - for j, m in enumerate(cur_label): - if valid_ids[i][j]: - y_pred.append(label_map[logits[i][j]]) + if self.model.config.output_top_k > 1: + logits = logits.detach().cpu().numpy() + valid_ids = valid_ids.detach().cpu().numpy() + for idx, (scores, is_valid) in enumerate(zip(logits[0], valid_ids[0])): + if is_valid: + label_ids_scores = [(score, label_id) for label_id, score in enumerate(scores)] + label_ids_scores = sorted(label_ids_scores, reverse=True) + score, label_id = label_ids_scores[0] + label_name = label_map[label_id] + labels = [label_name] + labels_score = [(label_name, score)] + if label_name != "O": + for topn in range(1, self.model.config.output_top_k): + score, label_id = label_ids_scores[topn] + label_name = label_map[label_id] + if label_name == "O": + break + labels_score.append((label_name, score)) + labels.append(label_name) + y_pred.append("#".join(labels)) + else: + logits = torch.argmax(logits, dim=2) + logits = logits.detach().cpu().numpy() + label_ids = label_ids.cpu().numpy() + + for i, cur_label in enumerate(label_ids): + for j, m in enumerate(cur_label): + if valid_ids[i][j]: + y_pred.append(label_map[logits[i][j]]) token_count = sum([len(s) for s in sentences]) + assert token_count == len(y_pred), \ f"The number of returned labels differ from the number of " \ f"tokens. Number of tokens: {token_count}, " \ @@ -95,7 +114,7 @@ class PolDeepNer2: return sentences_y_pred, stats def process_text(self, text: str) -> [AnnotationText]: - """A message of shame -- documentation must be completed. + """ @texts: Array of sentences. Each sentence is a string. "John lives in New York. Mary lives in Chicago" @@ -108,15 +127,13 @@ class PolDeepNer2: """ sentences = self.tokenizer.tokenize([text]) predictions = self.process(sentences) - annotations = wrap_annotations(predictions) + annotations = wrap_annotations(predictions[0]) return align_tokens_with_text(text, sentences, annotations) def process_document(self, text: str) -> Document: - """A message of shame -- documentation must be completed. - + """ Process given texts and return Document structure representing the result of processing. - """ polem = AnnotationLemmatizerPolem() @@ -145,8 +162,7 @@ class PolDeepNer2: return document def process_tokenized(self, tokens: [[str]]) -> [[str]]: - """A message of shame -- documentation must be completed. - + """ @tokens: Array of sentences. Each sentence is an array of words. [["John", "lives", "in", "New", "York"], ["Mary", "lives", "in", "Chicago"]] diff --git a/poldeepner2/utils/data_utils.py b/poldeepner2/utils/data_utils.py index f855f00ca2a39a352404805b39333e08bfc7b484..b3506408aa432960ffccdf18a924d131fe9a690e 100644 --- a/poldeepner2/utils/data_utils.py +++ b/poldeepner2/utils/data_utils.py @@ -23,7 +23,6 @@ class NerProcessor: def get_examples(self, data_path: List[str], data_type="data") -> List[InputExample]: examples = [] - print(data_path) for path in data_path: examples.extend(self._create_examples(self._read_file(path), data_type)) return examples diff --git a/poldeepner2/utils/train_utils.py b/poldeepner2/utils/train_utils.py index 27302d5995316bc64fb85888fcb08e744b829de5..871e0f1294f9b077e26cda5584e13e741a7b3042 100644 --- a/poldeepner2/utils/train_utils.py +++ b/poldeepner2/utils/train_utils.py @@ -1,4 +1,5 @@ """A message of shame -- documentation must be completed.""" +import logging from random import choice import torch @@ -118,7 +119,11 @@ def evaluate_model(model, eval_dataset, label_list, batch_size, device, model_na for j, m in enumerate(cur_label): if valid_ids[i][j]: # if it's a valid label temp_1.append(label_map[m]) - temp_2.append(label_map[logits[i][j]]) + if logits[i][j] not in label_map: + logging.error(f"Key {logits[i][j]} not found in {label_map}. Used default value 'O'") + temp_2.append("O") + else: + temp_2.append(label_map[logits[i][j]]) assert len(temp_1) == len(temp_2) # All labels are joined into a single sequence to merge annotations which were split between sequences. diff --git a/process_tsv.py b/process_tsv.py index 4fe0d70787a3e939f6f6f574c9bfc19b55529d09..4376ccac9fa68027f68a0d8b5418df0511bf4b43 100644 --- a/process_tsv.py +++ b/process_tsv.py @@ -1,76 +1,63 @@ -"""A message of shame -- documentation must be completed.""" - from __future__ import absolute_import, division, print_function import argparse import logging import os -from poldeepner2.models import PolDeepNer2 +import poldeepner2 from poldeepner2.utils.data_utils import read_tsv, save_tsv +from poldeepner2.utils.seed import setup_seed +from poldeepner2.utils.sequences import FeatureGeneratorFactory def main(args): - """A message of shame -- documentation must be completed. + print("Loading the NER model ...") + + ner = poldeepner2.load(args.model, device=args.device) - Args: - args:A message of shame -- documentation must be completed. + for param in ["device", "max_seq_length", "sequence_generator", "output_top_k"]: + value = args.__dict__.get(param, None) + if value is not None: + value_default = ner.model.config.__dict__.get(param) + if str(value) != str(value_default): + print(f"Forced change of the parameter: {param} '{value_default}' => '{value}'") + ner.model.config.__dict__[param] = value - """ - logging.info("Loading the NER model ...") - # ner = PolDeepNer2(args.model, args.pretrained_path, args.device, - # args.squeeze, args.max_seq_length) - ner = PolDeepNer2.load( - model=args.model, - pretrained_path=args.pretrained_path, - device=args.device, - max_seq_length=args.max_seq_length, - squeeze=args.squeeze, - # seed=args.seed - ) + if args.seed is not None: + setup_seed(args.seed) logging.info("Processing ...") sentences_labels = read_tsv(os.path.join(args.input)) sentences = [sentence[0] for sentence in sentences_labels] logging.info(f"Number of sentences to process: {len(sentences)}") - predictions = ner.process(sentences, args.max_seq_length) + predictions, stats = ner.process(sentences, args.max_seq_length) save_tsv(os.path.join(args.output), sentences, predictions) logging.info("done.") def parse_args(): - """A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ parser = argparse.ArgumentParser( description='Process a single TSV with a NER model') - parser.add_argument('--input', required=True, metavar='PATH', - help='path to a file with a list of files') + parser.add_argument('--input', required=True, metavar='PATH', help='path to a file with a list of files') + parser.add_argument('--model', required=True, metavar='PATH', help='path or name of the model') parser.add_argument('--output', required=True, metavar='PATH', help='path to a json output file') - parser.add_argument('--model', required=True, metavar='PATH', - help='path to NER model') - parser.add_argument('--pretrained_path', required=False, metavar='PATH', - help='pretrained XLM-Roberta model path') - parser.add_argument('--max_seq_length', required=False, default=256, - metavar='N', type=int, - help='the maximum total input sequence length after ' - 'WordPiece tokenization.') - parser.add_argument('--device', required=False, default="cpu", - metavar='cpu|cuda', - help='device type used for processing') - parser.add_argument('--squeeze', required=False, default=False, - help='try to squeeze multiple examples into one ' - 'Input Feature') + parser.add_argument('--max_seq_length', required=False, default=None, metavar='N', type=int, + help='override default values of the max_seq_length') + parser.add_argument('--device', default=None, metavar='cpu|cuda', + help='override default value of the device') + parser.add_argument('--sequence-generator', type=str, choices=FeatureGeneratorFactory.methods, + help="method of sequence generation", default=None, required=False) + parser.add_argument('--seed', required=False, default=None, metavar='N', type=int, + help='a seed used to initialize a number generator') + parser.add_argument('--output-top-k', required=False, default=None, metavar='N', type=int, + help='output top k labels for each token') return parser.parse_args() if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) cliargs = parse_args() try: main(cliargs) diff --git a/requirements.txt b/requirements.txt index 47053e257f8c6f5d6c799c64b209c16dead15662..83fed8f2c32135d6322b31f82457262e4c5909f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,9 +3,10 @@ pytorch-transformers seqeval==0.0.12 tqdm fastapi==0.61.1 +PyYAML==5.3.1 uvicorn==0.12.2 pandas==1.1.1 wandb==0.10.7 transformers==4.16.2 -torch==1.9.0. +torch==1.9.0 torchsummary==1.5.1 \ No newline at end of file diff --git a/server.py b/server.py index 644d7f8ab6aa629182ae7b062680d7ad403515ba..0a5a2e0c496c30b4007bf81afa17cffab3866f5b 100644 --- a/server.py +++ b/server.py @@ -6,32 +6,37 @@ import argparse from fastapi import FastAPI from typing import List, Optional +import poldeepner2 from poldeepner2.data.token import Token -from poldeepner2.models import PolDeepNer2 from poldeepner2.pipeline.tokenization import TokenizerSpaces, load, names from pydantic import BaseModel from starlette.middleware.cors import CORSMiddleware from poldeepner2.utils.annotation import Annotation +from poldeepner2.utils.sequences import FeatureGeneratorFactory class PredictionReq(BaseModel): - """A message of shame -- documentation must be completed.""" - text: str - tokenization: Optional[str] = 'spacy' + tokenization: Optional[str] = 'fast' class Prediction(BaseModel): - """A message of shame -- documentation must be completed.""" - text: str + tokens: List[List[str]] doc: List[List[str]] -class ResponseToken(BaseModel): - """A message of shame -- documentation must be completed.""" +class Entity(BaseModel): + text: str + label: str + +class Entities(BaseModel): + entities: List[Entity] + + +class ResponseToken(BaseModel): orth: str lemma: str start: int @@ -40,18 +45,11 @@ class ResponseToken(BaseModel): @staticmethod def generate(token: Token): - """A message of shame -- documentation must be completed. - - Args: - token: A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return {"orth": token.orth, "lemma": token.lemma, "start": token.start, "end": token.end, "eos": token.eos} + class ResponseAnnotation(BaseModel): """A message of shame -- documentation must be completed.""" @@ -95,28 +93,25 @@ class Server: allow_headers=['*']) global spacyTokenizer - spacyTokenizer = load('spacy') + spacyTokenizer = load('fast') @app.post('/predict', response_model=Prediction) async def predict(pred_req: PredictionReq): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ text = pred_req.text sentences = text.split('\n') tokens = spacyTokenizer.tokenize(sentences) - output = ner.process_tokenized(tokens) - return {"text": text, "entities": output} + output, stats = ner.process_tokenized(tokens) + return Prediction(text=text, tokens=tokens, doc=output) + + @app.post('/entities', response_model=Entities) + async def entities(pred_req: PredictionReq): + text = pred_req.text + annotations = ner.process_text(text) + entities = [Entity(text=an.text, label=an.label) for an in annotations] + return Entities(entities=entities) @app.post('/polem', response_model=ResponsePolem) async def polem(pred_req: PredictionReq): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ text = pred_req.text doc = ner.process_document(text) return {"text": text, @@ -126,31 +121,21 @@ class Server: def parse_args(): - """A message of shame -- documentation must be completed.""" parser = argparse.ArgumentParser( description='Process a single TSV with a NER model') parser.add_argument('--model', required=True, metavar='PATH', help='path to NER model') - parser.add_argument('--device', required=False, default="cpu", - metavar='cpu|cuda', - help='device type used for processing') - parser.add_argument('--max_seq_length', required=False, default=256, - metavar='N', type=int, - help='the maximum total input sequence length after ' - 'WordPiece tokenization.') - parser.add_argument('--pretrained_path', required=False, metavar='PATH', - default=None, - help='pretrained XLM-Roberta model path with model ' - 'name as prefix, ' - 'a.e automodel:allegro/herbert-large-cased') - parser.add_argument('--processes', help='number of processes', default=1) - parser.add_argument('--tokenization', required=False, default="spacy-ext", - choices=names, - help='Tokenization method') - parser.add_argument('--squeeze', required=False, default=False, - action="store_true", - help='try to squeeze multiple examples into one ' - 'Input Feature') + parser.add_argument('--max_seq_length', required=False, default=None, metavar='N', type=int, + help='override default values of the max_seq_length') + parser.add_argument('--device', default=None, metavar='cpu|cuda', + help='override default value of the device') + parser.add_argument('--sequence-generator', type=str, choices=FeatureGeneratorFactory.methods, + help="method of sequence generation", default=None, required=False) + parser.add_argument('--seed', required=False, default=None, metavar='N', type=int, + help='a seed used to initialize a number generator') + parser.add_argument('--output-top-k', required=False, default=None, metavar='N', type=int, + help='output top k labels for each token') + parser.add_argument('--host', required=False, default="0.0.0.0") parser.add_argument('--port', required=False, default=8001, type=int) return parser.parse_args() @@ -159,18 +144,20 @@ def parse_args(): server = Server() if __name__ == "__main__": - cliargs = parse_args() + args = parse_args() try: global ner - ner = PolDeepNer2.load(cliargs.model, - pretrained_path=cliargs.pretrained_path, - device=cliargs.device, - max_seq_length=cliargs.max_seq_length, - squeeze=cliargs.squeeze, - tokenizer=TokenizerSpaces()) + ner = poldeepner2.load(args.model, device=args.device) + + for param in ["device", "max_seq_length", "sequence_generator", "output_top_k"]: + value = args.__dict__.get(param, None) + if value is not None: + value_default = ner.model.config.__dict__.get(param) + if str(value) != str(value_default): + print(f"Forced change of the parameter: {param} '{value_default}' => '{value}'") + ner.model.config.__dict__[param] = value # threaded=True, processes=cliargs.processes - uvicorn.run(server.app, host=cliargs.host, port=cliargs.port, - log_level="info") + uvicorn.run(server.app, host=args.host, port=args.port, log_level="info") except ValueError as er: print("[ERROR] %s" % er) diff --git a/setup.py b/setup.py index 2764b4e498c5450da99a6d4d46144040b472b6f8..a28ed2e3fb65953827babc28ae2c4026b0a614d7 100644 --- a/setup.py +++ b/setup.py @@ -12,12 +12,12 @@ install_requires = [ "pandas==1.1.1", "transformers==4.2.1", "tqdm", - "torch==1.7.1" + "torch==1.12.1" ] setuptools.setup( name="poldeepner2", - version="0.8.0-alpha.1+001", + version="0.7.1", author="Michał Marcińczuk", author_email="michal.marcinczuk@pwr.edu.pl", description="PolDeepNer2 is a tool for sequence labeling tasks based on "