diff --git a/CHANGELOG.md b/CHANGELOG.md index bf2c25dbbc47a8b70f116d7d608cb26e6d1229c4..a1f5cef19f7026411fcc9063c96a6e8dc19896e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ # PolDeepNer2 Changelog +## 0.6.4 +### Added +- Script for processing files with plain text. + + ## 0.6.3 ### Changed - Refactored method for converting sentences into features (`convert_examples_to_features_nosq`) diff --git a/main.py b/main.py index 77cb2ecbe9c13baa779e280b7eb8bc24bf4c5b19..b01ec4240fa3cd87c8003824094a1dabaeb3f1b3 100644 --- a/main.py +++ b/main.py @@ -87,9 +87,7 @@ def main(): t0 = time.time() train_examples = processor.get_examples(args.data_train, "train") logger.info(f"Training data was loaded in {time.time()-t0} second(s)") - num_train_optimization_steps = int( - len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs - + # preparing model configs hidden_size = 1024 if 'large' in args.pretrained_path else (2048 if 'xl' in args.pretrained_path else 768) device = args.device @@ -130,6 +128,16 @@ def main(): if args.wandb: wandb.watch(model) + train_features = convert_examples_to_features( + train_examples, label_list, args.max_seq_length, model.encode_word, args.squeeze) + + # ToDo: Add as a parameter + # train_features.extend(convert_examples_to_features( + # train_examples, label_list, args.max_seq_length, model.encode_word, not args.squeeze)) + + num_train_optimization_steps = int( + len(train_features) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs + no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) @@ -165,9 +173,6 @@ def main(): model, optimizer, opt_level=args.fp16_opt_level) # Train the model - train_features = convert_examples_to_features( - train_examples, label_list, args.max_seq_length, model.encode_word, args.squeeze) - logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) diff --git a/process_texts.py b/process_texts.py new file mode 100644 index 0000000000000000000000000000000000000000..382bafbafa85220bd12aa626bed337e9944e354d --- /dev/null +++ b/process_texts.py @@ -0,0 +1,113 @@ +from __future__ import absolute_import, division, print_function + +import codecs +import logging +import argparse +import time +import glob +import os +from pathlib import Path + +import tqdm + +from tqdm import tqdm + +from poldeepner2.models import PolDeepNer2 +from poldeepner2.pipeline import tokenization +from poldeepner2.utils.data_utils import wrap_annotations + + +def flatten(list_of_lists): + flat_list = [] + for lit in list_of_lists: + flat_list.extend(lit) + return [flat_list] + + +def read_content_autobom(path: str) -> str: + bytes = min(32, os.path.getsize(path)) + content = open(path, 'rb').read(bytes) + if content.startswith(codecs.BOM_UTF8): + encoding = 'utf-8-sig' + else: + encoding = 'utf-8' + return open(path, "r", encoding=encoding).read() + + +def main(args): + print("Loading the NER model ...") + t0 = time.time() + if args.pretrained_path: + tokenizer = tokenization.load(args.tokenization) + ner = PolDeepNer2(args.model, args.pretrained_path, device=args.device, max_seq_length=args.max_seq_length, + squeeze=args.squeeze, seed=args.seed, tokenizer=tokenizer) + else: + ner = PolDeepNer2.load(args.model, device=args.device, resources_path=".models") + if args.max_seq_length: + ner.max_seq_length = args.max_seq_length + time_load = time.time() - t0 + + time_preprocess = 0 + time_ner = 0 + data_size = 0 + + for path in tqdm(glob.glob(args.input + "/*.txt")): + content = read_content_autobom(path) + data_size += len(content) + texts = content.split('\n') + + t0 = time.time() + tokenized_sentences = ner.tokenizer.tokenize(texts) + time_preprocess += (time.time() - t0) + + t0 = time.time() + predictions = ner.process(tokenized_sentences) + predictions = flatten(predictions) + tokenized_sentences = flatten(tokenized_sentences) + annotations = wrap_annotations(predictions) + time_ner += (time.time() - t0) + + output = Path(args.output) / Path(path).name + with open(output, "w") as fout: + for an in annotations: + text = " ".join([tokenized_sentences[0][n] for n in an.token_ids]) + token_start = min(an.token_ids) + token_end = max(an.token_ids) + fout.write(f"{an.annotation}\t{token_start}\t{token_end}\t{text}\n") + + print(f"Model loading time : {time_load:8.4} second(s)") + print(f"Data preprocessing time : {time_preprocess:8.4} second(s)") + print(f"Data NE recognition time : {time_ner:8.4} second(s)") + print(f"Total time : {time_load+time_preprocess+time_ner:8.4} second(s)") + print(f"Data size: : {data_size/1000000:8.4}M characters") + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Process a set of plain text files from given folder. The output is save to another folder.') + parser.add_argument('--input', required=True, metavar='PATH', help='path to an input folder with texts') + parser.add_argument('--output', required=True, metavar='PATH', help='path to an output folder') + parser.add_argument('--model', required=True, metavar='PATH', help='model name or path to a model name') + + # Required if the pretrained_path is given + parser.add_argument('--pretrained_path', required=False, metavar='PATH', help='pretrained XLM-Roberta model path') + parser.add_argument('--max_seq_length', required=False, default=None, metavar='N', type=int, + help='the maximum total input sequence length after WordPiece tokenization.') + parser.add_argument('--device', required=False, default="cpu", metavar='cpu|cuda', + help='device type used for processing') + parser.add_argument('--tokenization', required=False, default="spacy-ext", choices=tokenization.names, + help='Tokenization method') + parser.add_argument('--squeeze', required=False, default=False, action="store_true", + help='try to squeeze multiple examples into one Input Feature') + parser.add_argument('--seed', required=False, default=377, metavar='N', type=int, + help='a seed used to initialize a number generator') + return parser.parse_args() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG, filemode="w") + args = parse_args() + try: + main(args) + except ValueError as er: + print("[ERROR] %s" % er) diff --git a/setup.py b/setup.py index 64d7c754c2b0e74359f867879befc3f79ff73fb3..5c9e547fb3e6d63a009fc3475b800db72d818a42 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ install_requires = [ setuptools.setup( name="poldeepner2", - version="0.6.3", + version="0.6.4", author="Michał Marcińczuk", author_email="michal.marcinczuk@pwr.edu.pl", description="PolDeepNer2 is a tool for sequence labeling tasks based on transformer language models.",