Skip to content
Snippets Groups Projects
Commit fbd6e12e authored by Wiktor Walentynowicz's avatar Wiktor Walentynowicz :construction_worker_tone1:
Browse files

Merge branch 'dev_plain_text_processing' into 'master'

Dev plain text processing

See merge request !27
parents 9678eae7 f7943c73
No related branches found
No related tags found
1 merge request!27Dev plain text processing
Pipeline #4131 passed
......@@ -2,6 +2,11 @@
# PolDeepNer2 Changelog
## 0.6.4
### Added
- Script for processing files with plain text.
## 0.6.3
### Changed
- Refactored method for converting sentences into features (`convert_examples_to_features_nosq`)
......
......@@ -87,8 +87,6 @@ def main():
t0 = time.time()
train_examples = processor.get_examples(args.data_train, "train")
logger.info(f"Training data was loaded in {time.time()-t0} second(s)")
num_train_optimization_steps = int(
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
# preparing model configs
hidden_size = 1024 if 'large' in args.pretrained_path else (2048 if 'xl' in args.pretrained_path else 768)
......@@ -130,6 +128,16 @@ def main():
if args.wandb:
wandb.watch(model)
train_features = convert_examples_to_features(
train_examples, label_list, args.max_seq_length, model.encode_word, args.squeeze)
# ToDo: Add as a parameter
# train_features.extend(convert_examples_to_features(
# train_examples, label_list, args.max_seq_length, model.encode_word, not args.squeeze))
num_train_optimization_steps = int(
len(train_features) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
no_decay = ['bias', 'final_layer_norm.weight']
params = list(model.named_parameters())
......@@ -165,9 +173,6 @@ def main():
model, optimizer, opt_level=args.fp16_opt_level)
# Train the model
train_features = convert_examples_to_features(
train_examples, label_list, args.max_seq_length, model.encode_word, args.squeeze)
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_examples))
logger.info(" Batch size = %d", args.train_batch_size)
......
from __future__ import absolute_import, division, print_function
import codecs
import logging
import argparse
import time
import glob
import os
from pathlib import Path
import tqdm
from tqdm import tqdm
from poldeepner2.models import PolDeepNer2
from poldeepner2.pipeline import tokenization
from poldeepner2.utils.data_utils import wrap_annotations
def flatten(list_of_lists):
flat_list = []
for lit in list_of_lists:
flat_list.extend(lit)
return [flat_list]
def read_content_autobom(path: str) -> str:
bytes = min(32, os.path.getsize(path))
content = open(path, 'rb').read(bytes)
if content.startswith(codecs.BOM_UTF8):
encoding = 'utf-8-sig'
else:
encoding = 'utf-8'
return open(path, "r", encoding=encoding).read()
def main(args):
print("Loading the NER model ...")
t0 = time.time()
if args.pretrained_path:
tokenizer = tokenization.load(args.tokenization)
ner = PolDeepNer2(args.model, args.pretrained_path, device=args.device, max_seq_length=args.max_seq_length,
squeeze=args.squeeze, seed=args.seed, tokenizer=tokenizer)
else:
ner = PolDeepNer2.load(args.model, device=args.device, resources_path=".models")
if args.max_seq_length:
ner.max_seq_length = args.max_seq_length
time_load = time.time() - t0
time_preprocess = 0
time_ner = 0
data_size = 0
for path in tqdm(glob.glob(args.input + "/*.txt")):
content = read_content_autobom(path)
data_size += len(content)
texts = content.split('\n')
t0 = time.time()
tokenized_sentences = ner.tokenizer.tokenize(texts)
time_preprocess += (time.time() - t0)
t0 = time.time()
predictions = ner.process(tokenized_sentences)
predictions = flatten(predictions)
tokenized_sentences = flatten(tokenized_sentences)
annotations = wrap_annotations(predictions)
time_ner += (time.time() - t0)
output = Path(args.output) / Path(path).name
with open(output, "w") as fout:
for an in annotations:
text = " ".join([tokenized_sentences[0][n] for n in an.token_ids])
token_start = min(an.token_ids)
token_end = max(an.token_ids)
fout.write(f"{an.annotation}\t{token_start}\t{token_end}\t{text}\n")
print(f"Model loading time : {time_load:8.4} second(s)")
print(f"Data preprocessing time : {time_preprocess:8.4} second(s)")
print(f"Data NE recognition time : {time_ner:8.4} second(s)")
print(f"Total time : {time_load+time_preprocess+time_ner:8.4} second(s)")
print(f"Data size: : {data_size/1000000:8.4}M characters")
def parse_args():
parser = argparse.ArgumentParser(
description='Process a set of plain text files from given folder. The output is save to another folder.')
parser.add_argument('--input', required=True, metavar='PATH', help='path to an input folder with texts')
parser.add_argument('--output', required=True, metavar='PATH', help='path to an output folder')
parser.add_argument('--model', required=True, metavar='PATH', help='model name or path to a model name')
# Required if the pretrained_path is given
parser.add_argument('--pretrained_path', required=False, metavar='PATH', help='pretrained XLM-Roberta model path')
parser.add_argument('--max_seq_length', required=False, default=None, metavar='N', type=int,
help='the maximum total input sequence length after WordPiece tokenization.')
parser.add_argument('--device', required=False, default="cpu", metavar='cpu|cuda',
help='device type used for processing')
parser.add_argument('--tokenization', required=False, default="spacy-ext", choices=tokenization.names,
help='Tokenization method')
parser.add_argument('--squeeze', required=False, default=False, action="store_true",
help='try to squeeze multiple examples into one Input Feature')
parser.add_argument('--seed', required=False, default=377, metavar='N', type=int,
help='a seed used to initialize a number generator')
return parser.parse_args()
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG, filemode="w")
args = parse_args()
try:
main(args)
except ValueError as er:
print("[ERROR] %s" % er)
......@@ -19,7 +19,7 @@ install_requires = [
setuptools.setup(
name="poldeepner2",
version="0.6.3",
version="0.6.4",
author="Michał Marcińczuk",
author_email="michal.marcinczuk@pwr.edu.pl",
description="PolDeepNer2 is a tool for sequence labeling tasks based on transformer language models.",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment