process_poleval.py

"""A message of shame -- documentation must be completed."""

from __future__ import absolute_import, division, print_function

import logging
import pathlib
import argparse
import codecs
import os
import json
import time

from tqdm import tqdm

from poldeepner2.models import PolDeepNer2
from poldeepner2.pipeline import tokenization
from poldeepner2.utils.data_utils import get_poleval_dict, wrap_annotations


def merge_sentences(sentences: [[str]]):
    """A message of shame -- documentation must be completed.

    Args:
        sentences:A message of shame -- documentation must be completed.

    Returns:A message of shame -- documentation must be completed.

    """
    flat_list = []
    for lit in sentences:
        flat_list.extend(lit)
    return [flat_list]


def main(args):
    """A message of shame -- documentation must be completed.

    Args:
        args:A message of shame -- documentation must be completed.

    Returns:A message of shame -- documentation must be completed.

    """
    print("Loading the NER model ...")
    t0 = time.time()
    # if args.pretrained_path: #ner = PolDeepNer2(args.model,
    # args.pretrained_path, device=args.device,
    # max_seq_length=args.max_seq_length, #
    # squeeze=args.squeeze, seed=args.seed, tokenizer=tokenizer) else:
    # ner = poldeepner2.models.load(args.model, device=args.device,
    # resources_path=".models") # ner.max_seq_length = args.max_seq_length

    tokenizer = tokenization.load(args.tokenization) \
        if args.tokenization \
        else None
    ner = PolDeepNer2.load(
        model=args.model,
        pretrained_path=args.pretrained_path,
        device=args.device,
        max_seq_length=args.max_seq_length,
        squeeze=args.squeeze,
        seed=args.seed,
        tokenizer=tokenizer
    )

    time_load = time.time() - t0

    time_preprocess = 0
    time_ner = 0
    data_size = 0

    dict_list = []

    with open(
            os.path.join(pathlib.Path(__file__).parent.absolute(), args.input),
            encoding='UTF-8') as f:
        sentences = json.load(f)['questions']
        for i, sentence in tqdm(enumerate(sentences), total=len(sentences)):
            id = sentence['input']['fname'].replace("/home/a.wawer/poleval/",
                                                    "")
            file_content = sentence['input']['fileContent']
            data_size += len(file_content)
            texts = file_content.split('\n')

            t0 = time.time()
            tokenized_sentences = ner.tokenizer.tokenize(texts)
            time_preprocess += (time.time() - t0)

            t0 = time.time()
            predictions = ner.process(tokenized_sentences)
            if args.merge:
                predictions = merge_sentences(predictions)
                tokenized_sentences = merge_sentences(tokenized_sentences)
            annotations = wrap_annotations(predictions)
            dict_list.append(
                get_poleval_dict(id, file_content, tokenized_sentences,
                                 annotations))
            time_ner += (time.time() - t0)

    codecs.open(args.output, "w", "utf8").write(
        json.dumps(dict_list, indent=4))

    print(f"Model loading time          : {time_load:8.4} second(s)")
    print(f"Data preprocessing time     : {time_preprocess:8.4} second(s)")
    print(f"Data NE recognition time    : {time_ner:8.4} second(s)")
    print(f'Total time                  : '
          f'{time_load + time_preprocess + time_ner:8.4} second(s)')
    print(f"Data size:                  : "
          f"{data_size / 1000000:8.4}M characters")


def parse_args():
    """A message of shame -- documentation must be completed.

    Returns:A message of shame -- documentation must be completed.

    """
    parser = argparse.ArgumentParser(
        description='Convert set of IOB files into a single json file in '
                    'PolEval 2018 NER format')
    parser.add_argument('--input', required=True, metavar='PATH',
                        help='path to a file with a list of files')
    parser.add_argument('--output', required=True, metavar='PATH',
                        help='path to a json output file')
    parser.add_argument('--model', required=True, metavar='PATH',
                        help='model name or path to a model name')
    parser.add_argument('--pretrained_path', required=False, metavar='PATH',
                        help='pretrained XLM-Roberta model path')
    parser.add_argument('--max_seq_length', required=False, default=256,
                        metavar='N', type=int,
                        help='the maximum total input sequence length after '
                             'WordPiece tokenization.')
    parser.add_argument('--device', required=False, default="cpu",
                        metavar='cpu|cuda',
                        help='device type used for processing')
    parser.add_argument('--tokenization', required=False, default=None,
                        choices=tokenization.names,
                        help='Tokenization method')
    parser.add_argument('--squeeze', required=False, default=False,
                        action="store_true",
                        help='try to squeeze multiple examples into one '
                             'Input Feature')
    parser.add_argument('--seed', required=False, default=377, metavar='N',
                        type=int,
                        help='a seed used to initialize a number generator')
    parser.add_argument('--merge', required=False, default=False,
                        action="store_true",
                        help='merge sentences into a single sentence before '
                             'wrapping labels into annotations')
    return parser.parse_args()


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG, filemode="w")
    args = parse_args()
    main(args)
    # try:
    #     main(args)
    # except ValueError as er:
    #     print("[ERROR] %s" % er)