Skip to content
Snippets Groups Projects
Select Git revision
  • d6a73e194deed3bb5653c1b0a97138816f575ff9
  • master default protected
  • vertical_relations
  • lu_without_semantic_frames
  • hierarchy
  • additional-unification-filters
  • v0.1.1
  • v0.1.0
  • v0.0.9
  • v0.0.8
  • v0.0.7
  • v0.0.6
  • v0.0.5
  • v0.0.4
  • v0.0.3
  • v0.0.2
  • v0.0.1
17 results

apps.py

Blame
  • process_poleval.py 5.86 KiB
    """A message of shame -- documentation must be completed."""
    
    from __future__ import absolute_import, division, print_function
    
    import logging
    import pathlib
    import argparse
    import codecs
    import os
    import json
    import time
    
    from tqdm import tqdm
    
    from poldeepner2.models import PolDeepNer2
    from poldeepner2.pipeline import tokenization
    from poldeepner2.utils.data_utils import get_poleval_dict, wrap_annotations
    
    
    def merge_sentences(sentences: [[str]]):
        """A message of shame -- documentation must be completed.
    
        Args:
            sentences:A message of shame -- documentation must be completed.
    
        Returns:A message of shame -- documentation must be completed.
    
        """
        flat_list = []
        for lit in sentences:
            flat_list.extend(lit)
        return [flat_list]
    
    
    def main(args):
        """A message of shame -- documentation must be completed.
    
        Args:
            args:A message of shame -- documentation must be completed.
    
        Returns:A message of shame -- documentation must be completed.
    
        """
        print("Loading the NER model ...")
        t0 = time.time()
        # if args.pretrained_path: #ner = PolDeepNer2(args.model,
        # args.pretrained_path, device=args.device,
        # max_seq_length=args.max_seq_length, #
        # squeeze=args.squeeze, seed=args.seed, tokenizer=tokenizer) else:
        # ner = poldeepner2.models.load(args.model, device=args.device,
        # resources_path=".models") # ner.max_seq_length = args.max_seq_length
    
        tokenizer = tokenization.load(args.tokenization) \
            if args.tokenization \
            else None
        ner = PolDeepNer2.load(
            model=args.model,
            pretrained_path=args.pretrained_path,
            device=args.device,
            max_seq_length=args.max_seq_length,
            squeeze=args.squeeze,
            seed=args.seed,
            tokenizer=tokenizer
        )
    
        time_load = time.time() - t0
    
        time_preprocess = 0
        time_ner = 0
        data_size = 0
    
        dict_list = []
    
        with open(
                os.path.join(pathlib.Path(__file__).parent.absolute(), args.input),
                encoding='UTF-8') as f:
            sentences = json.load(f)['questions']
            for i, sentence in tqdm(enumerate(sentences), total=len(sentences)):
                id = sentence['input']['fname'].replace("/home/a.wawer/poleval/",
                                                        "")
                file_content = sentence['input']['fileContent']
                data_size += len(file_content)
                texts = file_content.split('\n')
    
                t0 = time.time()
                tokenized_sentences = ner.tokenizer.tokenize(texts)
                time_preprocess += (time.time() - t0)
    
                t0 = time.time()
                predictions = ner.process(tokenized_sentences)
                if args.merge:
                    predictions = merge_sentences(predictions)
                    tokenized_sentences = merge_sentences(tokenized_sentences)
                annotations = wrap_annotations(predictions)
                dict_list.append(
                    get_poleval_dict(id, file_content, tokenized_sentences,
                                     annotations))
                time_ner += (time.time() - t0)
    
        codecs.open(args.output, "w", "utf8").write(
            json.dumps(dict_list, indent=4))
    
        print(f"Model loading time          : {time_load:8.4} second(s)")
        print(f"Data preprocessing time     : {time_preprocess:8.4} second(s)")
        print(f"Data NE recognition time    : {time_ner:8.4} second(s)")
        print(f'Total time                  : '
              f'{time_load + time_preprocess + time_ner:8.4} second(s)')
        print(f"Data size:                  : "
              f"{data_size / 1000000:8.4}M characters")
    
    
    def parse_args():
        """A message of shame -- documentation must be completed.
    
        Returns:A message of shame -- documentation must be completed.
    
        """
        parser = argparse.ArgumentParser(
            description='Convert set of IOB files into a single json file in '
                        'PolEval 2018 NER format')
        parser.add_argument('--input', required=True, metavar='PATH',
                            help='path to a file with a list of files')
        parser.add_argument('--output', required=True, metavar='PATH',
                            help='path to a json output file')
        parser.add_argument('--model', required=True, metavar='PATH',
                            help='model name or path to a model name')
        parser.add_argument('--pretrained_path', required=False, metavar='PATH',
                            help='pretrained XLM-Roberta model path')
        parser.add_argument('--max_seq_length', required=False, default=256,
                            metavar='N', type=int,
                            help='the maximum total input sequence length after '
                                 'WordPiece tokenization.')
        parser.add_argument('--device', required=False, default="cpu",
                            metavar='cpu|cuda',
                            help='device type used for processing')
        parser.add_argument('--tokenization', required=False, default=None,
                            choices=tokenization.names,
                            help='Tokenization method')
        parser.add_argument('--squeeze', required=False, default=False,
                            action="store_true",
                            help='try to squeeze multiple examples into one '
                                 'Input Feature')
        parser.add_argument('--seed', required=False, default=377, metavar='N',
                            type=int,
                            help='a seed used to initialize a number generator')
        parser.add_argument('--merge', required=False, default=False,
                            action="store_true",
                            help='merge sentences into a single sentence before '
                                 'wrapping labels into annotations')
        return parser.parse_args()
    
    
    if __name__ == "__main__":
        logging.basicConfig(level=logging.DEBUG, filemode="w")
        args = parse_args()
        main(args)
        # try:
        #     main(args)
        # except ValueError as er:
        #     print("[ERROR] %s" % er)