diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d7cd87773a46c98797d378a195ef8d818df1728..60459e3413a835a1885fdbe6471271c8aa5b7a68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ # PolDeepNer2 Changelog +## 0.6.7 +### Changed +- Fixed tokens with empty subtokens. + ## 0.6.6 ### Added - Script for batch training. diff --git a/Dockerfiles/base/Dockerfile b/Dockerfiles/base/Dockerfile index 10f145e7ecd551a17a88a0ddab8b734f5e797b39..5a90bf1b8b9c6dabc778296fb801eeae0d8ef013 100644 --- a/Dockerfiles/base/Dockerfile +++ b/Dockerfiles/base/Dockerfile @@ -10,7 +10,7 @@ ENV LANG en_US.UTF-8 ENV LANGUAGE en_US:en ENV LC_ALL en_US.UTF-8 -# Python 3.6 +# Python 3.8 #RUN apt-get install -y software-properties-common vim RUN apt-get install -y python3.8 python3-pip RUN python3.8 --version diff --git a/Dockerfiles/kpwr_n82_base/Dockerfile b/Dockerfiles/kpwr_n82_base/Dockerfile index 509714a39b1681b2b4069db24e86754ce14c4b25..905a9c74fe3e7e101fbc40fbee9f9ece40f81f02 100644 --- a/Dockerfiles/kpwr_n82_base/Dockerfile +++ b/Dockerfiles/kpwr_n82_base/Dockerfile @@ -8,4 +8,4 @@ RUN rm kpwr_n82_base.zip EXPOSE 8000 -CMD python3.6 server.py --model models/kpwr_n82_base/kpwr_n82_base --pretrained_path xlmr:models/roberta_base_fairseq +CMD python3.8 server.py --model models/kpwr_n82_base/kpwr_n82_base --pretrained_path xlmr:models/roberta_base_fairseq diff --git a/Dockerfiles/kpwr_n82_large/Dockerfile b/Dockerfiles/kpwr_n82_large/Dockerfile index e7759a7010478dfa8deb8131158361cae6fba07b..63bf6f62981ca3bbf0dde9c8ceb15022833258af 100644 --- a/Dockerfiles/kpwr_n82_large/Dockerfile +++ b/Dockerfiles/kpwr_n82_large/Dockerfile @@ -8,4 +8,4 @@ RUN rm roberta_large_fairseq.zip EXPOSE 8000 -CMD python3.6 server.py --model models/kpwr_n82_large/kpwr_n82_large --pretrained_path xlmr:models/roberta_base_fairseq +CMD python3.8 server.py --model models/kpwr_n82_large/kpwr_n82_large --pretrained_path xlmr:models/roberta_base_fairseq diff --git a/Dockerfiles/merged-base/Dockerfile b/Dockerfiles/merged-base/Dockerfile index f7590f453d92af4760be8d3de85172e76b73fe9d..7ac3c86fad02f4cfbf75c0843add55cb368becb0 100644 --- a/Dockerfiles/merged-base/Dockerfile +++ b/Dockerfiles/merged-base/Dockerfile @@ -10,9 +10,9 @@ ENV LANG en_US.UTF-8 ENV LANGUAGE en_US:en ENV LC_ALL en_US.UTF-8 -# Python 3.6 +# Python 3.8 RUN apt-get install -y software-properties-common vim -RUN apt-get install -y python3.6 python3-pip +RUN apt-get install -y python3.8 python3-pip # update pip RUN pip3 install pip --upgrade @@ -22,7 +22,7 @@ RUN pip3 install wheel WORKDIR "/poldeepner2" ADD ./requirements.txt /poldeepner2/requirements.txt RUN pip3 install -r requirements.txt -RUN python3.6 -m spacy download pl_core_news_sm +RUN python3.8 -m spacy download pl_core_news_sm RUN apt-get install -y wget RUN apt-get install -y unzip @@ -43,4 +43,4 @@ COPY . . EXPOSE 8000 -CMD python3.6 server.py --model models/kpwr_n82_base/kpwr_n82_base --pretrained_path xlmr:models/roberta_base_fairseq +CMD python3.8 server.py --model models/kpwr_n82_base/kpwr_n82_base --pretrained_path xlmr:models/roberta_base_fairseq diff --git a/Dockerfiles/merged-large/Dockerfile b/Dockerfiles/merged-large/Dockerfile index 425df77af810526ecfa8bc8ca5d6c904104db03f..1592e7d6ec1425c7aea725541309bd541801dafa 100644 --- a/Dockerfiles/merged-large/Dockerfile +++ b/Dockerfiles/merged-large/Dockerfile @@ -10,9 +10,9 @@ ENV LANG en_US.UTF-8 ENV LANGUAGE en_US:en ENV LC_ALL en_US.UTF-8 -# Python 3.6 +# Python 3.8 RUN apt-get install -y software-properties-common vim -RUN apt-get install -y python3.6 python3-pip +RUN apt-get install -y python3.8 python3-pip # update pip RUN pip3 install pip --upgrade @@ -22,7 +22,7 @@ RUN pip3 install wheel WORKDIR "/poldeepner2" ADD ./requirements.txt /poldeepner2/requirements.txt RUN pip3 install -r requirements.txt -RUN python3.6 -m spacy download pl_core_news_sm +RUN python3.8 -m spacy download pl_core_news_sm RUN apt-get install -y wget RUN apt-get install -y unzip @@ -43,4 +43,4 @@ COPY . . EXPOSE 8000 -CMD python3.6 server.py --model models/kpwr_n82_large/kpwr_n82_large --pretrained_path xlmr:models/roberta_base_fairseq +CMD python3.8 server.py --model models/kpwr_n82_large/kpwr_n82_large --pretrained_path xlmr:models/roberta_base_fairseq diff --git a/Dockerfiles/nkjp_base/Dockerfile b/Dockerfiles/nkjp_base/Dockerfile index c44168190ccb10b0e03c0d5f711b899685d3c144..6950e343cfdee53d8a521d1615fd397153c506f3 100644 --- a/Dockerfiles/nkjp_base/Dockerfile +++ b/Dockerfiles/nkjp_base/Dockerfile @@ -8,4 +8,4 @@ RUN rm nkjp_base.zip EXPOSE 8000 -CMD python3.6 server.py --model models/nkjp_base/nkjp_base --pretrained_path xlmr:models/roberta_base_fairseq +CMD python3.8 server.py --model models/nkjp_base/nkjp_base --pretrained_path xlmr:models/roberta_base_fairseq diff --git a/README.md b/README.md index e1fef629dbdfcd915882eecba958b242b5220283..0fc14621ebb22e7febfdb031195a961411a22313 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,9 @@ It offers a set of pretrained models for Polish. The main features are: ### Requirements -* Python 3.6 +* Python 3.8 * CUDA 10.0+ -* PyTorch 1.7 +* PyTorch 1.9 ### Virtual environment @@ -29,7 +29,7 @@ It offers a set of pretrained models for Polish. The main features are: ``` sudo apt-get install python3-pip python3-dev python-virtualenv sudo pip install -U pip -virtualenv -p python3.6 venv +virtualenv -p python3.8 venv source venv/bin/activate pip install -U pip pip install -r requirements.txt @@ -38,9 +38,9 @@ pip install -r requirements.txt #### Conda ``` -conda create -n pdn2 python=3.6 +conda create -n pdn2 python=3.8 conda activate pdn2 -conda install -c anaconda cudatoolkit=10.1 +conda install -c anaconda cudatoolkit=10.2 conda install -c anaconda cudnn pip install -r requirements.txt ``` diff --git a/augment_dataset.py b/augment_dataset.py index 29c4fd3c6e10126de40f45715a37f8c094589a92..53e06f0a8c9c3d9303b380cebb7225936f88ec5e 100644 --- a/augment_dataset.py +++ b/augment_dataset.py @@ -1,3 +1,5 @@ +"""A message of shame -- documentation must be completed.""" + from __future__ import absolute_import, division, print_function import argparse @@ -9,12 +11,26 @@ from poldeepner2.utils.data_utils import read_tsv def write_sentence(fout: str, tokens: List[str], labels: List[str]): + """A message of shame -- documentation must be completed. + + Args: + fout: str + tokens: List[str] + labels: List[str] + + """ for token, label in zip(tokens, labels): fout.write("%s\t%s\n" % (token, label)) fout.write("\n") def main(args): + """A message of shame -- documentation must be completed. + + Args: + args:A message of shame -- documentation must be completed. + + """ sentences_labels = read_tsv(args.input, True) with codecs.open(args.output, "w", "utf8") as fout: for sentence, labels in sentences_labels: @@ -23,22 +39,33 @@ def main(args): if args.upper: logging.info("Augment data — upper case") for sentence, labels in sentences_labels: - write_sentence(fout, [token.upper() for token in sentence], labels) + write_sentence(fout, [token.upper() for token in sentence], + labels) if args.lower: logging.info("Augment data — lower case") for sentence, labels in sentences_labels: - write_sentence(fout, [token.lower() for token in sentence], labels) + write_sentence(fout, [token.lower() for token in sentence], + labels) def parse_args(): + """A message of shame -- documentation must be completed. + + Returns: parser.parse_args() + + """ parser = argparse.ArgumentParser( description='Process a single TSV with a NER model') - parser.add_argument('--input', required=True, metavar='PATH', help='path to a TSV file') - parser.add_argument('--output', required=True, metavar='PATH', help='path to save the augmented dataset') - parser.add_argument('--lower', required=False, default=False, action="store_true", + parser.add_argument('--input', required=True, metavar='PATH', + help='path to a TSV file') + parser.add_argument('--output', required=True, metavar='PATH', + help='path to save the augmented dataset') + parser.add_argument('--lower', required=False, default=False, + ction="store_true", help='augment lower-case data') - parser.add_argument('--upper', required=False, default=False, action="store_true", + parser.add_argument('--upper', required=False, default=False, + action="store_true", help='augment upper-case data') return parser.parse_args() diff --git a/config.cfg b/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..78e00ba3dcaaeab11e65a7a1921e7e424b3f91ef --- /dev/null +++ b/config.cfg @@ -0,0 +1,55 @@ +[model] +device = cpu +gpu_num = 0 +path = /mnt/sda/pdn2scripts/nkjp_base +pretrained_path = /mnt/sda/pdn2scripts/roberta_base + +[predict] +device = cpu +save_to_file = true +path = /mnt/sda/pdn2scripts/roberta_base +max_seq_len = 100 +path_to_save = predict_res.txt + +[evaluate] +device = cpu +gpu_num = 0 +path = E:/ClarinProjects/nkjp_base +pretrained_path = ./roberta_base +squeeze = false +max_seq_len = 100 +hidden_size = 32 +dropout = 0.05 + +[data] +tag_column_index = 3 +eval_path = data/coNLL-2003/test.txt +pred_path = tests/resources/text_krakow.txt + +[train] +adam_epsilon = 0.1 +data_test = data/coNLL-2003/test.txt +data_train = data/coNLL-2003/train.txt +data_tune = data/coNLL-2003/valid.txt +device = cuda +dropout = 0.05 +epoch_save_model = True +eval_batch_size = 16 +fp16 = false +fp16_opt_level = '' +freeze_model = True +gradient_accumulation_steps = 5 +hidden_size = 32 +learning_rate = 0.001 +max_grad_norm = 5 +max_seq_length = 32 +num_train_epochs = 100 +output_dir = test_res +pretrained_path = /mnt/sda/pdn2scripts/roberta_base +seed = 42 +squeeze = true +train_batch_size = 16 +training_mix = False +transfer = None +warmup_proportion = 0.3 +weight_decay = 0.1 diff --git a/core/poldeepner.py b/core/poldeepner.py new file mode 100644 index 0000000000000000000000000000000000000000..317c5f2b53791afd26a38550d32920190e75bfe2 --- /dev/null +++ b/core/poldeepner.py @@ -0,0 +1,161 @@ +"""A message of shame -- documentation must be completed.""" + +import codecs +import os +import torch +# import tqdm NOT USED +from torch.utils.data.dataloader import DataLoader + +from core.model.xlmr_for_token_classification import XLMRForTokenClassification +from core.utils.data_utils import InputExample, convert_examples_to_features, \ + create_dataset, read_params, wrap_annotations, align_tokens_with_text +from core.utils.tokenization import TokenizerSpaces + + +class PolDeepNer2: + """A message of shame -- documentation must be completed.""" + + def __init__(self, model_path, pretrained_path, + device="cpu", squeeze=False, max_seq_length=256, + tokenizer=TokenizerSpaces()): + """A message of shame -- documentation must be completed. + + Args: + model_path:A message of shame -- documentation must be completed. + pretrained_path:A message of shame -- documentation must be + completed. + device:A message of shame -- documentation must be completed. + squeeze:A message of shame -- documentation must be completed. + max_seq_length:A message of shame -- documentation must be + completed. + tokenizer:A message of shame -- documentation must be completed. + + """ + if not os.path.exists(model_path): + + raise ValueError("Model not found on path '%s'" % model_path) + + if not os.path.exists(pretrained_path): + raise ValueError("RoBERTa language model not found on path '%s'" + % pretrained_path) + + dropout, num_labels, label_list = read_params(model_path) + self.label_list = label_list + model = XLMRForTokenClassification(pretrained_path=pretrained_path, + n_labels=len(self.label_list) + 1, + dropout_p=dropout, + device=device, + hidden_size=768 + if 'base' in pretrained_path + else 1024) + state_dict = torch.load( + open(os.path.join(model_path, 'model.pt'), 'rb')) + model.load_state_dict(state_dict) + model.eval() + model.to(device) + self.model = model + self.device = device + self.squeeze = squeeze + self.max_seq_length = max_seq_length + self.tokenizer = tokenizer + + @staticmethod + def load_labels(path): + """A message of shame -- documentation must be completed. + + Args: + path:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ + return [line.strip() for line in codecs.open( + path, "r", "utf8").readlines() if len(line.strip()) > 0] + + def process(self, sentences): + """A message of shame -- documentation must be completed. + + @param sentences -- array of array of words, + [['Jan', 'z', 'Warszawy'], ['IBM', 'i', 'Apple']] + @param max_seq_length -- the maximum total input sequence length after + WordPiece tokenization + @param squeeze -- boolean enabling squeezing multiple sentences into + one Input Feature + """ + examples = [] + for idx, tokens in enumerate(sentences): + guid = str(idx) + text_a = ' '.join(tokens) + label = ["O"] * len(tokens) + examples.append(InputExample(guid=guid, text_a=text_a, + text_b=None, label=label)) + + eval_features = convert_examples_to_features(examples, + self.label_list, + self.max_seq_length, + self.model.encode_word, + self.squeeze) + eval_dataset = create_dataset(eval_features) + eval_dataloader = DataLoader(eval_dataset, batch_size=1) + + y_pred = [] + sum_pred = [] + label_map = {i: label for i, label in enumerate(self.label_list, 1)} + + for input_ids, label_ids, l_mask, valid_ids in eval_dataloader: + input_ids = input_ids.to(self.device) + label_ids = label_ids.to(self.device) + valid_ids = valid_ids.to(self.device) + + with torch.no_grad(): + logits = self.model(input_ids, labels=None, + labels_mask=None, valid_mask=valid_ids) + + logits = torch.argmax(logits, dim=2) + logits = logits.detach().cpu().numpy() + label_ids = label_ids.cpu().numpy() + for i, cur_label in enumerate(label_ids): + temp_1 = [] + temp_2 = [] + for j, m in enumerate(cur_label): + if valid_ids[i][j]: + temp_1.append(label_map[m]) + temp_2.append(label_map[logits[i][j]]) + assert len(temp_1) == len(temp_2) + if self.squeeze: + sum_pred.extend(temp_2) + else: + y_pred.append(temp_2) + pointer = 0 + for sentence in sentences: + y_pred.append(sum_pred[pointer: (pointer + len(sentence))]) + pointer += len(sentence) + return y_pred + + def process_text(self, text: str): + """A message of shame -- documentation must be completed. + + @texts: Array of sentences. Each sentence is a string. + "John lives in New York. Mary lives in Chicago" + + return:[(PER, 0, 4, "John"), (LOC, 14, 22, "New York"), + (PER, 24, 28, "Mary"), (LOC, 38, 45, "Chicago")]] + """ + sentences = self.tokenizer.tokenize([text]) + predictions = self.process(sentences) + annotations = wrap_annotations(predictions) + return align_tokens_with_text(text, sentences, annotations) + + def process_tokenized(self, tokens: [[str]], text: str): + """A message of shame -- documentation must be completed. + + @tokens: Array of sentences. Each sentence is an array of words. + [["John", "lives", "in", "New", "York"], + ["Mary", "lives", "in", "Chicago"]] + + return: [["B-PER", "O", "O", "B-LOC", "I-LOC"], + ["B-PER", "O", "O", "B-LOC"]] + """ + predictions = self.process(tokens) + annotations = wrap_annotations(predictions) + return align_tokens_with_text(text, tokens, annotations) diff --git a/evaluate_tsv.py b/evaluate_tsv.py index c346cb762b845490e16c809088c9a7c2b7791729..54eac10016c88007f18d65b0eae75ab35cf82b26 100644 --- a/evaluate_tsv.py +++ b/evaluate_tsv.py @@ -1,10 +1,12 @@ +"""A message of shame -- documentation must be completed.""" + from __future__ import absolute_import, division, print_function import argparse import os -from time import time import time +# from time import time F811 redefinition of unused 'time' import poldeepner2 from poldeepner2.utils.data_utils import read_tsv @@ -13,6 +15,12 @@ from poldeepner2.utils.sequence_labeling import classification_report def main(args): + """A message of shame -- documentation must be completed. + + Args: + args:A message of shame -- documentation must be completed. + + """ print("Loading the NER model ...") ner = poldeepner2.load(args.model, device=args.device) @@ -51,6 +59,11 @@ def main(args): def parse_args(): + """A message of shame -- documentation must be completed. + + Returns: parser.parse_args() + + """ parser = argparse.ArgumentParser( description='Process a single TSV with a NER model') parser.add_argument('--input', required=True, metavar='PATH', help='path to a file with a list of files') diff --git a/evaluator.py b/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..006204ddfe844d3787b3c91fa179895e862ad37c --- /dev/null +++ b/evaluator.py @@ -0,0 +1,91 @@ +"""Script for evaluating models on a pre-defined set of data.""" + +import configparser +import os +import time + +from poldeepner2.utils.data_utils import NerProcessor, create_dataset, \ + convert_examples_to_features +from poldeepner2.utils.train_utils import evaluate_model + + +def main(): + config_file = "config.cfg" + config = configparser.ConfigParser() + config.read(config_file) + + pretrained_model = config['evaluate']['pretrained_path'] + + device = config['evaluate']['device'] + squeeze = config.getboolean('evaluate', 'squeeze') + tag_column_index = config.getint('data', 'tag_column_index') + processor = NerProcessor() + + data_path = config['data']['eval_path'] + datasets = [data_path] + + labels_list = \ + processor.get_labels(datasets, config.getint('data', + 'tag_column_index')) + + num_labels = len(labels_list) + 1 + hidden_size = config.getint('evaluate', 'hidden_size') + dropout = config.getfloat('train', 'dropout') + + hidden_size = 1024 if 'large' in pretrained_model \ + else (768 if 'base' in pretrained_model else hidden_size) + device = device + + pretrained_path = config['model']['pretrained_path'] + + if pretrained_path.startswith("hf:"): + from poldeepner2.model.hf_for_token_calssification \ + import HfModelForTokenClassification + pretrained_dir = pretrained_path.split(':')[1] + model = HfModelForTokenClassification( + pretrained_path=pretrained_dir, n_labels=num_labels, + hidden_size=hidden_size, dropout_p=dropout, + device=device) + elif pretrained_path.startswith("mt5:"): + from poldeepner2.model.mt5_for_token_calssification \ + import Mt5ModelForTokenClassification + variant = pretrained_path.split(':')[1] + model = Mt5ModelForTokenClassification( + variant=variant, n_labels=num_labels, + hidden_size=hidden_size, dropout_p=dropout, device=device) + else: + from poldeepner2.model.xlmr_for_token_classification \ + import XLMRForTokenClassification + pretrained_dir = pretrained_path + if ":" in pretrained_dir: + pretrained_dir = pretrained_dir.split(':')[1] + if not os.path.exists(pretrained_dir): + raise ValueError("RoBERTa language model not found on path '%s'" + % pretrained_dir) + + model = XLMRForTokenClassification( + pretrained_path=pretrained_dir, n_labels=num_labels, + hidden_size=hidden_size, dropout_p=dropout, + device=device) + + max_seq_len = config.getint('evaluate', 'max_seq_len') + + eval_examples = processor.get_examples(datasets[0], tag_column_index, + 'eval') + + eval_features = convert_examples_to_features( + eval_examples, labels_list, max_seq_len, model.encode_word, + squeeze=squeeze) + + eval_data = create_dataset(eval_features) + + time_start = time.time() + f1, report = evaluate_model(model, eval_data, labels_list, 16, device) + time_end = time.time() + print(f' f1: {f1}') + print(f' report {report}') + print(f'time {time_end - time_start}') + + +if __name__ == "__main__": + main() diff --git a/poldeepner2/data/document.py b/poldeepner2/data/document.py index 750a743f9438a74eb6506a237542942d3ef8bc7f..2b0400df34ea086ef98cb6f9de2592389afef2de 100644 --- a/poldeepner2/data/document.py +++ b/poldeepner2/data/document.py @@ -1,11 +1,25 @@ +"""A message of shame -- documentation must be completed.""" + from poldeepner2.data.span import Span from poldeepner2.data.token import Token from poldeepner2.utils.annotation import Annotation class Document: + """A message of shame -- documentation must be completed.""" + + def __init__(self, content: str, + tokens: [Token] = [], sentences: [Span] = [], + annotations: [Annotation] = []): + """A message of shame -- documentation must be completed. + + Args: + content:A message of shame -- documentation must be completed. + tokens:A message of shame -- documentation must be completed. + sentences:A message of shame -- documentation must be completed. + annotations:A message of shame -- documentation must be completed. - def __init__(self, content: str, tokens: [Token] = [], sentences: [Span] = [], annotations: [Annotation] = []): + """ self.content = content self.tokens = tokens self.annotations = annotations diff --git a/poldeepner2/data/span.py b/poldeepner2/data/span.py index 36c1307a0c9bbdac0ec042678cf43efed2541e5c..d13e043d90705db299fd21a1e8e931d77f88a581 100644 --- a/poldeepner2/data/span.py +++ b/poldeepner2/data/span.py @@ -1,16 +1,26 @@ +"""A message of shame -- documentation must be completed.""" + from dataclasses import dataclass @dataclass class Span: - """ + """A message of shame -- documentation must be completed. + Args: - orth (str): + orth (str):A message of shame -- documentation must be completed. start (int): Index of the first token. end (int): Index of the last token +1. + """ + start: int end: int def __str__(self): + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ return f"Span(begin={self.begin},end={self.end})" diff --git a/poldeepner2/data/token.py b/poldeepner2/data/token.py index a4733b925601bdcbb1ebaba9ca0ec509806cf132..c8120a6aea0dd119c54c4b6985b4718cb96e2219 100644 --- a/poldeepner2/data/token.py +++ b/poldeepner2/data/token.py @@ -1,9 +1,12 @@ +"""A message of shame -- documentation must be completed.""" + from dataclasses import dataclass @dataclass class Token: - """ + """A message of shame -- documentation must be completed. + Args: orth (str): start (int): Index of the first orth character in the original text. @@ -12,7 +15,9 @@ class Token: ws (str): White spaces after the token in the original text. morph (str): eos (str): True if the token ends a sentence. + """ + orth: str start: int end: int @@ -22,4 +27,9 @@ class Token: eos: bool = False def __str__(self): + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ return f"Token(orth={self.orth},lemma={self.lemma},morph={self.morph})" diff --git a/poldeepner2/io/debug.py b/poldeepner2/io/debug.py index 3b4c9aa7c6c7fe86aeeb81f03dd34906f664843e..269c9ff7327e167fccd9d711cf141d99e798ca85 100644 --- a/poldeepner2/io/debug.py +++ b/poldeepner2/io/debug.py @@ -1,7 +1,17 @@ +"""A message of shame -- documentation must be completed.""" + import logging def debug_tokens_and_labels(tokenized_sentences, predictions): + """A message of shame -- documentation must be completed. + + Args: + tokenized_sentences:A message of shame -- documentation must be + completed. + predictions:A message of shame -- documentation must be completed. + + """ for tokens, labels in zip(tokenized_sentences, predictions): for token, label in zip(tokens, labels): logging.debug(f"TOKENIZATION: {token}\t{label}") diff --git a/poldeepner2/model/hf_for_token_calssification.py b/poldeepner2/model/hf_for_token_calssification.py index fab1dcf3cdca323a840c304bd97efb36750b4f22..504a0c864cbfcd12530b2d32bfb11d4c6d82efb7 100644 --- a/poldeepner2/model/hf_for_token_calssification.py +++ b/poldeepner2/model/hf_for_token_calssification.py @@ -1,3 +1,5 @@ +"""A message of shame -- documentation must be completed.""" + from pathlib import Path import yaml from typing import List @@ -63,22 +65,24 @@ class Pdn2TokenClassification(nn.Module): self.model = AutoModel.from_pretrained(path) def forward(self, inputs_ids, labels, labels_mask, valid_mask): - ''' - Computes a forward pass through the sequence tagging model. + """Computes a forward pass through the sequence tagging model. + Args: inputs_ids: tensor of size (bsz, max_seq_len). padding idx = 1 labels: tensor of size (bsz, max_seq_len) - labels_mask and valid_mask: indicate where loss gradients should be propagated and where + labels_mask: indicate where loss gradients should be + propagated and where labels should be ignored + valid_mask: indicate where loss gradients should be Returns : logits: unnormalized model outputs. loss: Cross Entropy loss between labels and logits - ''' + """ self.model.train() - transformer_out = self.model(inputs_ids, return_dict=True)[0] + transformer_out = self.model(inputs_ids, return_dict=True)[0] out_1 = F.relu(self.linear_1(transformer_out)) out_1 = self.dropout(out_1) logits = self.classification_head(out_1) @@ -100,8 +104,14 @@ class Pdn2TokenClassification(nn.Module): return logits def encode_word(self, s): - """ - takes a string and returns a list of token ids + """Takes a string and returns a list of token ids. + + Args: + self:A message of shame -- documentation must be completed. + s:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + """ tensor_ids = self.tokenizer.encode(s) # remove <s> and </s> ids diff --git a/poldeepner2/model/mt5_for_token_calssification.py b/poldeepner2/model/mt5_for_token_calssification.py new file mode 100644 index 0000000000000000000000000000000000000000..1a2e4aa8a7308e510d0238dc49a4030c5aa6af3e --- /dev/null +++ b/poldeepner2/model/mt5_for_token_calssification.py @@ -0,0 +1,104 @@ +"""A message of shame -- documentation must be completed.""" + +import torch.nn as nn +import torch.nn.functional as F +from transformers import T5Tokenizer, T5ForConditionalGeneration + + +class Mt5ModelForTokenClassification(nn.Module): + """A message of shame -- documentation must be completed.""" + + def __init__(self, variant, n_labels, hidden_size=768, dropout_p=0.2, + label_ignore_idx=0, + head_init_range=0.04, device='cuda'): + """A message of shame -- documentation must be completed. + + Args: + variant: A message of shame -- documentation must be completed. + n_labels: A message of shame -- documentation must be completed. + hidden_size: A message of shame -- documentation must be completed. + dropout_p: A message of shame -- documentation must be completed. + label_ignore_idx: A message of shame -- documentation must be + completed. + head_init_range: A message of shame -- documentation must be + completed. + device: A message of shame -- documentation must be completed. + + """ + super().__init__() + + self.n_labels = n_labels + + self.linear_1 = nn.Linear(hidden_size, hidden_size) + self.classification_head = nn.Linear(hidden_size, n_labels) + + self.label_ignore_idx = label_ignore_idx + + # self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path) + # self.model = AutoModel.from_pretrained(pretrained_path) + self.tokenizer = T5Tokenizer.from_pretrained( + f"google/mt5-{variant}") + self.model = T5ForConditionalGeneration.from_pretrained( + f'google/mt5-{variant}') + + self.dropout = nn.Dropout(dropout_p) + self.device = device + + # initializing classification head + self.classification_head.weight.data.normal_(mean=0.0, + std=head_init_range) + + def forward(self, inputs_ids, labels, + labels_mask, valid_mask): + """Computes a forward pass through the sequence tagging model. + + Args: + inputs_ids: tensor of size (bsz, max_seq_len). padding idx = 1 + labels: tensor of size (bsz, max_seq_len) + labels_mask: indicate where loss gradients should be + propagated and where + labels should be ignored + valid_mask: A message of shame -- documentation must be completed. + + Returns : + logits: unnormalized model outputs. + loss: Cross Entropy loss between labels and logits + + """ + self.model.train() + + transformer_out = self.model.encoder(input_ids=inputs_ids, + return_dict=True)[0] + out_1 = F.relu(self.linear_1(transformer_out)) + out_1 = self.dropout(out_1) + logits = self.classification_head(out_1) + + if labels is not None: + loss_fct = nn.CrossEntropyLoss(ignore_index=self.label_ignore_idx) + # Only keep active parts of the loss + if labels_mask is not None: + active_loss = valid_mask.view(-1) == 1 + + active_logits = logits.view(-1, self.n_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct( + logits.view(-1, self.n_labels), labels.view(-1)) + return loss + else: + return logits + + def encode_word(self, s): + """Takes a string and returns a list of token ids. + + Args: + self:A message of shame -- documentation must be completed. + s:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ + tensor_ids = self.tokenizer.encode(s) + # remove last special + return tensor_ids[0:-1] diff --git a/poldeepner2/model/xlmr_for_token_classification.py b/poldeepner2/model/xlmr_for_token_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..dce3481934501400cfc681aa452d58d5a9b31ca4 --- /dev/null +++ b/poldeepner2/model/xlmr_for_token_classification.py @@ -0,0 +1,93 @@ +"""A message of shame -- documentation must be completed.""" + +from fairseq.models.roberta import XLMRModel +import torch.nn as nn +import torch.nn.functional as F + + +class XLMRForTokenClassification(nn.Module): + """A message of shame -- documentation must be completed.""" + + def __init__(self, pretrained_path, n_labels, + hidden_size, dropout_p=0.2, label_ignore_idx=0, + head_init_range=0.04, device='cuda'): + """A message of shame -- documentation must be completed. + + Args: + pretrained_path:A message of shame -- documentation must be + completed. + n_labels:A message of shame -- documentation must be completed. + hidden_size:A message of shame -- documentation must be completed. + dropout_p:A message of shame -- documentation must be completed. + label_ignore_idx:A message of shame -- documentation must be + completed. + head_init_range:A message of shame -- documentation must be + completed. + device:A message of shame -- documentation must be completed. + + """ + super().__init__() + + self.n_labels = n_labels + self.linear_1 = nn.Linear(hidden_size, hidden_size) + self.classification_head = nn.Linear(hidden_size, n_labels) + self.label_ignore_idx = label_ignore_idx + + self.xlmr = XLMRModel.from_pretrained(pretrained_path) + self.model = self.xlmr.model + self.dropout = nn.Dropout(dropout_p) + self.device = device + + # initializing classification head + self.classification_head.weight.data.normal_(mean=0.0, + std=head_init_range) + + def forward(self, inputs_ids, labels, labels_mask, valid_mask): + """Computes a forward pass through the sequence tagging model. + + Args: + inputs_ids: tensor of size (bsz, max_seq_len). padding idx = 1 + labels: tensor of size (bsz, max_seq_len) + labels_mask: indicate where loss gradients should be + propagated and where + labels should be ignored + valid_mask: indicate where loss gradients should be + + Returns : + logits: unnormalized model outputs. + loss: Cross Entropy loss between labels and logits + + """ + transformer_out, _ = self.model(inputs_ids, features_only=True) + + out_1 = F.relu(self.linear_1(transformer_out)) + out_1 = self.dropout(out_1) + logits = self.classification_head(out_1) + if labels is not None: + loss_fct = nn.CrossEntropyLoss(ignore_index=self.label_ignore_idx) + # Only keep active parts of the loss + if labels_mask is not None: + active_loss = valid_mask.view(-1) == 1 + active_logits = logits.view(-1, self.n_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct( + logits.view(-1, self.n_labels), labels.view(-1)) + return loss + else: + return logits + + def encode_word(self, s): + """Takes a string and returns a list of token ids. + + Args: + self:A message of shame -- documentation must be completed. + s:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ + tensor_ids = self.xlmr.encode(s) + # remove <s> and </s> ids + return tensor_ids.cpu().numpy().tolist()[1:-1] diff --git a/poldeepner2/models.py b/poldeepner2/models.py index 81e84a8636cde37f4972936a809eba5ece29d05f..368257bde4eaccfa55102137283ec146a56207d1 100644 --- a/poldeepner2/models.py +++ b/poldeepner2/models.py @@ -1,3 +1,6 @@ +"""A message of shame -- documentation must be completed.""" + +import logging import os from typing import List @@ -18,6 +21,7 @@ from poldeepner2.utils.sequences import convert_examples_to_features class PolDeepNer2: + """A message of shame -- documentation must be completed.""" def __init__(self, path: str, tokenizer: Tokenizer = None, processor_annotations: List[ProcessorAnnotations] = None, device: str = None): @@ -39,7 +43,8 @@ class PolDeepNer2: text_a = ' '.join(tokens) text_b = None label = ["O"] * len(tokens) - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, + text_b=text_b, label=label)) eval_features = convert_examples_to_features(examples, self.model.config.labels, self.model.config.max_seq_length, @@ -51,7 +56,8 @@ class PolDeepNer2: label_map = {i: label for i, label in enumerate(self.model.config.labels, 1)} if show_progress: - outer = tqdm.tqdm(total=len(eval_dataloader), desc='Processing', position=0) + outer = tqdm.tqdm(total=len(eval_dataloader), desc='Processing', + position=0) for input_ids, label_ids, l_mask, valid_ids in eval_dataloader: if show_progress: outer.update(1) @@ -60,7 +66,8 @@ class PolDeepNer2: valid_ids = valid_ids.to(self.model.config.device) with torch.no_grad(): - logits = self.model(input_ids, labels=None, labels_mask=None, valid_mask=valid_ids) + logits = self.model(input_ids, labels=None, labels_mask=None, + valid_mask=valid_ids) logits = torch.argmax(logits, dim=2) logits = logits.detach().cpu().numpy() @@ -73,7 +80,8 @@ class PolDeepNer2: token_count = sum([len(s) for s in sentences]) assert token_count == len(y_pred), \ - f"The number of returned labels differ from the number of tokens. Number of tokens: {token_count}, " \ + f"The number of returned labels differ from the number of " \ + f"tokens. Number of tokens: {token_count}, " \ f"number of labels: {len(y_pred)}" sentences_y_pred = [] @@ -84,7 +92,8 @@ class PolDeepNer2: return sentences_y_pred def process_text(self, text: str) -> [AnnotationText]: - """ + """A message of shame -- documentation must be completed. + @texts: Array of sentences. Each sentence is a string. "John lives in New York. Mary lives in Chicago" @@ -92,6 +101,7 @@ class PolDeepNer2: AnnotationText(14, 22, "LOC", "New York"), AnnotationText(24, 28, "PER", "Mary"), AnnotationText(38, 45, "LOC", "Chicago")] + """ sentences = self.tokenizer.tokenize([text]) predictions = self.process(sentences) @@ -99,8 +109,11 @@ class PolDeepNer2: return align_tokens_with_text(text, sentences, annotations) def process_document(self, text: str) -> Document: - """ - Process given texts and return Document structure representing the result of processing. + """A message of shame -- documentation must be completed. + + Process given texts and return Document structure representing the + result of processing. + """ polem = AnnotationLemmatizerPolem() @@ -129,10 +142,14 @@ class PolDeepNer2: return document def process_tokenized(self, tokens: [[str]]) -> [[str]]: - """ + """A message of shame -- documentation must be completed. + @tokens: Array of sentences. Each sentence is an array of words. - [["John", "lives", "in", "New", "York"], ["Mary", "lives", "in", "Chicago"]] + [["John", "lives", "in", "New", "York"], + ["Mary", "lives", "in", "Chicago"]] + + return: [["B-PER", "O", "O", "B-LOC", "I-LOC"], + ["B-PER", "O", "O", "B-LOC"]] - return: [["B-PER", "O", "O", "B-LOC", "I-LOC"], ["B-PER", "O", "O", "B-LOC"]] """ return self.process(tokens) diff --git a/poldeepner2/pipeline/__init__.py b/poldeepner2/pipeline/__init__.py index 8b137891791fe96927ad78e64b0aad7bded08bdc..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/poldeepner2/pipeline/__init__.py +++ b/poldeepner2/pipeline/__init__.py @@ -1 +0,0 @@ - diff --git a/poldeepner2/pipeline/lemmatization.py b/poldeepner2/pipeline/lemmatization.py index 2883abf5ef18ac4f08c0b9773843b831f3947ac3..af27e7c08276731255388d95210c471d8fd3b2ad 100644 --- a/poldeepner2/pipeline/lemmatization.py +++ b/poldeepner2/pipeline/lemmatization.py @@ -1,3 +1,5 @@ +"""A message of shame -- documentation must be completed.""" + import logging import requests @@ -6,22 +8,45 @@ from poldeepner2.utils.annotation import Annotation class ProcessorAnnotations: + """A message of shame -- documentation must be completed.""" def process(self, annotations: [Annotation]): + """A message of shame -- documentation must be completed. + + Args: + annotations:A message of shame -- documentation must be completed. + + """ pass class AnnotationLemmatizerPolem (ProcessorAnnotations): + """A message of shame -- documentation must be completed.""" def __init__(self): + """A message of shame -- documentation must be completed.""" self.url = 'http://localhost:8000' pass def process(self, annotations: [Annotation]): + """A message of shame -- documentation must be completed. + + Args: + annotations:A message of shame -- documentation must be completed. + + """ for an in annotations: an.lemma = self.lemmatize(an) def lemmatize(self, annotation: Annotation): + """A message of shame -- documentation must be completed. + + Args: + annotation:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ orths = [an.orth for an in annotation.tokens] lemmas = [an.lemma for an in annotation.tokens] spaces = [str(len(an.ws) > 0) for an in annotation.tokens] @@ -41,4 +66,3 @@ class AnnotationLemmatizerPolem (ProcessorAnnotations): except Exception as ex: logging.error(ex) return None - diff --git a/poldeepner2/pipeline/tokenization.py b/poldeepner2/pipeline/tokenization.py index cab50694da28232940e2f0112f04ed5535b3cb1d..3cc7bce7fcd0fa186d67f925deb967d31b4531ee 100644 --- a/poldeepner2/pipeline/tokenization.py +++ b/poldeepner2/pipeline/tokenization.py @@ -1,18 +1,37 @@ +"""A message of shame -- documentation must be completed.""" + import re import requests from poldeepner2.data.token import Token -from poldeepner2.utils.preprocess import split_hashtags, split_leading_name, split_underscore +from poldeepner2.utils.preprocess import split_hashtags, split_leading_name, \ + split_underscore class Tokenizer: + """A message of shame -- documentation must be completed.""" def tokenize(self, texts: [str]) -> [[Token]]: + """A message of shame -- documentation must be completed. + + Args: + texts:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ return [] @staticmethod def align_tokens_with_text(text: str, sentences: [[Token]]): + """A message of shame -- documentation must be completed. + + Args: + text: A message of shame -- documentation must be completed. + sentences: A message of shame -- documentation must be completed. + + """ idx = 0 for sentence in sentences: for token in sentence: @@ -23,19 +42,30 @@ class Tokenizer: class TokenizerFast(Tokenizer): + """A message of shame -- documentation must be completed.""" def __init__(self): + """A message of shame -- documentation must be completed.""" self.pattern_tokens = re.compile(r"(\W)") self.abbrev_no_eos = set(["tzw", "np", "m.in", "tj"]) def tokenize(self, texts: [str]) -> [[str]]: + """A message of shame -- documentation must be completed. + + Args: + texts:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ out = [] for text in texts: text_out = [] tokens = self.pattern_tokens.split(text.strip()) tokens = [w for w in tokens if len(w.strip()) > 0] interp_ends = set(".?!") - ends = [idx + 1 for idx, w in enumerate(tokens) if w in interp_ends or idx == len(tokens) - 1] + ends = [idx + 1 for idx, w in enumerate(tokens) + if w in interp_ends or idx == len(tokens) - 1] for sent_start, sent_end in zip([0] + ends[:-1], ends): text_out.append(tokens[sent_start:sent_end]) @@ -58,22 +88,58 @@ class TokenizerFast(Tokenizer): return out def is_ended_with_abbrev(self, sequence: [str]) -> bool: - return len(sequence) > 1 and sequence[-1] == "." and sequence[-2] in self.abbrev_no_eos + """A message of shame -- documentation must be completed. + + Args: + sequence:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ + return len(sequence) > 1 and sequence[-1] == "." \ + and sequence[-2] in self.abbrev_no_eos def is_ended_with_name_initial(self, sequence: [str]) -> bool: - return len(sequence) > 1 and sequence[-1] == "." and len(sequence[-2]) == 1 \ - and sequence[-2].isupper() and sequence[-2].isalpha() + """A message of shame -- documentation must be completed. + + Args: + sequence: A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ + return len(sequence) > 1 and sequence[-1] == "." \ + and len(sequence[-2]) == 1 \ + and sequence[-2].isupper() and sequence[-2].isalpha() class TokenizerSpaces(Tokenizer): + """A message of shame -- documentation must be completed.""" def tokenize(self, texts: [str]) -> [[str]]: + """A message of shame -- documentation must be completed. + + Args: + texts:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ return [re.sub(r"\s+", " ", text.strip()).split(" ") for text in texts] class TokenizerKrnnt(Tokenizer): + """A message of shame -- documentation must be completed.""" def tokenize(self, texts: [str]) -> [[str]]: + """A message of shame -- documentation must be completed. + + Args: + texts:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ out = [] for text in texts: sentences = TokenizerKrnnt.request(text) @@ -85,6 +151,14 @@ class TokenizerKrnnt(Tokenizer): return out def tokenize_tokens(self, texts: [str]) -> [[Token]]: + """A message of shame -- documentation must be completed. + + Args: + texts:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ out = [] for text in texts: sentences = TokenizerKrnnt.request(text) @@ -93,6 +167,14 @@ class TokenizerKrnnt(Tokenizer): @staticmethod def request(text: str): + """A message of shame -- documentation must be completed. + + Args: + text:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ url = 'http://localhost:9003' x = requests.post(url, data=text.encode('utf-8')) tokens = TokenizerKrnnt.parse_krnnt_output(x.text) @@ -101,6 +183,14 @@ class TokenizerKrnnt(Tokenizer): @staticmethod def parse_krnnt_output(output): + """A message of shame -- documentation must be completed. + + Args: + output:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ sentences = [] tokens = [] orth = None @@ -117,7 +207,9 @@ class TokenizerKrnnt(Tokenizer): if len(tokens) > 0: tokens[-1].ws = "" if parts[1] == "none" else " " elif parts[0] == "" and orth is not None: - tokens.append(Token(orth, 0, 0, lemma=parts[1], morph=parts[2], ws="")) + tokens.append(Token(orth, 0, + 0, lemma=parts[1], + morph=parts[2], ws="")) orth = None if len(tokens) > 0: tokens[-1].eos = True @@ -126,6 +218,13 @@ class TokenizerKrnnt(Tokenizer): def load(tokenizer_type: str) -> Tokenizer: + """A message of shame -- documentation must be completed. + + Args: + tokenizer_type: strA message of shame -- documentation must be + completed. + + """ if tokenizer_type == "space": return TokenizerSpaces() elif tokenizer_type == "krnnt": diff --git a/poldeepner2/utils/annotation.py b/poldeepner2/utils/annotation.py index 8e8c23b016a936731cb066a4f96cd7c658bd28f8..a3685f7ac1880a8c0685a7c5ffd756f279aabe9f 100644 --- a/poldeepner2/utils/annotation.py +++ b/poldeepner2/utils/annotation.py @@ -1,11 +1,23 @@ +"""A message of shame -- documentation must be completed.""" from dataclasses import dataclass from poldeepner2.data.token import Token class Annotation: + """A message of shame -- documentation must be completed.""" - def __init__(self, label, sid: int = None, token_id: int = None, tokens: [Token] = []): + def __init__(self, label, sid: int = None, token_id: int = None, + tokens: [Token] = []): + """A message of shame -- documentation must be completed. + + Args: + label: A message of shame -- documentation must be completed. + sid: A message of shame -- documentation must be completed. + token_id: A message of shame -- documentation must be completed. + tokens: A message of shame -- documentation must be completed. + + """ self.sentence_id = sid self.token_ids = [token_id] if token_id is not None else [] self.tokens = tokens @@ -13,37 +25,87 @@ class Annotation: self.lemma = "" def add_id(self, id): + """A message of shame -- documentation must be completed. + + Args: + id: A message of shame -- documentation must be completed. + + """ self.token_ids.append(id) def add_token(self, token: Token): + """A message of shame -- documentation must be completed. + + Args: + token:A message of shame -- documentation must be completed. + + """ self.tokens.append(token) def get_text(self): + """A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ return "".join([t.orth + t.ws for t in self.tokens]).strip() def __str__(self): + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ return self.annotation def __eq__(self, other): - return self.annotation == other.annotation and self.token_ids[0] == other.token_ids[0] and \ - self.token_ids[-1] == other.token_ids[-1] and self.sentence_id == other.sentence_id + """A message of shame -- documentation must be completed. + + Args: + other:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ + return self.annotation == other.annotation \ + and self.token_ids[0] == other.token_ids[0] \ + and self.token_ids[-1] == other.token_ids[-1] \ + and self.sentence_id == other.sentence_id def __hash__(self): - return hash(self.annotation + str(self.sentence_id) + str(self.token_ids[0]) + str(self.token_ids[-1])) + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ + return hash(self.annotation + str(self.sentence_id) + + str(self.token_ids[0]) + str(self.token_ids[-1])) @property def annotation_length(self): + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ return self.token_ids[-1] - self.token_ids[0] @dataclass class AnnotationText: + """A message of shame -- documentation must be completed.""" + start: int end: int label: str text: str def dict(self): + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ return { 'begin': self.start, 'end': self.end, diff --git a/poldeepner2/utils/data_utils.py b/poldeepner2/utils/data_utils.py index fba29064064e46bf145a329d3b303e6165516a75..08520b99bf6762f36bfb820fdbf13e0635df1d66 100644 --- a/poldeepner2/utils/data_utils.py +++ b/poldeepner2/utils/data_utils.py @@ -1,3 +1,4 @@ +"""A message of shame -- documentation must be completed.""" import codecs import json from typing import List @@ -7,7 +8,6 @@ from torch.utils.data import TensorDataset from poldeepner2.utils.annotation import Annotation, AnnotationText - LABEL_IGNORE_ID = 0 @@ -17,14 +17,13 @@ class InputExample(object): def __init__(self, guid, text_a, text_b=None, label=None): """Constructs a InputExample. - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. + Args: guid: Unique id for the example. text_a: string. The + untokenized text of the first sequence. For single sequence tasks, + only this sequence must be specified. text_b: (Optional) string. The + untokenized text of the second sequence. Only must be specified for + sequence pair tasks. label: (Optional) string. The label of the + example. This should be specified for train and dev examples, + but not for test examples. """ self.guid = guid self.text_a = text_a @@ -43,6 +42,14 @@ class NerProcessor: return examples def get_labels(self, paths): + """A message of shame -- documentation must be completed. + + Args: + paths: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ label_set = set([]) for path in paths: examples = self.get_examples(path) @@ -50,13 +57,22 @@ class NerProcessor: return sorted(list(label_set)) def _read_file(self, filename): + """A message of shame -- documentation must be completed. + + Args: + filename: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ f = open(filename) data = [] sentence = [] label = [] - + for i, line in enumerate(f, 1): - if not line.strip() or len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n": + if not line.strip() or len(line) == 0 or \ + line.startswith('-DOCSTART') or line[0] == "\n": if len(sentence) > 0: data.append((sentence, label)) sentence = [] @@ -64,7 +80,8 @@ class NerProcessor: continue splits = line.split() - assert len(splits) >= 2, "error on line {}. Found {} splits".format(i, len(splits)) + assert len(splits) >= 2, "error on line {}. Found {} " \ + "splits".format(i, len(splits)) word, tag = splits[0], splits[-1] sentence.append(word.strip()) label.append(tag.strip()) @@ -72,7 +89,44 @@ class NerProcessor: data.append((sentence, label)) return data + def _read_iob(self, filename, column_index): + + data = [] + sentence = [] + label = [] + with open(filename, encoding='utf-8') as f: + for i, line in enumerate(f, 1): + line = line.strip('\n') + + # check if begining of the file or empty line + if line.startswith('-DOCSTART') or len(line) == 0: + if len(sentence) > 0: + data.append((sentence, label)) + sentence = [] + label = [] + continue + + splits = line.split() + assert len(splits) >= 2, "error on line {}. Found {} splits".format( + i, len(splits)) + + word, tag = splits[0], splits[column_index] + sentence.append(word) + label.append(tag) + if len(sentence) > 0: + data.append((sentence, label)) + return data + def _create_examples(self, lines, set_type): + """A message of shame -- documentation must be completed. + + Args: + lines: A message of shame -- documentation must be completed. + set_type: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ examples = [] for i, (sentence, label) in enumerate(lines): guid = "%s-%s" % (set_type, i) @@ -85,6 +139,14 @@ class NerProcessor: @staticmethod def _get_labels(sentences): + """A message of shame -- documentation must be completed. + + Args: + sentences: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ label_set = set([]) for t in sentences: label_set.update(t.label) @@ -92,14 +154,34 @@ class NerProcessor: def create_dataset(features) -> TensorDataset: - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) - all_valid_ids = torch.tensor([f.valid_ids for f in features], dtype=torch.long) + """A message of shame -- documentation must be completed. + + Args: + features: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ + all_input_ids = torch.tensor([f.input_ids for f in features], + dtype=torch.long) + all_label_ids = torch.tensor([f.label_id for f in features], + dtype=torch.long) + all_valid_ids = torch.tensor([f.valid_ids for f in features], + dtype=torch.long) # ToDo: at some point the TensorDataset should be reduced to three items. - return TensorDataset(all_input_ids, all_label_ids, all_valid_ids, all_valid_ids) + return TensorDataset(all_input_ids, all_label_ids, all_valid_ids, + all_valid_ids) def wrap_annotations(sentences) -> [Annotation]: + """A message of shame -- documentation must be completed. + + Args: + sentences: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ annotations = [] tid = 0 for sid, labels in enumerate(sentences): @@ -108,7 +190,8 @@ def wrap_annotations(sentences) -> [Annotation]: current_token_annotations = {} for ann in label.split('#'): type = ann[2:] - if 'B-' in ann or ('I-' in ann and type not in last_token_annotations): + if 'B-' in ann or ('I-' in ann and + type not in last_token_annotations): an = Annotation(type, sid, tid) current_token_annotations[type] = an annotations.append(an) @@ -122,6 +205,15 @@ def wrap_annotations(sentences) -> [Annotation]: def align_tokens_to_text(sentences: [[str]], text): + """A message of shame -- documentation must be completed. + + Args: + sentences: A message of shame -- documentation must be completed. + text: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ offsets = [] tid = 0 text = text.lower() @@ -131,7 +223,8 @@ def align_tokens_to_text(sentences: [[str]], text): for t in s: start = text.find(t.lower(), tid) if start == -1: - raise Exception(f"Could not align tokens to text: {t} in '{text}") + raise Exception(f"Could not align tokens to text: {t} in '" + f"{text}") end = start + len(t) offsets.append((start, end)) tid = end @@ -139,32 +232,49 @@ def align_tokens_to_text(sentences: [[str]], text): def align_tokens_with_text(text, sentences, annotations) -> [AnnotationText]: + """A message of shame -- documentation must be completed. + + Args: + text: A message of shame -- documentation must be completed. + sentences: A message of shame -- documentation must be completed. + annotations: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ offsets = align_tokens_to_text(sentences, text) output = [] for an in annotations: begin = offsets[an.token_ids[0]][0] end = offsets[an.token_ids[-1]][1] orth = text[begin:end] - output.append(AnnotationText(begin, end, an.annotation.replace("-", "_"), orth)) + output.append(AnnotationText(begin, end, an.annotation.replace("-", + "_"), + orth)) return output def get_poleval_dict(doc_id, text, sentences, annotations): - ''' Returns PolEval dict + """A message of shame -- documentation must be completed. + + Returns PolEval dict { - text: - id: - answers: + text: A message of shame -- documentation must be completed. + id: A message of shame -- documentation must be completed. + answers: A message of shame -- documentation must be completed. } - Note that arguments it takes is FILE, PATH, FILE as utils.load_data_and_labels opens file itself - ''' + Note that arguments it takes is FILE, PATH, FILE as + utils.load_data_and_labels opens file itself + + """ offsets = align_tokens_to_text(sentences, text) answers = [] for an in annotations: begin = offsets[an.token_ids[0]][0] end = offsets[an.token_ids[-1]][1] orth = text[begin:end] - answers.append("%s %d %d\t%s" % (an.annotation.replace("-", "_"), begin, end, orth)) + answers.append("%s %d %d\t%s" % (an.annotation.replace("-", "_"), + begin, end, orth)) return ({ 'text': text, 'id': doc_id, @@ -173,17 +283,41 @@ def get_poleval_dict(doc_id, text, sentences, annotations): def read_params(path): + """A message of shame -- documentation must be completed. + + Args: + path: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ data = read_params_json(path) return data['dropout'], data['num_labels'], data['label_list'] def read_params_json(path): + """A message of shame -- documentation must be completed. + + Args: + path: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ with open(path + '/params.json') as json_file: data = json.load(json_file) return data def read_json(path): + """A message of shame -- documentation must be completed. + + Args: + path: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ json_out = {} with open(path, encoding='utf-8') as f: data = json.load(f) @@ -193,11 +327,20 @@ def read_json(path): def read_tsv(filename, with_labels=False): + """A message of shame -- documentation must be completed. + + Args: + filename: A message of shame -- documentation must be completed. + with_labels: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ f = open(filename, encoding="utf-8") data = [] sentence = [] label = [] - + for i, line in enumerate(f, 1): if not line.strip() or len(line) == 0 \ or line.startswith('-DOCSTART') or line[0] == "\n": @@ -219,6 +362,14 @@ def read_tsv(filename, with_labels=False): def save_tsv(output_path, sentences, predictions): + """A message of shame -- documentation must be completed. + + Args: + output_path: A message of shame -- documentation must be completed. + sentences: A message of shame -- documentation must be completed. + predictions: A message of shame -- documentation must be completed. + + """ with codecs.open(output_path, "w", "utf8") as fout: assert len(sentences) == len(predictions) for tokens, labels in zip(sentences, predictions): @@ -228,6 +379,14 @@ def save_tsv(output_path, sentences, predictions): def get_dict_for_record(json_ann): + """A message of shame -- documentation must be completed. + + Args: + json_ann: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ token_dict = {} derives = 0 for ann in json_ann['data']['brat'].split('\n'): @@ -237,21 +396,32 @@ def get_dict_for_record(json_ann): token = ann.split('\t')[-1] if token in token_dict.keys(): - token_dict[token] = ''.join([token_dict[token],'#',annotation]) + token_dict[token] = ''.join([token_dict[token], '#', + annotation]) else: - token_dict[token] = annotation + token_dict[token] = annotation else: derives += 1 return token_dict, derives def map_json_to_iob(json_ann, iob): + """A message of shame -- documentation must be completed. + + Args: + json_ann: A message of shame -- documentation must be completed. + iob: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ token_dict, derives = get_dict_for_record(json_ann) successfully_added = 0 out_iob = '' with open(iob, encoding='utf-8') as f: for line in f: - if not line.strip() or len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n": + if not line.strip() or len(line) == 0 or \ + line.startswith('-DOCSTART') or line[0] == "\n": out_iob += line else: splitline = line.split('\t') @@ -266,6 +436,15 @@ def map_json_to_iob(json_ann, iob): def has_same_neighbour(annotation, next_annotations): + """A message of shame -- documentation must be completed. + + Args: + annotation: A message of shame -- documentation must be completed. + next_annotations: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ if next_annotations == ['O']: return False searched_ann = 'B-{0}'.format(annotation) @@ -273,6 +452,12 @@ def has_same_neighbour(annotation, next_annotations): def iob2_to_iob(iob2_text): + """A message of shame -- documentation must be completed. + + Args: + iob2_text:A message of shame -- documentation must be completed. + + """ iob2_list = [] iob1_list = [] @@ -291,11 +476,12 @@ def iob2_to_iob(iob2_text): else: current_ann.append(ann) iob1_list.append((line[0], '#'.join(current_ann))) - elif i == len(iob2_list)-1: + elif i == len(iob2_list) - 1: for ann in line[1]: split = ann.split('-') if split[0] == 'B': - if len(tags_to_separate) > 0 and split[1] in tags_to_separate: + if len(tags_to_separate) > 0 and split[1] \ + in tags_to_separate: current_ann.append(ann) tags_to_separate.remove(split[1]) else: @@ -307,18 +493,19 @@ def iob2_to_iob(iob2_text): for ann in line[1]: split = ann.split('-') if split[0] == 'B': - if len(tags_to_separate) > 0 and split[1] in tags_to_separate: + if len(tags_to_separate) > 0 and split[1] \ + in tags_to_separate: current_ann.append(ann) tags_to_separate.remove(split[1]) else: current_ann.append('I-{0}'.format(split[1])) - if has_same_neighbour(split[1], iob2_list[i+1][1]): + if has_same_neighbour(split[1], iob2_list[i + 1][1]): tags_to_separate.append(split[1]) elif split[0] == 'I': current_ann.append(ann) - if has_same_neighbour(split[1], iob2_list[i+1][1]): + if has_same_neighbour(split[1], iob2_list[i + 1][1]): tags_to_separate.append(split[1]) else: current_ann.append(ann) iob1_list.append((line[0], '#'.join(current_ann))) - return '\n'.join(map(lambda x: '{} {}'.format(x[0], x[1]) , iob1_list)) + return '\n'.join(map(lambda x: '{} {}'.format(x[0], x[1]), iob1_list)) diff --git a/poldeepner2/utils/file_utils.py b/poldeepner2/utils/file_utils.py index 45c6d5d1f5bc12216ded68fcb38c5697b9e340fa..ad08a6e5932092dc3240a3b09ae09ad53d162ed6 100644 --- a/poldeepner2/utils/file_utils.py +++ b/poldeepner2/utils/file_utils.py @@ -1,3 +1,4 @@ +"""A message of shame -- documentation must be completed.""" from urllib.request import urlopen import requests import os @@ -7,7 +8,8 @@ from tqdm import tqdm def download_from_url(url, dst): - """ + """A message of shame -- documentation must be completed. + @param: url to download file @param: dst place to put the file """ @@ -36,24 +38,51 @@ def download_from_url(url, dst): def unpack_gz(path, output): + """A message of shame -- documentation must be completed. + + Args: + path: A message of shame -- documentation must be completed. + output: A message of shame -- documentation must be completed. + + """ with tarfile.open(path, 'r') as tar: - for member in tqdm(iterable=tar.getmembers(), total=len(tar.getmembers())): + for member in tqdm(iterable=tar.getmembers(), + total=len(tar.getmembers())): tar.extract(path=output, member=member) def unpack_zip(path, output): + """A message of shame -- documentation must be completed. + + Args: + path: A message of shame -- documentation must be completed. + output: A message of shame -- documentation must be completed. + + """ with ZipFile(path, 'r') as zipObj: zipObj.extractall(output) def download_file(url, path, compression, extract_to_subfolder=False): + """A message of shame -- documentation must be completed. + + Args: + url: A message of shame -- documentation must be completed. + path: A message of shame -- documentation must be completed. + compression: A message of shame -- documentation must be completed. + extract_to_subfolder: A message of shame -- documentation must be + completed. + + """ ext = "" if compression is None else '.' + compression download_from_url(url, path + ext) if compression == 'zip': - unpack_zip(f'{path}.zip', path if extract_to_subfolder else os.path.dirname(path)) + unpack_zip(f'{path}.zip', + path if extract_to_subfolder else os.path.dirname(path)) os.remove(f"{path}{ext}") elif compression == 'tar.gz': - unpack_gz(f'path.tar.gz', path if extract_to_subfolder else os.path.dirname(path)) + unpack_gz(f"{path}.tar.gz", + path if extract_to_subfolder else os.path.dirname(path)) os.remove(f"{path}{ext}") elif compression is None: pass diff --git a/poldeepner2/utils/preprocess.py b/poldeepner2/utils/preprocess.py index 020aaa848a005f2999e42a070e9bac737b73d5a5..d8613c32cdc7ff39cd95041f171ba2788110efeb 100644 --- a/poldeepner2/utils/preprocess.py +++ b/poldeepner2/utils/preprocess.py @@ -1,13 +1,23 @@ +"""A message of shame -- documentation must be completed.""" import re def split_hashtags(tokens): + """A message of shame -- documentation must be completed. + + Args: + tokens: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ output = [] i = 0 while i < len(tokens): - if tokens[i] == "#" and i+1 < len(tokens) and re.fullmatch(r"([A-Z][a-z]+)([A-Z][a-z]+)+", tokens[i+1]): + if tokens[i] == "#" and i + 1 < len(tokens) and \ + re.fullmatch(r"([A-Z][a-z]+)([A-Z][a-z]+)+", tokens[i + 1]): output.append("#") - for m in re.findall(r"([A-Z][a-z]+)", tokens[i+1]): + for m in re.findall(r"([A-Z][a-z]+)", tokens[i + 1]): output.append(str(m)) i += 2 else: @@ -17,7 +27,16 @@ def split_hashtags(tokens): def split_leading_name(tokens): - if len(tokens) > 1 and re.fullmatch(r"([A-Z][a-z]+)([A-Z][a-z]+)+", tokens[0]) and tokens[1] == ":": + """A message of shame -- documentation must be completed. + + Args: + tokens: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ + if len(tokens) > 1 and re.fullmatch(r"([A-Z][a-z]+)([A-Z][a-z]+)+", + tokens[0]) and tokens[1] == ":": output = [] for m in re.findall(r"([A-Z][a-z]+)", tokens[0]): output.append(str(m)) @@ -28,6 +47,14 @@ def split_leading_name(tokens): def split_underscore(tokens): + """A message of shame -- documentation must be completed. + + Args: + tokens: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ output = [] for token in tokens: if "_" in token: diff --git a/poldeepner2/utils/seed.py b/poldeepner2/utils/seed.py index 7a2b4614246b9681c6ea1dddd70eeedd960b5c1d..dd022c54dd7e694f756e3a38f5e0ea129683896a 100644 --- a/poldeepner2/utils/seed.py +++ b/poldeepner2/utils/seed.py @@ -1,9 +1,18 @@ +"""A message of shame -- documentation must be completed.""" import numpy as np import torch import random def setup_seed(n=101): + """A message of shame -- documentation must be completed. + + Args: + n: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ random.seed(n) np.random.seed(n) torch.manual_seed(n) diff --git a/poldeepner2/utils/sequence_labeling.py b/poldeepner2/utils/sequence_labeling.py index 8f6410c373606231a80a288510d252e10f35c27c..9aa83f4d59c6bf5bc8e98312475a0be7ece79aec 100644 --- a/poldeepner2/utils/sequence_labeling.py +++ b/poldeepner2/utils/sequence_labeling.py @@ -1,6 +1,8 @@ -"""Metrics to assess performance on sequence labeling task given prediction -Functions named as ``*_score`` return a scalar value to maximize: the higher -the better +"""Metrics to assess performance on sequence labeling task given prediction. + +Functions named as ``*_score`` return a scalar value to maximize: the higher. +the better. + """ from __future__ import absolute_import @@ -16,6 +18,7 @@ def get_entities(seq, suffix=False): """Gets entities from sequence. Args: + suffix: A message of shame -- documentation must be completed. seq (list): sequence of labels. Returns: @@ -27,6 +30,7 @@ def get_entities(seq, suffix=False): >>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC'] >>> get_entities(seq) [('PER', 0, 1), ('LOC', 3, 3)] + """ # for nested list if any(isinstance(s, list) for s in seq): @@ -37,32 +41,43 @@ def get_entities(seq, suffix=False): for i, chunk in enumerate(seq + ['O']): for et in existing_tags: et['continued'] = False - active_types = map(lambda x: x['type'] ,existing_tags) + active_types = map(lambda x: x['type'], existing_tags) i_chunk = [] if '#' in chunk: for ann in chunk.split('#'): - i_chunk.append(ann) + i_chunk.append(ann) else: - i_chunk.append(chunk) + i_chunk.append(chunk) for subchunk in i_chunk: tag, type_ = get_tag_type(suffix, subchunk) - if start_of_chunk(tag,type_) and (tag == 'B' or type_ not in active_types): - existing_tags.append( {'begin': i, 'continued': True, 'type': type_} ) + if start_of_chunk(tag, type_) and (tag == 'B' or type_ not in + active_types): + existing_tags.append({'begin': i, 'continued': True, + 'type': type_}) if tag == 'I': for et in existing_tags: if et['type'] == type_: et['continued'] = True notFinished = [] for et in existing_tags: - if et['continued'] : + if et['continued']: notFinished.append(et) else: - chunks.append((et['type'], et['begin'], i-1)) + chunks.append((et['type'], et['begin'], i - 1)) existing_tags = notFinished return chunks def get_tag_type(suffix, chunk): + """A message of shame -- documentation must be completed. + + Args: + suffix: A message of shame -- documentation must be completed. + chunk: A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ if suffix: tag = chunk[-1] type_ = chunk.split('-')[0] @@ -83,18 +98,27 @@ def end_of_chunk(tag, prev_type, type_, prev_tag): Returns: chunk_end: boolean. + """ chunk_end = False - if prev_tag == 'E': chunk_end = True - if prev_tag == 'S': chunk_end = True + if prev_tag == 'E': + chunk_end = True + if prev_tag == 'S': + chunk_end = True - if prev_tag == 'B' and tag == 'B': chunk_end = True - if prev_tag == 'B' and tag == 'S': chunk_end = True - if prev_tag == 'B' and tag == 'O': chunk_end = True - if prev_tag == 'I' and tag == 'B': chunk_end = True - if prev_tag == 'I' and tag == 'S': chunk_end = True - if prev_tag == 'I' and tag == 'O': chunk_end = True + if prev_tag == 'B' and tag == 'B': + chunk_end = True + if prev_tag == 'B' and tag == 'S': + chunk_end = True + if prev_tag == 'B' and tag == 'O': + chunk_end = True + if prev_tag == 'I' and tag == 'B': + chunk_end = True + if prev_tag == 'I' and tag == 'S': + chunk_end = True + if prev_tag == 'I' and tag == 'O': + chunk_end = True if prev_tag != 'O' and prev_tag != '.' and prev_type != type_: chunk_end = True @@ -113,18 +137,27 @@ def start_of_chunk(tag, type_, prev_type=None, prev_tag='O'): Returns: chunk_start: boolean. + """ chunk_start = False - if tag == 'B': chunk_start = True - if tag == 'S': chunk_start = True + if tag == 'B': + chunk_start = True + if tag == 'S': + chunk_start = True - if prev_tag == 'E' and tag == 'E': chunk_start = True - if prev_tag == 'E' and tag == 'I': chunk_start = True - if prev_tag == 'S' and tag == 'E': chunk_start = True - if prev_tag == 'S' and tag == 'I': chunk_start = True - if prev_tag == 'O' and tag == 'E': chunk_start = True - if prev_tag == 'O' and tag == 'I': chunk_start = True + if prev_tag == 'E' and tag == 'E': + chunk_start = True + if prev_tag == 'E' and tag == 'I': + chunk_start = True + if prev_tag == 'S' and tag == 'E': + chunk_start = True + if prev_tag == 'S' and tag == 'I': + chunk_start = True + if prev_tag == 'O' and tag == 'E': + chunk_start = True + if prev_tag == 'O' and tag == 'I': + chunk_start = True if tag != 'O' and tag != '.' and prev_type != type_: chunk_start = True @@ -143,6 +176,7 @@ def f1_score(y_true, y_pred, average='micro', suffix=False): F1 = 2 * (precision * recall) / (precision + recall) Args: + average: y_true : 2d array. Ground truth (correct) target values. y_pred : 2d array. Estimated targets as returned by a tagger. @@ -151,10 +185,13 @@ def f1_score(y_true, y_pred, average='micro', suffix=False): Example: >>> from seqeval.metrics import f1_score - >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] - >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] + >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], + ['B-PER', 'I-PER', 'O']] + >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], + ['B-PER', 'I-PER', 'O']] >>> f1_score(y_true, y_pred) 0.50 + """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) @@ -186,16 +223,19 @@ def accuracy_score(y_true, y_pred): Example: >>> from seqeval.metrics import accuracy_score - >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] - >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] + >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], + ['B-PER', 'I-PER', 'O']] + >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], + ['B-PER', 'I-PER', 'O']] >>> accuracy_score(y_true, y_pred) 0.80 + """ if any(isinstance(s, list) for s in y_true): y_true = [item for sublist in y_true for item in sublist] y_pred = [item for sublist in y_pred for item in sublist] - nb_correct = sum(y_t==y_p for y_t, y_p in zip(y_true, y_pred)) + nb_correct = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred)) nb_true = len(y_true) score = nb_correct / nb_true @@ -208,7 +248,8 @@ def precision_score(y_true, y_pred, average='micro', suffix=False): The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is - intuitively the ability of the classifier not to label as positive a sample. + intuitively the ability of the classifier not to label as positive a + sample. The best value is 1 and the worst value is 0. @@ -221,10 +262,13 @@ def precision_score(y_true, y_pred, average='micro', suffix=False): Example: >>> from seqeval.metrics import precision_score - >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] - >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] + >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], + ['B-PER', 'I-PER', 'O']] + >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], + ['B-PER', 'I-PER', 'O']] >>> precision_score(y_true, y_pred) 0.50 + """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) @@ -255,10 +299,13 @@ def recall_score(y_true, y_pred, average='micro', suffix=False): Example: >>> from seqeval.metrics import recall_score - >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] - >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] + >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], + ['B-PER', 'I-PER', 'O']] + >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], + ['B-PER', 'I-PER', 'O']] >>> recall_score(y_true, y_pred) 0.50 + """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) @@ -272,8 +319,7 @@ def recall_score(y_true, y_pred, average='micro', suffix=False): def performance_measure(y_true, y_pred): - """ - Compute the performance metrics: TP, FP, FN, TN + """Compute the performance metrics: TP, FP, FN, TN. Args: y_true : 2d array. Ground truth (correct) target values. @@ -284,10 +330,13 @@ def performance_measure(y_true, y_pred): Example: >>> from seqeval.metrics import performance_measure - >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-ORG'], ['B-PER', 'I-PER', 'O']] - >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O'], ['B-PER', 'I-PER', 'O']] + >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-ORG'], + ['B-PER', 'I-PER', 'O']] + >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O'], + ['B-PER', 'I-PER', 'O']] >>> performance_measure(y_true, y_pred) (3, 3, 1, 4) + """ performace_dict = dict() if any(isinstance(s, list) for s in y_true): @@ -307,15 +356,19 @@ def classification_report(y_true, y_pred, digits=2, suffix=False): Args: y_true : 2d array. Ground truth (correct) target values. y_pred : 2d array. Estimated targets as returned by a classifier. - digits : int. Number of digits for formatting output floating point values. + digits : int. Number of digits for formatting output floating point + values. Returns: - report : string. Text summary of the precision, recall, F1 score for each class. + report : string. Text summary of the precision, recall, F1 score for + each class. Examples: >>> from seqeval.metrics import classification_report - >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] - >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] + >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], + ['B-PER', 'I-PER', 'O']] + >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], + ['B-PER', 'I-PER', 'O']] >>> print(classification_report(y_true, y_pred)) precision recall f1-score support <BLANKLINE> @@ -325,6 +378,7 @@ def classification_report(y_true, y_pred, digits=2, suffix=False): micro avg 0.50 0.50 0.50 2 macro avg 0.50 0.50 0.50 2 <BLANKLINE> + """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) @@ -359,7 +413,8 @@ def classification_report(y_true, y_pred, digits=2, suffix=False): r = nb_correct / nb_true if nb_true > 0 else 0 f1 = 2 * p * r / (p + r) if p + r > 0 else 0 - report += row_fmt.format(*[type_name, p, r, f1, nb_true], width=width, digits=digits) + report += row_fmt.format(*[type_name, p, r, f1, nb_true], width=width, + digits=digits) ps.append(p) rs.append(r) diff --git a/poldeepner2/utils/sequences.py b/poldeepner2/utils/sequences.py index 884e1cee8ab65e96c6ad460372080ecb3a981dcb..25bfd2d0548727ebb18b23b7e14f45d220a6a7ed 100644 --- a/poldeepner2/utils/sequences.py +++ b/poldeepner2/utils/sequences.py @@ -80,8 +80,10 @@ def tokens_and_labels_into_token_features(tokens: List[str], labels: List[str], for word, label_1 in zip(tokens, labels): subtokens = encode_method(word.strip()) if len(subtokens) == 0: - logging.warning(f"Token '{word}' has no subwords") - continue + print(tokens) + replacement = "x" * len(word.strip()) + logging.warning(f"Token '{word}' has no subwords. It was replaced with '{replacement}'") + subtokens = encode_method(replacement) tfs.append(TokenFeatures(subtokens, label_map[label_1])) return tfs diff --git a/poldeepner2/utils/train_utils.py b/poldeepner2/utils/train_utils.py index d8063b9649a4aab75db4266db2ea45f2962abd9b..facd3535904cd1b015a1a5650b0e9d08d919df95 100644 --- a/poldeepner2/utils/train_utils.py +++ b/poldeepner2/utils/train_utils.py @@ -1,3 +1,4 @@ +"""A message of shame -- documentation must be completed.""" import torch from torch.utils.data import SequentialSampler, DataLoader from poldeepner2.utils.sequence_labeling import classification_report, f1_score, precision_score, recall_score diff --git a/poleval_ner_test.py b/poleval_ner_test.py index 1c45cf1528af99bc573d05c2c4ff21f69e203abb..0bba6d5c64c8e6e76ee24be6ad7b3662e19efa2f 100644 --- a/poleval_ner_test.py +++ b/poleval_ner_test.py @@ -1,7 +1,22 @@ -import sys, json, getopt +"""A message of shame -- documentation must be completed.""" + +import getopt +import json +import sys + from dateutil import parser + def overlap(offsetsa, offsetsb): + """A message of shame -- documentation must be completed. + + Args: + offsetsa:A message of shame -- documentation must be completed. + offsetsb:A message of shame -- documentation must be completed. + + Returns: not (int(end1) < int(start2) or int(end2) < int(start1)) + + """ try: start1, end1 = offsetsa.split('_') start2, end2 = offsetsb.split('_') @@ -10,7 +25,17 @@ def overlap(offsetsa, offsetsb): print(offsetsb) return not (int(end1) < int(start2) or int(end2) < int(start1)) + def exact(offsetsa, offsetsb): + """A message of shame -- documentation must be completed. + + Args: + offsetsa:A message of shame -- documentation must be completed. + offsetsb:A message of shame -- documentation must be completed. + + Returns: (int(start1) == int(start2)) and (int(end1) == int(end2)) + + """ try: start1, end1 = offsetsa.split('_') start2, end2 = offsetsb.split('_') @@ -19,11 +44,31 @@ def exact(offsetsa, offsetsb): print(offsetsb) return (int(start1) == int(start2)) and (int(end1) == int(end2)) -# this to ensure we get rid of derived types when loading entities (redundant otherwise) + +# this to ensure we get rid of derived types when loading entities +# (redundant otherwise) def removeDerivs(annots): - return { (a,c) for a,c in annots if c.find('derivType') < 0 } + """A message of shame -- documentation must be completed. + + Args: + annots:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ + return {(a, c) for a, c in annots if c.find('derivType') < 0} + def compareTextsOverlap(eGold, eModel): + """A message of shame -- documentation must be completed. + + Args: + eGold:A message of shame -- documentation must be completed. + eModel:A message of shame -- documentation must be completed. + + Returns: [tp, fp, fn] + + """ eGold = removeDerivs(eGold) eModel = removeDerivs(eModel) tp, fp, fn = 0, 0, 0 @@ -36,7 +81,17 @@ def compareTextsOverlap(eGold, eModel): fn = len(eGold) - tp return [tp, fp, fn] + def compareTextsExact(eGold, eModel): + """A message of shame -- documentation must be completed. + + Args: + eGold:A message of shame -- documentation must be completed. + eModel:A message of shame -- documentation must be completed. + + Returns: [tp, fp, fn] + + """ eGold = removeDerivs(eGold) eModel = removeDerivs(eModel) tp, fp, fn = 0, 0, 0 @@ -49,35 +104,59 @@ def compareTextsExact(eGold, eModel): fn = len(eGold) - tp return [tp, fp, fn] + def makeAnnsFormat(inputDoc, cols, htype): + """A message of shame -- documentation must be completed. + + Args: + inputDoc:A message of shame -- documentation must be completed. + cols:A message of shame -- documentation must be completed. + htype:A message of shame -- documentation must be completed. + + Returns: z_anns + + """ z_anns = [] for ben in inputDoc.split('\n'): pcs = ben.split('\t') try: - if len(pcs)==cols: + if len(pcs) == cols: cat, ofrom, oto = pcs[-2].split(' ') - z_anns.append( [ofrom+"_"+oto, cat] ) + z_anns.append([ofrom + "_" + oto, cat]) except ValueError: # handling fragmented entity, two strategies: - if htype=='merge': + if htype == 'merge': # take start and end, use as a single big entity cat, ofrom, ignored, oto = pcs[-2].split(' ') - z_anns.append( [ofrom+"_"+oto, cat] ) - if htype=='split': + z_anns.append([ofrom + "_" + oto, cat]) + if htype == 'split': # split into two entities catAndOffsets1, offsets2 = pcs[-2].split(';') cat, ofrom, oto = catAndOffsets1.split(' ') - z_anns.append( [ofrom+"_"+oto, cat] ) + z_anns.append([ofrom + "_" + oto, cat]) ofrom, oto = offsets2.split(' ') - z_anns.append( [ofrom+"_"+oto, cat] ) + z_anns.append([ofrom + "_" + oto, cat]) return z_anns + # compute micro F1 scores for exact and overlap matches -# htype parameter reflects two possible strategies for handling fragmented entities ("split" or "merge") +# htype parameter reflects two possible strategies for handling fragmented +# entities ("split" or "merge") def computeScores(goldfile, userfile, htype="split"): + """A message of shame -- documentation must be completed. + + Args: + goldfile:A message of shame -- documentation must be completed. + userfile:A message of shame -- documentation must be completed. + htype:A message of shame -- documentation must be completed. - global_tp_ov = 0 ; global_fp_ov = 0 ; global_fn_ov = 0 - global_tp_ex = 0 ; global_fp_ex = 0 ; global_fn_ex = 0 + """ + global_tp_ov = 0 + global_fp_ov = 0 + global_fn_ov = 0 + global_tp_ex = 0 + global_fp_ex = 0 + global_fn_ex = 0 idsToAnnsUser = {} with open(userfile) as json_data: @@ -89,7 +168,7 @@ def computeScores(goldfile, userfile, htype="split"): else: idsToAnnsUser[userjson[nr]['id']] = '' - found = 0; + found = 0 nonfound = 0 idsToAnnsGold = {} @@ -97,33 +176,40 @@ def computeScores(goldfile, userfile, htype="split"): goldjson = json.load(json_data) for nr in range(len(goldjson['questions'])): - idGold = '/'.join(goldjson['questions'][nr]['input']['fname'].split('/')[4:]) + idGold = '/'.join( + goldjson['questions'][nr]['input']['fname'].split('/')[4:]) # print(idGold) if idGold in idsToAnnsUser: found += 1 # find the most recent answer: if len(goldjson['questions'][nr]['answers']) > 1: - maximum = parser.parse('1900-01-02T14:22:41.439308+00:00'); + maximum = parser.parse('1900-01-02T14:22:41.439308+00:00') index = 0 - for i, value in enumerate(goldjson['questions'][nr]['answers']): - value = parser.parse(goldjson['questions'][nr]['answers'][i]['created']) + for i, value in enumerate( + goldjson['questions'][nr]['answers']): + value = parser.parse( + goldjson['questions'][nr]['answers'][i]['created']) if value > maximum: maximum = value index = i - idsToAnnsGold[idGold] = goldjson['questions'][nr]['answers'][index]['data']['brat'] + idsToAnnsGold[idGold] = \ + goldjson['questions'][nr]['answers'][index]['data']['brat'] else: - idsToAnnsGold[idGold] = goldjson['questions'][nr]['answers'][0]['data']['brat'] + idsToAnnsGold[idGold] = \ + goldjson['questions'][nr]['answers'][0]['data']['brat'] # overlap scores: - ovtp = compareTextsOverlap(makeAnnsFormat(idsToAnnsGold[idGold], 3, htype), - makeAnnsFormat(idsToAnnsUser[idGold], 2, htype)) + ovtp = compareTextsOverlap( + makeAnnsFormat(idsToAnnsGold[idGold], 3, htype), + makeAnnsFormat(idsToAnnsUser[idGold], 2, htype)) global_tp_ov += ovtp[0] global_fp_ov += ovtp[1] global_fn_ov += ovtp[2] # exact match scores: - extp = compareTextsExact(makeAnnsFormat(idsToAnnsGold[idGold], 3, htype), - makeAnnsFormat(idsToAnnsUser[idGold], 2, htype)) + extp = compareTextsExact( + makeAnnsFormat(idsToAnnsGold[idGold], 3, htype), + makeAnnsFormat(idsToAnnsUser[idGold], 2, htype)) global_tp_ex += extp[0] global_fp_ex += extp[1] global_fn_ex += extp[2] @@ -133,26 +219,34 @@ def computeScores(goldfile, userfile, htype="split"): nonfound += 1 print(userfile) - print("Nr of documents identified by ID in both data sets: "+str(found)+", not identified (left out): "+str(nonfound)) + print("Nr of documents identified by ID in both data sets: " + str( + found) + ", not identified (left out): " + str(nonfound)) prec = float(global_tp_ov) / float(global_fp_ov + global_tp_ov) recall = float(global_tp_ov) / float(global_fn_ov + global_tp_ov) f1o = float(2 * prec * recall) / float(prec + recall) - print("OVERLAP precision: %0.3f recall: %0.3f F1: %0.3f " % (prec, recall, f1o)) + print("OVERLAP precision: %0.3f recall: %0.3f F1: %0.3f " % ( + prec, recall, f1o)) prec = float(global_tp_ex) / float(global_fp_ex + global_tp_ex) recall = float(global_tp_ex) / float(global_fn_ex + global_tp_ex) f1e = float(2 * prec * recall) / float(prec + recall) - print("EXACT precision: %0.3f recall: %0.3f F1: %0.3f " % (prec, recall, f1e)) + print("EXACT precision: %0.3f recall: %0.3f F1: %0.3f " % ( + prec, recall, f1e)) - print("Final score: %0.3f" % (f1o*0.8 + f1e*0.2)) - - print("Exact TP=%d ; FP=%d; FN=%d" % (global_tp_ex, global_fp_ex, global_fn_ex)) + print("Final score: %0.3f" % (f1o * 0.8 + f1e * 0.2)) + print("Exact TP=%d ; FP=%d; FN=%d" % ( + global_tp_ex, global_fp_ex, global_fn_ex)) def main(argv): + """A message of shame -- documentation must be completed. + + Args: + argv:A message of shame -- documentation must be completed. + """ goldfile = 'POLEVAL-NER_GOLD.json' userfile = '' try: @@ -162,18 +256,19 @@ def main(argv): sys.exit(2) for opt, arg in opts: - if opt == '-h': - print('poleval_ner_test.py -g <goldfile> -u <userfile>') - sys.exit() - elif opt in ("-u", "--userfile"): - userfile = arg - elif opt in ("-g", "--goldfile"): - goldfile = arg + if opt == '-h': + print('poleval_ner_test.py -g <goldfile> -u <userfile>') + sys.exit() + elif opt in ("-u", "--userfile"): + userfile = arg + elif opt in ("-g", "--goldfile"): + goldfile = arg print('gold file is: ' + goldfile) - print('user file is: '+ userfile) + print('user file is: ' + userfile) computeScores(goldfile, userfile, htype="split") + if __name__ == "__main__": main(sys.argv[1:]) diff --git a/poleval_ner_test_v2.py b/poleval_ner_test_v2.py index 83f46207346aa118d8d32ab0f9aadcf2a760c4c0..d86cadc216a23c9f6d8ed5de4a5963b6fc592cff 100644 --- a/poleval_ner_test_v2.py +++ b/poleval_ner_test_v2.py @@ -1,8 +1,12 @@ -import sys, json, getopt +"""A message of shame -- documentation must be completed.""" + +import getopt +import json +import sys -from tqdm import tqdm from attr import dataclass from dateutil import parser +from tqdm import tqdm """ Source: http://poleval.pl/tasks/ @@ -11,11 +15,20 @@ Source: http://poleval.pl/tasks/ @dataclass class CategoryNormalizer: + """A message of shame -- documentation must be completed.""" lower: bool = False only_main: bool = False def normalize(self, name): + """A message of shame -- documentation must be completed. + + Args: + name:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ if self.lower: name = name.lower() if self.only_main: @@ -24,6 +37,15 @@ class CategoryNormalizer: def overlap(offsetsa, offsetsb): + """A message of shame -- documentation must be completed. + + Args: + offsetsa:A message of shame -- documentation must be completed. + offsetsb:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ try: start1, end1 = offsetsa.split('_') start2, end2 = offsetsb.split('_') @@ -34,6 +56,15 @@ def overlap(offsetsa, offsetsb): def exact(offsetsa, offsetsb): + """A message of shame -- documentation must be completed. + + Args: + offsetsa:A message of shame -- documentation must be completed. + offsetsb:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ try: start1, end1 = offsetsa.split('_') start2, end2 = offsetsb.split('_') @@ -43,12 +74,30 @@ def exact(offsetsa, offsetsb): return (int(start1) == int(start2)) and (int(end1) == int(end2)) -# this to ensure we get rid of derived types when loading entities (redundant otherwise) +# this to ensure we get rid of derived types when loading entities ( +# redundant otherwise) def removeDerivs(annots): - return { (a,c) for a,c in annots if c.find('derivType') < 0 } + """A message of shame -- documentation must be completed. + + Args: + annots:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ + return {(a, c) for a, c in annots if c.find('derivType') < 0} def getAnnotatonText(content, spans): + """A message of shame -- documentation must be completed. + + Args: + content:A message of shame -- documentation must be completed. + spans:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ texts = [] for span in spans.split(";"): range = span.split("_") @@ -57,12 +106,24 @@ def getAnnotatonText(content, spans): def compareTextsOverlap(eGold, eModel, content, cn: CategoryNormalizer): + """A message of shame -- documentation must be completed. + + Args: + eGold:A message of shame -- documentation must be completed. + eModel:A message of shame -- documentation must be completed. + content:A message of shame -- documentation must be completed. + cn:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ eGold = removeDerivs(eGold) eModel = removeDerivs(eModel) tp, fp, fn = 0, 0, 0 for (offsets_gold, cat_gold) in eGold: for (offsets_model, cat_model) in eModel: - if overlap(offsets_gold, offsets_model) and cn.normalize(cat_gold) == cn.normalize(cat_model): + if overlap(offsets_gold, offsets_model) and cn.normalize( + cat_gold) == cn.normalize(cat_model): tp += 1 break fp = len(eModel) - tp @@ -71,12 +132,23 @@ def compareTextsOverlap(eGold, eModel, content, cn: CategoryNormalizer): def compareTextsExact(eGold, eModel, cn: CategoryNormalizer): + """A message of shame -- documentation must be completed. + + Args: + eGold:A message of shame -- documentation must be completed. + eModel:A message of shame -- documentation must be completed. + cn:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ eGold = removeDerivs(eGold) eModel = removeDerivs(eModel) tp, fp, fn = 0, 0, 0 for (offsets_gold, cat_gold) in eGold: for (offsets_model, cat_model) in eModel: - if exact(offsets_gold, offsets_model) and cn.normalize(cat_gold) == cn.normalize(cat_model): + if exact(offsets_gold, offsets_model) and cn.normalize( + cat_gold) == cn.normalize(cat_model): tp += 1 break fp = len(eModel) - tp @@ -85,34 +157,60 @@ def compareTextsExact(eGold, eModel, cn: CategoryNormalizer): def makeAnnsFormat(inputDoc, cols, htype): + """A message of shame -- documentation must be completed. + + Args: + inputDoc:A message of shame -- documentation must be completed. + cols:A message of shame -- documentation must be completed. + htype:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ z_anns = [] for ben in inputDoc.split('\n'): pcs = ben.split('\t') try: - if len(pcs)==cols: + if len(pcs) == cols: cat, ofrom, oto = pcs[-2].split(' ') - z_anns.append( [ofrom+"_"+oto, cat] ) + z_anns.append([ofrom + "_" + oto, cat]) except ValueError: # handling fragmented entity, two strategies: - if htype=='merge': + if htype == 'merge': # take start and end, use as a single big entity cat, ofrom, ignored, oto = pcs[-2].split(' ') - z_anns.append( [ofrom+"_"+oto, cat] ) - if htype=='split': + z_anns.append([ofrom + "_" + oto, cat]) + if htype == 'split': # split into two entities catAndOffsets1, offsets2 = pcs[-2].split(';') cat, ofrom, oto = catAndOffsets1.split(' ') - z_anns.append( [ofrom+"_"+oto, cat] ) + z_anns.append([ofrom + "_" + oto, cat]) ofrom, oto = offsets2.split(' ') - z_anns.append( [ofrom+"_"+oto, cat] ) + z_anns.append([ofrom + "_" + oto, cat]) return z_anns -# compute micro F1 scores for exact and overlap matches -# htype parameter reflects two possible strategies for handling fragmented entities ("split" or "merge") -def computeScores(goldfile, userfile, cn: CategoryNormalizer, htype="split", types=None): - global_tp_ov = 0 ; global_fp_ov = 0 ; global_fn_ov = 0 - global_tp_ex = 0 ; global_fp_ex = 0 ; global_fn_ex = 0 +# compute micro F1 scores for exact and overlap matches htype parameter +# reflects two possible strategies for handling fragmented entities ("split" +# or "merge") +def computeScores(goldfile, userfile, cn: CategoryNormalizer, htype="split", + types=None): + """A message of shame -- documentation must be completed. + + Args: + goldfile:A message of shame -- documentation must be completed. + userfile:A message of shame -- documentation must be completed. + cn:A message of shame -- documentation must be completed. + htype:A message of shame -- documentation must be completed. + types:A message of shame -- documentation must be completed. + + """ + global_tp_ov = 0 + global_fp_ov = 0 + global_fn_ov = 0 + global_tp_ex = 0 + global_fp_ex = 0 + global_fn_ex = 0 idsToAnnsUser = {} with open(userfile) as json_data: @@ -132,29 +230,36 @@ def computeScores(goldfile, userfile, cn: CategoryNormalizer, htype="split", typ goldjson = json.load(json_data) for nr in tqdm(range(len(goldjson['questions']))): - idGold = '/'.join(goldjson['questions'][nr]['input']['fname'].split('/')[4:]) + idGold = '/'.join( + goldjson['questions'][nr]['input']['fname'].split('/')[4:]) if idGold in idsToAnnsUser: found += 1 # find the most recent answer: if len(goldjson['questions'][nr]['answers']) > 1: - maximum = parser.parse('1900-01-02T14:22:41.439308+00:00'); + maximum = parser.parse('1900-01-02T14:22:41.439308+00:00') index = 0 - for i, value in enumerate(goldjson['questions'][nr]['answers']): - value = parser.parse(goldjson['questions'][nr]['answers'][i]['created']) + for i, value in enumerate( + goldjson['questions'][nr]['answers']): + value = parser.parse( + goldjson['questions'][nr]['answers'][i]['created']) if value > maximum: maximum = value index = i - idsToAnnsGold[idGold] = goldjson['questions'][nr]['answers'][index]['data']['brat'] + idsToAnnsGold[idGold] = \ + goldjson['questions'][nr]['answers'][index]['data']['brat'] else: content = goldjson['questions'][nr]['input']['fileContent'] - idsToAnnsGold[idGold] = goldjson['questions'][nr]['answers'][0]['data']['brat'] + idsToAnnsGold[idGold] = \ + goldjson['questions'][nr]['answers'][0]['data']['brat'] gold = makeAnnsFormat(idsToAnnsGold[idGold], 3, htype) user = makeAnnsFormat(idsToAnnsUser[idGold], 2, htype) if types is not None: - gold = [(span, type) for span, type in gold if type in types] - user = [(span, type) for span, type in user if type in types] + gold = [(span, type) for span, type in gold if + type in types] + user = [(span, type) for span, type in user if + type in types] # overlap scores: ovtp = compareTextsOverlap(gold, user, content, cn) @@ -173,55 +278,66 @@ def computeScores(goldfile, userfile, cn: CategoryNormalizer, htype="split", typ nonfound += 1 print(userfile) - print("Nr of documents identified by ID in both data sets: "+str(found)+", not identified (left out): "+str(nonfound)) + print("Nr of documents identified by ID in both data sets: " + str( + found) + ", not identified (left out): " + str(nonfound)) if types is not None: print("NE types to evaluate: " + ", ".join(types)) prec = float(global_tp_ov) / float(global_fp_ov + global_tp_ov) recall = float(global_tp_ov) / float(global_fn_ov + global_tp_ov) f1o = float(2 * prec * recall) / float(prec + recall) - print("OVERLAP precision: %0.3f recall: %0.3f F1: %0.3f " % (prec, recall, f1o)) + print("OVERLAP precision: %0.3f recall: %0.3f F1: %0.3f " % ( + prec, recall, f1o)) prec = float(global_tp_ex) / float(global_fp_ex + global_tp_ex) recall = float(global_tp_ex) / float(global_fn_ex + global_tp_ex) f1e = float(2 * prec * recall) / float(prec + recall) - print("EXACT precision: %0.3f recall: %0.3f F1: %0.3f " % (prec, recall, f1e)) + print("EXACT precision: %0.3f recall: %0.3f F1: %0.3f " % ( + prec, recall, f1e)) - print("Final score: %0.3f" % (f1o*0.8 + f1e*0.2)) + print("Final score: %0.3f" % (f1o * 0.8 + f1e * 0.2)) - print("Exact TP=%d ; FP=%d; FN=%d" % (global_tp_ex, global_fp_ex, global_fn_ex)) + print("Exact TP=%d ; FP=%d; FN=%d" % ( + global_tp_ex, global_fp_ex, global_fn_ex)) def main(argv): + """A message of shame -- documentation must be completed. + + Args: + argv:A message of shame -- documentation must be completed. + """ goldfile = 'POLEVAL-NER_GOLD.json' userfile = '' lower = False main_categories = False try: - opts, args = getopt.getopt(argv, "g:u:m:h:l", ["goldfile=", "userfile=", "categories-main", "categories-lower"]) + opts, args = getopt.getopt(argv, "g:u:m:h:l", + ["goldfile=", "userfile=", + "categories-main", "categories-lower"]) except getopt.GetoptError: print('poleval_ner_test.py -g <inputfile> -u <userfile>') sys.exit(2) for opt, arg in opts: - if opt == '-h': - print('poleval_ner_test.py -g <goldfile> -u <userfile>') - sys.exit() - elif opt in ("-u", "--userfile"): - userfile = arg - elif opt in ("-g", "--goldfile"): - goldfile = arg - elif opt in ("-m", "--categories-main"): - main_categories = True - elif opt in ("-l", "--categories-lower"): - lower = True + if opt == '-h': + print('poleval_ner_test.py -g <goldfile> -u <userfile>') + sys.exit() + elif opt in ("-u", "--userfile"): + userfile = arg + elif opt in ("-g", "--goldfile"): + goldfile = arg + elif opt in ("-m", "--categories-main"): + main_categories = True + elif opt in ("-l", "--categories-lower"): + lower = True print('gold file is: ' + goldfile) print('user file is: ' + userfile) - types = None - #types = set(["date"]) + # types = None NOT USED + # types = set(["date"]) category_normalizer = CategoryNormalizer(lower, main_categories) diff --git a/poleval_to_iob.py b/poleval_to_iob.py index 106fb27e3d9a53acb3ebb90fd3e7411fa9ed76a8..0f0f19a5b1109adc83f2919473ed37493f5649ed 100644 --- a/poleval_to_iob.py +++ b/poleval_to_iob.py @@ -1,3 +1,5 @@ +"""A message of shame -- documentation must be completed.""" + from __future__ import absolute_import, division, print_function import argparse @@ -8,19 +10,35 @@ from poldeepner2.utils.data_utils import read_json, map_json_to_iob def get_id(ini_file): + """A message of shame -- documentation must be completed. + + Args: + ini_file:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ for line in codecs.open(ini_file, "r", "utf8"): if 'id = ' in line: return line.replace('id = ', '') + def main(args): + """A message of shame -- documentation must be completed. + + Args: + args:A message of shame -- documentation must be completed. + Returns:A message of shame -- documentation must be completed. + + """ print("Loading the NER model ...") json_ann = read_json(args.json) parent = os.path.dirname(args.input) paths = codecs.open(args.input, "r", "utf8").readlines() if not os.path.exists(args.output): os.makedirs(args.output) - output_dir = args.output.replace('index.list','') + output_dir = args.output.replace('index.list', '') paths_count = len(paths) global_success = 0 global_failed = 0 @@ -32,24 +50,39 @@ def main(args): path = os.path.dirname(abs_path) doc_id = get_id(os.path.join(path, name + ".ini")).split('/')[-1] print("%d from %d: %s" % (n, paths_count, doc_id)) - mapped_iob, success, failed, derives = map_json_to_iob(json_ann[doc_id], os.path.join(path, name + '.iob')) + mapped_iob, success, failed, derives = map_json_to_iob( + json_ann[doc_id], os.path.join(path, name + '.iob')) global_success += success global_failed += failed global_derives += derives codecs.open(output_dir + name + '.iob', "w", "utf8").write(mapped_iob) - print("Successed:{}, Failed: {}, Derives skipped {}".format(global_success, global_failed, global_derives)) + print("Successed:{}, Failed: {}, Derives skipped {}".format(global_success, + global_failed, + global_derives) + ) + def parse_args(): + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ parser = argparse.ArgumentParser( - description='Convert set of IOB files into a single json file in PolEval 2018 NER format') - parser.add_argument('--input', required=True, metavar='PATH', help='path to input') - parser.add_argument('--output', required=True, metavar='PATH', help='path to output directory') - parser.add_argument('--json', required=True, metavar='PATH', help='path to json') + description='Convert set of IOB files into a single json file in ' + 'PolEval 2018 NER format') + parser.add_argument('--input', required=True, metavar='PATH', + help='path to input') + parser.add_argument('--output', required=True, metavar='PATH', + help='path to output directory') + parser.add_argument('--json', required=True, metavar='PATH', + help='path to json') return parser.parse_args() + if __name__ == "__main__": args = parse_args() try: main(args) except ValueError as er: - print("[ERROR] %s" % er) \ No newline at end of file + print("[ERROR] %s" % er) diff --git a/predictor.py b/predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..3d7151ad64d70cab1ba9beb6d59fc5f2a476ea4e --- /dev/null +++ b/predictor.py @@ -0,0 +1,44 @@ +"""Script for tagging raw data.""" + +import configparser + +from poldeepner2.models import PolDeepNer2 + + +def main(): + # config serializuje razem z modelem + # json + config_file = "config.cfg" + config = configparser.ConfigParser() + config.read(config_file) + + model = config['model']['path'] + pretrained_model = config['model']['pretrained_path'] + + ner = PolDeepNer2.load(model=model, pretrained_path=pretrained_model) + + data_path = config['data']['pred_path'] + with open(data_path) as f: + data = f.readlines() + + if not config.getboolean('predict', 'save_to_file'): + for sentence in data: + if sentence != '\n': + print(sentence) + text_prediction = ner.process_text(sentence) + for pred in text_prediction: + print(f'{pred.text}, {pred.label}') + + else: + with open(config['predict']['path_to_save'], 'w+') as f_res: + for sentence in data: + if sentence != '\n': + text_prediction = ner.process_text(sentence) + for pred in text_prediction: + f_res.write(f'{pred.text}, {pred.label}\n') + else: + f_res.write('\n') + + +if __name__ == "__main__": + main() diff --git a/process_poleval.py b/process_poleval.py index a4c221992b899e3ef609d1f77be3ad7ef27b1f97..6d327b378c8d9433fb2cb11b9a6f7abe725d729d 100644 --- a/process_poleval.py +++ b/process_poleval.py @@ -1,3 +1,5 @@ +"""A message of shame -- documentation must be completed.""" + from __future__ import absolute_import, division, print_function import logging @@ -10,13 +12,20 @@ import time from tqdm import tqdm -import poldeepner2 from poldeepner2.models import PolDeepNer2 from poldeepner2.pipeline import tokenization from poldeepner2.utils.data_utils import get_poleval_dict, wrap_annotations def merge_sentences(sentences: [[str]]): + """A message of shame -- documentation must be completed. + + Args: + sentences:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ flat_list = [] for lit in sentences: flat_list.extend(lit) @@ -24,16 +33,26 @@ def merge_sentences(sentences: [[str]]): def main(args): + """A message of shame -- documentation must be completed. + + Args: + args:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ print("Loading the NER model ...") t0 = time.time() - # if args.pretrained_path: - # #ner = PolDeepNer2(args.model, args.pretrained_path, device=args.device, max_seq_length=args.max_seq_length, - # # squeeze=args.squeeze, seed=args.seed, tokenizer=tokenizer) - # else: - # # ner = poldeepner2.models.load(args.model, device=args.device, resources_path=".models") - # # ner.max_seq_length = args.max_seq_length - - tokenizer = tokenization.load(args.tokenization) if args.tokenization else None + # if args.pretrained_path: #ner = PolDeepNer2(args.model, + # args.pretrained_path, device=args.device, + # max_seq_length=args.max_seq_length, # + # squeeze=args.squeeze, seed=args.seed, tokenizer=tokenizer) else: + # ner = poldeepner2.models.load(args.model, device=args.device, + # resources_path=".models") # ner.max_seq_length = args.max_seq_length + + tokenizer = tokenization.load(args.tokenization) \ + if args.tokenization \ + else None ner = PolDeepNer2.load( model=args.model, pretrained_path=args.pretrained_path, @@ -44,7 +63,6 @@ def main(args): tokenizer=tokenizer ) - time_load = time.time() - t0 time_preprocess = 0 @@ -53,10 +71,13 @@ def main(args): dict_list = [] - with open(os.path.join(pathlib.Path(__file__).parent.absolute(), args.input), encoding='UTF-8') as f: + with open( + os.path.join(pathlib.Path(__file__).parent.absolute(), args.input), + encoding='UTF-8') as f: sentences = json.load(f)['questions'] for i, sentence in tqdm(enumerate(sentences), total=len(sentences)): - id = sentence['input']['fname'].replace("/home/a.wawer/poleval/", "") + id = sentence['input']['fname'].replace("/home/a.wawer/poleval/", + "") file_content = sentence['input']['fileContent'] data_size += len(file_content) texts = file_content.split('\n') @@ -71,37 +92,61 @@ def main(args): predictions = merge_sentences(predictions) tokenized_sentences = merge_sentences(tokenized_sentences) annotations = wrap_annotations(predictions) - dict_list.append(get_poleval_dict(id, file_content, tokenized_sentences, annotations)) + dict_list.append( + get_poleval_dict(id, file_content, tokenized_sentences, + annotations)) time_ner += (time.time() - t0) - codecs.open(args.output, "w", "utf8").write(json.dumps(dict_list, indent=4)) + codecs.open(args.output, "w", "utf8").write( + json.dumps(dict_list, indent=4)) print(f"Model loading time : {time_load:8.4} second(s)") print(f"Data preprocessing time : {time_preprocess:8.4} second(s)") print(f"Data NE recognition time : {time_ner:8.4} second(s)") - print(f"Total time : {time_load+time_preprocess+time_ner:8.4} second(s)") - print(f"Data size: : {data_size/1000000:8.4}M characters") + print(f'Total time : ' + f'{time_load + time_preprocess + time_ner:8.4} second(s)') + print(f"Data size: : " + f"{data_size / 1000000:8.4}M characters") def parse_args(): + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ parser = argparse.ArgumentParser( - description='Convert set of IOB files into a single json file in PolEval 2018 NER format') - parser.add_argument('--input', required=True, metavar='PATH', help='path to a file with a list of files') - parser.add_argument('--output', required=True, metavar='PATH', help='path to a json output file') - parser.add_argument('--model', required=True, metavar='PATH', help='model name or path to a model name') - parser.add_argument('--pretrained_path', required=False, metavar='PATH', help='pretrained XLM-Roberta model path') - parser.add_argument('--max_seq_length', required=False, default=256, metavar='N', type=int, - help='the maximum total input sequence length after WordPiece tokenization.') - parser.add_argument('--device', required=False, default="cpu", metavar='cpu|cuda', + description='Convert set of IOB files into a single json file in ' + 'PolEval 2018 NER format') + parser.add_argument('--input', required=True, metavar='PATH', + help='path to a file with a list of files') + parser.add_argument('--output', required=True, metavar='PATH', + help='path to a json output file') + parser.add_argument('--model', required=True, metavar='PATH', + help='model name or path to a model name') + parser.add_argument('--pretrained_path', required=False, metavar='PATH', + help='pretrained XLM-Roberta model path') + parser.add_argument('--max_seq_length', required=False, default=256, + metavar='N', type=int, + help='the maximum total input sequence length after ' + 'WordPiece tokenization.') + parser.add_argument('--device', required=False, default="cpu", + metavar='cpu|cuda', help='device type used for processing') - parser.add_argument('--tokenization', required=False, default=None, choices=tokenization.names, + parser.add_argument('--tokenization', required=False, default=None, + choices=tokenization.names, help='Tokenization method') - parser.add_argument('--squeeze', required=False, default=False, action="store_true", - help='try to squeeze multiple examples into one Input Feature') - parser.add_argument('--seed', required=False, default=377, metavar='N', type=int, + parser.add_argument('--squeeze', required=False, default=False, + action="store_true", + help='try to squeeze multiple examples into one ' + 'Input Feature') + parser.add_argument('--seed', required=False, default=377, metavar='N', + type=int, help='a seed used to initialize a number generator') - parser.add_argument('--merge', required=False, default=False, action="store_true", - help='merge sentences into a single sentence before wrapping labels into annotations') + parser.add_argument('--merge', required=False, default=False, + action="store_true", + help='merge sentences into a single sentence before ' + 'wrapping labels into annotations') return parser.parse_args() diff --git a/process_poleval_pretokenized.py b/process_poleval_pretokenized.py index 175fd0d6f90881d3873dc750f8e5225af990c4df..63ac26be9a9b77416f8057fe7f8777226e5e3f9a 100644 --- a/process_poleval_pretokenized.py +++ b/process_poleval_pretokenized.py @@ -1,3 +1,5 @@ +"""A message of shame -- documentation must be completed.""" + from __future__ import absolute_import, division, print_function import argparse @@ -7,17 +9,34 @@ import json import logging from poldeepner2.models import PolDeepNer2 -from poldeepner2.utils.data_utils import get_poleval_dict, read_tsv, wrap_annotations +from poldeepner2.utils.data_utils import get_poleval_dict, read_tsv, \ + wrap_annotations from poldeepner2.utils.preprocess import split_hashtags, split_leading_name def get_id(ini_file): + """A message of shame -- documentation must be completed. + + Args: + ini_file:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ for line in codecs.open(ini_file, "r", "utf8"): if 'id = ' in line: return line.replace('id = ', '') def load_document(abs_path): + """A message of shame -- documentation must be completed. + + Args: + abs_path:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ namext = os.path.basename(abs_path) name = os.path.splitext(namext)[0] path = os.path.dirname(abs_path) @@ -29,12 +48,20 @@ def load_document(abs_path): def main(args): + """A message of shame -- documentation must be completed. + + Args: + args:A message of shame -- documentation must be completed. + + """ print("Loading the NER model ...") - ner = PolDeepNer2(args.model, args.pretrained_path, args.device, max_seq_length=args.max_seq_length, + ner = PolDeepNer2(args.model, args.pretrained_path, + args.device, max_seq_length=args.max_seq_length, squeeze=args.squeeze) parent = os.path.dirname(args.input) dict_list = [] - for n, rel_path in enumerate(codecs.open(args.input, "r", "utf8").readlines()): + for n, rel_path in enumerate( + codecs.open(args.input, "r", "utf8").readlines()): abs_path = os.path.abspath(os.path.join(parent, rel_path.strip())) doc_id, text, sentences_raw = load_document(abs_path) print("Processing %d: %s" % (n, doc_id)) @@ -42,24 +69,43 @@ def main(args): sentences = [split_leading_name(sentence) for sentence in sentences] predictions = ner.process(sentences) annotations = wrap_annotations(predictions) - dict_list.append(get_poleval_dict(doc_id, text, sentences, annotations)) + dict_list.append( + get_poleval_dict(doc_id, text, sentences, annotations)) # debug_tokens_and_labels(sentences_raw, predictions) - codecs.open(args.output, "w", "utf8").write(json.dumps(dict_list, indent=4)) + codecs.open( + args.output, "w", "utf8").write(json.dumps(dict_list, indent=4)) def parse_args(): + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ parser = argparse.ArgumentParser( - description='Convert set of IOB files into a single json file in PolEval 2018 NER format') - parser.add_argument('--input', required=True, metavar='PATH', help='path to a file with a list of files') - parser.add_argument('--output', required=True, metavar='PATH', help='path to a json output file') - parser.add_argument('--model', required=True, metavar='PATH', help='path to NER model') - parser.add_argument('--pretrained_path', required=True, metavar='PATH', help='pretrained XLM-Roberta model path') - parser.add_argument('--max_seq_length', required=False, default=256, metavar='N', type=int, - help='the maximum total input sequence length after WordPiece tokenization.') - parser.add_argument('--device', required=False, default="cpu", metavar='cpu|cuda', + description='Convert set of IOB files into a single json file in ' + 'PolEval 2018 NER format') + parser.add_argument('--input', required=True, + metavar='PATH', help='path to a file with a list of ' + 'files') + parser.add_argument('--output', required=True, + metavar='PATH', help='path to a json output file') + parser.add_argument('--model', required=True, + metavar='PATH', help='path to NER model') + parser.add_argument('--pretrained_path', required=True, + metavar='PATH', help='pretrained XLM-Roberta model ' + 'path') + parser.add_argument('--max_seq_length', required=False, + default=256, metavar='N', + type=int, help='the maximum total input sequence ' + 'length after WordPiece tokenization.') + parser.add_argument('--device', required=False, + default="cpu", metavar='cpu|cuda', help='device type used for processing') - parser.add_argument('--squeeze', required=False, default=False, action="store_true", - help='try to squeeze multiple examples into one Input Feature') + parser.add_argument('--squeeze', required=False, + default=False, action="store_true", + help='try to squeeze multiple examples into one ' + 'Input Feature') return parser.parse_args() diff --git a/process_texts.py b/process_texts.py index c0dc34738a82b5df226a82783c31e30ad6f7d142..f2a69b55fb6bc3803aa012ca51bc93ddc5d1c71f 100644 --- a/process_texts.py +++ b/process_texts.py @@ -1,3 +1,5 @@ +"""A message of shame -- documentation must be completed.""" + from __future__ import absolute_import, division, print_function import codecs @@ -8,7 +10,7 @@ import glob import os from pathlib import Path -import tqdm +# import tqdm F811 redefinition of unused 'tqdm' from tqdm import tqdm @@ -18,6 +20,14 @@ from poldeepner2.utils.data_utils import wrap_annotations def flatten(list_of_lists): + """A message of shame -- documentation must be completed. + + Args: + list_of_lists:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ flat_list = [] for lit in list_of_lists: flat_list.extend(lit) @@ -25,6 +35,14 @@ def flatten(list_of_lists): def read_content_autobom(path: str) -> str: + """A message of shame -- documentation must be completed. + + Args: + path:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ bytes = min(32, os.path.getsize(path)) content = open(path, 'rb').read(bytes) if content.startswith(codecs.BOM_UTF8): @@ -35,14 +53,26 @@ def read_content_autobom(path: str) -> str: def main(args): + """A message of shame -- documentation must be completed. + + Args: + args:A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ print("Loading the NER model ...") t0 = time.time() if args.pretrained_path: tokenizer = tokenization.load(args.tokenization) - ner = PolDeepNer2(args.model, args.pretrained_path, device=args.device, max_seq_length=args.max_seq_length, - squeeze=args.squeeze, seed=args.seed, tokenizer=tokenizer) + ner = PolDeepNer2(args.model, args.pretrained_path, + device=args.device, + max_seq_length=args.max_seq_length, + squeeze=args.squeeze, seed=args.seed, + tokenizer=tokenizer) else: - ner = PolDeepNer2.load(args.model, device=args.device, resources_path=".models") + ner = PolDeepNer2.load(args.model, device=args.device, + resources_path=".models") if args.max_seq_length: ner.max_seq_length = args.max_seq_length if tokenization: @@ -72,36 +102,56 @@ def main(args): output = Path(args.output) / Path(path).name with open(output, "w") as fout: for an in annotations: - text = " ".join([tokenized_sentences[0][n] for n in an.token_ids]) + text = " ".join( + [tokenized_sentences[0][n] for n in an.token_ids]) token_start = min(an.token_ids) token_end = max(an.token_ids) - fout.write(f"{an.annotation}\t{token_start}\t{token_end}\t{text}\n") + fout.write( + f"{an.annotation}\t{token_start}\t{token_end}\t{text}\n") print(f"Model loading time : {time_load:8.4} second(s)") print(f"Data preprocessing time : {time_preprocess:8.4} second(s)") print(f"Data NE recognition time : {time_ner:8.4} second(s)") - print(f"Total time : {time_load+time_preprocess+time_ner:8.4} second(s)") + print(f"Total time : " + f"{time_load+time_preprocess+time_ner:8.4} second(s)") print(f"Data size: : {data_size/1000000:8.4}M characters") def parse_args(): + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ parser = argparse.ArgumentParser( - description='Process a set of plain text files from given folder. The output is save to another folder.') - parser.add_argument('--input', required=True, metavar='PATH', help='path to an input folder with texts') - parser.add_argument('--output', required=True, metavar='PATH', help='path to an output folder') - parser.add_argument('--model', required=True, metavar='PATH', help='model name or path to a model name') + description='Process a set of plain text files from given folder. ' + 'The output is save to another folder.') + parser.add_argument('--input', required=True, metavar='PATH', + help='path to an input folder with texts') + parser.add_argument('--output', required=True, metavar='PATH', + help='path to an output folder') + parser.add_argument('--model', required=True, metavar='PATH', + help='model name or path to a model name') # Required if the pretrained_path is given - parser.add_argument('--pretrained_path', required=False, metavar='PATH', help='pretrained XLM-Roberta model path') - parser.add_argument('--max_seq_length', required=False, default=None, metavar='N', type=int, - help='the maximum total input sequence length after WordPiece tokenization.') - parser.add_argument('--device', required=False, default="cpu", metavar='cpu|cuda', + parser.add_argument('--pretrained_path', required=False, metavar='PATH', + help='pretrained XLM-Roberta model path') + parser.add_argument('--max_seq_length', required=False, default=None, + metavar='N', type=int, + help='the maximum total input sequence length after ' + 'WordPiece tokenization.') + parser.add_argument('--device', required=False, default="cpu", + metavar='cpu|cuda', help='device type used for processing') - parser.add_argument('--tokenization', required=False, default=None, choices=tokenization.names, + parser.add_argument('--tokenization', required=False, default=None, + choices=tokenization.names, help='Tokenization method') - parser.add_argument('--squeeze', required=False, default=False, action="store_true", - help='try to squeeze multiple examples into one Input Feature') - parser.add_argument('--seed', required=False, default=377, metavar='N', type=int, + parser.add_argument('--squeeze', required=False, default=False, + action="store_true", + help='try to squeeze multiple examples into one ' + 'Input Feature') + parser.add_argument('--seed', required=False, default=377, + metavar='N', type=int, help='a seed used to initialize a number generator') return parser.parse_args() diff --git a/process_tsv.py b/process_tsv.py index b3218c71127bba12a8777993a3463486e53f6770..4fe0d70787a3e939f6f6f574c9bfc19b55529d09 100644 --- a/process_tsv.py +++ b/process_tsv.py @@ -1,3 +1,5 @@ +"""A message of shame -- documentation must be completed.""" + from __future__ import absolute_import, division, print_function import argparse @@ -9,17 +11,24 @@ from poldeepner2.utils.data_utils import read_tsv, save_tsv def main(args): + """A message of shame -- documentation must be completed. + + Args: + args:A message of shame -- documentation must be completed. + + """ logging.info("Loading the NER model ...") - #ner = PolDeepNer2(args.model, args.pretrained_path, args.device, args.squeeze, args.max_seq_length) + # ner = PolDeepNer2(args.model, args.pretrained_path, args.device, + # args.squeeze, args.max_seq_length) ner = PolDeepNer2.load( model=args.model, pretrained_path=args.pretrained_path, device=args.device, max_seq_length=args.max_seq_length, squeeze=args.squeeze, - #seed=args.seed + # seed=args.seed ) - + logging.info("Processing ...") sentences_labels = read_tsv(os.path.join(args.input)) sentences = [sentence[0] for sentence in sentences_labels] @@ -32,18 +41,31 @@ def main(args): def parse_args(): + """A message of shame -- documentation must be completed. + + Returns: A message of shame -- documentation must be completed. + + """ parser = argparse.ArgumentParser( description='Process a single TSV with a NER model') - parser.add_argument('--input', required=True, metavar='PATH', help='path to a file with a list of files') - parser.add_argument('--output', required=True, metavar='PATH', help='path to a json output file') - parser.add_argument('--model', required=True, metavar='PATH', help='path to NER model') - parser.add_argument('--pretrained_path', required=False, metavar='PATH', help='pretrained XLM-Roberta model path') - parser.add_argument('--max_seq_length', required=False, default=256, metavar='N', type=int, - help='the maximum total input sequence length after WordPiece tokenization.') - parser.add_argument('--device', required=False, default="cpu", metavar='cpu|cuda', + parser.add_argument('--input', required=True, metavar='PATH', + help='path to a file with a list of files') + parser.add_argument('--output', required=True, metavar='PATH', + help='path to a json output file') + parser.add_argument('--model', required=True, metavar='PATH', + help='path to NER model') + parser.add_argument('--pretrained_path', required=False, metavar='PATH', + help='pretrained XLM-Roberta model path') + parser.add_argument('--max_seq_length', required=False, default=256, + metavar='N', type=int, + help='the maximum total input sequence length after ' + 'WordPiece tokenization.') + parser.add_argument('--device', required=False, default="cpu", + metavar='cpu|cuda', help='device type used for processing') parser.add_argument('--squeeze', required=False, default=False, - help='try to squeeze multiple examples into one Input Feature') + help='try to squeeze multiple examples into one ' + 'Input Feature') return parser.parse_args() diff --git a/requirements.txt b/requirements.txt index d48e2ff8692d575721b8daaa517cb0f65bdb65cd..47053e257f8c6f5d6c799c64b209c16dead15662 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ tqdm fastapi==0.61.1 uvicorn==0.12.2 pandas==1.1.1 -wandb==0.10.7 -transformers==4.0.1 -torch==1.7.1 +wandb==0.10.7 +transformers==4.16.2 +torch==1.9.0. torchsummary==1.5.1 \ No newline at end of file diff --git a/sample.py b/sample.py index 5314ab3885f6ee68362634c3ff13abecf8374ba6..59958407b02ac42cea8274e74ad8122d17cb4208 100644 --- a/sample.py +++ b/sample.py @@ -3,11 +3,12 @@ import poldeepner2 path_or_name = "pdn2-v07-cen-n82-base-01" ner = poldeepner2.load(path_or_name) -sentences = ["Marek Nowak z Politechniki Wrocławskiej mieszka przy ul. Sądeckiej.", +sentences = ["Marek Nowak z Politechniki Wrocławskiej mieszka przy ul. " + "Sądeckiej.", "#PoselAdamNowak Co Pan myśli na temat fuzji Orlenu i Lotosu?"] for sentence in sentences: - print("-"*20) + print("-" * 20) print(sentence) for name in ner.process_text(sentence): name_range = "%d:%d" % (name.start, name.end) diff --git a/sample_conll.py b/sample_conll.py index ab19a053817bbfcfe9ffb01ce2888a729a9f7711..2cc31aee32cd3fa38b5a5aeed96c7d9339d9fce4 100644 --- a/sample_conll.py +++ b/sample_conll.py @@ -1,11 +1,17 @@ +"""A message of shame -- documentation must be completed.""" + import poldeepner2.models -ner = poldeepner2.models.load("conll-english-large-sq", device="cuda:0", resources_path="/tmp") +ner = poldeepner2.models.load("conll-english-large-sq", device="cuda:0", + resources_path="/tmp") -sentences = ["""(CNN)In a new criminal court case against a woman alleged to have entered the US Capitol on January 6, the FBI noted that a tipster raised the possibility of a laptop being stolen from House Speaker Nancy Pelosi's office to potentially sell to Russia."""] +sentences = ["""(CNN)In a new criminal court case against a woman alleged to +have entered the US Capitol on January 6, the FBI noted that a tipster +raised the possibility of a laptop being stolen from House Speaker Nancy +Pelosi's office to potentially sell to Russia."""] for sentence in sentences: - print("-"*20) + print("-" * 20) print(sentence) for name in ner.process_text(sentence): name_range = "%d:%d" % (name.start, name.end) diff --git a/sample_polem.py b/sample_polem.py index 02f101bc4977825bc0419587a40011ec50c88ec5..11a4cedd6b68cd3429a49d997029bc63d0584336 100644 --- a/sample_polem.py +++ b/sample_polem.py @@ -1,29 +1,36 @@ +"""A message of shame -- documentation must be completed.""" + import time from poldeepner2.models import PolDeepNer2, ModelFactory resources_path = "../poldeepner2_models" t0 = time.time() -model = ModelFactory.get_resource("pdn2_cen_n82_roberta_large_sq_krnnt_cuda.pdn2", resources_path) +model = ModelFactory.get_resource("pdn2_cen_n82_roberta_large_sq_krnnt_cuda" + ".pdn2", resources_path) ner = PolDeepNer2.load(model) -time_model = time.time()-t0 +time_model = time.time() - t0 -sentences = ["Spotkałem Marka Nowaka na Politechnice Wrocławskiej, który pracuje w Intelu.", +sentences = ["Spotkałem Marka Nowaka na Politechnice Wrocławskiej, który " + "pracuje w Intelu.", "Wczoraj mieliśmy kontrolę Naczelnej Izby Skarbowej.", - open("tests/resources/text_krakow.txt", "r", encoding="utf-8").read()] + open("tests/resources/text_krakow.txt", "r", + encoding="utf-8").read()] token_count = 0 t0 = time.time() for sentence in sentences: - print("-"*20) + print("-" * 20) print(sentence.strip()) doc = ner.process_document(sentence) token_count += len(doc.tokens) for name in doc.annotations: name_range = "%d:%d" % (name.start, name.end) - char_range = "%d:%d" % (doc.tokens[name.start].start, doc.tokens[name.end - 1].end) - print(f"{name_range:<8} {char_range:<12} {name.label:<25} {name.get_text():<25} {name.lemma}") + char_range = "%d:%d" % (doc.tokens[name.start].start, + doc.tokens[name.end - 1].end) + print(f"{name_range:<8} {char_range:<12} {name.label:<25} " + f"{name.get_text():<25} {name.lemma}") print() print() diff --git a/scripts/config.cfg b/scripts/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..9ac6aabd6e4ac56bb40aa8c4f59f4e170ced81cf --- /dev/null +++ b/scripts/config.cfg @@ -0,0 +1,36 @@ +[model] +path = +cpu_or_gpu = cpu +gpu_num = 0 + +[predict] +data_path +save_to_file = yes + +[train] +adam_epsilon = +data_test = +data_train = +data_tune = +device = gpu +dropout = 0.05 +epoch_save_model = 5 +eval_batch_size = 16 +fp16 = false +fp16_opt_level = +freeze_model = +gradient_accumulation_steps = +hidden_size = 32 +learning_rate = 0.001 +max_grad_norm = +max_seq_length = 32 +num_train_epochs = 100 +output_dir = +pretrained_path = +seed = 42 +squeeze = +train_batch_size = 16 +training_mix = 0.5 +transfer = +warmup_proportion = +weight_decay = 0.1 diff --git a/scripts/evaluator.py b/scripts/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..983f1127af2685a7e73f81ca2ba2b5168b02948a --- /dev/null +++ b/scripts/evaluator.py @@ -0,0 +1,48 @@ +"""Script for evaluating models on a pre-defined set of data.""" + +import configparser +from sklearn.metrics import accuracy_score +from poldeepner2.poldeepner2.models import PolDeepNer2 +from poldeepner2.poldeepner2.utils.data_utils import NerProcessor + + +def main(): + # config serializuje razem z modelem + # json + config_file = "config.cfg" + config = configparser.ConfigParser() + config.read(config_file) + + model = config['model']['path'] + + ner = PolDeepNer2.load(model=model) + + data_path = config['data']['path'] + processor = NerProcessor() + + #Prediction + data = processor.get_examples(data_path) + prediction_labels = [] + for sentence in data: + print(sentence) + prediction = ner.process_text(sentence) + print(prediction) + + #predicted label + predict_label = prediction[2][2] + prediction_labels.append(predict_label) + + + #Comparing + true_labels = processor.get_labels(data_path) + + eval_res = accuracy_score(true_labels, prediction_labels) + print(eval_res) + + + +if __name__ == "__main__": + try: + main() + except ValueError as er: + print("[ERROR] %s" % er) diff --git a/server.py b/server.py index 90c130db99ac830ef2ea309eff7a0ef9c73efdbd..644d7f8ab6aa629182ae7b062680d7ad403515ba 100644 --- a/server.py +++ b/server.py @@ -1,8 +1,10 @@ +"""A message of shame -- documentation must be completed.""" + from __future__ import absolute_import, division, print_function import uvicorn import argparse from fastapi import FastAPI -from typing import Dict, List, Optional +from typing import List, Optional from poldeepner2.data.token import Token from poldeepner2.models import PolDeepNer2 @@ -14,16 +16,22 @@ from poldeepner2.utils.annotation import Annotation class PredictionReq(BaseModel): + """A message of shame -- documentation must be completed.""" + text: str tokenization: Optional[str] = 'spacy' class Prediction(BaseModel): + """A message of shame -- documentation must be completed.""" + text: str doc: List[List[str]] class ResponseToken(BaseModel): + """A message of shame -- documentation must be completed.""" + orth: str lemma: str start: int @@ -32,10 +40,21 @@ class ResponseToken(BaseModel): @staticmethod def generate(token: Token): - return {"orth": token.orth, "lemma": token.lemma, "start": token.start, "end": token.end, "eos": token.eos} + """A message of shame -- documentation must be completed. + + Args: + token: A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ + return {"orth": token.orth, "lemma": token.lemma, "start": token.start, + "end": token.end, "eos": token.eos} class ResponseAnnotation(BaseModel): + """A message of shame -- documentation must be completed.""" + text: str label: str lemma: str @@ -44,29 +63,47 @@ class ResponseAnnotation(BaseModel): @staticmethod def generate(an: Annotation): - return {"text": an.get_text(), "label": an.annotation, "lemma": an.lemma, "start": an.start, "end": an.end} + """A message of shame -- documentation must be completed. + + Args: + an: A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ + return {"text": an.get_text(), "label": an.annotation, + "lemma": an.lemma, "start": an.start, "end": an.end} class ResponsePolem(BaseModel): + """A message of shame -- documentation must be completed.""" + text: str tokens: List[ResponseToken] annotations: List[ResponseAnnotation] class Server: + """A message of shame -- documentation must be completed.""" + app = FastAPI() - + app.add_middleware(CORSMiddleware, allow_origins=['*'], allow_credentials=True, allow_methods=['*'], allow_headers=['*']) - + global spacyTokenizer spacyTokenizer = load('spacy') @app.post('/predict', response_model=Prediction) async def predict(pred_req: PredictionReq): + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ text = pred_req.text sentences = text.split('\n') tokens = spacyTokenizer.tokenize(sentences) @@ -75,28 +112,45 @@ class Server: @app.post('/polem', response_model=ResponsePolem) async def polem(pred_req: PredictionReq): + """A message of shame -- documentation must be completed. + + Returns:A message of shame -- documentation must be completed. + + """ text = pred_req.text doc = ner.process_document(text) return {"text": text, "tokens": [ResponseToken.generate(t) for t in doc.tokens], - "annotations": [ResponseAnnotation.generate(t) for t in doc.annotations]} + "annotations": [ResponseAnnotation.generate(t) for t + in doc.annotations]} def parse_args(): + """A message of shame -- documentation must be completed.""" parser = argparse.ArgumentParser( description='Process a single TSV with a NER model') - parser.add_argument('--model', required=True, metavar='PATH', help='path to NER model') - parser.add_argument('--device', required=False, default="cpu", metavar='cpu|cuda', + parser.add_argument('--model', required=True, metavar='PATH', + help='path to NER model') + parser.add_argument('--device', required=False, default="cpu", + metavar='cpu|cuda', help='device type used for processing') - parser.add_argument('--max_seq_length', required=False, default=256, metavar='N', type=int, - help='the maximum total input sequence length after WordPiece tokenization.') - parser.add_argument('--pretrained_path', required=False, metavar='PATH', default=None, - help='pretrained XLM-Roberta model path with model name as prefix, a.e automodel:allegro/herbert-large-cased') + parser.add_argument('--max_seq_length', required=False, default=256, + metavar='N', type=int, + help='the maximum total input sequence length after ' + 'WordPiece tokenization.') + parser.add_argument('--pretrained_path', required=False, metavar='PATH', + default=None, + help='pretrained XLM-Roberta model path with model ' + 'name as prefix, ' + 'a.e automodel:allegro/herbert-large-cased') parser.add_argument('--processes', help='number of processes', default=1) - parser.add_argument('--tokenization', required=False, default="spacy-ext", choices=names, + parser.add_argument('--tokenization', required=False, default="spacy-ext", + choices=names, help='Tokenization method') - parser.add_argument('--squeeze', required=False, default=False, action="store_true", - help='try to squeeze multiple examples into one Input Feature') + parser.add_argument('--squeeze', required=False, default=False, + action="store_true", + help='try to squeeze multiple examples into one ' + 'Input Feature') parser.add_argument('--host', required=False, default="0.0.0.0") parser.add_argument('--port', required=False, default=8001, type=int) return parser.parse_args() @@ -108,11 +162,15 @@ if __name__ == "__main__": cliargs = parse_args() try: global ner - ner = PolDeepNer2.load(cliargs.model, pretrained_path=cliargs.pretrained_path, device=cliargs.device, - max_seq_length=cliargs.max_seq_length, squeeze=cliargs.squeeze, + ner = PolDeepNer2.load(cliargs.model, + pretrained_path=cliargs.pretrained_path, + device=cliargs.device, + max_seq_length=cliargs.max_seq_length, + squeeze=cliargs.squeeze, tokenizer=TokenizerSpaces()) - + # threaded=True, processes=cliargs.processes - uvicorn.run(server.app, host=cliargs.host, port=cliargs.port, log_level="info") + uvicorn.run(server.app, host=cliargs.host, port=cliargs.port, + log_level="info") except ValueError as er: print("[ERROR] %s" % er) diff --git a/setup.py b/setup.py index 5762a74b3b0f102b561f66db5bb69576b4ba06f6..b32d86af9ba5fcdec6ba6d753a6fa060a20032c3 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,8 @@ setuptools.setup( version="0.7.0", author="Michał Marcińczuk", author_email="michal.marcinczuk@pwr.edu.pl", - description="PolDeepNer2 is a tool for sequence labeling tasks based on transformer language models.", + description="PolDeepNer2 is a tool for sequence labeling tasks based on " + "transformer language models.", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/CLARIN-PL/PolDeepNer2", diff --git a/tests/pipeline/test_lemmatization.py b/tests/pipeline/test_lemmatization.py index 4d0afea3296ac5a10cc09f2b5520169eba358cba..d5191ab45bf16f0fac90ee08e9934cc358d05dea 100644 --- a/tests/pipeline/test_lemmatization.py +++ b/tests/pipeline/test_lemmatization.py @@ -1,3 +1,4 @@ +"""A message of shame -- documentation must be completed.""" import pytest from poldeepner2.data.token import Token @@ -7,17 +8,31 @@ from poldeepner2.utils.annotation import Annotation @pytest.mark.external @pytest.mark.parametrize("annotation, lemma", [ - (Annotation("nam_liv_person", tokens=[Token("Tomka", 0, 5, "Tomek", "", "subst:sg:gen:m1")]), "Tomek"), - (Annotation("nam_liv_person", tokens=[Token("Mickiewicza", 0, 11, "Mickiewicz", "", "subst:sg:gen:m1")]), + (Annotation("nam_liv_person", + tokens=[Token("Tomka", 0, 5, "Tomek", "", "subst:sg:gen:m1")]), + "Tomek"), + (Annotation("nam_liv_person", + tokens=[Token("Mickiewicza", 0, 11, "Mickiewicz", "", + "subst:sg:gen:m1")]), "Mickiewicz"), (Annotation("nam_org_institution", tokens=[ Token("Lidze", 0, 5, "liga", " ", "subst:sg:loc:f"), - Token("światowej", 7, 16, "światowy", " ", "adj:sg:loc:f:pos")]), "Liga światowa"), + Token("światowej", 7, 16, "światowy", " ", "adj:sg:loc:f:pos")]), + "Liga światowa"), (Annotation("", tokens=[ Token("Lidze", 0, 5, "liga", " ", "subst:sg:loc:f"), - Token("światowej", 7, 16, "światowy", " ", "adj:sg:loc:f:pos")]), "liga światowa") + Token("światowej", 7, 16, "światowy", " ", "adj:sg:loc:f:pos")]), + "liga światowa") ]) -def test_annotation_lemmatizer_polem_single(annotation: Annotation, lemma: str): +def test_annotation_lemmatizer_polem_single(annotation: Annotation, + lemma: str): + """A message of shame -- documentation must be completed. + + Args: + annotation: A message of shame -- documentation must be completed. + lemma: A message of shame -- documentation must be completed. + + """ annotations = [annotation] polem = AnnotationLemmatizerPolem() polem.process(annotations) diff --git a/tests/pipeline/test_tokenization.py b/tests/pipeline/test_tokenization.py index 8cf7ede01825b0168bd9a0324e0584aba9cfb5ae..7ebd484d8bf7ea7e87f9f06dc00addfb579571cb 100644 --- a/tests/pipeline/test_tokenization.py +++ b/tests/pipeline/test_tokenization.py @@ -1,3 +1,4 @@ +"""A message of shame -- documentation must be completed.""" import pytest from poldeepner2.pipeline.tokenization import TokenizerKrnnt, TokenizerSpaces @@ -5,16 +6,37 @@ from poldeepner2.pipeline.tokenization import TokenizerKrnnt, TokenizerSpaces @pytest.fixture(scope='session', autouse=True) def tokenizer_krnnt(): + """A message of shame -- documentation must be completed. + + Returns: TokenizerKrnnt() + + """ return TokenizerKrnnt() @pytest.mark.external @pytest.mark.parametrize("text, orths, lemmas, ws, morphs, starts, ends", [ - ("Ala ma kota.", ["Ala", "ma", "kota", "."], ["Ala", "mieć", "kot", "."], [" ", " ", "", ""], - ["subst:sg:nom:f", "fin:sg:ter:imperf", "subst:sg:acc:m2", "interp"], [0, 4, 7, 11], [3, 6, 11, 12]) + ("Ala ma kota.", ["Ala", "ma", "kota", "."], ["Ala", "mieć", "kot", "."], + [" ", " ", "", ""], + ["subst:sg:nom:f", "fin:sg:ter:imperf", "subst:sg:acc:m2", "interp"], + [0, 4, 7, 11], [3, 6, 11, 12]) ]) -def test_tokenizer_krrnt_text(text: str, orths: [str], lemmas: [str], ws: [str], morphs: [str], +def test_tokenizer_krrnt_text(text: str, orths: [str], lemmas: [str], + ws: [str], morphs: [str], starts: [int], ends: [int], tokenizer_krnnt): + """A message of shame -- documentation must be completed. + + Args: + text: A message of shame -- documentation must be completed. + orths: A message of shame -- documentation must be completed. + lemmas: A message of shame -- documentation must be completed. + ws: A message of shame -- documentation must be completed. + morphs: A message of shame -- documentation must be completed. + starts: A message of shame -- documentation must be completed. + ends: A message of shame -- documentation must be completed. + tokenizer_krnnt: A message of shame -- documentation must be completed. + + """ sentence = tokenizer_krnnt.tokenize_tokens([text])[0] assert len(sentence) == len(orths) @@ -31,10 +53,18 @@ def test_tokenizer_krrnt_text(text: str, orths: [str], lemmas: [str], ws: [str], (["Ala ma kota"], [["Ala", "ma", "kota"]]), ([" Ala ma kota"], [["Ala", "ma", "kota"]]), (["Ala ma kota"], [["Ala", "ma", "kota"]]), - (["Ala ma kota", "Kot jest łaciaty"], [["Ala", "ma", "kota"], ["Kot", "jest", "łaciaty"]]) + (["Ala ma kota", "Kot jest łaciaty"], [["Ala", "ma", "kota"], + ["Kot", "jest", "łaciaty"]]) ] ) def test_tokenizer_spaces(texts, tokens): + """A message of shame -- documentation must be completed. + + Args: + texts: A message of shame -- documentation must be completed. + tokens: A message of shame -- documentation must be completed. + + """ tokenizer = TokenizerSpaces() output = tokenizer.tokenize(texts) output == tokens diff --git a/tests/unit/utils/test_align_tokens_to_text.py b/tests/unit/utils/test_align_tokens_to_text.py index 1a402d43e22969814b8368f219455dc7cd1afda5..5ceff1a52c075c7042d388c67f719cdb196df850 100644 --- a/tests/unit/utils/test_align_tokens_to_text.py +++ b/tests/unit/utils/test_align_tokens_to_text.py @@ -1,4 +1,6 @@ +"""A message of shame -- documentation must be completed.""" import pytest + from poldeepner2.utils.data_utils import align_tokens_to_text @@ -6,8 +8,15 @@ from poldeepner2.utils.data_utils import align_tokens_to_text ("Ala ma kota", [["Ala", "ma", "kota"]], [(0, 3), (4, 6), (7, 11)]), ("Ala ma kota", [["Ala", "ma", "kota"]], [(0, 3), (4, 6), (8, 12)]), (" Ala ma kota", [["Ala", "ma", "kota"]], [(1, 4), (5, 7), (8, 12)]) - ] +] ) def test_align_tokens_to_text(text, tokens, expected_offsets): + """A message of shame -- documentation must be completed. + + Args: text: A message of shame -- documentation must be completed. + tokens: A message of shame -- documentation must be completed. + expected_offsets: A message of shame -- documentation must be completed. + + """ offsets = align_tokens_to_text(tokens, text) assert offsets == expected_offsets diff --git a/tests/unit/utils/test_iob2_to_iob.py b/tests/unit/utils/test_iob2_to_iob.py index e2f0ee50a384dd654028cb315a1a7d3f26276d19..1cecd0d4baa4021cfea4ec364e463627e9e6253d 100644 --- a/tests/unit/utils/test_iob2_to_iob.py +++ b/tests/unit/utils/test_iob2_to_iob.py @@ -1,9 +1,9 @@ +"""A message of shame -- documentation must be completed.""" import pytest import sys import pathlib - -sys.path.append(str(pathlib.Path(__file__).absolute().parents[3].resolve())) from poldeepner2.utils.data_utils import iob2_to_iob +sys.path.append(str(pathlib.Path(__file__).absolute().parents[3].resolve())) @pytest.mark.parametrize( @@ -12,25 +12,48 @@ from poldeepner2.utils.data_utils import iob2_to_iob 'Alex I-PER\nis O\ngoing O\nto O\nLos I-LOC\nAngeles I-LOC'), ('Alex B-PER', 'Alex I-PER'), - ('Alex B-PER\nAngeles I-PER\nis O\ngoing O\nto O\nLos B-LOC\nAngeles I-LOC', - 'Alex I-PER\nAngeles I-PER\nis O\ngoing O\nto O\nLos I-LOC\nAngeles I-LOC'), - ('is O\ngoing O\nAlex B-PER\nAngeles I-PER\nis O\ngoing O\nto O\nLos B-LOC\nAngeles I-LOC', - 'is O\ngoing O\nAlex I-PER\nAngeles I-PER\nis O\ngoing O\nto O\nLos I-LOC\nAngeles I-LOC'), - ('Alex B-PER\nis O\ngoing O\nAlex B-PER\nto O\nLos B-LOC\nAngeles I-LOC', - 'Alex I-PER\nis O\ngoing O\nAlex I-PER\nto O\nLos I-LOC\nAngeles I-LOC'), - ('Alex B-PER\nis O\ngoing O\nAlex B-PER\nto O\nLos B-LOC\nAngeles I-LOC\nAlex B-PER', - 'Alex I-PER\nis O\ngoing O\nAlex I-PER\nto O\nLos I-LOC\nAngeles I-LOC\nAlex I-PER'), + ('Alex B-PER\nAngeles I-PER\nis O\ngoing O\nto O\nLos B-LOC\nAngeles ' + 'I-LOC', + 'Alex I-PER\nAngeles I-PER\nis O\ngoing O\nto O\nLos I-LOC\nAngeles ' + 'I-LOC'), + ('is O\ngoing O\nAlex B-PER\nAngeles I-PER\nis O\ngoing O\nto O\nLos ' + 'B-LOC\nAngeles I-LOC', + 'is O\ngoing O\nAlex I-PER\nAngeles I-PER\nis O\ngoing O\nto O\nLos ' + 'I-LOC\nAngeles I-LOC'), + ('Alex B-PER\nis O\ngoing O\nAlex B-PER\nto O\nLos B-LOC\nAngeles ' + 'I-LOC', + 'Alex I-PER\nis O\ngoing O\nAlex I-PER\nto O\nLos I-LOC\nAngeles ' + 'I-LOC'), + ('Alex B-PER\nis O\ngoing O\nAlex B-PER\nto O\nLos B-LOC\nAngeles ' + 'I-LOC\nAlex B-PER', + 'Alex I-PER\nis O\ngoing O\nAlex I-PER\nto O\nLos I-LOC\nAngeles ' + 'I-LOC\nAlex I-PER'), # nested - ('Alex B-PER#B-ORG\nis I-ORG\ngoing O\nAlex B-PER\nto O\nLos B-LOC\nAngeles I-LOC\nAlex B-PER#B-LOC', - 'Alex I-PER#I-ORG\nis I-ORG\ngoing O\nAlex I-PER\nto O\nLos I-LOC\nAngeles I-LOC\nAlex I-PER#B-LOC'), - ('Alex B-PER#B-ORG\nis I-ORG\ngoing O\nAlex B-PER\nto O\nLos B-LOC\nAngeles I-LOC\nAlex B-PER#B-LOC\nAlex B-PER#B-LOC', - 'Alex I-PER#I-ORG\nis I-ORG\ngoing O\nAlex I-PER\nto O\nLos I-LOC\nAngeles I-LOC\nAlex I-PER#B-LOC\nAlex B-PER#I-LOC'), - ('Alex B-PER#B-ORG\nis I-ORG\ngoing O\nAlex B-PER\nto O\nLos B-LOC#B-PER\nAngeles B-LOC#B-PER', - 'Alex I-PER#I-ORG\nis I-ORG\ngoing O\nAlex I-PER\nto O\nLos I-LOC#I-PER\nAngeles B-LOC#B-PER'), - ('Alex B-PER#B-ORG\nis I-ORG\ngoing O\nAlex B-PER#B-NAV\nto I-NAV\nLos B-LOC#B-PER\nAngeles I-LOC#B-PER', - 'Alex I-PER#I-ORG\nis I-ORG\ngoing O\nAlex I-PER#I-NAV\nto I-NAV\nLos I-LOC#I-PER\nAngeles I-LOC#B-PER') + ('Alex B-PER#B-ORG\nis I-ORG\ngoing O\nAlex B-PER\nto O\nLos ' + 'B-LOC\nAngeles I-LOC\nAlex B-PER#B-LOC', + 'Alex I-PER#I-ORG\nis I-ORG\ngoing O\nAlex I-PER\nto O\nLos ' + 'I-LOC\nAngeles I-LOC\nAlex I-PER#B-LOC'), + ('Alex B-PER#B-ORG\nis I-ORG\ngoing O\nAlex B-PER\nto O\nLos ' + 'B-LOC\nAngeles I-LOC\nAlex B-PER#B-LOC\nAlex B-PER#B-LOC', + 'Alex I-PER#I-ORG\nis I-ORG\ngoing O\nAlex I-PER\nto O\nLos ' + 'I-LOC\nAngeles I-LOC\nAlex I-PER#B-LOC\nAlex B-PER#I-LOC'), + ('Alex B-PER#B-ORG\nis I-ORG\ngoing O\nAlex B-PER\nto O\nLos ' + 'B-LOC#B-PER\nAngeles B-LOC#B-PER', + 'Alex I-PER#I-ORG\nis I-ORG\ngoing O\nAlex I-PER\nto O\nLos ' + 'I-LOC#I-PER\nAngeles B-LOC#B-PER'), + ('Alex B-PER#B-ORG\nis I-ORG\ngoing O\nAlex B-PER#B-NAV\nto ' + 'I-NAV\nLos B-LOC#B-PER\nAngeles I-LOC#B-PER', + 'Alex I-PER#I-ORG\nis I-ORG\ngoing O\nAlex I-PER#I-NAV\nto ' + 'I-NAV\nLos I-LOC#I-PER\nAngeles I-LOC#B-PER') ] ) def test_iob2_to_iob(iob2_input, expected_output): + """A message of shame -- documentation must be completed. + + Args: + iob2_input: A message of shame -- documentation must be completed. + expected_output: A message of shame -- documentation must be completed. + + """ iob1 = iob2_to_iob(iob2_input) assert iob1.split('\n') == expected_output.split('\n') diff --git a/tests/unit/utils/test_poleval_dict.py b/tests/unit/utils/test_poleval_dict.py index 94a7d147bb32ef212abc5e911c45f2abafbeceee..4aa9a65b8ab528c7020c4099cd517085d735276b 100644 --- a/tests/unit/utils/test_poleval_dict.py +++ b/tests/unit/utils/test_poleval_dict.py @@ -1,12 +1,14 @@ +"""A message of shame -- documentation must be completed.""" import pytest -from poldeepner2.utils.data_utils import get_poleval_dict, read_tsv, wrap_annotations +from poldeepner2.utils.data_utils import get_poleval_dict, wrap_annotations @pytest.mark.parametrize( "id, text, tokens, labels, answers", [ ('9 from 1828: PCCwR-1.1-TXT/short/Literatura piękna/2955.txt', 'Pojutrze - wyszeptał niepewnie, a Beny kaszlnął. Raz Benjamin trzy', - [['Pojutrze', '-', 'wyszeptał', 'niepewnie', ',', 'a', 'Beny', 'kaszlnął', '.'], + [['Pojutrze', '-', 'wyszeptał', 'niepewnie', ',', 'a', 'Beny', + 'kaszlnął', '.'], ['Raz', 'Benjamin', 'trzy']], [['O', 'O', 'O', 'O', 'O', 'O', 'B-persName', 'O', 'O'], ['O', 'B-persName', 'O']], @@ -14,7 +16,8 @@ from poldeepner2.utils.data_utils import get_poleval_dict, read_tsv, wrap_annota ('9 from 1828: PCCwR-1.1-TXT/short/Literatura piękna/2955.txt', 'Pojutrze - wyszeptał niepewnie, a Beny kaszlnął. Raz Benjamin trzy', - [['Pojutrze', '-', 'wyszeptał', 'niepewnie', ',', 'a', 'Beny', 'kaszlnął', '.'], + [['Pojutrze', '-', 'wyszeptał', 'niepewnie', ',', 'a', 'Beny', + 'kaszlnął', '.'], ['Raz', 'Benjamin', 'trzy']], [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O']], @@ -22,30 +25,48 @@ from poldeepner2.utils.data_utils import get_poleval_dict, read_tsv, wrap_annota ('9 from 1828: PCCwR-1.1-TXT/short/Literatura piękna/2955.txt', 'Pojutrze - wyszeptał niepewnie, a Beny kaszlnął. Raz Benjamin trzy', - [['Pojutrze', '-', 'wyszeptał', 'niepewnie', ',', 'a', 'Beny', 'kaszlnął', '.'], + [['Pojutrze', '-', 'wyszeptał', 'niepewnie', ',', 'a', 'Beny', + 'kaszlnął', '.'], ['Raz', 'Benjamin', 'trzy']], - [['O', 'O', 'O', 'O', 'O', 'O', 'B-persName#B-persName-forename', 'O', 'O'], + [['O', 'O', 'O', 'O', 'O', 'O', 'B-persName#B-persName-forename', + 'O', 'O'], ['O', 'B-persName#B-persName-forename', 'O']], - 'persName 34 38\tBeny\npersName_forename 34 38\tBeny\npersName 53 61\tBenjamin\npersName_forename 53 61\tBenjamin'), + 'persName 34 38\tBeny\npersName_forename 34 38\tBeny\npersName 53 ' + '61\tBenjamin\npersName_forename 53 61\tBenjamin'), ('9 from 1828: PCCwR-1.1-TXT/short/Literatura piękna/2955.txt', 'Pojutrze - wyszeptał niepewnie, a Londyn kaszlnął. Raz Londyn trzy', - [['Pojutrze', '-', 'wyszeptał', 'niepewnie', ',', 'a', 'Londyn', 'kaszlnął', '.'], + [['Pojutrze', '-', 'wyszeptał', 'niepewnie', ',', 'a', 'Londyn', + 'kaszlnął', '.'], ['Raz', 'Londyn', 'trzy']], [['O', 'O', 'O', 'O', 'O', 'O', 'B-LOC#B-PER', 'O', 'O'], ['O', 'B-LOC#B-PER', 'O']], - 'LOC 34 40\tLondyn\nPER 34 40\tLondyn\nLOC 55 61\tLondyn\nPER 55 61\tLondyn'), + 'LOC 34 40\tLondyn\nPER 34 40\tLondyn\nLOC 55 61\tLondyn\nPER 55 ' + '61\tLondyn'), ('9 from 1828: PCCwR-1.1-TXT/short/Literatura piękna/2955.txt', - 'Pojutrze - wyszeptał niepewnie, a Londyn kaszlnął. Raz Londyn trzy', - [['Pojutrze', '-', 'wyszeptał', 'niepewnie', ',', 'a', 'Londyn', 'kaszlnął', '.'], + 'Pojutrze - wyszeptał niepewnie, a Londyn kaszlnął. Raz Londyn ' + ' trzy', + [['Pojutrze', '-', 'wyszeptał', 'niepewnie', ',', 'a', 'Londyn', + 'kaszlnął', '.'], ['Raz', 'Londyn', 'trzy']], [['O', 'O', 'O', 'O', 'O', 'O', 'B-LOC#B-PER', 'O', 'O'], ['O', 'B-LOC#B-PER', 'O']], - 'LOC 37 43\tLondyn\nPER 37 43\tLondyn\nLOC 60 66\tLondyn\nPER 60 66\tLondyn') + 'LOC 37 43\tLondyn\nPER 37 43\tLondyn\nLOC 60 66\tLondyn\nPER 60 ' + '66\tLondyn') ] ) def test_get_poleval_dict(id, text, tokens, labels, answers): + """A message of shame -- documentation must be completed. + + Args: + id: A message of shame -- documentation must be completed. + text: A message of shame -- documentation must be completed. + tokens: A message of shame -- documentation must be completed. + labels: A message of shame -- documentation must be completed. + answers: A message of shame -- documentation must be completed. + + """ annotations = wrap_annotations(labels) poleval_dict = get_poleval_dict(id, text, tokens, annotations) assert poleval_dict == {'text': text, 'id': id, 'answers': answers} diff --git a/tests/unit/utils/test_read_tsv.py b/tests/unit/utils/test_read_tsv.py index 51be8728ea248d12e051ec4d83b0bfa61dc74b4e..7ebb214717d17c0d7f78b2d062cae7b5437be369 100644 --- a/tests/unit/utils/test_read_tsv.py +++ b/tests/unit/utils/test_read_tsv.py @@ -1,3 +1,4 @@ +"""A message of shame -- documentation must be completed.""" from pathlib import Path import pytest @@ -10,7 +11,8 @@ from poldeepner2.utils.data_utils import read_tsv "path, expected_output", [ ('tsv_test.tsv', [('Ala z Krakowa jeździ Audi'.split(" "), ['O'] * 5), - ('Marek Nowak z Politechniki Wrocławskiej mieszka przy ul . Sądeckiej'.split(" "), ['O'] * 10)]) + ('Marek Nowak z Politechniki Wrocławskiej mieszka przy ul . ' + 'Sądeckiej'.split(" "), ['O'] * 10)]) ] ) def test_get_read_tsv(path, expected_output): diff --git a/tests/unit/utils/test_sequence_labeling.py b/tests/unit/utils/test_sequence_labeling.py index 8db73de4f717252e69dc57cdb3f3f62c45d9f497..97db2b68b92c11b61feaf9b2a2f261072ec720b8 100644 --- a/tests/unit/utils/test_sequence_labeling.py +++ b/tests/unit/utils/test_sequence_labeling.py @@ -1,3 +1,4 @@ +"""A message of shame -- documentation must be completed.""" import pytest from poldeepner2.utils.sequence_labeling import get_entities @@ -9,12 +10,19 @@ from poldeepner2.utils.sequence_labeling import get_entities (["O", "B-PER", "I-PER"], [("PER", 1, 2)]), (["B-PER", "B-PER"], [("PER", 0, 0), ("PER", 1, 1)]), (["B-LOC", "I-PER", "I-PER"], [("LOC", 0, 0), ("PER", 1, 2)]), - (["B-LOC#B-PER", "I-LOC", "I-LOC#B-PER"], [("LOC", 0, 2), ("PER", 0, 0), ("PER", 2, 2)]), - (["B-LOC#I-PER", "I-LOC", "I-LOC#I-PER"], [("LOC", 0, 2), ("PER", 0, 0), ("PER", 2, 2)]), - (["B-nam_liv_person#B-nam_liv_person_first", "B-nam_liv_person#B-nam_liv_person_last"], - [('nam_liv_person', 0, 0), ('nam_liv_person_first', 0, 0), ('nam_liv_person', 1, 1), ('nam_liv_person_last', 1, 1)]), + (["B-LOC#B-PER", "I-LOC", "I-LOC#B-PER"], [("LOC", 0, 2), + ("PER", 0, 0), + ("PER", 2, 2)]), + (["B-LOC#I-PER", "I-LOC", "I-LOC#I-PER"], [("LOC", 0, 2), + ("PER", 0, 0), + ("PER", 2, 2)]), + (["B-nam_liv_person#B-nam_liv_person_first", + "B-nam_liv_person#B-nam_liv_person_last"], + [('nam_liv_person', 0, 0), ('nam_liv_person_first', 0, 0), + ('nam_liv_person', 1, 1), ('nam_liv_person_last', 1, 1)]), (["B-persName#B-persName-forename", "B-persName"], - [("persName", 0, 0), ("persName-forename", 0, 0), ("persName", 1, 1)]), + [("persName", 0, 0), ("persName-forename", 0, 0), + ("persName", 1, 1)]), (["I-PER", "I-PER"], [("PER", 0, 1)]), (["B-PER", "B-PER"], [("PER", 0, 0), ("PER", 1, 1)]), # (["B-PER", "S-PER"], [("PER", 0, 0), ("PER", 1, 1)]), @@ -23,5 +31,12 @@ from poldeepner2.utils.sequence_labeling import get_entities ] ) def test_get_entities(labels, expected): + """A message of shame -- documentation must be completed. + + Args: + labels: A message of shame -- documentation must be completed. + expected: A message of shame -- documentation must be completed. + + """ entities = get_entities(labels) assert set(entities) == set(expected) diff --git a/tests/unit/utils/test_wrap_annotations.py b/tests/unit/utils/test_wrap_annotations.py index 6836cba37e3d47653a707f694bb57287bf04eb2c..d0452712c2d3adad9d06c5a437612dde0a226092 100644 --- a/tests/unit/utils/test_wrap_annotations.py +++ b/tests/unit/utils/test_wrap_annotations.py @@ -1,10 +1,13 @@ +"""A message of shame -- documentation must be completed.""" import codecs from pathlib import Path -from poldeepner2.utils.data_utils import read_tsv, wrap_annotations, align_tokens_to_text +from poldeepner2.utils.data_utils import read_tsv, wrap_annotations, \ + align_tokens_to_text def test_wrap_and_align_tokens_to_text(): + """A message of shame -- documentation must be completed.""" root = Path(__file__).parents[2].absolute() / "resources" path_iob = str(root / "poleval_0337_iob.tsv") diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000000000000000000000000000000000..8f84e117ac1320bd94573cc6ceef3a383c9129a2 --- /dev/null +++ b/tox.ini @@ -0,0 +1,44 @@ +[tox] +envlist = pep8,docstyle +skipsdist = True + +[testenv:pep8] +deps = + flake8 +basepython = python3.8 +commands = + flake8 {posargs} + +[testenv:docstyle] +deps = + pydocstyle +basepython = python3.8 +commands = + pydocstyle --verbose {posargs} + +[flake8] +# W504 skipped because it is overeager and unnecessary +ignore = W504 +show-source = True +exclude = .git,.venv,.tox,dist,doc,*egg,build,venv +import-order-style = pep8 +max-line-length = 80 + + +[pydocstyle] +# D104 Missing docstring in public package +# D203 1 blank line required before class docstring +# D213 Multi-line docstring summary should start at the second line +# D214 Section is over-indented +# D215 Section underline is over-indented +# D401 First line should be in imperative mood; try rephrasing +# D405 Section name should be properly capitalized +# D406 Section name should end with a newline +# D407 Missing dashed underline after section +# D408 Section underline should be in the line following the section’s name +# D409 Section underline should match the length of its name +# D410 Missing blank line after section +# D411 Missing blank line before section +ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411 +match-dir = ^(?!\.tox|venv).* +match = ^(?!setup).*\.py \ No newline at end of file diff --git a/train.py b/train.py index acbc67c28fc498500b703f7d8ebd7f61cf5d30e3..382309cc51d15d46b758eac8e86475e5f1df752b 100644 --- a/train.py +++ b/train.py @@ -1,3 +1,5 @@ +"""A message of shame -- documentation must be completed.""" + from __future__ import absolute_import, division, print_function import argparse @@ -39,19 +41,19 @@ def train_model(args: Namespace): args.output_dir += suffix config = { - "epochs": args.num_train_epochs, - "language_model": args.pretrained_path, - "batch_size": args.train_batch_size, - "data_train": args.data_train, - "data_tune": args.data_tune, - "data_test": args.data_test, - "max_seq_length": args.max_seq_length, - "warmup_proportion": args.warmup_proportion, - "learning_rate": args.learning_rate, - "gradient_accumulation_steps": args.gradient_accumulation_steps, - "squeeze": args.squeeze, - "dropout": args.dropout, - "output_dir": args.output_dir + "epochs": args.num_train_epochs, + "language_model": args.pretrained_path, + "batch_size": args.train_batch_size, + "data_train": args.data_train, + "data_tune": args.data_tune, + "data_test": args.data_test, + "max_seq_length": args.max_seq_length, + "warmup_proportion": args.warmup_proportion, + "learning_rate": args.learning_rate, + "gradient_accumulation_steps": args.gradient_accumulation_steps, + "squeeze": args.squeeze, + "dropout": args.dropout, + "output_dir": args.output_dir } if args.wandb: @@ -61,7 +63,8 @@ def train_model(args: Namespace): wandb.run.save() if os.path.exists(args.output_dir) and os.listdir(args.output_dir): - raise ValueError("Output directory (%s) already exists and is not empty." % args.output_dir) + raise ValueError("Output directory (%s) already exists and is not " + "empty." % args.output_dir) Path(args.output_dir).mkdir(parents=True, exist_ok=True) @@ -70,7 +73,8 @@ def train_model(args: Namespace): logger.info(item) if args.gradient_accumulation_steps < 1: - raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1" + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " + "should be >= 1 " % args.gradient_accumulation_steps) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps @@ -92,7 +96,7 @@ def train_model(args: Namespace): logger.info("Loading training data...") t0 = time.time() train_examples = processor.get_examples(args.data_train, "train") - logger.info(f"Training data was loaded in {time.time()-t0} second(s)") + logger.info(f"Training data was loaded in {time.time() - t0} second(s)") # preparing model configs hidden_size = 1024 if 'large' in args.pretrained_path else \ @@ -116,14 +120,19 @@ def train_model(args: Namespace): logger.info(f"Pretrained model was loaded in {time.time()-t0} second(s)") train_features = convert_examples_to_features( - train_examples, label_list, args.max_seq_length, model.encode_word, args.squeeze) + train_examples, label_list, + args.max_seq_length, model.encode_word, + args.squeeze) if args.training_mix: train_features.extend(convert_examples_to_features( - train_examples, label_list, args.max_seq_length, model.encode_word, not args.squeeze)) + train_examples, label_list, + args.max_seq_length, model.encode_word, + not args.squeeze)) num_train_optimization_steps = int( - len(train_features) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs + len(train_features) / args.train_batch_size / + args.gradient_accumulation_steps) * args.num_train_epochs no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) @@ -135,8 +144,10 @@ def train_model(args: Namespace): ] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) - optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, + eps=args.adam_epsilon) + scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, + t_total=num_train_optimization_steps) # freeze model if necessary if args.freeze_model: @@ -153,7 +164,8 @@ def train_model(args: Namespace): from apex import amp except ImportError: raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + "Please install apex from https://www.github.com/nvidia/apex " + "to use fp16 training.") model, optimizer = amp.initialize( model, optimizer, opt_level=args.fp16_opt_level) @@ -172,16 +184,19 @@ def train_model(args: Namespace): if args.data_tune: val_examples = processor.get_examples(args.data_tune, "tune") val_features = convert_examples_to_features( - val_examples, label_list, args.max_seq_length, model.encode_word, args.squeeze) + val_examples, label_list, + args.max_seq_length, model.encode_word, + args.squeeze) val_data = create_dataset(val_features) if args.data_test: eval_examples = processor.get_examples(args.data_test, "test") eval_features = convert_examples_to_features( - eval_examples, label_list, args.max_seq_length, model.encode_word, args.squeeze) + eval_examples, label_list, args.max_seq_length, + model.encode_word, args.squeeze) eval_data = create_dataset(eval_features) - for epoch_no in range(1, args.num_train_epochs+1): + for epoch_no in range(1, args.num_train_epochs + 1): epoch_stats = {"epoch": epoch_no} logger.info("Epoch %d" % epoch_no) tr_loss = 0 @@ -202,10 +217,12 @@ def train_model(args: Namespace): if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), + args.max_grad_norm) else: loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + torch.nn.utils.clip_grad_norm_(model.parameters(), + args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) @@ -238,7 +255,8 @@ def train_model(args: Namespace): if f1 > best_val_f1: best_val_f1 = f1 - logger.info("\nFound better f1=%.4f on validation set. Saving model\n" % f1) + logger.info("\nFound better f1=%.4f on validation set. " + "Saving model\n" % f1) logger.info("%s\n" % report) model.save(args.output_dir) else: @@ -256,7 +274,8 @@ def train_model(args: Namespace): logger.info("%s\n" % report) if args.epoch_save_model: - epoch_output_dir = os.path.join(args.output_dir, "e%03d" % epoch_no) + epoch_output_dir = os.path.join(args.output_dir, + "e%03d" % epoch_no) os.makedirs(epoch_output_dir) model.save(epoch_output_dir) diff --git a/trainer.py b/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..12cbeb43b8b91be55f80b201629bd36f0766e528 --- /dev/null +++ b/trainer.py @@ -0,0 +1,365 @@ +"""Script to teach new models compatible with the library.""" +import configparser +import logging +import os +import random +import sys +import time +from pathlib import Path + +import numpy as np +import torch +from pytorch_transformers import AdamW, WarmupLinearSchedule +from torch.utils.data import DataLoader, RandomSampler +from tqdm import tqdm + +from poldeepner2.utils.data_utils import NerProcessor +from poldeepner2.utils.data_utils import create_dataset, \ + convert_examples_to_features, save_params +from poldeepner2.utils.train_utils import evaluate_model + + +def main(): + + config_file = "config.cfg" + config = configparser.ConfigParser() + config.read(config_file) + + # HYPERPARAMETERS + adam_epsilon = config.getfloat('train', 'adam_epsilon') + data_test = config['train']['data_test'] + data_train = config['train']['data_train'] + data_tune = config['train']['data_tune'] + device = config['train']['device'] + dropout = config.getfloat('train', 'dropout') + epoch_save_model = config.getboolean('train', 'epoch_save_model') + eval_batch_size = config.getint('train', 'eval_batch_size') + fp16 = config.getboolean('train', 'fp16') + fp16_opt_level = config['train']['fp16_opt_level'] + freeze_model = config.getboolean('train', 'freeze_model') + gradient_accumulation_steps = \ + config.getint('train', 'gradient_accumulation_steps') + hidden_size = config.getint('train', 'hidden_size') + learning_rate = config.getfloat('train', 'learning_rate') + max_grad_norm = config.getfloat('train', 'max_grad_norm') + max_seq_length = config.getint('train', 'max_seq_length') + num_train_epochs = config.getint('train', 'num_train_epochs') + output_dir = config['train']['output_dir'] + pretrained_path = config['train']['pretrained_path'] + seed = config.getint('train', 'seed') + squeeze = config.getboolean('train', 'squeeze') + train_batch_size = config.getint('train', 'train_batch_size') + training_mix = config.getboolean('train', 'training_mix') + use_transfer = 'transfer' in config['train'] and \ + config['train']['transfer'] != 'None' + if use_transfer: + transfer = config['train']['transfer'] + else: + transfer = None + warmup_proportion = config.getfloat('train', 'warmup_proportion') + weight_decay = config.getfloat('train', 'weight_decay') + + # if wandb: + # import wandb + # wandb.init(project=wandb, config=config) + + if os.path.exists(output_dir) and os.listdir(output_dir): + raise ValueError( + "Output directory (%s) already exists and is not empty." + % output_dir) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, + filename=Path(output_dir) / "log.txt") + logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) + logger = logging.getLogger(__name__) + for item in sorted(config.items()): + logger.info(item) + + if gradient_accumulation_steps < 1: + raise ValueError( + "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" + % gradient_accumulation_steps) + + train_batch_size = train_batch_size // gradient_accumulation_steps + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + # Determine set of labels + processor = NerProcessor() + datasets = [data_train] + if data_tune: + datasets.append(data_tune) + if data_test: + datasets.append(data_test) + label_list = \ + processor.get_labels(datasets, config.getint('data', + 'tag_column_index')) + logger.info(f"Labels: {label_list}") + num_labels = len(label_list) + 1 # add one for IGNORE label + logger.info(f"Number of labels: {num_labels}") + + # Load training data + logger.info("Loading training data...") + t0 = time.time() + train_examples = \ + processor.get_examples(data_train, + config.getint('data', + 'tag_column_index'), + "train") + logger.info(f"Training data was loaded in {time.time() - t0} second(s)") + + # preparing model configs + hidden_size = 1024 if 'large' in pretrained_path \ + else (768 if 'base' in pretrained_path else hidden_size) + device = device + + logger.info("Loading pretrained model...") + t0 = time.time() + if pretrained_path.startswith("hf:"): + from poldeepner2.model.hf_for_token_calssification import \ + HfModelForTokenClassification + pretrained_dir = pretrained_path.split(':')[1] + model = HfModelForTokenClassification( + pretrained_path=pretrained_dir, n_labels=num_labels, + hidden_size=hidden_size, dropout_p=dropout, + device=device) + elif pretrained_path.startswith("mt5:"): + from poldeepner2.model.mt5_for_token_calssification import \ + Mt5ModelForTokenClassification + variant = pretrained_path.split(':')[1] + model = Mt5ModelForTokenClassification( + variant=variant, n_labels=num_labels, hidden_size=hidden_size, + dropout_p=dropout, device=device) + else: + from poldeepner2.model.xlmr_for_token_classification \ + import XLMRForTokenClassification + pretrained_dir = pretrained_path + if ":" in pretrained_dir: + pretrained_dir = pretrained_dir.split(':')[1] + if not os.path.exists(pretrained_dir): + raise ValueError( + "RoBERTa language model not found on path '%s'" + % pretrained_dir) + model = XLMRForTokenClassification( + pretrained_path=pretrained_dir, n_labels=num_labels, + hidden_size=hidden_size, dropout_p=dropout, + device=device) + logger.info(f"Pretrained model was loaded in {time.time() - t0} second(s)") + + if use_transfer: + if device == "cpu": + state_dict = torch.load( + open(os.path.join(transfer, 'model.pt'), 'rb'), + map_location='cpu') + else: + state_dict = torch.load( + open(os.path.join(transfer, 'model.pt'), 'rb')) + model.load_state_dict(state_dict) + + model.to(device) + # if wandb: + # wandb.watch(model) + + train_features = convert_examples_to_features( + train_examples, label_list, max_seq_length, model.encode_word, + squeeze) + + if training_mix: + train_features.extend(convert_examples_to_features( + train_examples, label_list, max_seq_length, model.encode_word, + not squeeze)) + + num_train_optimization_steps = int( + len(train_features) / train_batch_size / gradient_accumulation_steps) \ + * num_train_epochs + + no_decay = ['bias', 'final_layer_norm.weight'] + + params = list(model.named_parameters()) + + optimizer_grouped_parameters = [ + {'params': [p for n, p in params if not any( + nd in n for nd in no_decay)], 'weight_decay': weight_decay}, + {'params': [p for n, p in params if any( + nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + + warmup_steps = int(warmup_proportion * num_train_optimization_steps) + optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, + eps=adam_epsilon) + scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, + t_total=num_train_optimization_steps) + + # freeze model if necessary + if freeze_model: + logger.info("Freezing XLM-R model...") + for n, p in model.named_parameters(): + if 'xlmr' in n and p.requires_grad: + logging.info("Parameter %s - freezed" % n) + p.requires_grad = False + else: + logging.info("Parameter %s - unchanged" % n) + + if fp16: + try: + from apex import amp + except ImportError: + raise ImportError( + "Please install apex from https://www.github.com/nvidia/apex " + "to use fp16 training.") + model, optimizer = amp.initialize( + model, optimizer, opt_level=fp16_opt_level) + + # Train the model + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_examples)) + logger.info(" Batch size = %d", train_batch_size) + logger.info(" Num steps = %d", num_train_optimization_steps) + + train_data = create_dataset(train_features) + + train_sampler = RandomSampler(train_data) + + train_dataloader = DataLoader(train_data, sampler=train_sampler, + batch_size=train_batch_size) + + # getting validation samples + best_val_f1 = 0.0 + if data_tune: + val_examples = \ + processor.get_examples(data_tune, + config.getint('data', 'tag_column_index'), + "tune") + val_features = convert_examples_to_features( + val_examples, label_list, max_seq_length, model.encode_word, + squeeze) + val_data = create_dataset(val_features) + + if data_test: + eval_examples = \ + processor.get_examples(data_test, + config.getint('data', 'tag_column_index'), + "test") + eval_features = convert_examples_to_features( + eval_examples, label_list, max_seq_length, model.encode_word, + squeeze) + eval_data = create_dataset(eval_features) + + for epoch_no in range(1, num_train_epochs + 1): + epoch_stats = {"epoch": epoch_no} + logger.info("Epoch %d" % epoch_no) + tr_loss = 0 + nb_tr_examples, nb_tr_steps = 0, 0 + + model.train() + steps = len(train_dataloader) + + time_start = time.time() + # ToDo: add parameter for this feature + # for g in optimizer.param_groups: + # g['lr'] = learning_rate - (learning_rate/100 * epoch_no) + # epoch_stats['lr'] = learning_rate - (learning_rate/100 * epoch_no) + + for step, batch in tqdm(enumerate(train_dataloader), total=steps): + batch = tuple(t.to(device) for t in batch) + input_ids, label_ids, l_mask, valid_ids, = batch + loss = model(input_ids, label_ids, l_mask, valid_ids) + if gradient_accumulation_steps > 1: + loss = loss / gradient_accumulation_steps + + if fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), + max_grad_norm) + else: + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), + max_grad_norm) + + tr_loss += loss.item() + nb_tr_examples += input_ids.size(0) + nb_tr_steps += 1 + + epoch_stats["loss"] = loss + epoch_stats["learning_rate"] = scheduler.get_last_lr()[0] + + if (step + 1) % gradient_accumulation_steps == 0: + optimizer.step() + scheduler.step() + model.zero_grad() + + epoch_stats["step"] = step + # if wandb: + # wandb.log(epoch_stats) + + # if wandb: + # epoch_stats["epoch_training_time"] = time.time() - time_start + + if data_tune: + logger.info("\nTesting on validation set...") + time_start = time.time() + f1, report = evaluate_model(model, val_data, label_list, + eval_batch_size, device) + time_end = time.time() + epoch_stats["validation_F1"] = f1 + epoch_stats["epoch_validation_time"] = time_end - time_start + + if f1 > best_val_f1: + best_val_f1 = f1 + logger.info( + "\nFound better f1=%.4f on validation set. Saving model\n" + % f1) + logger.info("%s\n" % report) + torch.save(model.state_dict(), + open(os.path.join(output_dir, 'model.pt'), + 'wb')) + save_params(output_dir, dropout, num_labels, + label_list) + + if data_test: + logger.info("\nTesting on test set...") + time_start = time.time() + print(f'len label_list: {len(label_list)}') + print(f'label_list: {label_list}') + + f1_score, report = evaluate_model(model, eval_data, label_list, + eval_batch_size, device) + time_end = time.time() + epoch_stats["test_F1"] = f1_score + epoch_stats["epoch_testing_time"] = time_end - time_start + logger.info("%s\n" % report) + + if epoch_save_model: + epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no) + os.makedirs(epoch_output_dir) + torch.save(model.state_dict(), + open(os.path.join(epoch_output_dir, 'model.pt'), 'wb')) + save_params(epoch_output_dir, dropout, num_labels, label_list) + + # if wandb: + # wandb.log(epoch_stats) + + model.to(device) + + if data_test: + eval_data = create_dataset(eval_features) + f1_score, report = evaluate_model(model, eval_data, label_list, + eval_batch_size, device) + logger.info("\n%s", report) + output_eval_file = os.path.join(output_dir, "test_results.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Writing results to file *****") + writer.write(report) + logger.info("Done.") + + +if __name__ == "__main__": + main()