diff --git a/CHANGELOG.md b/CHANGELOG.md index ee18cacd9ebeabc93b4b0eca46bc576d0cf2bd7d..2769c196a0fd289eb2a5cb8385e3c022c6cef493 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ # PolDeepNer2 Changelog +## 0.7.2-wip +### Added +- Cleanup. +- Fix for evaluation. +- Added conlleval metrics. + + ## 0.7.1 ### Added - `context-right` sequence generator. diff --git a/augment_dataset.py b/augment_dataset.py index 53e06f0a8c9c3d9303b380cebb7225936f88ec5e..e0079f4cb3c27706d58b1febc8a9c471e8b1f3de 100644 --- a/augment_dataset.py +++ b/augment_dataset.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - from __future__ import absolute_import, division, print_function import argparse @@ -11,26 +9,12 @@ from poldeepner2.utils.data_utils import read_tsv def write_sentence(fout: str, tokens: List[str], labels: List[str]): - """A message of shame -- documentation must be completed. - - Args: - fout: str - tokens: List[str] - labels: List[str] - - """ for token, label in zip(tokens, labels): fout.write("%s\t%s\n" % (token, label)) fout.write("\n") def main(args): - """A message of shame -- documentation must be completed. - - Args: - args:A message of shame -- documentation must be completed. - - """ sentences_labels = read_tsv(args.input, True) with codecs.open(args.output, "w", "utf8") as fout: for sentence, labels in sentences_labels: @@ -50,11 +34,6 @@ def main(args): def parse_args(): - """A message of shame -- documentation must be completed. - - Returns: parser.parse_args() - - """ parser = argparse.ArgumentParser( description='Process a single TSV with a NER model') parser.add_argument('--input', required=True, metavar='PATH', diff --git a/configs/train-conll03-context.yaml b/configs/train-conll03-context.yaml index f5bd7b4ad089ca297bf6deff661f6f33aa700293..b0753100faa033681e348de4f652e8e73658648b 100644 --- a/configs/train-conll03-context.yaml +++ b/configs/train-conll03-context.yaml @@ -12,11 +12,6 @@ eval_batch_size: 16 train_batch_size: 16 dropout: 0.2 wandb: conll03_en_context -#freeze_model: -#sequence_generator: context-right -#sequence_generator: context-right-mix -#sequence_generator: single -#sequence_generator: merged sequence_generator: union sequence_generator_for_eval: context-window output_dir: ../poldeepner2_models/dev/squeeze_research/conll_context/model_{sequence_generator}_{max_seq_length}_{seed}_v diff --git a/conlleval.py b/conlleval.py new file mode 100644 index 0000000000000000000000000000000000000000..eccd021f74f716319a01205b48e5bfd3602a1b17 --- /dev/null +++ b/conlleval.py @@ -0,0 +1,237 @@ +""" +Source: https://github.com/sighsmile/conlleval + +This script applies to IOB2 or IOBES tagging scheme. +If you are using a different scheme, please convert to IOB2 or IOBES. + +IOB2: +- B = begin, +- I = inside but not the first, +- O = outside + +e.g. +John lives in New York City . +B-PER O O B-LOC I-LOC I-LOC O + +IOBES: +- B = begin, +- E = end, +- S = singleton, +- I = inside but not the first or the last, +- O = outside + +e.g. +John lives in New York City . +S-PER O O B-LOC I-LOC E-LOC O + +prefix: IOBES +chunk_type: PER, LOC, etc. +""" +from __future__ import division, print_function, unicode_literals + +import sys +from collections import defaultdict + +def split_tag(chunk_tag): + """ + split chunk tag into IOBES prefix and chunk_type + e.g. + B-PER -> (B, PER) + O -> (O, None) + """ + if chunk_tag == 'O': + return ('O', None) + return chunk_tag.split('-', maxsplit=1) + +def is_chunk_end(prev_tag, tag): + """ + check if the previous chunk ended between the previous and current word + e.g. + (B-PER, I-PER) -> False + (B-LOC, O) -> True + + Note: in case of contradicting tags, e.g. (B-PER, I-LOC) + this is considered as (B-PER, B-LOC) + """ + prefix1, chunk_type1 = split_tag(prev_tag) + prefix2, chunk_type2 = split_tag(tag) + + if prefix1 == 'O': + return False + if prefix2 == 'O': + return prefix1 != 'O' + + if chunk_type1 != chunk_type2: + return True + + return prefix2 in ['B', 'S'] or prefix1 in ['E', 'S'] + +def is_chunk_start(prev_tag, tag): + """ + check if a new chunk started between the previous and current word + """ + prefix1, chunk_type1 = split_tag(prev_tag) + prefix2, chunk_type2 = split_tag(tag) + + if prefix2 == 'O': + return False + if prefix1 == 'O': + return prefix2 != 'O' + + if chunk_type1 != chunk_type2: + return True + + return prefix2 in ['B', 'S'] or prefix1 in ['E', 'S'] + + +def calc_metrics(tp, p, t, percent=True): + """ + compute overall precision, recall and FB1 (default values are 0.0) + if percent is True, return 100 * original decimal value + """ + precision = tp / p if p else 0 + recall = tp / t if t else 0 + fb1 = 2 * precision * recall / (precision + recall) if precision + recall else 0 + if percent: + return 100 * precision, 100 * recall, 100 * fb1 + else: + return precision, recall, fb1 + + +def count_chunks(true_seqs, pred_seqs): + """ + true_seqs: a list of true tags + pred_seqs: a list of predicted tags + + return: + correct_chunks: a dict (counter), + key = chunk types, + value = number of correctly identified chunks per type + true_chunks: a dict, number of true chunks per type + pred_chunks: a dict, number of identified chunks per type + + correct_counts, true_counts, pred_counts: similar to above, but for tags + """ + correct_chunks = defaultdict(int) + true_chunks = defaultdict(int) + pred_chunks = defaultdict(int) + + correct_counts = defaultdict(int) + true_counts = defaultdict(int) + pred_counts = defaultdict(int) + + prev_true_tag, prev_pred_tag = 'O', 'O' + correct_chunk = None + + for true_tag, pred_tag in zip(true_seqs, pred_seqs): + if true_tag == pred_tag: + correct_counts[true_tag] += 1 + true_counts[true_tag] += 1 + pred_counts[pred_tag] += 1 + + _, true_type = split_tag(true_tag) + _, pred_type = split_tag(pred_tag) + + if correct_chunk is not None: + true_end = is_chunk_end(prev_true_tag, true_tag) + pred_end = is_chunk_end(prev_pred_tag, pred_tag) + + if pred_end and true_end: + correct_chunks[correct_chunk] += 1 + correct_chunk = None + elif pred_end != true_end or true_type != pred_type: + correct_chunk = None + + true_start = is_chunk_start(prev_true_tag, true_tag) + pred_start = is_chunk_start(prev_pred_tag, pred_tag) + + if true_start and pred_start and true_type == pred_type: + correct_chunk = true_type + if true_start: + true_chunks[true_type] += 1 + if pred_start: + pred_chunks[pred_type] += 1 + + prev_true_tag, prev_pred_tag = true_tag, pred_tag + if correct_chunk is not None: + correct_chunks[correct_chunk] += 1 + + return (correct_chunks, true_chunks, pred_chunks, + correct_counts, true_counts, pred_counts) + +def get_result(correct_chunks, true_chunks, pred_chunks, + correct_counts, true_counts, pred_counts, verbose=True): + """ + if verbose, print overall performance, as well as preformance per chunk type; + otherwise, simply return overall prec, rec, f1 scores + """ + # sum counts + sum_correct_chunks = sum(correct_chunks.values()) + sum_true_chunks = sum(true_chunks.values()) + sum_pred_chunks = sum(pred_chunks.values()) + + sum_correct_counts = sum(correct_counts.values()) + sum_true_counts = sum(true_counts.values()) + + nonO_correct_counts = sum(v for k, v in correct_counts.items() if k != 'O') + nonO_true_counts = sum(v for k, v in true_counts.items() if k != 'O') + + chunk_types = sorted(list(set(list(true_chunks) + list(pred_chunks)))) + + # compute overall precision, recall and FB1 (default values are 0.0) + prec, rec, f1 = calc_metrics(sum_correct_chunks, sum_pred_chunks, sum_true_chunks) + res = (prec, rec, f1) + if not verbose: + return res + + # print overall performance, and performance per chunk type + + print("processed %i tokens with %i phrases; " % (sum_true_counts, sum_true_chunks), end='') + print("found: %i phrases; correct: %i.\n" % (sum_pred_chunks, sum_correct_chunks), end='') + + print("accuracy: %6.2f%%; (non-O)" % (100*nonO_correct_counts/nonO_true_counts)) + print("accuracy: %6.2f%%; " % (100*sum_correct_counts/sum_true_counts), end='') + print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" % (prec, rec, f1)) + + # for each chunk type, compute precision, recall and FB1 (default values are 0.0) + for t in chunk_types: + prec, rec, f1 = calc_metrics(correct_chunks[t], pred_chunks[t], true_chunks[t]) + print("%17s: " %t , end='') + print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" % + (prec, rec, f1), end='') + print(" %d" % pred_chunks[t]) + + return res + # you can generate LaTeX output for tables like in + # http://cnts.uia.ac.be/conll2003/ner/example.tex + # but I'm not implementing this + +def evaluate(true_seqs, pred_seqs, verbose=True): + (correct_chunks, true_chunks, pred_chunks, + correct_counts, true_counts, pred_counts) = count_chunks(true_seqs, pred_seqs) + result = get_result(correct_chunks, true_chunks, pred_chunks, + correct_counts, true_counts, pred_counts, verbose=verbose) + return result + +def evaluate_conll_file(fileIterator): + true_seqs, pred_seqs = [], [] + + for line in fileIterator: + cols = line.strip().split() + # each non-empty line must contain >= 3 columns + if not cols: + true_seqs.append('O') + pred_seqs.append('O') + elif len(cols) < 3: + raise IOError("conlleval: too few columns in line %s\n" % line) + else: + # extract tags from last 2 columns + true_seqs.append(cols[-2]) + pred_seqs.append(cols[-1]) + return evaluate(true_seqs, pred_seqs) + +if __name__ == '__main__': + """ + usage: conlleval < file + """ + evaluate_conll_file(sys.stdin) diff --git a/core/poldeepner.py b/core/poldeepner.py index d2bd150bfafa291b8399010da1f6540ba1c8d1d2..95ce87e9ad31fb07171597c91171d066759c5e80 100644 --- a/core/poldeepner.py +++ b/core/poldeepner.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - import codecs import os import torch @@ -13,24 +11,9 @@ from core.utils.tokenization import TokenizerSpaces class PolDeepNer2: - """A message of shame -- documentation must be completed.""" - def __init__(self, model_path, pretrained_path, device="cpu", squeeze=False, max_seq_length=256, tokenizer=TokenizerSpaces()): - """A message of shame -- documentation must be completed. - - Args: - model_path:A message of shame -- documentation must be completed. - pretrained_path:A message of shame -- documentation must be - completed. - device:A message of shame -- documentation must be completed. - squeeze:A message of shame -- documentation must be completed. - max_seq_length:A message of shame -- documentation must be - completed. - tokenizer:A message of shame -- documentation must be completed. - - """ if not os.path.exists(model_path): raise ValueError("Model not found on path '%s'" % model_path) @@ -61,21 +44,11 @@ class PolDeepNer2: @staticmethod def load_labels(path): - """A message of shame -- documentation must be completed. - - Args: - path:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return [line.strip() for line in codecs.open( path, "r", "utf8").readlines() if len(line.strip()) > 0] def process(self, sentences): - """A message of shame -- documentation must be completed. - - @param sentences -- array of array of words, + """ @param sentences -- array of array of words, [['Jan', 'z', 'Warszawy'], ['IBM', 'i', 'Apple']] @param max_seq_length -- the maximum total input sequence length after WordPiece tokenization @@ -132,8 +105,7 @@ class PolDeepNer2: return y_pred def process_text(self, text: str): - """A message of shame -- documentation must be completed. - + """ @texts: Array of sentences. Each sentence is a string. "John lives in New York. Mary lives in Chicago" @@ -146,8 +118,7 @@ class PolDeepNer2: return align_tokens_with_text(text, sentences, annotations) def process_tokenized(self, tokens: [[str]], text: str): - """A message of shame -- documentation must be completed. - + """ @tokens: Array of sentences. Each sentence is an array of words. [["John", "lives", "in", "New", "York"], ["Mary", "lives", "in", "Chicago"]] diff --git a/evaluate_tsv.py b/evaluate_tsv.py index 7b7544ef4d77183dee2e5ca4971fea8d0b3e44a7..48ba7e6f5b4d88cbb0bdb01970ddf71bebd0cc1e 100644 --- a/evaluate_tsv.py +++ b/evaluate_tsv.py @@ -8,6 +8,7 @@ import time from typing import List, Tuple import poldeepner2 +from conlleval import evaluate from poldeepner2.utils.data_utils import read_tsv from poldeepner2.utils.seed import setup_seed from poldeepner2.utils.sequence_labeling import classification_report @@ -31,6 +32,7 @@ def evaluate_model(args) -> List[Tuple[str, str]]: logging.info("Processing ...") sentences_labels = read_tsv(os.path.join(args.input), True) + sentences = [sentence[0] for sentence in sentences_labels] labels = [sentence[1] for sentence in sentences_labels] @@ -43,6 +45,9 @@ def evaluate_model(args) -> List[Tuple[str, str]]: time_processing = time.clock() - t0 report = classification_report(labels, predictions, digits=4) + + evaluate([p for sub in labels for p in sub], [p for sub in predictions for p in sub], verbose=True) + # print(report) # # print(f"Total time: : {time_processing:>8.4} second(s)") diff --git a/poldeepner2/data/document.py b/poldeepner2/data/document.py index 2b0400df34ea086ef98cb6f9de2592389afef2de..a7db91ebf16f5c328a3e7520c243e8234b1774c1 100644 --- a/poldeepner2/data/document.py +++ b/poldeepner2/data/document.py @@ -1,25 +1,12 @@ -"""A message of shame -- documentation must be completed.""" - from poldeepner2.data.span import Span from poldeepner2.data.token import Token from poldeepner2.utils.annotation import Annotation class Document: - """A message of shame -- documentation must be completed.""" - def __init__(self, content: str, tokens: [Token] = [], sentences: [Span] = [], annotations: [Annotation] = []): - """A message of shame -- documentation must be completed. - - Args: - content:A message of shame -- documentation must be completed. - tokens:A message of shame -- documentation must be completed. - sentences:A message of shame -- documentation must be completed. - annotations:A message of shame -- documentation must be completed. - - """ self.content = content self.tokens = tokens self.annotations = annotations diff --git a/poldeepner2/data/span.py b/poldeepner2/data/span.py index d13e043d90705db299fd21a1e8e931d77f88a581..e9ce4b6941fbdc2ceb69b526888415259214c11b 100644 --- a/poldeepner2/data/span.py +++ b/poldeepner2/data/span.py @@ -1,26 +1,10 @@ -"""A message of shame -- documentation must be completed.""" - from dataclasses import dataclass @dataclass class Span: - """A message of shame -- documentation must be completed. - - Args: - orth (str):A message of shame -- documentation must be completed. - start (int): Index of the first token. - end (int): Index of the last token +1. - - """ - start: int end: int def __str__(self): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return f"Span(begin={self.begin},end={self.end})" diff --git a/poldeepner2/data/token.py b/poldeepner2/data/token.py index c8120a6aea0dd119c54c4b6985b4718cb96e2219..2700218e3556a086dbf9b7204e414187d0b931de 100644 --- a/poldeepner2/data/token.py +++ b/poldeepner2/data/token.py @@ -1,12 +1,9 @@ -"""A message of shame -- documentation must be completed.""" - from dataclasses import dataclass @dataclass class Token: - """A message of shame -- documentation must be completed. - + """ Args: orth (str): start (int): Index of the first orth character in the original text. @@ -27,9 +24,4 @@ class Token: eos: bool = False def __str__(self): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return f"Token(orth={self.orth},lemma={self.lemma},morph={self.morph})" diff --git a/poldeepner2/io/debug.py b/poldeepner2/io/debug.py index 269c9ff7327e167fccd9d711cf141d99e798ca85..3b4c9aa7c6c7fe86aeeb81f03dd34906f664843e 100644 --- a/poldeepner2/io/debug.py +++ b/poldeepner2/io/debug.py @@ -1,17 +1,7 @@ -"""A message of shame -- documentation must be completed.""" - import logging def debug_tokens_and_labels(tokenized_sentences, predictions): - """A message of shame -- documentation must be completed. - - Args: - tokenized_sentences:A message of shame -- documentation must be - completed. - predictions:A message of shame -- documentation must be completed. - - """ for tokens, labels in zip(tokenized_sentences, predictions): for token, label in zip(tokens, labels): logging.debug(f"TOKENIZATION: {token}\t{label}") diff --git a/poldeepner2/model/hf_for_token_calssification.py b/poldeepner2/model/hf_for_token_calssification.py index 92a428f5d1e8f18a6006756b42fba2ca848da4b7..9cb065ffc4ade9a42d27a335762fd39a77d02e07 100644 --- a/poldeepner2/model/hf_for_token_calssification.py +++ b/poldeepner2/model/hf_for_token_calssification.py @@ -61,6 +61,7 @@ class Pdn2TokenClassification(nn.Module): self.to(self.config.device) else: self.model = AutoModel.from_pretrained(path) + self.model.train() def forward(self, inputs_ids, labels, valid_mask): """Computes a forward pass through the sequence tagging model. @@ -78,8 +79,6 @@ class Pdn2TokenClassification(nn.Module): loss: Cross Entropy loss between labels and logits """ - self.model.train() - transformer_out = self.model(inputs_ids, return_dict=True)[0] out_1 = F.relu(self.linear_1(transformer_out)) out_1 = self.dropout(out_1) diff --git a/poldeepner2/models.py b/poldeepner2/models.py index 832331a36bbded7669a186ad582e07fd45ae5dc3..6788182f01e091521ad349d3cbedf963879669ea 100644 --- a/poldeepner2/models.py +++ b/poldeepner2/models.py @@ -1,3 +1,4 @@ +import logging import os from typing import List @@ -36,7 +37,10 @@ class PolDeepNer2: examples = [InputExample(guid=str(idx), tokens=tokens, labels=["O"] * len(tokens)) for idx, tokens in enumerate(sentences)] - assert self.model.config.sequence_generator != "union", "In the inference mode the sequence_generator cannot be union" + if self.model.config.sequence_generator == "union": + logging.warning("In the inference mode the sequence_generator cannot be union. Forced the 'single' mode.") + self.model.config.sequence_generator = "single" + gen = FeatureGeneratorFactory.create(self.model.config.sequence_generator, label_list=self.model.config.labels, max_seq_length=self.model.config.max_seq_length, diff --git a/poldeepner2/pipeline/lemmatization.py b/poldeepner2/pipeline/lemmatization.py index af27e7c08276731255388d95210c471d8fd3b2ad..fd4d97537854b25d6c87351e7272ba315ae5a805 100644 --- a/poldeepner2/pipeline/lemmatization.py +++ b/poldeepner2/pipeline/lemmatization.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - import logging import requests @@ -8,45 +6,22 @@ from poldeepner2.utils.annotation import Annotation class ProcessorAnnotations: - """A message of shame -- documentation must be completed.""" def process(self, annotations: [Annotation]): - """A message of shame -- documentation must be completed. - - Args: - annotations:A message of shame -- documentation must be completed. - - """ pass class AnnotationLemmatizerPolem (ProcessorAnnotations): - """A message of shame -- documentation must be completed.""" def __init__(self): - """A message of shame -- documentation must be completed.""" self.url = 'http://localhost:8000' pass def process(self, annotations: [Annotation]): - """A message of shame -- documentation must be completed. - - Args: - annotations:A message of shame -- documentation must be completed. - - """ for an in annotations: an.lemma = self.lemmatize(an) def lemmatize(self, annotation: Annotation): - """A message of shame -- documentation must be completed. - - Args: - annotation:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ orths = [an.orth for an in annotation.tokens] lemmas = [an.lemma for an in annotation.tokens] spaces = [str(len(an.ws) > 0) for an in annotation.tokens] diff --git a/poldeepner2/pipeline/tokenization.py b/poldeepner2/pipeline/tokenization.py index 3cc7bce7fcd0fa186d67f925deb967d31b4531ee..a14e3e322d08189c637528d84c77909a5905b02c 100644 --- a/poldeepner2/pipeline/tokenization.py +++ b/poldeepner2/pipeline/tokenization.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - import re import requests @@ -10,28 +8,12 @@ from poldeepner2.utils.preprocess import split_hashtags, split_leading_name, \ class Tokenizer: - """A message of shame -- documentation must be completed.""" def tokenize(self, texts: [str]) -> [[Token]]: - """A message of shame -- documentation must be completed. - - Args: - texts:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return [] @staticmethod def align_tokens_with_text(text: str, sentences: [[Token]]): - """A message of shame -- documentation must be completed. - - Args: - text: A message of shame -- documentation must be completed. - sentences: A message of shame -- documentation must be completed. - - """ idx = 0 for sentence in sentences: for token in sentence: @@ -42,22 +24,12 @@ class Tokenizer: class TokenizerFast(Tokenizer): - """A message of shame -- documentation must be completed.""" def __init__(self): - """A message of shame -- documentation must be completed.""" self.pattern_tokens = re.compile(r"(\W)") self.abbrev_no_eos = set(["tzw", "np", "m.in", "tj"]) def tokenize(self, texts: [str]) -> [[str]]: - """A message of shame -- documentation must be completed. - - Args: - texts:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ out = [] for text in texts: text_out = [] @@ -88,58 +60,24 @@ class TokenizerFast(Tokenizer): return out def is_ended_with_abbrev(self, sequence: [str]) -> bool: - """A message of shame -- documentation must be completed. - - Args: - sequence:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return len(sequence) > 1 and sequence[-1] == "." \ and sequence[-2] in self.abbrev_no_eos def is_ended_with_name_initial(self, sequence: [str]) -> bool: - """A message of shame -- documentation must be completed. - - Args: - sequence: A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return len(sequence) > 1 and sequence[-1] == "." \ and len(sequence[-2]) == 1 \ and sequence[-2].isupper() and sequence[-2].isalpha() class TokenizerSpaces(Tokenizer): - """A message of shame -- documentation must be completed.""" def tokenize(self, texts: [str]) -> [[str]]: - """A message of shame -- documentation must be completed. - - Args: - texts:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return [re.sub(r"\s+", " ", text.strip()).split(" ") for text in texts] class TokenizerKrnnt(Tokenizer): - """A message of shame -- documentation must be completed.""" def tokenize(self, texts: [str]) -> [[str]]: - """A message of shame -- documentation must be completed. - - Args: - texts:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ out = [] for text in texts: sentences = TokenizerKrnnt.request(text) @@ -151,14 +89,6 @@ class TokenizerKrnnt(Tokenizer): return out def tokenize_tokens(self, texts: [str]) -> [[Token]]: - """A message of shame -- documentation must be completed. - - Args: - texts:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ out = [] for text in texts: sentences = TokenizerKrnnt.request(text) @@ -167,14 +97,6 @@ class TokenizerKrnnt(Tokenizer): @staticmethod def request(text: str): - """A message of shame -- documentation must be completed. - - Args: - text:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ url = 'http://localhost:9003' x = requests.post(url, data=text.encode('utf-8')) tokens = TokenizerKrnnt.parse_krnnt_output(x.text) @@ -183,14 +105,6 @@ class TokenizerKrnnt(Tokenizer): @staticmethod def parse_krnnt_output(output): - """A message of shame -- documentation must be completed. - - Args: - output:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ sentences = [] tokens = [] orth = None @@ -218,13 +132,6 @@ class TokenizerKrnnt(Tokenizer): def load(tokenizer_type: str) -> Tokenizer: - """A message of shame -- documentation must be completed. - - Args: - tokenizer_type: strA message of shame -- documentation must be - completed. - - """ if tokenizer_type == "space": return TokenizerSpaces() elif tokenizer_type == "krnnt": diff --git a/poldeepner2/utils/annotation.py b/poldeepner2/utils/annotation.py index a3685f7ac1880a8c0685a7c5ffd756f279aabe9f..99028ea8c5d8e6865a4ff16d340a3410ceaf874e 100644 --- a/poldeepner2/utils/annotation.py +++ b/poldeepner2/utils/annotation.py @@ -1,23 +1,11 @@ -"""A message of shame -- documentation must be completed.""" from dataclasses import dataclass from poldeepner2.data.token import Token class Annotation: - """A message of shame -- documentation must be completed.""" - def __init__(self, label, sid: int = None, token_id: int = None, tokens: [Token] = []): - """A message of shame -- documentation must be completed. - - Args: - label: A message of shame -- documentation must be completed. - sid: A message of shame -- documentation must be completed. - token_id: A message of shame -- documentation must be completed. - tokens: A message of shame -- documentation must be completed. - - """ self.sentence_id = sid self.token_ids = [token_id] if token_id is not None else [] self.tokens = tokens @@ -25,87 +13,40 @@ class Annotation: self.lemma = "" def add_id(self, id): - """A message of shame -- documentation must be completed. - - Args: - id: A message of shame -- documentation must be completed. - - """ self.token_ids.append(id) def add_token(self, token: Token): - """A message of shame -- documentation must be completed. - - Args: - token:A message of shame -- documentation must be completed. - - """ self.tokens.append(token) def get_text(self): - """A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ return "".join([t.orth + t.ws for t in self.tokens]).strip() def __str__(self): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return self.annotation def __eq__(self, other): - """A message of shame -- documentation must be completed. - - Args: - other:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return self.annotation == other.annotation \ and self.token_ids[0] == other.token_ids[0] \ and self.token_ids[-1] == other.token_ids[-1] \ and self.sentence_id == other.sentence_id def __hash__(self): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return hash(self.annotation + str(self.sentence_id) + str(self.token_ids[0]) + str(self.token_ids[-1])) @property def annotation_length(self): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return self.token_ids[-1] - self.token_ids[0] @dataclass class AnnotationText: - """A message of shame -- documentation must be completed.""" - start: int end: int label: str text: str def dict(self): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return { 'begin': self.start, 'end': self.end, diff --git a/poldeepner2/utils/data_utils.py b/poldeepner2/utils/data_utils.py index b3506408aa432960ffccdf18a924d131fe9a690e..7e3a9d1409d3cab57e76c8f44889d8b6b6b0a751 100644 --- a/poldeepner2/utils/data_utils.py +++ b/poldeepner2/utils/data_utils.py @@ -1,4 +1,3 @@ -"""A message of shame -- documentation must be completed.""" import codecs import json from typing import List @@ -28,14 +27,6 @@ class NerProcessor: return examples def get_labels(self, paths): - """A message of shame -- documentation must be completed. - - Args: - paths: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ label_set = set([]) for path in paths: examples = self.get_examples(path) @@ -43,14 +34,6 @@ class NerProcessor: return sorted(list(label_set)) def _read_file(self, filename): - """A message of shame -- documentation must be completed. - - Args: - filename: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ f = open(filename) data = [] sentence = [] @@ -104,15 +87,6 @@ class NerProcessor: return data def _create_examples(self, lines, set_type): - """A message of shame -- documentation must be completed. - - Args: - lines: A message of shame -- documentation must be completed. - set_type: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ examples = [] for i, (sentence, label) in enumerate(lines): guid = "%s-%s" % (set_type, i) @@ -124,14 +98,6 @@ class NerProcessor: @staticmethod def _get_labels(sentences): - """A message of shame -- documentation must be completed. - - Args: - sentences: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ label_set = set([]) for t in sentences: label_set.update(t.labels) @@ -139,14 +105,6 @@ class NerProcessor: def create_dataset(features) -> TensorDataset: - """A message of shame -- documentation must be completed. - - Args: - features: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], @@ -157,15 +115,7 @@ def create_dataset(features) -> TensorDataset: return TensorDataset(all_input_ids, all_label_ids, all_valid_ids) -def wrap_annotations(sentences) -> [Annotation]: - """A message of shame -- documentation must be completed. - - Args: - sentences: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ +def wrap_annotations(sentences: List[List[str]]) -> [Annotation]: annotations = [] tid = 0 for sid, labels in enumerate(sentences): @@ -189,15 +139,6 @@ def wrap_annotations(sentences) -> [Annotation]: def align_tokens_to_text(sentences: [[str]], text): - """A message of shame -- documentation must be completed. - - Args: - sentences: A message of shame -- documentation must be completed. - text: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ offsets = [] tid = 0 text = text.lower() @@ -216,16 +157,6 @@ def align_tokens_to_text(sentences: [[str]], text): def align_tokens_with_text(text, sentences, annotations) -> [AnnotationText]: - """A message of shame -- documentation must be completed. - - Args: - text: A message of shame -- documentation must be completed. - sentences: A message of shame -- documentation must be completed. - annotations: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ offsets = align_tokens_to_text(sentences, text) output = [] for an in annotations: @@ -239,18 +170,6 @@ def align_tokens_with_text(text, sentences, annotations) -> [AnnotationText]: def get_poleval_dict(doc_id, text, sentences, annotations): - """A message of shame -- documentation must be completed. - - Returns PolEval dict - { - text: A message of shame -- documentation must be completed. - id: A message of shame -- documentation must be completed. - answers: A message of shame -- documentation must be completed. - } - Note that arguments it takes is FILE, PATH, FILE as - utils.load_data_and_labels opens file itself - - """ offsets = align_tokens_to_text(sentences, text) answers = [] for an in annotations: @@ -267,41 +186,17 @@ def get_poleval_dict(doc_id, text, sentences, annotations): def read_params(path): - """A message of shame -- documentation must be completed. - - Args: - path: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ data = read_params_json(path) return data['dropout'], data['num_labels'], data['label_list'] def read_params_json(path): - """A message of shame -- documentation must be completed. - - Args: - path: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ with open(path + '/params.json') as json_file: data = json.load(json_file) return data def read_json(path): - """A message of shame -- documentation must be completed. - - Args: - path: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ json_out = {} with open(path, encoding='utf-8') as f: data = json.load(f) @@ -311,15 +206,6 @@ def read_json(path): def read_tsv(filename, with_labels=False): - """A message of shame -- documentation must be completed. - - Args: - filename: A message of shame -- documentation must be completed. - with_labels: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ f = open(filename, encoding="utf-8") data = [] sentence = [] @@ -346,14 +232,6 @@ def read_tsv(filename, with_labels=False): def save_tsv(output_path, sentences, predictions): - """A message of shame -- documentation must be completed. - - Args: - output_path: A message of shame -- documentation must be completed. - sentences: A message of shame -- documentation must be completed. - predictions: A message of shame -- documentation must be completed. - - """ with codecs.open(output_path, "w", "utf8") as fout: assert len(sentences) == len(predictions) for tokens, labels in zip(sentences, predictions): @@ -363,14 +241,6 @@ def save_tsv(output_path, sentences, predictions): def get_dict_for_record(json_ann): - """A message of shame -- documentation must be completed. - - Args: - json_ann: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ token_dict = {} derives = 0 for ann in json_ann['data']['brat'].split('\n'): @@ -390,15 +260,6 @@ def get_dict_for_record(json_ann): def map_json_to_iob(json_ann, iob): - """A message of shame -- documentation must be completed. - - Args: - json_ann: A message of shame -- documentation must be completed. - iob: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ token_dict, derives = get_dict_for_record(json_ann) successfully_added = 0 out_iob = '' @@ -420,15 +281,6 @@ def map_json_to_iob(json_ann, iob): def has_same_neighbour(annotation, next_annotations): - """A message of shame -- documentation must be completed. - - Args: - annotation: A message of shame -- documentation must be completed. - next_annotations: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ if next_annotations == ['O']: return False searched_ann = 'B-{0}'.format(annotation) @@ -436,12 +288,6 @@ def has_same_neighbour(annotation, next_annotations): def iob2_to_iob(iob2_text): - """A message of shame -- documentation must be completed. - - Args: - iob2_text:A message of shame -- documentation must be completed. - - """ iob2_list = [] iob1_list = [] diff --git a/poldeepner2/utils/file_utils.py b/poldeepner2/utils/file_utils.py index ad08a6e5932092dc3240a3b09ae09ad53d162ed6..cccd00caead6ef0a55e65c763c3f5c82c68a7302 100644 --- a/poldeepner2/utils/file_utils.py +++ b/poldeepner2/utils/file_utils.py @@ -1,4 +1,3 @@ -"""A message of shame -- documentation must be completed.""" from urllib.request import urlopen import requests import os @@ -8,11 +7,6 @@ from tqdm import tqdm def download_from_url(url, dst): - """A message of shame -- documentation must be completed. - - @param: url to download file - @param: dst place to put the file - """ file_size = int(urlopen(url).info().get('Content-Length', -1)) if os.path.exists(dst): first_byte = os.path.getsize(dst) @@ -38,13 +32,6 @@ def download_from_url(url, dst): def unpack_gz(path, output): - """A message of shame -- documentation must be completed. - - Args: - path: A message of shame -- documentation must be completed. - output: A message of shame -- documentation must be completed. - - """ with tarfile.open(path, 'r') as tar: for member in tqdm(iterable=tar.getmembers(), total=len(tar.getmembers())): @@ -52,28 +39,11 @@ def unpack_gz(path, output): def unpack_zip(path, output): - """A message of shame -- documentation must be completed. - - Args: - path: A message of shame -- documentation must be completed. - output: A message of shame -- documentation must be completed. - - """ with ZipFile(path, 'r') as zipObj: zipObj.extractall(output) def download_file(url, path, compression, extract_to_subfolder=False): - """A message of shame -- documentation must be completed. - - Args: - url: A message of shame -- documentation must be completed. - path: A message of shame -- documentation must be completed. - compression: A message of shame -- documentation must be completed. - extract_to_subfolder: A message of shame -- documentation must be - completed. - - """ ext = "" if compression is None else '.' + compression download_from_url(url, path + ext) if compression == 'zip': diff --git a/poldeepner2/utils/preprocess.py b/poldeepner2/utils/preprocess.py index d8613c32cdc7ff39cd95041f171ba2788110efeb..9895eabe25d6947add4e85e6b1bc48240c3b5c1e 100644 --- a/poldeepner2/utils/preprocess.py +++ b/poldeepner2/utils/preprocess.py @@ -1,16 +1,7 @@ -"""A message of shame -- documentation must be completed.""" import re def split_hashtags(tokens): - """A message of shame -- documentation must be completed. - - Args: - tokens: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ output = [] i = 0 while i < len(tokens): @@ -27,14 +18,6 @@ def split_hashtags(tokens): def split_leading_name(tokens): - """A message of shame -- documentation must be completed. - - Args: - tokens: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ if len(tokens) > 1 and re.fullmatch(r"([A-Z][a-z]+)([A-Z][a-z]+)+", tokens[0]) and tokens[1] == ":": output = [] @@ -47,14 +30,6 @@ def split_leading_name(tokens): def split_underscore(tokens): - """A message of shame -- documentation must be completed. - - Args: - tokens: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ output = [] for token in tokens: if "_" in token: diff --git a/poldeepner2/utils/seed.py b/poldeepner2/utils/seed.py index dd022c54dd7e694f756e3a38f5e0ea129683896a..7a2b4614246b9681c6ea1dddd70eeedd960b5c1d 100644 --- a/poldeepner2/utils/seed.py +++ b/poldeepner2/utils/seed.py @@ -1,18 +1,9 @@ -"""A message of shame -- documentation must be completed.""" import numpy as np import torch import random def setup_seed(n=101): - """A message of shame -- documentation must be completed. - - Args: - n: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ random.seed(n) np.random.seed(n) torch.manual_seed(n) diff --git a/poldeepner2/utils/sequence_labeling.py b/poldeepner2/utils/sequence_labeling.py index 9aa83f4d59c6bf5bc8e98312475a0be7ece79aec..c719e91ffc5f8c2ebb48d085265b8f4bf12e14a0 100644 --- a/poldeepner2/utils/sequence_labeling.py +++ b/poldeepner2/utils/sequence_labeling.py @@ -18,7 +18,7 @@ def get_entities(seq, suffix=False): """Gets entities from sequence. Args: - suffix: A message of shame -- documentation must be completed. + suffix: seq (list): sequence of labels. Returns: @@ -69,15 +69,6 @@ def get_entities(seq, suffix=False): def get_tag_type(suffix, chunk): - """A message of shame -- documentation must be completed. - - Args: - suffix: A message of shame -- documentation must be completed. - chunk: A message of shame -- documentation must be completed. - - Returns: A message of shame -- documentation must be completed. - - """ if suffix: tag = chunk[-1] type_ = chunk.split('-')[0] diff --git a/poldeepner2/utils/train_utils.py b/poldeepner2/utils/train_utils.py index 871e0f1294f9b077e26cda5584e13e741a7b3042..568ae06aee75c0fea04298b46a1b245423e07188 100644 --- a/poldeepner2/utils/train_utils.py +++ b/poldeepner2/utils/train_utils.py @@ -1,6 +1,4 @@ -"""A message of shame -- documentation must be completed.""" import logging -from random import choice import torch from torch.utils.data import SequentialSampler, DataLoader diff --git a/poleval_ner_test.py b/poleval_ner_test.py index 0bba6d5c64c8e6e76ee24be6ad7b3662e19efa2f..003854e8777c01f14f52de82701dfe82511101c0 100644 --- a/poleval_ner_test.py +++ b/poleval_ner_test.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - import getopt import json import sys @@ -8,15 +6,6 @@ from dateutil import parser def overlap(offsetsa, offsetsb): - """A message of shame -- documentation must be completed. - - Args: - offsetsa:A message of shame -- documentation must be completed. - offsetsb:A message of shame -- documentation must be completed. - - Returns: not (int(end1) < int(start2) or int(end2) < int(start1)) - - """ try: start1, end1 = offsetsa.split('_') start2, end2 = offsetsb.split('_') @@ -27,15 +16,6 @@ def overlap(offsetsa, offsetsb): def exact(offsetsa, offsetsb): - """A message of shame -- documentation must be completed. - - Args: - offsetsa:A message of shame -- documentation must be completed. - offsetsb:A message of shame -- documentation must be completed. - - Returns: (int(start1) == int(start2)) and (int(end1) == int(end2)) - - """ try: start1, end1 = offsetsa.split('_') start2, end2 = offsetsb.split('_') @@ -48,27 +28,10 @@ def exact(offsetsa, offsetsb): # this to ensure we get rid of derived types when loading entities # (redundant otherwise) def removeDerivs(annots): - """A message of shame -- documentation must be completed. - - Args: - annots:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return {(a, c) for a, c in annots if c.find('derivType') < 0} def compareTextsOverlap(eGold, eModel): - """A message of shame -- documentation must be completed. - - Args: - eGold:A message of shame -- documentation must be completed. - eModel:A message of shame -- documentation must be completed. - - Returns: [tp, fp, fn] - - """ eGold = removeDerivs(eGold) eModel = removeDerivs(eModel) tp, fp, fn = 0, 0, 0 @@ -83,15 +46,6 @@ def compareTextsOverlap(eGold, eModel): def compareTextsExact(eGold, eModel): - """A message of shame -- documentation must be completed. - - Args: - eGold:A message of shame -- documentation must be completed. - eModel:A message of shame -- documentation must be completed. - - Returns: [tp, fp, fn] - - """ eGold = removeDerivs(eGold) eModel = removeDerivs(eModel) tp, fp, fn = 0, 0, 0 @@ -106,16 +60,6 @@ def compareTextsExact(eGold, eModel): def makeAnnsFormat(inputDoc, cols, htype): - """A message of shame -- documentation must be completed. - - Args: - inputDoc:A message of shame -- documentation must be completed. - cols:A message of shame -- documentation must be completed. - htype:A message of shame -- documentation must be completed. - - Returns: z_anns - - """ z_anns = [] for ben in inputDoc.split('\n'): pcs = ben.split('\t') @@ -143,14 +87,6 @@ def makeAnnsFormat(inputDoc, cols, htype): # htype parameter reflects two possible strategies for handling fragmented # entities ("split" or "merge") def computeScores(goldfile, userfile, htype="split"): - """A message of shame -- documentation must be completed. - - Args: - goldfile:A message of shame -- documentation must be completed. - userfile:A message of shame -- documentation must be completed. - htype:A message of shame -- documentation must be completed. - - """ global_tp_ov = 0 global_fp_ov = 0 global_fn_ov = 0 @@ -241,12 +177,6 @@ def computeScores(goldfile, userfile, htype="split"): def main(argv): - """A message of shame -- documentation must be completed. - - Args: - argv:A message of shame -- documentation must be completed. - - """ goldfile = 'POLEVAL-NER_GOLD.json' userfile = '' try: diff --git a/poleval_ner_test_v2.py b/poleval_ner_test_v2.py index ecf69c1c3c741c0800da957b1b3d385818b67748..d607fe73556a054845d0aa445ba3bb6858859f3b 100644 --- a/poleval_ner_test_v2.py +++ b/poleval_ner_test_v2.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - import getopt import json import sys @@ -15,20 +13,10 @@ Source: http://poleval.pl/tasks/ @dataclass class CategoryNormalizer: - """A message of shame -- documentation must be completed.""" - lower: bool = False only_main: bool = False def normalize(self, name): - """A message of shame -- documentation must be completed. - - Args: - name:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ if self.lower: name = name.lower() if self.only_main: @@ -37,15 +25,6 @@ class CategoryNormalizer: def overlap(offsetsa, offsetsb): - """A message of shame -- documentation must be completed. - - Args: - offsetsa:A message of shame -- documentation must be completed. - offsetsb:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ try: start1, end1 = offsetsa.split('_') start2, end2 = offsetsb.split('_') @@ -56,15 +35,6 @@ def overlap(offsetsa, offsetsb): def exact(offsetsa, offsetsb): - """A message of shame -- documentation must be completed. - - Args: - offsetsa:A message of shame -- documentation must be completed. - offsetsb:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ try: start1, end1 = offsetsa.split('_') start2, end2 = offsetsb.split('_') @@ -77,27 +47,10 @@ def exact(offsetsa, offsetsb): # this to ensure we get rid of derived types when loading entities ( # redundant otherwise) def removeDerivs(annots): - """A message of shame -- documentation must be completed. - - Args: - annots:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return {(a, c) for a, c in annots if c.find('derivType') < 0} def getAnnotatonText(content, spans): - """A message of shame -- documentation must be completed. - - Args: - content:A message of shame -- documentation must be completed. - spans:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ texts = [] for span in spans.split(";"): range = span.split("_") @@ -106,17 +59,6 @@ def getAnnotatonText(content, spans): def compareTextsOverlap(eGold, eModel, content, cn: CategoryNormalizer): - """A message of shame -- documentation must be completed. - - Args: - eGold:A message of shame -- documentation must be completed. - eModel:A message of shame -- documentation must be completed. - content:A message of shame -- documentation must be completed. - cn:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ eGold = removeDerivs(eGold) eModel = removeDerivs(eModel) tp, fp, fn = 0, 0, 0 @@ -132,16 +74,6 @@ def compareTextsOverlap(eGold, eModel, content, cn: CategoryNormalizer): def compareTextsExact(eGold, eModel, cn: CategoryNormalizer): - """A message of shame -- documentation must be completed. - - Args: - eGold:A message of shame -- documentation must be completed. - eModel:A message of shame -- documentation must be completed. - cn:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ eGold = removeDerivs(eGold) eModel = removeDerivs(eModel) tp, fp, fn = 0, 0, 0 @@ -157,16 +89,6 @@ def compareTextsExact(eGold, eModel, cn: CategoryNormalizer): def makeAnnsFormat(inputDoc, cols, htype): - """A message of shame -- documentation must be completed. - - Args: - inputDoc:A message of shame -- documentation must be completed. - cols:A message of shame -- documentation must be completed. - htype:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ z_anns = [] for ben in inputDoc.split('\n'): pcs = ben.split('\t') @@ -195,16 +117,6 @@ def makeAnnsFormat(inputDoc, cols, htype): # or "merge") def computeScores(goldfile, userfile, cn: CategoryNormalizer, htype="split", types=None): - """A message of shame -- documentation must be completed. - - Args: - goldfile:A message of shame -- documentation must be completed. - userfile:A message of shame -- documentation must be completed. - cn:A message of shame -- documentation must be completed. - htype:A message of shame -- documentation must be completed. - types:A message of shame -- documentation must be completed. - - """ global_tp_ov = 0 global_fp_ov = 0 global_fn_ov = 0 @@ -302,12 +214,6 @@ def computeScores(goldfile, userfile, cn: CategoryNormalizer, htype="split", def main(argv): - """A message of shame -- documentation must be completed. - - Args: - argv:A message of shame -- documentation must be completed. - - """ goldfile = 'POLEVAL-NER_GOLD.json' userfile = '' lower = False diff --git a/poleval_to_iob.py b/poleval_to_iob.py index 0f0f19a5b1109adc83f2919473ed37493f5649ed..6feb2664070f9906f04e3223d975a8307daaa376 100644 --- a/poleval_to_iob.py +++ b/poleval_to_iob.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - from __future__ import absolute_import, division, print_function import argparse @@ -10,28 +8,12 @@ from poldeepner2.utils.data_utils import read_json, map_json_to_iob def get_id(ini_file): - """A message of shame -- documentation must be completed. - - Args: - ini_file:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ for line in codecs.open(ini_file, "r", "utf8"): if 'id = ' in line: return line.replace('id = ', '') def main(args): - """A message of shame -- documentation must be completed. - - Args: - args:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ print("Loading the NER model ...") json_ann = read_json(args.json) parent = os.path.dirname(args.input) @@ -63,11 +45,6 @@ def main(args): def parse_args(): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ parser = argparse.ArgumentParser( description='Convert set of IOB files into a single json file in ' 'PolEval 2018 NER format') diff --git a/process_poleval.py b/process_poleval.py index 6d327b378c8d9433fb2cb11b9a6f7abe725d729d..2283841eb5f0288a6bd013bad9ccd328741593b5 100644 --- a/process_poleval.py +++ b/process_poleval.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - from __future__ import absolute_import, division, print_function import logging @@ -18,14 +16,6 @@ from poldeepner2.utils.data_utils import get_poleval_dict, wrap_annotations def merge_sentences(sentences: [[str]]): - """A message of shame -- documentation must be completed. - - Args: - sentences:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ flat_list = [] for lit in sentences: flat_list.extend(lit) @@ -33,14 +23,6 @@ def merge_sentences(sentences: [[str]]): def main(args): - """A message of shame -- documentation must be completed. - - Args: - args:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ print("Loading the NER model ...") t0 = time.time() # if args.pretrained_path: #ner = PolDeepNer2(args.model, @@ -110,11 +92,6 @@ def main(args): def parse_args(): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ parser = argparse.ArgumentParser( description='Convert set of IOB files into a single json file in ' 'PolEval 2018 NER format') diff --git a/process_poleval_pretokenized.py b/process_poleval_pretokenized.py index 63ac26be9a9b77416f8057fe7f8777226e5e3f9a..04843571351af9dcb7174bb6b245453705f1e794 100644 --- a/process_poleval_pretokenized.py +++ b/process_poleval_pretokenized.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - from __future__ import absolute_import, division, print_function import argparse @@ -15,28 +13,12 @@ from poldeepner2.utils.preprocess import split_hashtags, split_leading_name def get_id(ini_file): - """A message of shame -- documentation must be completed. - - Args: - ini_file:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ for line in codecs.open(ini_file, "r", "utf8"): if 'id = ' in line: return line.replace('id = ', '') def load_document(abs_path): - """A message of shame -- documentation must be completed. - - Args: - abs_path:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ namext = os.path.basename(abs_path) name = os.path.splitext(namext)[0] path = os.path.dirname(abs_path) @@ -48,12 +30,6 @@ def load_document(abs_path): def main(args): - """A message of shame -- documentation must be completed. - - Args: - args:A message of shame -- documentation must be completed. - - """ print("Loading the NER model ...") ner = PolDeepNer2(args.model, args.pretrained_path, args.device, max_seq_length=args.max_seq_length, @@ -77,11 +53,6 @@ def main(args): def parse_args(): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ parser = argparse.ArgumentParser( description='Convert set of IOB files into a single json file in ' 'PolEval 2018 NER format') diff --git a/process_texts.py b/process_texts.py index f2a69b55fb6bc3803aa012ca51bc93ddc5d1c71f..9ddc215e4c677961554ccf1b35c5e01c2cf4b3ef 100644 --- a/process_texts.py +++ b/process_texts.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - from __future__ import absolute_import, division, print_function import codecs @@ -20,14 +18,6 @@ from poldeepner2.utils.data_utils import wrap_annotations def flatten(list_of_lists): - """A message of shame -- documentation must be completed. - - Args: - list_of_lists:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ flat_list = [] for lit in list_of_lists: flat_list.extend(lit) @@ -35,14 +25,6 @@ def flatten(list_of_lists): def read_content_autobom(path: str) -> str: - """A message of shame -- documentation must be completed. - - Args: - path:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ bytes = min(32, os.path.getsize(path)) content = open(path, 'rb').read(bytes) if content.startswith(codecs.BOM_UTF8): @@ -53,14 +35,6 @@ def read_content_autobom(path: str) -> str: def main(args): - """A message of shame -- documentation must be completed. - - Args: - args:A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ print("Loading the NER model ...") t0 = time.time() if args.pretrained_path: @@ -118,11 +92,6 @@ def main(args): def parse_args(): - """A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ parser = argparse.ArgumentParser( description='Process a set of plain text files from given folder. ' 'The output is save to another folder.') diff --git a/process_tsv.py b/process_tsv.py index 4376ccac9fa68027f68a0d8b5418df0511bf4b43..edc017083d1f2ef62b1eee6dcb3c4fae8a2071db 100644 --- a/process_tsv.py +++ b/process_tsv.py @@ -4,6 +4,8 @@ import argparse import logging import os +import torch + import poldeepner2 from poldeepner2.utils.data_utils import read_tsv, save_tsv from poldeepner2.utils.seed import setup_seed @@ -31,7 +33,8 @@ def main(args): sentences = [sentence[0] for sentence in sentences_labels] logging.info(f"Number of sentences to process: {len(sentences)}") - predictions, stats = ner.process(sentences, args.max_seq_length) + with torch.no_grad(): + predictions, stats = ner.process(sentences, args.max_seq_length) save_tsv(os.path.join(args.output), sentences, predictions) logging.info("done.") diff --git a/sample_conll.py b/sample_conll.py index 2cc31aee32cd3fa38b5a5aeed96c7d9339d9fce4..04928b9d15bfc857e952922985df0a9f5460eacc 100644 --- a/sample_conll.py +++ b/sample_conll.py @@ -1,9 +1,7 @@ -"""A message of shame -- documentation must be completed.""" +import poldeepner2 -import poldeepner2.models - -ner = poldeepner2.models.load("conll-english-large-sq", device="cuda:0", - resources_path="/tmp") +ner = poldeepner2.load("../poldeepner2_models/dev/conll_en_context/model_union_128_101_v_001", device="cpu") +ner.model.config.sequence_generator = "single" sentences = ["""(CNN)In a new criminal court case against a woman alleged to have entered the US Capitol on January 6, the FBI noted that a tipster diff --git a/sample_polem.py b/sample_polem.py index 11a4cedd6b68cd3429a49d997029bc63d0584336..77b065cad8ea080c2ca644d7ce89604c8ccb1b4d 100644 --- a/sample_polem.py +++ b/sample_polem.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - import time from poldeepner2.models import PolDeepNer2, ModelFactory diff --git a/server.py b/server.py index 0a5a2e0c496c30b4007bf81afa17cffab3866f5b..7f9dcda57a28cbfaa933c124f76be6277861adc3 100644 --- a/server.py +++ b/server.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - from __future__ import absolute_import, division, print_function import uvicorn import argparse @@ -51,8 +49,6 @@ class ResponseToken(BaseModel): class ResponseAnnotation(BaseModel): - """A message of shame -- documentation must be completed.""" - text: str label: str lemma: str @@ -61,29 +57,17 @@ class ResponseAnnotation(BaseModel): @staticmethod def generate(an: Annotation): - """A message of shame -- documentation must be completed. - - Args: - an: A message of shame -- documentation must be completed. - - Returns:A message of shame -- documentation must be completed. - - """ return {"text": an.get_text(), "label": an.annotation, "lemma": an.lemma, "start": an.start, "end": an.end} class ResponsePolem(BaseModel): - """A message of shame -- documentation must be completed.""" - text: str tokens: List[ResponseToken] annotations: List[ResponseAnnotation] class Server: - """A message of shame -- documentation must be completed.""" - app = FastAPI() app.add_middleware(CORSMiddleware, diff --git a/setup.py b/setup.py index a28ed2e3fb65953827babc28ae2c4026b0a614d7..87bcf50789f33bac7d11f4c22c7cbac7efae163e 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ install_requires = [ setuptools.setup( name="poldeepner2", - version="0.7.1", + version="0.7.2-wip", author="Michał Marcińczuk", author_email="michal.marcinczuk@pwr.edu.pl", description="PolDeepNer2 is a tool for sequence labeling tasks based on " diff --git a/tests/pipeline/test_lemmatization.py b/tests/pipeline/test_lemmatization.py index d5191ab45bf16f0fac90ee08e9934cc358d05dea..1c62be38b6d69f9406f1d492152c5c519a209f44 100644 --- a/tests/pipeline/test_lemmatization.py +++ b/tests/pipeline/test_lemmatization.py @@ -1,4 +1,3 @@ -"""A message of shame -- documentation must be completed.""" import pytest from poldeepner2.data.token import Token @@ -26,13 +25,6 @@ from poldeepner2.utils.annotation import Annotation ]) def test_annotation_lemmatizer_polem_single(annotation: Annotation, lemma: str): - """A message of shame -- documentation must be completed. - - Args: - annotation: A message of shame -- documentation must be completed. - lemma: A message of shame -- documentation must be completed. - - """ annotations = [annotation] polem = AnnotationLemmatizerPolem() polem.process(annotations) diff --git a/tests/pipeline/test_tokenization.py b/tests/pipeline/test_tokenization.py index 7ebd484d8bf7ea7e87f9f06dc00addfb579571cb..0693da896e83c6e2b2711e8b343892a958138007 100644 --- a/tests/pipeline/test_tokenization.py +++ b/tests/pipeline/test_tokenization.py @@ -1,4 +1,3 @@ -"""A message of shame -- documentation must be completed.""" import pytest from poldeepner2.pipeline.tokenization import TokenizerKrnnt, TokenizerSpaces @@ -6,11 +5,6 @@ from poldeepner2.pipeline.tokenization import TokenizerKrnnt, TokenizerSpaces @pytest.fixture(scope='session', autouse=True) def tokenizer_krnnt(): - """A message of shame -- documentation must be completed. - - Returns: TokenizerKrnnt() - - """ return TokenizerKrnnt() @@ -24,19 +18,6 @@ def tokenizer_krnnt(): def test_tokenizer_krrnt_text(text: str, orths: [str], lemmas: [str], ws: [str], morphs: [str], starts: [int], ends: [int], tokenizer_krnnt): - """A message of shame -- documentation must be completed. - - Args: - text: A message of shame -- documentation must be completed. - orths: A message of shame -- documentation must be completed. - lemmas: A message of shame -- documentation must be completed. - ws: A message of shame -- documentation must be completed. - morphs: A message of shame -- documentation must be completed. - starts: A message of shame -- documentation must be completed. - ends: A message of shame -- documentation must be completed. - tokenizer_krnnt: A message of shame -- documentation must be completed. - - """ sentence = tokenizer_krnnt.tokenize_tokens([text])[0] assert len(sentence) == len(orths) @@ -58,13 +39,6 @@ def test_tokenizer_krrnt_text(text: str, orths: [str], lemmas: [str], ] ) def test_tokenizer_spaces(texts, tokens): - """A message of shame -- documentation must be completed. - - Args: - texts: A message of shame -- documentation must be completed. - tokens: A message of shame -- documentation must be completed. - - """ tokenizer = TokenizerSpaces() output = tokenizer.tokenize(texts) output == tokens diff --git a/tests/unit/utils/test_align_tokens_to_text.py b/tests/unit/utils/test_align_tokens_to_text.py index 5ceff1a52c075c7042d388c67f719cdb196df850..abc0f8ac75a6c46a9e1dc4f8e254a4cfcef95bb9 100644 --- a/tests/unit/utils/test_align_tokens_to_text.py +++ b/tests/unit/utils/test_align_tokens_to_text.py @@ -1,4 +1,3 @@ -"""A message of shame -- documentation must be completed.""" import pytest from poldeepner2.utils.data_utils import align_tokens_to_text @@ -11,12 +10,5 @@ from poldeepner2.utils.data_utils import align_tokens_to_text ] ) def test_align_tokens_to_text(text, tokens, expected_offsets): - """A message of shame -- documentation must be completed. - - Args: text: A message of shame -- documentation must be completed. - tokens: A message of shame -- documentation must be completed. - expected_offsets: A message of shame -- documentation must be completed. - - """ offsets = align_tokens_to_text(tokens, text) assert offsets == expected_offsets diff --git a/tests/unit/utils/test_iob2_to_iob.py b/tests/unit/utils/test_iob2_to_iob.py index 1cecd0d4baa4021cfea4ec364e463627e9e6253d..455c93ed080d2579ae91cf057957937a410f97ef 100644 --- a/tests/unit/utils/test_iob2_to_iob.py +++ b/tests/unit/utils/test_iob2_to_iob.py @@ -1,4 +1,3 @@ -"""A message of shame -- documentation must be completed.""" import pytest import sys import pathlib @@ -48,12 +47,5 @@ sys.path.append(str(pathlib.Path(__file__).absolute().parents[3].resolve())) ] ) def test_iob2_to_iob(iob2_input, expected_output): - """A message of shame -- documentation must be completed. - - Args: - iob2_input: A message of shame -- documentation must be completed. - expected_output: A message of shame -- documentation must be completed. - - """ iob1 = iob2_to_iob(iob2_input) assert iob1.split('\n') == expected_output.split('\n') diff --git a/tests/unit/utils/test_poleval_dict.py b/tests/unit/utils/test_poleval_dict.py index 4aa9a65b8ab528c7020c4099cd517085d735276b..cc11d966c9d3a210c5b0dce47ebe0168695cc6b9 100644 --- a/tests/unit/utils/test_poleval_dict.py +++ b/tests/unit/utils/test_poleval_dict.py @@ -1,4 +1,3 @@ -"""A message of shame -- documentation must be completed.""" import pytest from poldeepner2.utils.data_utils import get_poleval_dict, wrap_annotations @@ -57,16 +56,6 @@ from poldeepner2.utils.data_utils import get_poleval_dict, wrap_annotations ] ) def test_get_poleval_dict(id, text, tokens, labels, answers): - """A message of shame -- documentation must be completed. - - Args: - id: A message of shame -- documentation must be completed. - text: A message of shame -- documentation must be completed. - tokens: A message of shame -- documentation must be completed. - labels: A message of shame -- documentation must be completed. - answers: A message of shame -- documentation must be completed. - - """ annotations = wrap_annotations(labels) poleval_dict = get_poleval_dict(id, text, tokens, annotations) assert poleval_dict == {'text': text, 'id': id, 'answers': answers} diff --git a/tests/unit/utils/test_read_tsv.py b/tests/unit/utils/test_read_tsv.py index 7ebb214717d17c0d7f78b2d062cae7b5437be369..13de58dc970db64f659bfcc2c4d068e2a776f207 100644 --- a/tests/unit/utils/test_read_tsv.py +++ b/tests/unit/utils/test_read_tsv.py @@ -1,4 +1,3 @@ -"""A message of shame -- documentation must be completed.""" from pathlib import Path import pytest diff --git a/tests/unit/utils/test_sequences.py b/tests/unit/utils/test_sequences.py index 9673151c767e213123d30f01265c924c12e4c961..783e958763d1b048a44cf5d51adc6254e7f42487 100644 --- a/tests/unit/utils/test_sequences.py +++ b/tests/unit/utils/test_sequences.py @@ -16,7 +16,7 @@ def method_encode(text: str) -> List[int]: ([("Ala mam kota", "B-nam O O")], 16, [3]), ([("Ala mam kota", "B-nam O O")], 6, [1, 1, 1]), ([("Ala mam kota", "B-nam O O")], 8, [2, 1]), - ([("Tomek przeprowadził się z Krakowa do Warszawy", "B-nam O O O B-loc O B-loc")], 16, [1, 1, 4, 1]), + ([("Tomek przeprowadził się z Krakowa do Warszawy", "B-nam O O O B-loc O B-loc")], 16, [3, 3, 1]), ([("Adam miał psa", "B-nam O O"), ("Tomek ma Burka", "B-nam O B-nam")], 10, [2, 1, 2, 1]), ] ) @@ -65,13 +65,13 @@ def test_feature_generator_single_sentences_with_context(examples: List[Tuple[st @pytest.mark.parametrize( "sentences, max_seq_length, max_segment_length, expected_tokens", [ - (["aaa bbb ccc", "eee fff"], 12, 4, ["aaa bbb ccc", "aaa bbb ccc", "bbb ccc eee", "ccc eee", "eee fff"]), - (["aaa bbb ccc", "eee fff"], 13, 4, ["aaa bbb ccc", "aaa bbb ccc", "bbb ccc eee", "ccc eee fff", "eee fff"]) + (["aaa bbb ccc", "eee fff"], 16, 4, + ["aaa bbb ccc eee", "aaa bbb ccc eee", "bbb ccc eee fff", "ccc eee fff", "eee fff"]), ] ) def test_feature_generator_window_context(sentences: List[str, ], max_seq_length: int, max_segment_length: int, expected_tokens: List[str]): examples = [InputExample(1, text.split(" ")) for text in sentences] - gen = FeatureGeneratorWindowContext(["B-nam", "B-loc", "O"], max_seq_length, method_encode, 4) + gen = FeatureGeneratorWindowContext(["B-nam", "B-loc", "O"], max_seq_length, method_encode) sequences = gen.generate(examples) assert [" ".join(s.tokens) for s in sequences] == expected_tokens diff --git a/tests/unit/utils/test_wrap_annotations.py b/tests/unit/utils/test_wrap_annotations.py index d0452712c2d3adad9d06c5a437612dde0a226092..081025e6f666993bfa03020ef690bdfed457be55 100644 --- a/tests/unit/utils/test_wrap_annotations.py +++ b/tests/unit/utils/test_wrap_annotations.py @@ -1,4 +1,3 @@ -"""A message of shame -- documentation must be completed.""" import codecs from pathlib import Path @@ -7,7 +6,6 @@ from poldeepner2.utils.data_utils import read_tsv, wrap_annotations, \ def test_wrap_and_align_tokens_to_text(): - """A message of shame -- documentation must be completed.""" root = Path(__file__).parents[2].absolute() / "resources" path_iob = str(root / "poleval_0337_iob.tsv") diff --git a/train.py b/train.py index c68e364e4b24f46f18269df99a78e2ea9031edab..94993563f75c92b74b2eaa0e305814d76bf6ca56 100644 --- a/train.py +++ b/train.py @@ -1,5 +1,3 @@ -"""A message of shame -- documentation must be completed.""" - from __future__ import absolute_import, division, print_function import argparse @@ -116,8 +114,10 @@ def train_model(args: Namespace): logger.info(f"Training data was loaded in {time.time() - t0} second(s)") # preparing model configs - hidden_size = 1024 if 'large' in args.pretrained_path else \ - (768 if 'base' in args.pretrained_path else args.hidden_size) + hidden_size = 4096 if 'xxlarge' in args.pretrained_path else \ + 2048 if 'xlarge' in args.pretrained_path else \ + 1024 if 'large' in args.pretrained_path else \ + 768 if 'base' in args.pretrained_path else args.hidden_size device = args.device logger.info("Loading pretrained model...")