From 441d304b1c4a1df4f897af11a2e37b4cb2913e0c Mon Sep 17 00:00:00 2001 From: Jarema Radom <jaremaradom@gmail.com> Date: Mon, 19 Oct 2020 09:58:15 +0200 Subject: [PATCH 1/2] wandb init --- main.py | 6 ++++-- requirements.txt | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index ce8d8d4..6b23eae 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,7 @@ from __future__ import absolute_import, division, print_function import argparse import codecs import logging +import wandb import os import random import sys @@ -29,6 +30,7 @@ def main(): parser = add_xlmr_args(parser) args = parser.parse_args() + wandb.init(config={"epochs": args.num_train_epochs, "batch_size": args.train_batch_size}) if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory (%s) already exists and is not empty." % args.output_dir) @@ -78,6 +80,7 @@ def main(): dropout_p=args.dropout, device=device) model.to(device) + wandb.watch(model) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) @@ -148,7 +151,6 @@ def main(): batch = tuple(t.to(device) for t in batch) input_ids, label_ids, l_mask, valid_ids, = batch loss = model(input_ids, label_ids, l_mask, valid_ids) - if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps @@ -167,7 +169,7 @@ def main(): nb_tr_steps += 1 if step % 1000 == 0: logger.info('Step = %d/%d; Loss = %.4f' % (step+1, steps, tr_loss / (step+1))) - + wandb.log({"loss": loss}) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() diff --git a/requirements.txt b/requirements.txt index 8f0386e..ba7e5ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ fairseq==0.9.0 pytorch-transformers==1.2.0 seqeval==0.0.12 -pytest~=6.0.1 \ No newline at end of file +pytest~=6.0.1 +wandb==0.10.7 \ No newline at end of file -- GitLab From 17ab858a819b28d87606c63ab34531096e50ca41 Mon Sep 17 00:00:00 2001 From: Jarema Radom <jaremaradom@gmail.com> Date: Wed, 28 Oct 2020 09:16:46 +0100 Subject: [PATCH 2/2] Merge fix --- core/poldeepner.py | 9 ------- core/utils/data_utils.py | 16 ----------- process_poleval.py | 57 +++------------------------------------- process_tsv.py | 12 --------- 4 files changed, 3 insertions(+), 91 deletions(-) diff --git a/core/poldeepner.py b/core/poldeepner.py index 90566d8..06db767 100644 --- a/core/poldeepner.py +++ b/core/poldeepner.py @@ -1,10 +1,7 @@ import codecs import os import torch -<<<<<<< HEAD -======= import tqdm ->>>>>>> 9-models from torch.utils.data.dataloader import DataLoader from core.model.xlmr_for_token_classification import XLMRForTokenClassification @@ -54,14 +51,8 @@ class PolDeepNer2: for idx, tokens in enumerate(sentences): guid = str(idx) text_a = ' '.join(tokens) -<<<<<<< HEAD - text_b = None - label = ["O"] * len(tokens) - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) -======= label = ["O"] * len(tokens) examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) ->>>>>>> 9-models eval_features = convert_examples_to_features(examples, self.label_list, self.max_seq_length, self.model.encode_word, self.squeeze) diff --git a/core/utils/data_utils.py b/core/utils/data_utils.py index 98b5ae4..3c9df11 100644 --- a/core/utils/data_utils.py +++ b/core/utils/data_utils.py @@ -311,7 +311,6 @@ def convert_examples_to_features_nosq(examples, label_list, max_seq_length, enco assert len(valid) == max_seq_length assert len(label_mask) == max_seq_length -<<<<<<< HEAD if ex_index < 2: logging.debug("*** Example ***") logging.debug("guid: %s" % example.guid) @@ -328,14 +327,6 @@ def convert_examples_to_features_nosq(examples, label_list, max_seq_length, enco label_mask=label_mask)) -======= - features.append( - InputFeatures(input_ids=token_ids, - input_mask=input_mask, - label_id=label_ids, - valid_ids=valid, - label_mask=label_mask)) ->>>>>>> 4f7642433a515aa3d14550ddb1b0009ffe824c16 return features @@ -529,10 +520,3 @@ def map_json_to_iob(json_ann, iob): out_iob += line failed_to_add = len(token_dict) - successfully_added return out_iob, successfully_added, failed_to_add, derives -<<<<<<< HEAD -<<<<<<< HEAD:core/utils/data_utils.py - -======= ->>>>>>> 9-models:core/utils/data_utils.py -======= ->>>>>>> 4f7642433a515aa3d14550ddb1b0009ffe824c16 diff --git a/process_poleval.py b/process_poleval.py index a0c87a4..d739dad 100644 --- a/process_poleval.py +++ b/process_poleval.py @@ -8,74 +8,23 @@ import os import json from core.poldeepner import PolDeepNer2 -<<<<<<< HEAD -from core.utils.data_utils import get_poleval_dict, read_tsv, wrap_annotations -from core.utils.file_utils import show_download_menu, check_for_data, download_missing - - -def get_id(ini_file): - for line in codecs.open(ini_file, "r", "utf8"): - if 'id = ' in line: - return line.replace('id = ', '') - - -def split_hashtags(tokens): - output = [] - i = 0 - while i < len(tokens): - if tokens[i] == "#" and i+1 < len(tokens) and re.fullmatch(r"([A-Z][a-z]+)([A-Z][a-z]+)+", tokens[i+1]): - output.append("#") - for m in re.findall(r"([A-Z][a-z]+)", tokens[i+1]): - output.append(str(m)) - i += 2 - else: - output.append(tokens[i]) - i += 1 - return output - - -def split_leading_name(tokens): - if len(tokens) > 1 and re.fullmatch(r"([A-Z][a-z]+)([A-Z][a-z]+)+", tokens[0]) and tokens[1] == ":": - output = [] - for m in re.findall(r"([A-Z][a-z]+)", tokens[0]): - output.append(str(m)) - output.extend(tokens[1:]) - return output - else: - return tokens - - -def load_document(abs_path): - namext = os.path.basename(abs_path) - name = os.path.splitext(namext)[0] - path = os.path.dirname(abs_path) - text = codecs.open(os.path.join(path, name + ".txt"), "r", "utf8").read() - doc_id = get_id(os.path.join(path, name + ".ini")) - sentences_labels = read_tsv(os.path.join(path, name + ".iob")) - sentences = [sentence[0] for sentence in sentences_labels] - return doc_id, text, sentences -======= from core.utils import tokenization from core.utils.data_utils import get_poleval_dict, wrap_annotations from core.utils.preprocess import split_hashtags, split_leading_name ->>>>>>> 4f7642433a515aa3d14550ddb1b0009ffe824c16 def main(args): print("Loading the NER model ...") -<<<<<<< HEAD missing = check_for_data('./config.json') if len(missing) > 0: if args.pretrained_path: missing = list(filter(lambda x: x['name'] not in ['RoBERTa_base', 'RoBERTa_large', 'KPWR_n82_base'], missing)) show_download_menu(missing) - ner = PolDeepNer2(args.model, args.pretrained_path, args.device) - parent = os.path.dirname(args.input) -======= ner = PolDeepNer2(args.model, args.pretrained_path, device=args.device, max_seq_length=args.max_seq_length, - squeeze=args.squeeze) ->>>>>>> 4f7642433a515aa3d14550ddb1b0009ffe824c16 + squeeze=args.squeeze) + parent = os.path.dirname(args.input) + dict_list = [] tokenizer = tokenization.load(args.tokenization) diff --git a/process_tsv.py b/process_tsv.py index 088523a..7b3f864 100644 --- a/process_tsv.py +++ b/process_tsv.py @@ -12,7 +12,6 @@ from core.utils.data_utils import read_tsv, save_tsv def main(args): -<<<<<<< HEAD logging.info("Loading the NER model ...") missing = check_for_data('./config.json') missing = list(filter(lambda x: x['name'] not in ['RoBERTa_base', 'RoBERTa_large'], missing)) @@ -28,17 +27,6 @@ def main(args): logging.info(f"Number of sentences to process: {len(sentences)}") predictions = ner.process(sentences, args.max_seq_length) save_tsv(os.path.join(args.output), sentences, predictions) -======= - try: - print("Loading the NER model ...") - ner = PolDeepNer2(args.model, args.pretrained_path, args.device, args.squeeze, args.max_seq_length) - - print("Processing ...") - sentences_labels = read_tsv(os.path.join(args.input)) - sentences = [sentence[0] for sentence in sentences_labels] - predictions = ner.process(sentences) - save_tsv(os.path.join(args.input), os.path.join(args.output), predictions) ->>>>>>> 4f7642433a515aa3d14550ddb1b0009ffe824c16 logging.info("done.") -- GitLab