From 1a0f387da3263f8ab3550adf219cd3830ec10e30 Mon Sep 17 00:00:00 2001 From: pszenny <pszenny@e-science.pl> Date: Fri, 14 Jan 2022 01:44:52 +0100 Subject: [PATCH] learning rate test implementation --- combo/config.graph.template.jsonnet | 26 +++--- combo/training/scheduler.py | 2 +- combo/training/trainer.py | 140 ++++++++++++++++------------ 3 files changed, 92 insertions(+), 76 deletions(-) diff --git a/combo/config.graph.template.jsonnet b/combo/config.graph.template.jsonnet index a472560..f339670 100644 --- a/combo/config.graph.template.jsonnet +++ b/combo/config.graph.template.jsonnet @@ -274,25 +274,25 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't local projection_dim = 512, cycle_loss_n: cycle_loss_n, head_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + 768, #hidden_size * 2, out_features: projection_dim, activation: "tanh", }, dependency_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + 768, #hidden_size * 2, out_features: projection_dim, activation: "tanh", }, }, local projection_dim = 128, head_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + 768,#hidden_size * 2, out_features: projection_dim, dropout_rate: predictors_dropout, activation: "tanh" }, dependency_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + 768, #hidden_size * 2, out_features: projection_dim, dropout_rate: predictors_dropout, activation: "tanh" @@ -305,25 +305,25 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't local projection_dim = 512, cycle_loss_n: cycle_loss_n, head_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + 768, #hidden_size * 2, out_features: projection_dim, activation: "tanh", }, dependency_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + 768, #hidden_size * 2, out_features: projection_dim, activation: "tanh", }, }, local projection_dim = 128, head_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + 768, #hidden_size * 2, out_features: projection_dim, dropout_rate: predictors_dropout, activation: "tanh" }, dependency_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + 768, #hidden_size * 2, out_features: projection_dim, dropout_rate: predictors_dropout, activation: "tanh" @@ -332,7 +332,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't morphological_feat: if in_targets("feats") then { type: "combo_morpho_from_vocab", vocab_namespace: "feats_labels", - input_dim: hidden_size * 2, + input_dim: char_dim + 768, #hidden_size * 2, hidden_dims: [128], activations: ["tanh", "linear"], dropout: [predictors_dropout, 0.0], @@ -344,7 +344,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't lemma_vocab_namespace: "lemma_characters", embedding_dim: 256, input_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + 768, #hidden_size * 2, out_features: 32, dropout_rate: predictors_dropout, activation: "tanh" @@ -357,7 +357,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't activations: ["relu", "relu", "relu", "linear"], }, upos_tagger: if in_targets("upostag") then { - input_dim: hidden_size * 2, + input_dim: char_dim + 768, #hidden_size * 2, hidden_dims: [64], activations: ["tanh", "linear"], dropout: [predictors_dropout, 0.0], @@ -365,7 +365,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't vocab_namespace: "upostag_labels" }, xpos_tagger: if in_targets("xpostag") then { - input_dim: hidden_size * 2, + input_dim: char_dim + 768, #hidden_size * 2, hidden_dims: [128], activations: ["tanh", "linear"], dropout: [predictors_dropout, 0.0], @@ -373,7 +373,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't vocab_namespace: "xpostag_labels" }, semantic_relation: if in_targets("semrel") then { - input_dim: hidden_size * 2, + input_dim: char_dim + 768, #hidden_size * 2, hidden_dims: [64], activations: ["tanh", "linear"], dropout: [predictors_dropout, 0.0], diff --git a/combo/training/scheduler.py b/combo/training/scheduler.py index 8752d73..b749a55 100644 --- a/combo/training/scheduler.py +++ b/combo/training/scheduler.py @@ -7,7 +7,7 @@ from overrides import overrides class Scheduler(learning_rate_scheduler._PyTorchLearningRateSchedulerWrapper): def __init__(self, optimizer, patience: int = 6, decreases: int = 2, threshold: float = 1e-3): - super().__init__(lr_scheduler.LambdaLR(optimizer, lr_lambda=[self._lr_lambda])) + super().__init__(lr_scheduler.LambdaLR(optimizer, lr_lambda=[self._lr_lambda,self._lr_lambda])) self.threshold = threshold self.decreases = decreases self.patience = patience diff --git a/combo/training/trainer.py b/combo/training/trainer.py index 3bee8fc..ae4c0d0 100644 --- a/combo/training/trainer.py +++ b/combo/training/trainer.py @@ -32,11 +32,69 @@ class TransferPatienceEpochCallback(training.EpochCallback): def __call__(self, trainer: "training.GradientDescentTrainer", metrics: Dict[str, Any], epoch: int, is_master: bool) -> None: - if trainer._learning_rate_scheduler and trainer._learning_rate_scheduler.patience is not None: - trainer._metric_tracker._patience = trainer._learning_rate_scheduler.patience - trainer._metric_tracker._epochs_with_no_improvement = 0 + + #LR range test variables + path_to_result_file = "/tmp/lustre_shared/lukasz/tmp/LR_range_3.txt" + end_lr = 0.001 + start_lr = 0.0000001 + num_lr_in_test = 25 + lr_update_factor = (end_lr / start_lr) ** (1.0 / num_lr_in_test) + # # # # # # # + + param_group = trainer.optimizer.param_groups + + if epoch == 0: + with open(path_to_result_file, "a") as file: + file.write("\n" + str(param_group[0]["lr"]) + ";" + str(param_group[1]["lr"]) + ";") else: - raise checks.ConfigurationError("Learning rate scheduler isn't properly setup!") + with open(path_to_result_file, "a") as file: + file.write(str(metrics["training_loss"]) + ";" + + str(metrics["validation_loss"]) + ";" + + str(metrics["best_epoch"]) + ";" + + #training losses + str(metrics["training_partial_loss/upostag_loss"]) + ";" + + str(metrics["training_partial_loss/xpostag_loss"]) + ";" + + str(metrics["training_partial_loss/feats_loss"]) + ";" + + str(metrics["training_partial_loss/lemma_loss"]) + ";" + + str(metrics["training_partial_loss/head_loss"]) + ";" + + str(metrics["training_partial_loss/deprel_loss"]) + ";" + + #training acc + str(metrics["training_UPOS_ACC"]) + ";" + + str(metrics["training_XPOS_ACC"]) + ";" + + str(metrics["training_SEMREL_ACC"]) + ";" + + str(metrics["training_LEMMA_ACC"]) + ";" + + str(metrics["training_FEATS_ACC"]) + ";" + + str(metrics["training_LAS"]) + ";" + + #validation losses + str(metrics["validation_partial_loss/upostag_loss"]) + ";" + + str(metrics["validation_partial_loss/xpostag_loss"]) + ";" + + str(metrics["validation_partial_loss/feats_loss"]) + ";" + + str(metrics["validation_partial_loss/lemma_loss"]) + ";" + + str(metrics["validation_partial_loss/head_loss"]) + ";" + + str(metrics["validation_partial_loss/deprel_loss"]) + ";" + + # validation acc + str(metrics["validation_UPOS_ACC"]) + ";" + + str(metrics["validation_XPOS_ACC"]) + ";" + + str(metrics["validation_SEMREL_ACC"]) + ";" + + str(metrics["validation_LEMMA_ACC"]) + ";" + + str(metrics["validation_FEATS_ACC"]) + ";" + + str(metrics["validation_LAS"]) + ";" + + # best loss + str(metrics["best_validation_loss"]) + ";" + ) + + # END CONDITIONS + if param_group[1]["lr"] >= end_lr and param_group[0]["lr"] >= end_lr: + raise Exception('End of LR test') + + param_group[0]["lr"] = param_group[0]["lr"] * lr_update_factor + if param_group[0]["lr"] >= end_lr: + param_group[0]["lr"] = start_lr + param_group[1]["lr"] = param_group[1]["lr"] * lr_update_factor + + file.write("\n" + str(param_group[0]["lr"]) + ";" + str(param_group[1]["lr"]) + ";") + + trainer.optimizer.param_groups = param_group @training.Trainer.register("gradient_descent_validate_n", constructor="from_partial_objects") @@ -65,7 +123,7 @@ class GradientDescentTrainer(training.GradientDescentTrainer): batch_callbacks, epoch_callbacks, end_callbacks, trainer_callbacks, distributed, local_rank, world_size, num_gradient_accumulation_steps, use_amp) # TODO extract param to constructor (+ constructor method?) - self.validate_every_n = 5 + self.validate_every_n = 1 @overrides def _try_train(self) -> Dict[str, Any]: @@ -93,26 +151,16 @@ class GradientDescentTrainer(training.GradientDescentTrainer): for key, value in self._metric_tracker.best_epoch_metrics.items(): metrics["best_validation_" + key] = value - for callback in self._epoch_callbacks: - callback(self, metrics={}, epoch=-1, is_master=self._master) - for epoch in range(epoch_counter, self._num_epochs): - epoch_start_time = time.time() - train_metrics = self._train_epoch(epoch) - if self._master and self._checkpointer is not None: - self._checkpointer.save_checkpoint(epoch, self, save_model_only=True) + epochs_to_change_lr = 15 - # Wait for the master to finish saving the model checkpoint - if self._distributed: - dist.barrier() + # every epochs_to_change_lr epoch loads weights after 1 epoch + if (epoch - 1) % epochs_to_change_lr == 0 and epoch > 1: + self.model.load_state_dict(torch.load(os.path.join(self._serialization_dir, "initial.th"))) - # get peak of memory usage - for key, value in train_metrics.items(): - if key.startswith("gpu_") and key.endswith("_memory_MB"): - metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) - elif key.startswith("worker_") and key.endswith("_memory_MB"): - metrics["peak_" + key] = max(metrics.get("peak_" + key, 0), value) + epoch_start_time = time.time() + train_metrics = self._train_epoch(epoch) if self._validation_data_loader is not None: val_metrics = {} @@ -141,24 +189,6 @@ class GradientDescentTrainer(training.GradientDescentTrainer): # Check validation metric for early stopping this_epoch_val_metric = val_metrics[self._validation_metric] - # self._metric_tracker.add_metric(this_epoch_val_metric) - - train_metrics["patience"] = self._metric_tracker._patience - if self._metric_tracker.should_stop_early(): - logger.info("Ran out of patience. Stopping training.") - break - - if self._master: - self._tensorboard.log_metrics( - train_metrics, val_metrics=val_metrics, log_to_console=True, epoch=epoch + 1 - ) # +1 because tensorboard doesn't like 0 - - # Create overall metrics dict - training_elapsed_time = time.time() - training_start_time - metrics["training_duration"] = str(datetime.timedelta(seconds=training_elapsed_time)) - metrics["training_start_epoch"] = epoch_counter - metrics["training_epochs"] = epochs_trained - metrics["epoch"] = epoch for key, value in train_metrics.items(): metrics["training_" + key] = value @@ -174,10 +204,9 @@ class GradientDescentTrainer(training.GradientDescentTrainer): self._metric_tracker.best_epoch_metrics = val_metrics - if self._serialization_dir and self._master: - common_util.dump_metrics( - os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), metrics - ) + for callback in self._epoch_callbacks: + if ((epoch-1) % epochs_to_change_lr == 0 and epoch > 1 ) or epoch == 0: + callback(self, metrics=metrics, epoch=epoch, is_master=self._master) # The Scheduler API is agnostic to whether your schedule requires a validation metric - # if it doesn't, the validation metric passed here is ignored. @@ -186,18 +215,6 @@ class GradientDescentTrainer(training.GradientDescentTrainer): if self._momentum_scheduler: self._momentum_scheduler.step(this_epoch_val_metric) - if self._master and self._checkpointer is not None: - self._checkpointer.save_checkpoint( - epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far() - ) - - # Wait for the master to finish saving the checkpoint - if self._distributed: - dist.barrier() - - for callback in self._epoch_callbacks: - callback(self, metrics=metrics, epoch=epoch, is_master=self._master) - epoch_elapsed_time = time.time() - epoch_start_time logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time)) @@ -211,16 +228,15 @@ class GradientDescentTrainer(training.GradientDescentTrainer): epochs_trained += 1 + if epoch == 0: + torch.save(self.model.state_dict(), os.path.join(self._serialization_dir, "initial.th")) + + if (epoch-1) % epochs_to_change_lr == 0 and epoch > 1: + self._metric_tracker.best_epoch_metrics = val_metrics + for callback in self._end_callbacks: callback(self, metrics=metrics, epoch=epoch, is_master=self._master) - # Load the best model state before returning - best_model_state = ( - None if self._checkpointer is None else self._checkpointer.best_model_state() - ) - if best_model_state: - self.model.load_state_dict(best_model_state) - return metrics @classmethod -- GitLab