diff --git a/README.md b/README.md index 0ef57069216a10ae8e4392c4f201ac2cc7883910..680673cc9dffd60d8e23c2a80fcddcad96d03ef5 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ ## Installation +### HERBERTA notes: + +Install herberta transformers package **before** running command below + Clone this repository and run: ```bash python setup.py develop diff --git a/combo/training/trainer.py b/combo/training/trainer.py index 2096d489a2e4963c1e05bc361a120747760b0143..234bdd7992afad8f1c5e48c1baae793f30ccc2d8 100644 --- a/combo/training/trainer.py +++ b/combo/training/trainer.py @@ -54,12 +54,12 @@ class GradientDescentTrainer(training.GradientDescentTrainer): batch_callbacks: List[training.BatchCallback] = None, epoch_callbacks: List[training.EpochCallback] = None, distributed: bool = False, local_rank: int = 0, world_size: int = 1, num_gradient_accumulation_steps: int = 1, - opt_level: Optional[str] = None) -> None: + use_amp: bool = False) -> None: super().__init__(model, optimizer, data_loader, patience, validation_metric, validation_data_loader, num_epochs, serialization_dir, checkpointer, cuda_device, grad_norm, grad_clipping, learning_rate_scheduler, momentum_scheduler, tensorboard_writer, moving_average, batch_callbacks, epoch_callbacks, distributed, local_rank, world_size, - num_gradient_accumulation_steps, opt_level) + num_gradient_accumulation_steps, use_amp) # TODO extract param to constructor (+ constructor method?) self.validate_every_n = 5 @@ -125,7 +125,8 @@ class GradientDescentTrainer(training.GradientDescentTrainer): self.model, val_loss, val_reg_loss, - num_batches, + num_batches=num_batches, + batch_loss=None, reset=True, world_size=self._world_size, cuda_device=self.cuda_device, @@ -231,7 +232,7 @@ class GradientDescentTrainer(training.GradientDescentTrainer): world_size: int = 1, num_gradient_accumulation_steps: int = 1, opt_level: Optional[str] = None, - no_grad: List[str] = None, + use_amp: bool = False, optimizer: common.Lazy[optimizers.Optimizer] = None, learning_rate_scheduler: common.Lazy[learning_rate_schedulers.LearningRateScheduler] = None, momentum_scheduler: common.Lazy[momentum_schedulers.MomentumScheduler] = None, @@ -258,8 +259,7 @@ class GradientDescentTrainer(training.GradientDescentTrainer): distributed=distributed, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, - opt_level=opt_level, - no_grad=no_grad, + use_amp=use_amp, optimizer=optimizer, learning_rate_scheduler=learning_rate_scheduler, momentum_scheduler=momentum_scheduler, diff --git a/setup.py b/setup.py index d3aac1aba248e7bbb76c19e84ae89532c1aa9cf7..228e02562e45b7b329f3b221b76ba895b1cf2c0d 100644 --- a/setup.py +++ b/setup.py @@ -3,17 +3,17 @@ from setuptools import find_packages, setup REQUIREMENTS = [ 'absl-py==0.9.0', - 'allennlp==1.0.0', + 'allennlp==1.1.0', 'conllu==2.3.2', 'dataclasses-json==0.5.2', 'joblib==0.14.1', 'jsonnet==0.15.0', 'requests==2.23.0', - 'overrides==3.0.0', + 'overrides==3.1.0', 'tensorboard==2.1.0', - 'torch>=1.5.0,<1.6.0', + 'torch==1.6.0', 'tqdm==4.43.0', - 'transformers==2.9.1', + 'transformers>=3.0.0,<3.1.0', 'urllib3==1.24.2', ]