diff --git a/README.md b/README.md
index 0ef57069216a10ae8e4392c4f201ac2cc7883910..680673cc9dffd60d8e23c2a80fcddcad96d03ef5 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,9 @@
 ## Installation
 
+### HERBERTA notes:
+
+Install herberta transformers package **before** running command below
+
 Clone this repository and run:
 ```bash
 python setup.py develop
diff --git a/combo/training/trainer.py b/combo/training/trainer.py
index 2096d489a2e4963c1e05bc361a120747760b0143..234bdd7992afad8f1c5e48c1baae793f30ccc2d8 100644
--- a/combo/training/trainer.py
+++ b/combo/training/trainer.py
@@ -54,12 +54,12 @@ class GradientDescentTrainer(training.GradientDescentTrainer):
                  batch_callbacks: List[training.BatchCallback] = None,
                  epoch_callbacks: List[training.EpochCallback] = None, distributed: bool = False, local_rank: int = 0,
                  world_size: int = 1, num_gradient_accumulation_steps: int = 1,
-                 opt_level: Optional[str] = None) -> None:
+                 use_amp: bool = False) -> None:
         super().__init__(model, optimizer, data_loader, patience, validation_metric, validation_data_loader, num_epochs,
                          serialization_dir, checkpointer, cuda_device, grad_norm, grad_clipping,
                          learning_rate_scheduler, momentum_scheduler, tensorboard_writer, moving_average,
                          batch_callbacks, epoch_callbacks, distributed, local_rank, world_size,
-                         num_gradient_accumulation_steps, opt_level)
+                         num_gradient_accumulation_steps, use_amp)
         # TODO extract param to constructor (+ constructor method?)
         self.validate_every_n = 5
 
@@ -125,7 +125,8 @@ class GradientDescentTrainer(training.GradientDescentTrainer):
                             self.model,
                             val_loss,
                             val_reg_loss,
-                            num_batches,
+                            num_batches=num_batches,
+                            batch_loss=None,
                             reset=True,
                             world_size=self._world_size,
                             cuda_device=self.cuda_device,
@@ -231,7 +232,7 @@ class GradientDescentTrainer(training.GradientDescentTrainer):
             world_size: int = 1,
             num_gradient_accumulation_steps: int = 1,
             opt_level: Optional[str] = None,
-            no_grad: List[str] = None,
+            use_amp: bool = False,
             optimizer: common.Lazy[optimizers.Optimizer] = None,
             learning_rate_scheduler: common.Lazy[learning_rate_schedulers.LearningRateScheduler] = None,
             momentum_scheduler: common.Lazy[momentum_schedulers.MomentumScheduler] = None,
@@ -258,8 +259,7 @@ class GradientDescentTrainer(training.GradientDescentTrainer):
             distributed=distributed,
             world_size=world_size,
             num_gradient_accumulation_steps=num_gradient_accumulation_steps,
-            opt_level=opt_level,
-            no_grad=no_grad,
+            use_amp=use_amp,
             optimizer=optimizer,
             learning_rate_scheduler=learning_rate_scheduler,
             momentum_scheduler=momentum_scheduler,
diff --git a/setup.py b/setup.py
index d3aac1aba248e7bbb76c19e84ae89532c1aa9cf7..228e02562e45b7b329f3b221b76ba895b1cf2c0d 100644
--- a/setup.py
+++ b/setup.py
@@ -3,17 +3,17 @@ from setuptools import find_packages, setup
 
 REQUIREMENTS = [
     'absl-py==0.9.0',
-    'allennlp==1.0.0',
+    'allennlp==1.1.0',
     'conllu==2.3.2',
     'dataclasses-json==0.5.2',
     'joblib==0.14.1',
     'jsonnet==0.15.0',
     'requests==2.23.0',
-    'overrides==3.0.0',
+    'overrides==3.1.0',
     'tensorboard==2.1.0',
-    'torch>=1.5.0,<1.6.0',
+    'torch==1.6.0',
     'tqdm==4.43.0',
-    'transformers==2.9.1',
+    'transformers>=3.0.0,<3.1.0',
     'urllib3==1.24.2',
 ]