diff --git a/mammoth/utils/optimizers.py b/mammoth/utils/optimizers.py index c446c036..ef4e88c7 100644 --- a/mammoth/utils/optimizers.py +++ b/mammoth/utils/optimizers.py @@ -91,18 +91,6 @@ def build_torch_optimizer(model, opts, task_queue_manager): ) elif opts.optim == 'fusedadam': raise NotImplementedError() - # # we use here a FusedAdam() copy of an old Apex repo - # optimizer = FusedAdam(params, lr=opts.learning_rate, betas=betas) - # if opts.model_dtype == 'fp16': - # import apex - - # # In this case use the old FusedAdam with FP16_optimizer wrapper - # static_loss_scale = opts.loss_scale - # dynamic_loss_scale = opts.loss_scale == 0 - # base_optimizer = functools.partial( - # apex.contrib.optimizers.FP16_Optimizer - # optimizer, static_loss_scale=static_loss_scale, dynamic_loss_scale=dynamic_loss_scale - # ) else: raise ValueError('Invalid optimizer type: ' + opts.optim) @@ -326,13 +314,10 @@ def from_opts(cls, model, opts, task_queue_manager, checkpoint=None): ) if opts.model_dtype == "fp16": - if opts.optim == "fusedadam": - optimizer._fp16 = "legacy" - else: - optimizer._fp16 = "amp" - from torch.cuda.amp import GradScaler + optimizer._fp16 = "amp" + from torch.cuda.amp import GradScaler - optimizer._scaler = GradScaler() + optimizer._scaler = GradScaler() if optim_state_dict: optimizer.load_state_dict(optim_state_dict) @@ -379,11 +364,6 @@ def backward(self, loss): backward pass.""" if self.amp: self._scaler.scale(loss).backward() - elif self._fp16 == "legacy": - kwargs = {} - if "update_master_grads" in fn_args(self._optimizer.backward): - kwargs["update_master_grads"] = True - self._optimizer.backward(loss, **kwargs) else: loss.backward() @@ -398,15 +378,10 @@ def externally_managed_step(self, *args, **kwargs): if self.amp: for suboptimizer in self._optimizer.optimizers.values(): self._scaler.unscale_(suboptimizer) - elif self._fp16 == "legacy": - if hasattr(self._optimizer, "update_master_grads"): - self._optimizer.update_master_grads() - if hasattr(self._optimizer, "clip_master_grads") and self._max_grad_norm > 0: - self._optimizer.clip_master_grads(self._max_grad_norm) for group in self._optimizer.param_groups: group['lr'] = learning_rate - if self._max_grad_norm > 0 and self._fp16 != "legacy": + if self._max_grad_norm > 0: clip_grad_norm_(group['params'], self._max_grad_norm) if self.amp: @@ -595,168 +570,6 @@ def step(self, closure=None): """ -class FusedAdam(torch.optim.Optimizer): - - """Implements Adam algorithm. Currently GPU-only. - Requires Apex to be installed via - ``python setup.py install --cuda_ext --cpp_ext``. - It has been proposed in `Adam: A Method for Stochastic Optimization`_. - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups. - lr (float, optional): learning rate. (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square. - (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability. (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) NOT SUPPORTED in FusedAdam! - eps_inside_sqrt (boolean, optional): in the 'update parameters' step, - adds eps to the bias-corrected second moment estimate before - evaluating square root instead of adding it to the square root of - second moment estimate as in the original paper. (default: False) - .. _Adam: A Method for Stochastic Optimization: - https://arxiv.org/abs/1412.6980 - .. _On the Convergence of Adam and Beyond: - https://openreview.net/forum?id=ryQu7f-RZ - """ - - def __init__( - self, - params, - lr=1e-3, - bias_correction=True, - betas=(0.9, 0.999), - eps=1e-8, - eps_inside_sqrt=False, - weight_decay=0.0, - max_grad_norm=0.0, - amsgrad=False, - ): - global fused_adam_cuda - fused_adam_cuda = importlib.import_module("fused_adam_cuda") - - if amsgrad: - raise RuntimeError('AMSGrad variant not supported.') - defaults = dict( - lr=lr, - bias_correction=bias_correction, - betas=betas, - eps=eps, - weight_decay=weight_decay, - max_grad_norm=max_grad_norm, - ) - super(FusedAdam, self).__init__(params, defaults) - self.eps_mode = 0 if eps_inside_sqrt else 1 - - def step(self, closure=None, grads=None, output_params=None, scale=1.0, grad_norms=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - grads (list of tensors, optional): weight gradient to use for the - optimizer update. If gradients have type torch.half, parameters - are expected to be in type torch.float. (default: None) - output params (list of tensors, optional): A reduced precision copy - of the updated weights written out in addition to the regular - updated weights. Have to be of same type as gradients. - (default: None) - scale (float, optional): factor to divide gradient tensor values - by before applying to weights. (default: 1) - """ - loss = None - if closure is not None: - loss = closure() - - if grads is None: - grads_group = [None] * len(self.param_groups) - # backward compatibility - # assuming a list/generator of parameter means single group - elif isinstance(grads, types.GeneratorType): - grads_group = [grads] - elif not isinstance(grads[0], list): - grads_group = [grads] - else: - grads_group = grads - - if output_params is None: - output_params_group = [None] * len(self.param_groups) - elif isinstance(output_params, types.GeneratorType): - output_params_group = [output_params] - elif not isinstance(output_params[0], list): - output_params_group = [output_params] - else: - output_params_group = output_params - - if grad_norms is None: - grad_norms = [None] * len(self.param_groups) - - for group, grads_this_group, output_params_this_group, grad_norm in zip( - self.param_groups, grads_group, output_params_group, grad_norms - ): - if grads_this_group is None: - grads_this_group = [None] * len(group['params']) - if output_params_this_group is None: - output_params_this_group = [None] * len(group['params']) - - # compute combined scale factor for this group - combined_scale = scale - if group['max_grad_norm'] > 0: - # norm is in fact norm*scale - clip = ((grad_norm / scale) + 1e-6) / group['max_grad_norm'] - if clip > 1: - combined_scale = clip * scale - - bias_correction = 1 if group['bias_correction'] else 0 - - for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group): - # note: p.grad should not ever be set for correct operation of - # mixed precision optimizer that sometimes sends None gradients - if p.grad is None and grad is None: - continue - if grad is None: - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError('sparse gradient not supported') - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - out_p = torch.tensor([], dtype=torch.float) if output_param is None else output_param - fused_adam_cuda.adam( - p.data, - out_p, - exp_avg, - exp_avg_sq, - grad, - group['lr'], - beta1, - beta2, - group['eps'], - combined_scale, - state['step'], - self.eps_mode, - bias_correction, - group['weight_decay'], - ) - return loss - - class AdaFactorFairSeq(torch.optim.Optimizer): """Implements Adafactor algorithm.