Loading megatron/optimizer/__init__.py +3 −1 Original line number Diff line number Diff line Loading @@ -100,10 +100,12 @@ def get_megatron_optimizer(model): args.clip_grad, args.log_num_zeros_in_grad, params_have_main_grad, args.use_contiguous_buffers_in_ddp, args.bf16, grad_scaler) # FP32. return FP32Optimizer(optimizer, args.clip_grad, args.log_num_zeros_in_grad, params_have_main_grad) params_have_main_grad, args.use_contiguous_buffers_in_ddp) megatron/optimizer/optimizer.py +29 −5 Original line number Diff line number Diff line Loading @@ -68,7 +68,9 @@ class MegatronOptimizer(ABC): def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad, params_have_main_grad): params_have_main_grad, use_contiguous_buffers_in_ddp): """Input optimizer is the base optimizer for example Adam.""" self.optimizer = optimizer assert self.optimizer, 'no optimizer is provided.' Loading @@ -76,6 +78,7 @@ class MegatronOptimizer(ABC): self.clip_grad = clip_grad self.log_num_zeros_in_grad = log_num_zeros_in_grad self.params_have_main_grad = params_have_main_grad self.use_contiguous_buffers_in_ddp = use_contiguous_buffers_in_ddp def get_parameters(self): Loading Loading @@ -187,11 +190,12 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer): """ def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad, params_have_main_grad, bf16, grad_scaler): params_have_main_grad, use_contiguous_buffers_in_ddp, bf16, grad_scaler): super(Float16OptimizerWithFloat16Params, self).__init__( optimizer, clip_grad, log_num_zeros_in_grad, params_have_main_grad) params_have_main_grad, use_contiguous_buffers_in_ddp) self.bf16 = bf16 self.grad_scaler = grad_scaler Loading Loading @@ -305,12 +309,25 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer): else: if model_param.grad is not None: main_param.grad = model_param.grad.float() # Safe to deallocate model's grad/main_grad after copying. # (If using contiguous buffers, main_grad's memory should # persist and therefore should not be deallocated.) model_param.grad = None if not self.use_contiguous_buffers_in_ddp: model_param.main_grad = None # For fp32 grads, we need to reset the grads to main grad. if self.params_have_main_grad: for model_group in self.fp32_from_fp32_groups: for model_param in model_group: model_param.grad = model_param.main_grad # Safe to de-reference model's main_grad after copying. # (If using contiguous buffers, main_grad's memory should # persist and therefore should not be deallocated.) if not self.use_contiguous_buffers_in_ddp: model_param.main_grad = None def _unscale_main_grads_and_check_for_nan(self): main_grads = [] Loading Loading @@ -464,11 +481,12 @@ class FP32Optimizer(MegatronOptimizer): def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad, params_have_main_grad): params_have_main_grad, use_contiguous_buffers_in_ddp): super(FP32Optimizer, self).__init__( optimizer, clip_grad, log_num_zeros_in_grad, params_have_main_grad) params_have_main_grad, use_contiguous_buffers_in_ddp) self._scale = torch.cuda.FloatTensor([1.0]) Loading @@ -495,6 +513,12 @@ class FP32Optimizer(MegatronOptimizer): for param in param_group['params']: param.grad = param.main_grad # Safe to de-reference model's main_grad after copying. # (If using contiguous buffers, main_grad's memory should # persist and therefore should not be deallocated.) if not self.use_contiguous_buffers_in_ddp: param.main_grad = None # Clip gradients. grad_norm = None if self.clip_grad > 0.0: Loading Loading
megatron/optimizer/__init__.py +3 −1 Original line number Diff line number Diff line Loading @@ -100,10 +100,12 @@ def get_megatron_optimizer(model): args.clip_grad, args.log_num_zeros_in_grad, params_have_main_grad, args.use_contiguous_buffers_in_ddp, args.bf16, grad_scaler) # FP32. return FP32Optimizer(optimizer, args.clip_grad, args.log_num_zeros_in_grad, params_have_main_grad) params_have_main_grad, args.use_contiguous_buffers_in_ddp)
megatron/optimizer/optimizer.py +29 −5 Original line number Diff line number Diff line Loading @@ -68,7 +68,9 @@ class MegatronOptimizer(ABC): def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad, params_have_main_grad): params_have_main_grad, use_contiguous_buffers_in_ddp): """Input optimizer is the base optimizer for example Adam.""" self.optimizer = optimizer assert self.optimizer, 'no optimizer is provided.' Loading @@ -76,6 +78,7 @@ class MegatronOptimizer(ABC): self.clip_grad = clip_grad self.log_num_zeros_in_grad = log_num_zeros_in_grad self.params_have_main_grad = params_have_main_grad self.use_contiguous_buffers_in_ddp = use_contiguous_buffers_in_ddp def get_parameters(self): Loading Loading @@ -187,11 +190,12 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer): """ def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad, params_have_main_grad, bf16, grad_scaler): params_have_main_grad, use_contiguous_buffers_in_ddp, bf16, grad_scaler): super(Float16OptimizerWithFloat16Params, self).__init__( optimizer, clip_grad, log_num_zeros_in_grad, params_have_main_grad) params_have_main_grad, use_contiguous_buffers_in_ddp) self.bf16 = bf16 self.grad_scaler = grad_scaler Loading Loading @@ -305,12 +309,25 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer): else: if model_param.grad is not None: main_param.grad = model_param.grad.float() # Safe to deallocate model's grad/main_grad after copying. # (If using contiguous buffers, main_grad's memory should # persist and therefore should not be deallocated.) model_param.grad = None if not self.use_contiguous_buffers_in_ddp: model_param.main_grad = None # For fp32 grads, we need to reset the grads to main grad. if self.params_have_main_grad: for model_group in self.fp32_from_fp32_groups: for model_param in model_group: model_param.grad = model_param.main_grad # Safe to de-reference model's main_grad after copying. # (If using contiguous buffers, main_grad's memory should # persist and therefore should not be deallocated.) if not self.use_contiguous_buffers_in_ddp: model_param.main_grad = None def _unscale_main_grads_and_check_for_nan(self): main_grads = [] Loading Loading @@ -464,11 +481,12 @@ class FP32Optimizer(MegatronOptimizer): def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad, params_have_main_grad): params_have_main_grad, use_contiguous_buffers_in_ddp): super(FP32Optimizer, self).__init__( optimizer, clip_grad, log_num_zeros_in_grad, params_have_main_grad) params_have_main_grad, use_contiguous_buffers_in_ddp) self._scale = torch.cuda.FloatTensor([1.0]) Loading @@ -495,6 +513,12 @@ class FP32Optimizer(MegatronOptimizer): for param in param_group['params']: param.grad = param.main_grad # Safe to de-reference model's main_grad after copying. # (If using contiguous buffers, main_grad's memory should # persist and therefore should not be deallocated.) if not self.use_contiguous_buffers_in_ddp: param.main_grad = None # Clip gradients. grad_norm = None if self.clip_grad > 0.0: Loading