Loading megatron/optimizer/grad_scaler.py +1 −1 Original line number Diff line number Diff line Loading @@ -103,7 +103,7 @@ class DynamicGradScaler(MegatronGradScaler): if found_inf: self._growth_tracker = 0 self._hysteresis_tracker -= 1 # Now if we are our of hysteresis count, scale down the loss. # Now if we are out of hysteresis count, scale down the loss. if self._hysteresis_tracker <= 0: self._scale = torch.max(self._scale * self.backoff_factor, self.min_scale) Loading megatron/optimizer/optimizer.py +1 −1 Original line number Diff line number Diff line Loading @@ -375,7 +375,7 @@ class FP32Optimizer(MegatronOptimizer): @torch.no_grad() def step(self): """Clip gradients (if needed) and step the base optimizer. Always return auccessful since there is no overflow.""" Always return successful since there is no overflow.""" # Clip gradients. if self.clip_grad > 0.0: Loading megatron/training.py +3 −0 Original line number Diff line number Diff line Loading @@ -183,6 +183,9 @@ def get_model(model_provider_func): model = model_provider_func() # Set tensor model parallel attributes if not set. # Only parameters that are already tensor model parallel have these # attributes set for them. We should make sure the default attributes # are set for all params so the optimizer can use them. for param in model.parameters(): mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param) Loading Loading
megatron/optimizer/grad_scaler.py +1 −1 Original line number Diff line number Diff line Loading @@ -103,7 +103,7 @@ class DynamicGradScaler(MegatronGradScaler): if found_inf: self._growth_tracker = 0 self._hysteresis_tracker -= 1 # Now if we are our of hysteresis count, scale down the loss. # Now if we are out of hysteresis count, scale down the loss. if self._hysteresis_tracker <= 0: self._scale = torch.max(self._scale * self.backoff_factor, self.min_scale) Loading
megatron/optimizer/optimizer.py +1 −1 Original line number Diff line number Diff line Loading @@ -375,7 +375,7 @@ class FP32Optimizer(MegatronOptimizer): @torch.no_grad() def step(self): """Clip gradients (if needed) and step the base optimizer. Always return auccessful since there is no overflow.""" Always return successful since there is no overflow.""" # Clip gradients. if self.clip_grad > 0.0: Loading
megatron/training.py +3 −0 Original line number Diff line number Diff line Loading @@ -183,6 +183,9 @@ def get_model(model_provider_func): model = model_provider_func() # Set tensor model parallel attributes if not set. # Only parameters that are already tensor model parallel have these # attributes set for them. We should make sure the default attributes # are set for all params so the optimizer can use them. for param in model.parameters(): mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param) Loading