Commit 43529f78 authored by mohammad's avatar mohammad
Browse files

addressed deepaks comments

parent 242770dd
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -103,7 +103,7 @@ class DynamicGradScaler(MegatronGradScaler):
        if found_inf:
            self._growth_tracker = 0
            self._hysteresis_tracker -= 1
            # Now if we are our of hysteresis count, scale down the loss.
            # Now if we are out of hysteresis count, scale down the loss.
            if self._hysteresis_tracker <= 0:
                self._scale = torch.max(self._scale * self.backoff_factor,
                                        self.min_scale)
+1 −1
Original line number Diff line number Diff line
@@ -375,7 +375,7 @@ class FP32Optimizer(MegatronOptimizer):
    @torch.no_grad()
    def step(self):
        """Clip gradients (if needed) and step the base optimizer.
        Always return auccessful since there is no overflow."""
        Always return successful since there is no overflow."""

        # Clip gradients.
        if self.clip_grad > 0.0:
+3 −0
Original line number Diff line number Diff line
@@ -183,6 +183,9 @@ def get_model(model_provider_func):
    model = model_provider_func()

    # Set tensor model parallel attributes if not set.
    # Only parameters that are already tensor model parallel have these
    # attributes set for them. We should make sure the default attributes
    # are set for all params so the optimizer can use them.
    for param in model.parameters():
        mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param)