Commit 43529f78 authored Dec 30, 2020 by mohammad

addressed deepaks comments

parent 242770dd

megatron/optimizer/grad_scaler.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -103,7 +103,7 @@ class DynamicGradScaler(MegatronGradScaler):
		if found_inf:
		self._growth_tracker = 0
		self._hysteresis_tracker -= 1
		# Now if we are our of hysteresis count, scale down the loss.
		# Now if we are out of hysteresis count, scale down the loss.
		if self._hysteresis_tracker <= 0:
		self._scale = torch.max(self._scale * self.backoff_factor,
		self.min_scale)

megatron/optimizer/optimizer.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -375,7 +375,7 @@ class FP32Optimizer(MegatronOptimizer):
		@torch.no_grad()
		def step(self):
		"""Clip gradients (if needed) and step the base optimizer.
		Always return auccessful since there is no overflow."""
		Always return successful since there is no overflow."""

		# Clip gradients.
		if self.clip_grad > 0.0:

megatron/training.py

+3 −0

Original line number	Diff line number	Diff line
		@@ -183,6 +183,9 @@ def get_model(model_provider_func):
		model = model_provider_func()

		# Set tensor model parallel attributes if not set.
		# Only parameters that are already tensor model parallel have these
		# attributes set for them. We should make sure the default attributes
		# are set for all params so the optimizer can use them.
		for param in model.parameters():
		mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param)