Loading megatron/optimizer/__init__.py +2 −4 Original line number Diff line number Diff line Loading @@ -13,11 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. #from apex.optimizers import FusedAdam as Adam from apex.optimizers import FusedAdam as Adam from apex.optimizers import FusedSGD as SGD #from apex.optimizers import FusedLAMB as Lamb from deepspeed.ops.adam import FusedAdam as Adam from deepspeed.ops.lamb import FusedLamb as Lamb from apex.optimizers import FusedLAMB as Lamb from megatron import get_args from megatron.model import LayerNorm Loading megatron/optimizer/clip_grads.py +4 −7 Original line number Diff line number Diff line Loading @@ -18,12 +18,9 @@ import torch from torch._six import inf #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) from apex.multi_tensor_apply import multi_tensor_applier #import amp_C from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale import amp_C from megatron import mpu from megatron.model.module import param_is_not_shared Loading Loading @@ -93,7 +90,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): # Multi-tensor applier takes a function and a list of list # and performs the operation on that list all in one kernel. grad_norm, _ = multi_tensor_applier( multi_tensor_l2norm, amp_C.multi_tensor_l2norm, dummy_overflow_buf, [grads_for_norm], False # no per-parameter norm Loading @@ -117,7 +114,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): clip_coeff = max_norm / (total_norm + 1.0e-6) if clip_coeff < 1.0: dummy_overflow_buf = torch.cuda.IntTensor([0]) multi_tensor_applier(multi_tensor_scale, multi_tensor_applier(amp_C.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff) Loading megatron/optimizer/optimizer.py +3 −6 Original line number Diff line number Diff line Loading @@ -20,12 +20,9 @@ from abc import abstractmethod import torch #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) from apex.multi_tensor_apply import multi_tensor_applier #import amp_C from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale import amp_C from megatron import get_timers from megatron import mpu Loading Loading @@ -57,7 +54,7 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None): if overflow_buf: overflow_buf.fill_(0) # Scaling with factor `1.0` is equivalent to copy. multi_tensor_applier(multi_tensor_scale, multi_tensor_applier(amp_C.multi_tensor_scale, overflow_buf, [this, that], 1.0) Loading megatron/utils.py +3 −5 Original line number Diff line number Diff line Loading @@ -20,10 +20,8 @@ import sys import torch from torch.nn.parallel import DistributedDataParallel as torchDDP #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) #import amp_C from apex.multi_tensor_apply import multi_tensor_applier import amp_C from megatron import get_args from megatron import print_rank_0 Loading Loading @@ -67,7 +65,7 @@ def calc_params_l2_norm(model): # Calculate norm dummy_overflow_buf = torch.cuda.IntTensor([0]) norm, _ = multi_tensor_applier( multi_tensor_l2norm, amp_C.multi_tensor_l2norm, dummy_overflow_buf, [params_data], False # no per-parameter norm Loading Loading
megatron/optimizer/__init__.py +2 −4 Original line number Diff line number Diff line Loading @@ -13,11 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. #from apex.optimizers import FusedAdam as Adam from apex.optimizers import FusedAdam as Adam from apex.optimizers import FusedSGD as SGD #from apex.optimizers import FusedLAMB as Lamb from deepspeed.ops.adam import FusedAdam as Adam from deepspeed.ops.lamb import FusedLamb as Lamb from apex.optimizers import FusedLAMB as Lamb from megatron import get_args from megatron.model import LayerNorm Loading
megatron/optimizer/clip_grads.py +4 −7 Original line number Diff line number Diff line Loading @@ -18,12 +18,9 @@ import torch from torch._six import inf #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) from apex.multi_tensor_apply import multi_tensor_applier #import amp_C from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale import amp_C from megatron import mpu from megatron.model.module import param_is_not_shared Loading Loading @@ -93,7 +90,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): # Multi-tensor applier takes a function and a list of list # and performs the operation on that list all in one kernel. grad_norm, _ = multi_tensor_applier( multi_tensor_l2norm, amp_C.multi_tensor_l2norm, dummy_overflow_buf, [grads_for_norm], False # no per-parameter norm Loading @@ -117,7 +114,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): clip_coeff = max_norm / (total_norm + 1.0e-6) if clip_coeff < 1.0: dummy_overflow_buf = torch.cuda.IntTensor([0]) multi_tensor_applier(multi_tensor_scale, multi_tensor_applier(amp_C.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff) Loading
megatron/optimizer/optimizer.py +3 −6 Original line number Diff line number Diff line Loading @@ -20,12 +20,9 @@ from abc import abstractmethod import torch #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) from apex.multi_tensor_apply import multi_tensor_applier #import amp_C from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale import amp_C from megatron import get_timers from megatron import mpu Loading Loading @@ -57,7 +54,7 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None): if overflow_buf: overflow_buf.fill_(0) # Scaling with factor `1.0` is equivalent to copy. multi_tensor_applier(multi_tensor_scale, multi_tensor_applier(amp_C.multi_tensor_scale, overflow_buf, [this, that], 1.0) Loading
megatron/utils.py +3 −5 Original line number Diff line number Diff line Loading @@ -20,10 +20,8 @@ import sys import torch from torch.nn.parallel import DistributedDataParallel as torchDDP #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) #import amp_C from apex.multi_tensor_apply import multi_tensor_applier import amp_C from megatron import get_args from megatron import print_rank_0 Loading Loading @@ -67,7 +65,7 @@ def calc_params_l2_norm(model): # Calculate norm dummy_overflow_buf = torch.cuda.IntTensor([0]) norm, _ = multi_tensor_applier( multi_tensor_l2norm, amp_C.multi_tensor_l2norm, dummy_overflow_buf, [params_data], False # no per-parameter norm Loading