Loading megatron/optimizer/__init__.py +10 −1 Original line number Diff line number Diff line Loading @@ -13,8 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. from apex.optimizers import FusedAdam as Adam #from apex.optimizers import FusedAdam as Adam from apex.optimizers import FusedSGD as SGD #from apex.optimizers import FusedLAMB as Lamb from deepspeed.ops.adam import FusedAdam as Adam from deepspeed.ops.lamb import FusedLamb as Lamb from megatron import get_args from megatron.model import LayerNorm Loading Loading @@ -63,6 +66,12 @@ def get_megatron_optimizer(model): lr=args.lr, weight_decay=args.weight_decay, momentum=args.sgd_momentum) elif args.optimizer == 'lamb': optimizer = Lamb(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps) else: raise Exception('{} optimizer is not supported.'.format( args.optimizer)) Loading megatron/optimizer/clip_grads.py +8 −4 Original line number Diff line number Diff line Loading @@ -18,8 +18,12 @@ import torch from torch._six import inf from apex.multi_tensor_apply import multi_tensor_applier import amp_C #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) #import amp_C from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale from megatron import mpu from megatron.model.module import param_is_not_shared Loading Loading @@ -89,7 +93,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): # Multi-tensor applier takes a function and a list of list # and performs the operation on that list all in one kernel. grad_norm, _ = multi_tensor_applier( amp_C.multi_tensor_l2norm, multi_tensor_l2norm, dummy_overflow_buf, [grads_for_norm], False # no per-parameter norm Loading @@ -113,7 +117,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): clip_coeff = max_norm / (total_norm + 1.0e-6) if clip_coeff < 1.0: dummy_overflow_buf = torch.cuda.IntTensor([0]) multi_tensor_applier(amp_C.multi_tensor_scale, multi_tensor_applier(multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff) Loading megatron/optimizer/optimizer.py +7 −3 Original line number Diff line number Diff line Loading @@ -20,8 +20,12 @@ from abc import abstractmethod import torch from apex.multi_tensor_apply import multi_tensor_applier import amp_C #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) #import amp_C from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale from megatron import get_timers from megatron import mpu Loading Loading @@ -53,7 +57,7 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None): if overflow_buf: overflow_buf.fill_(0) # Scaling with factor `1.0` is equivalent to copy. multi_tensor_applier(amp_C.multi_tensor_scale, multi_tensor_applier(multi_tensor_scale, overflow_buf, [this, that], 1.0) Loading megatron/utils.py +5 −3 Original line number Diff line number Diff line Loading @@ -20,8 +20,10 @@ import sys import torch from torch.nn.parallel import DistributedDataParallel as torchDDP from apex.multi_tensor_apply import multi_tensor_applier import amp_C #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) #import amp_C from megatron import get_args from megatron import print_rank_0 Loading Loading @@ -65,7 +67,7 @@ def calc_params_l2_norm(model): # Calculate norm dummy_overflow_buf = torch.cuda.IntTensor([0]) norm, _ = multi_tensor_applier( amp_C.multi_tensor_l2norm, multi_tensor_l2norm, dummy_overflow_buf, [params_data], False # no per-parameter norm Loading Loading
megatron/optimizer/__init__.py +10 −1 Original line number Diff line number Diff line Loading @@ -13,8 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. from apex.optimizers import FusedAdam as Adam #from apex.optimizers import FusedAdam as Adam from apex.optimizers import FusedSGD as SGD #from apex.optimizers import FusedLAMB as Lamb from deepspeed.ops.adam import FusedAdam as Adam from deepspeed.ops.lamb import FusedLamb as Lamb from megatron import get_args from megatron.model import LayerNorm Loading Loading @@ -63,6 +66,12 @@ def get_megatron_optimizer(model): lr=args.lr, weight_decay=args.weight_decay, momentum=args.sgd_momentum) elif args.optimizer == 'lamb': optimizer = Lamb(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps) else: raise Exception('{} optimizer is not supported.'.format( args.optimizer)) Loading
megatron/optimizer/clip_grads.py +8 −4 Original line number Diff line number Diff line Loading @@ -18,8 +18,12 @@ import torch from torch._six import inf from apex.multi_tensor_apply import multi_tensor_applier import amp_C #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) #import amp_C from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale from megatron import mpu from megatron.model.module import param_is_not_shared Loading Loading @@ -89,7 +93,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): # Multi-tensor applier takes a function and a list of list # and performs the operation on that list all in one kernel. grad_norm, _ = multi_tensor_applier( amp_C.multi_tensor_l2norm, multi_tensor_l2norm, dummy_overflow_buf, [grads_for_norm], False # no per-parameter norm Loading @@ -113,7 +117,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): clip_coeff = max_norm / (total_norm + 1.0e-6) if clip_coeff < 1.0: dummy_overflow_buf = torch.cuda.IntTensor([0]) multi_tensor_applier(amp_C.multi_tensor_scale, multi_tensor_applier(multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff) Loading
megatron/optimizer/optimizer.py +7 −3 Original line number Diff line number Diff line Loading @@ -20,8 +20,12 @@ from abc import abstractmethod import torch from apex.multi_tensor_apply import multi_tensor_applier import amp_C #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) #import amp_C from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale from megatron import get_timers from megatron import mpu Loading Loading @@ -53,7 +57,7 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None): if overflow_buf: overflow_buf.fill_(0) # Scaling with factor `1.0` is equivalent to copy. multi_tensor_applier(amp_C.multi_tensor_scale, multi_tensor_applier(multi_tensor_scale, overflow_buf, [this, that], 1.0) Loading
megatron/utils.py +5 −3 Original line number Diff line number Diff line Loading @@ -20,8 +20,10 @@ import sys import torch from torch.nn.parallel import DistributedDataParallel as torchDDP from apex.multi_tensor_apply import multi_tensor_applier import amp_C #from apex.multi_tensor_apply import multi_tensor_applier from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply multi_tensor_applier=MultiTensorApply(2048*32) #import amp_C from megatron import get_args from megatron import print_rank_0 Loading Loading @@ -65,7 +67,7 @@ def calc_params_l2_norm(model): # Calculate norm dummy_overflow_buf = torch.cuda.IntTensor([0]) norm, _ = multi_tensor_applier( amp_C.multi_tensor_l2norm, multi_tensor_l2norm, dummy_overflow_buf, [params_data], False # no per-parameter norm Loading