Commit ef777805 authored by Isaac's avatar Isaac
Browse files

Adding changes for integrating deepspeed. Not working as of now. Use master or...

Adding changes for integrating deepspeed. Not working as of now. Use master or one commit behind this
parent 0aa86dc1
Loading
Loading
Loading
Loading
+10 −1
Original line number Diff line number Diff line
@@ -13,8 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from apex.optimizers import FusedAdam as Adam
#from apex.optimizers import FusedAdam as Adam
from apex.optimizers import FusedSGD as SGD
#from apex.optimizers import FusedLAMB as Lamb
from deepspeed.ops.adam import FusedAdam as Adam
from deepspeed.ops.lamb import FusedLamb as Lamb

from megatron import get_args
from megatron.model import LayerNorm
@@ -63,6 +66,12 @@ def get_megatron_optimizer(model):
                        lr=args.lr,
                        weight_decay=args.weight_decay,
                        momentum=args.sgd_momentum)
    elif args.optimizer == 'lamb':
        optimizer = Lamb(param_groups,
                         lr=args.lr,
                         weight_decay=args.weight_decay,
                         betas=(args.adam_beta1, args.adam_beta2),
                         eps=args.adam_eps)
    else:
        raise Exception('{} optimizer is not supported.'.format(
            args.optimizer))
+8 −4
Original line number Diff line number Diff line
@@ -18,8 +18,12 @@
import torch
from torch._six import inf

from apex.multi_tensor_apply import multi_tensor_applier
import amp_C
#from apex.multi_tensor_apply import multi_tensor_applier
from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply
multi_tensor_applier=MultiTensorApply(2048*32)

#import amp_C
from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale

from megatron import mpu
from megatron.model.module import param_is_not_shared
@@ -89,7 +93,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
            # Multi-tensor applier takes a function and a list of list
            # and performs the operation on that list all in one kernel.
            grad_norm, _ = multi_tensor_applier(
                amp_C.multi_tensor_l2norm,
                multi_tensor_l2norm,
                dummy_overflow_buf,
                [grads_for_norm],
                False # no per-parameter norm
@@ -113,7 +117,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    clip_coeff = max_norm / (total_norm + 1.0e-6)
    if clip_coeff < 1.0:
        dummy_overflow_buf = torch.cuda.IntTensor([0])
        multi_tensor_applier(amp_C.multi_tensor_scale,
        multi_tensor_applier(multi_tensor_scale,
                             dummy_overflow_buf,
                             [grads, grads],
                             clip_coeff)
+7 −3
Original line number Diff line number Diff line
@@ -20,8 +20,12 @@ from abc import abstractmethod

import torch

from apex.multi_tensor_apply import multi_tensor_applier
import amp_C
#from apex.multi_tensor_apply import multi_tensor_applier
from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply
multi_tensor_applier=MultiTensorApply(2048*32)

#import amp_C
from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale

from megatron import get_timers
from megatron import mpu
@@ -53,7 +57,7 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
    if overflow_buf:
        overflow_buf.fill_(0)
        # Scaling with factor `1.0` is equivalent to copy.
        multi_tensor_applier(amp_C.multi_tensor_scale,
        multi_tensor_applier(multi_tensor_scale,
                             overflow_buf,
                             [this, that],
                             1.0)
+5 −3
Original line number Diff line number Diff line
@@ -20,8 +20,10 @@ import sys
import torch
from torch.nn.parallel import DistributedDataParallel as torchDDP

from apex.multi_tensor_apply import multi_tensor_applier
import amp_C
#from apex.multi_tensor_apply import multi_tensor_applier
from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply
multi_tensor_applier=MultiTensorApply(2048*32)
#import amp_C

from megatron import get_args
from megatron import print_rank_0
@@ -65,7 +67,7 @@ def calc_params_l2_norm(model):
    # Calculate norm
    dummy_overflow_buf = torch.cuda.IntTensor([0])
    norm, _ = multi_tensor_applier(
        amp_C.multi_tensor_l2norm,
        multi_tensor_l2norm,
        dummy_overflow_buf,
        [params_data],
        False # no per-parameter norm