Commit 9415e318 authored by Isaac's avatar Isaac
Browse files

Removing deepspeed ops. Using ROCmSoftwarePlatform apex build for those functionalities on AMD

parent ef777805
Loading
Loading
Loading
Loading
+2 −4
Original line number Diff line number Diff line
@@ -13,11 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

#from apex.optimizers import FusedAdam as Adam
from apex.optimizers import FusedAdam as Adam
from apex.optimizers import FusedSGD as SGD
#from apex.optimizers import FusedLAMB as Lamb
from deepspeed.ops.adam import FusedAdam as Adam
from deepspeed.ops.lamb import FusedLamb as Lamb
from apex.optimizers import FusedLAMB as Lamb

from megatron import get_args
from megatron.model import LayerNorm
+4 −7
Original line number Diff line number Diff line
@@ -18,12 +18,9 @@
import torch
from torch._six import inf

#from apex.multi_tensor_apply import multi_tensor_applier
from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply
multi_tensor_applier=MultiTensorApply(2048*32)
from apex.multi_tensor_apply import multi_tensor_applier

#import amp_C
from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale
import amp_C

from megatron import mpu
from megatron.model.module import param_is_not_shared
@@ -93,7 +90,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
            # Multi-tensor applier takes a function and a list of list
            # and performs the operation on that list all in one kernel.
            grad_norm, _ = multi_tensor_applier(
                multi_tensor_l2norm,
                amp_C.multi_tensor_l2norm,
                dummy_overflow_buf,
                [grads_for_norm],
                False # no per-parameter norm
@@ -117,7 +114,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    clip_coeff = max_norm / (total_norm + 1.0e-6)
    if clip_coeff < 1.0:
        dummy_overflow_buf = torch.cuda.IntTensor([0])
        multi_tensor_applier(multi_tensor_scale,
        multi_tensor_applier(amp_C.multi_tensor_scale,
                             dummy_overflow_buf,
                             [grads, grads],
                             clip_coeff)
+3 −6
Original line number Diff line number Diff line
@@ -20,12 +20,9 @@ from abc import abstractmethod

import torch

#from apex.multi_tensor_apply import multi_tensor_applier
from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply
multi_tensor_applier=MultiTensorApply(2048*32)
from apex.multi_tensor_apply import multi_tensor_applier

#import amp_C
from deepspeed.ops.multi_tensor import multi_tensor_l2norm, multi_tensor_scale
import amp_C

from megatron import get_timers
from megatron import mpu
@@ -57,7 +54,7 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
    if overflow_buf:
        overflow_buf.fill_(0)
        # Scaling with factor `1.0` is equivalent to copy.
        multi_tensor_applier(multi_tensor_scale,
        multi_tensor_applier(amp_C.multi_tensor_scale,
                             overflow_buf,
                             [this, that],
                             1.0)
+3 −5
Original line number Diff line number Diff line
@@ -20,10 +20,8 @@ import sys
import torch
from torch.nn.parallel import DistributedDataParallel as torchDDP

#from apex.multi_tensor_apply import multi_tensor_applier
from deepspeed.ops.adam.multi_tensor_apply import MultiTensorApply
multi_tensor_applier=MultiTensorApply(2048*32)
#import amp_C
from apex.multi_tensor_apply import multi_tensor_applier
import amp_C

from megatron import get_args
from megatron import print_rank_0
@@ -67,7 +65,7 @@ def calc_params_l2_norm(model):
    # Calculate norm
    dummy_overflow_buf = torch.cuda.IntTensor([0])
    norm, _ = multi_tensor_applier(
        multi_tensor_l2norm,
        amp_C.multi_tensor_l2norm,
        dummy_overflow_buf,
        [params_data],
        False # no per-parameter norm