Commit 74c83693 authored by Isaac's avatar Isaac
Browse files

Working crusher build with ROCm/4.5.0

parent 9415e318
Loading
Loading
Loading
Loading

export_DDP_vars.sh

0 → 100644
+6 −0
Original line number Diff line number Diff line
export RANK=$SLURM_PROCID
export LOCAL_RANK=$SLURM_LOCALID
export WORLD_SIZE=$SLURM_NTASKS
export MASTER_ADDR=$SLURM_SUBMIT_HOST
export MASTER_PORT=29500
+4 −0
Original line number Diff line number Diff line
@@ -348,6 +348,10 @@ def _add_network_size_args(parser):
                       help='Disable BERT binary head.',
                       dest='bert_binary_head')
    
    group.add_argument('--HIP', type=int, default=None, 
                       help='Use HIP to compile or not')
                                                

    return parser


+79 −0
Original line number Diff line number Diff line
@@ -17,7 +17,9 @@ import os
import pathlib
import subprocess

import torch
from torch.utils import cpp_extension
from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME

# Setting this param to a list has a problem of generating different
# compilation commands (with diferent order of architectures) and
@@ -27,6 +29,83 @@ from torch.utils import cpp_extension
os.environ["TORCH_CUDA_ARCH_LIST"] = ""


def load_hip(args):

    this_dir = os.path.dirname(os.path.abspath(__file__))

    TORCH_MAJOR = int(torch.__version__.split('.')[0])
    TORCH_MINOR = int(torch.__version__.split('.')[1])
    version_ge_1_1 = []
    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
        version_ge_1_1 = ['-DVERSION_GE_1_1']
    version_ge_1_3 = []
    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
        version_ge_1_3 = ['-DVERSION_GE_1_3']
    version_ge_1_5 = []
    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
        version_ge_1_5 = ['-DVERSION_GE_1_5']
    version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5

    cc_flag = []

    srcpath = pathlib.Path(__file__).parent.absolute()
    buildpath = srcpath / 'build'
    _create_build_dir(buildpath)
    
    hipcc_args = ['-O3'] + version_dependent_macros
    def _cpp_extention_load_helper(name, sources, extra_hip_flags):
        return cpp_extension.load(
            name=name,
            sources=sources,
            build_directory=buildpath,
            extra_include_paths=[this_dir],
            extra_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1', '-DCMAKE_CXX_COMPILER=g++', '-DCMAKE_C_COMPILER=gcc'],
            extra_cuda_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1'] + extra_hip_flags + cc_flag,
            verbose=(args.rank == 0)
        )
        #Build Extension in a setup file
        #return CUDAExtension(
        #    name=name,
        #    sources=sources,
        #    include_dirs=[this_dir],
        #    extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
        #                        'nvcc': hipcc_args})

    # ==============
    # Fused softmax.
    # ==============

    if args.masked_softmax_fusion:
        extra_hip_flags = []
        extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
                            '-U__CUDA_NO_HALF_CONVERSIONS__',
                            '--expt-relaxed-constexpr',
                            '--expt-extended-lambda']
        
        # Upper triangular softmax.
        sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
                 srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
        scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
            "scaled_upper_triang_masked_softmax_cuda",
            sources, extra_hip_flags)

        # Masked softmax.
        sources=[srcpath / 'scaled_masked_softmax.cpp',
                 srcpath / 'scaled_masked_softmax_cuda.cu']
        scaled_masked_softmax_cuda = _cpp_extention_load_helper(
            "scaled_masked_softmax_cuda", sources, extra_hip_flags)

    # =================================
    # Mixed precision fused layer norm.
    # =================================

    extra_hip_flags = []
    extra_cuda_flags = ['-maxrregcount=50']
    sources=[srcpath / 'layer_norm_cuda.cpp',
             srcpath / 'layer_norm_cuda_kernel.cu']
    fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
        "fused_mix_prec_layer_norm_cuda", sources, extra_hip_flags)

def load(args):

    # Check if cuda 11 is installed for compute capability 8.0
+8 −3
Original line number Diff line number Diff line
@@ -250,9 +250,14 @@ void cuWelfordMuSigma2(
template<typename U> U rsqrt(U v) {
  return U(1) / sqrt(v);
}
template<> float rsqrt(float v) {
#if defined __HIP_PLATFORM_HCC__
template<>
#else
__device__ float rsqrt(float v) {
  return rsqrtf(v);
}
#endif

template<> double rsqrt(double v) {
  return rsqrt(v);
}
@@ -304,7 +309,7 @@ void cuApplyLayerNorm(
  // 1) blockDim.x == warpSize
  // 2) Tensors are contiguous
  //
  for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
    SharedMemory<U> shared;
    U* buf = shared.getPointer();
    U mu,sigma2;
@@ -543,7 +548,7 @@ void cuComputeGradInput(
    const V* gamma,
    T* grad_input)
{
  for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
    U sum_loss1 = U(0);
    U sum_loss2 = U(0);
    const U c_mean = mean[i1];
+1 −1
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@
 * limitations under the License.
 */

#include <cuda_fp16.h>
#include <hip/hip_fp16.h>
#include <torch/extension.h>
#include <vector>

Loading