Loading export_DDP_vars.sh 0 → 100644 +6 −0 Original line number Diff line number Diff line export RANK=$SLURM_PROCID export LOCAL_RANK=$SLURM_LOCALID export WORLD_SIZE=$SLURM_NTASKS export MASTER_ADDR=$SLURM_SUBMIT_HOST export MASTER_PORT=29500 megatron/arguments.py +4 −0 Original line number Diff line number Diff line Loading @@ -348,6 +348,10 @@ def _add_network_size_args(parser): help='Disable BERT binary head.', dest='bert_binary_head') group.add_argument('--HIP', type=int, default=None, help='Use HIP to compile or not') return parser Loading megatron/fused_kernels/__init__.py +79 −0 Original line number Diff line number Diff line Loading @@ -17,7 +17,9 @@ import os import pathlib import subprocess import torch from torch.utils import cpp_extension from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME # Setting this param to a list has a problem of generating different # compilation commands (with diferent order of architectures) and Loading @@ -27,6 +29,83 @@ from torch.utils import cpp_extension os.environ["TORCH_CUDA_ARCH_LIST"] = "" def load_hip(args): this_dir = os.path.dirname(os.path.abspath(__file__)) TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) version_ge_1_1 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0): version_ge_1_1 = ['-DVERSION_GE_1_1'] version_ge_1_3 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2): version_ge_1_3 = ['-DVERSION_GE_1_3'] version_ge_1_5 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4): version_ge_1_5 = ['-DVERSION_GE_1_5'] version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5 cc_flag = [] srcpath = pathlib.Path(__file__).parent.absolute() buildpath = srcpath / 'build' _create_build_dir(buildpath) hipcc_args = ['-O3'] + version_dependent_macros def _cpp_extention_load_helper(name, sources, extra_hip_flags): return cpp_extension.load( name=name, sources=sources, build_directory=buildpath, extra_include_paths=[this_dir], extra_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1', '-DCMAKE_CXX_COMPILER=g++', '-DCMAKE_C_COMPILER=gcc'], extra_cuda_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1'] + extra_hip_flags + cc_flag, verbose=(args.rank == 0) ) #Build Extension in a setup file #return CUDAExtension( # name=name, # sources=sources, # include_dirs=[this_dir], # extra_compile_args={'cxx': ['-O3',] + version_dependent_macros, # 'nvcc': hipcc_args}) # ============== # Fused softmax. # ============== if args.masked_softmax_fusion: extra_hip_flags = [] extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr', '--expt-extended-lambda'] # Upper triangular softmax. sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp', srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'] scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper( "scaled_upper_triang_masked_softmax_cuda", sources, extra_hip_flags) # Masked softmax. sources=[srcpath / 'scaled_masked_softmax.cpp', srcpath / 'scaled_masked_softmax_cuda.cu'] scaled_masked_softmax_cuda = _cpp_extention_load_helper( "scaled_masked_softmax_cuda", sources, extra_hip_flags) # ================================= # Mixed precision fused layer norm. # ================================= extra_hip_flags = [] extra_cuda_flags = ['-maxrregcount=50'] sources=[srcpath / 'layer_norm_cuda.cpp', srcpath / 'layer_norm_cuda_kernel.cu'] fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper( "fused_mix_prec_layer_norm_cuda", sources, extra_hip_flags) def load(args): # Check if cuda 11 is installed for compute capability 8.0 Loading megatron/fused_kernels/layer_norm_cuda_kernel.cu +8 −3 Original line number Diff line number Diff line Loading @@ -250,9 +250,14 @@ void cuWelfordMuSigma2( template<typename U> U rsqrt(U v) { return U(1) / sqrt(v); } template<> float rsqrt(float v) { #if defined __HIP_PLATFORM_HCC__ template<> #else __device__ float rsqrt(float v) { return rsqrtf(v); } #endif template<> double rsqrt(double v) { return rsqrt(v); } Loading Loading @@ -304,7 +309,7 @@ void cuApplyLayerNorm( // 1) blockDim.x == warpSize // 2) Tensors are contiguous // for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { SharedMemory<U> shared; U* buf = shared.getPointer(); U mu,sigma2; Loading Loading @@ -543,7 +548,7 @@ void cuComputeGradInput( const V* gamma, T* grad_input) { for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { U sum_loss1 = U(0); U sum_loss2 = U(0); const U c_mean = mean[i1]; Loading megatron/fused_kernels/scaled_masked_softmax.cpp +1 −1 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ * limitations under the License. */ #include <cuda_fp16.h> #include <hip/hip_fp16.h> #include <torch/extension.h> #include <vector> Loading Loading
export_DDP_vars.sh 0 → 100644 +6 −0 Original line number Diff line number Diff line export RANK=$SLURM_PROCID export LOCAL_RANK=$SLURM_LOCALID export WORLD_SIZE=$SLURM_NTASKS export MASTER_ADDR=$SLURM_SUBMIT_HOST export MASTER_PORT=29500
megatron/arguments.py +4 −0 Original line number Diff line number Diff line Loading @@ -348,6 +348,10 @@ def _add_network_size_args(parser): help='Disable BERT binary head.', dest='bert_binary_head') group.add_argument('--HIP', type=int, default=None, help='Use HIP to compile or not') return parser Loading
megatron/fused_kernels/__init__.py +79 −0 Original line number Diff line number Diff line Loading @@ -17,7 +17,9 @@ import os import pathlib import subprocess import torch from torch.utils import cpp_extension from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME # Setting this param to a list has a problem of generating different # compilation commands (with diferent order of architectures) and Loading @@ -27,6 +29,83 @@ from torch.utils import cpp_extension os.environ["TORCH_CUDA_ARCH_LIST"] = "" def load_hip(args): this_dir = os.path.dirname(os.path.abspath(__file__)) TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) version_ge_1_1 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0): version_ge_1_1 = ['-DVERSION_GE_1_1'] version_ge_1_3 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2): version_ge_1_3 = ['-DVERSION_GE_1_3'] version_ge_1_5 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4): version_ge_1_5 = ['-DVERSION_GE_1_5'] version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5 cc_flag = [] srcpath = pathlib.Path(__file__).parent.absolute() buildpath = srcpath / 'build' _create_build_dir(buildpath) hipcc_args = ['-O3'] + version_dependent_macros def _cpp_extention_load_helper(name, sources, extra_hip_flags): return cpp_extension.load( name=name, sources=sources, build_directory=buildpath, extra_include_paths=[this_dir], extra_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1', '-DCMAKE_CXX_COMPILER=g++', '-DCMAKE_C_COMPILER=gcc'], extra_cuda_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1'] + extra_hip_flags + cc_flag, verbose=(args.rank == 0) ) #Build Extension in a setup file #return CUDAExtension( # name=name, # sources=sources, # include_dirs=[this_dir], # extra_compile_args={'cxx': ['-O3',] + version_dependent_macros, # 'nvcc': hipcc_args}) # ============== # Fused softmax. # ============== if args.masked_softmax_fusion: extra_hip_flags = [] extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr', '--expt-extended-lambda'] # Upper triangular softmax. sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp', srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'] scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper( "scaled_upper_triang_masked_softmax_cuda", sources, extra_hip_flags) # Masked softmax. sources=[srcpath / 'scaled_masked_softmax.cpp', srcpath / 'scaled_masked_softmax_cuda.cu'] scaled_masked_softmax_cuda = _cpp_extention_load_helper( "scaled_masked_softmax_cuda", sources, extra_hip_flags) # ================================= # Mixed precision fused layer norm. # ================================= extra_hip_flags = [] extra_cuda_flags = ['-maxrregcount=50'] sources=[srcpath / 'layer_norm_cuda.cpp', srcpath / 'layer_norm_cuda_kernel.cu'] fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper( "fused_mix_prec_layer_norm_cuda", sources, extra_hip_flags) def load(args): # Check if cuda 11 is installed for compute capability 8.0 Loading
megatron/fused_kernels/layer_norm_cuda_kernel.cu +8 −3 Original line number Diff line number Diff line Loading @@ -250,9 +250,14 @@ void cuWelfordMuSigma2( template<typename U> U rsqrt(U v) { return U(1) / sqrt(v); } template<> float rsqrt(float v) { #if defined __HIP_PLATFORM_HCC__ template<> #else __device__ float rsqrt(float v) { return rsqrtf(v); } #endif template<> double rsqrt(double v) { return rsqrt(v); } Loading Loading @@ -304,7 +309,7 @@ void cuApplyLayerNorm( // 1) blockDim.x == warpSize // 2) Tensors are contiguous // for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { SharedMemory<U> shared; U* buf = shared.getPointer(); U mu,sigma2; Loading Loading @@ -543,7 +548,7 @@ void cuComputeGradInput( const V* gamma, T* grad_input) { for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { U sum_loss1 = U(0); U sum_loss2 = U(0); const U c_mean = mean[i1]; Loading
megatron/fused_kernels/scaled_masked_softmax.cpp +1 −1 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ * limitations under the License. */ #include <cuda_fp16.h> #include <hip/hip_fp16.h> #include <torch/extension.h> #include <vector> Loading