Working crusher build with ROCm/4.5.0 (74c83693) · Commits · candle / Megatron-LM

export_DDP_vars.sh

0 → 100644

+6 −0

Original line number	Diff line number	Diff line
		export RANK=$SLURM_PROCID
		export LOCAL_RANK=$SLURM_LOCALID
		export WORLD_SIZE=$SLURM_NTASKS
		export MASTER_ADDR=$SLURM_SUBMIT_HOST
		export MASTER_PORT=29500

megatron/arguments.py

+4 −0

Original line number	Diff line number	Diff line
		@@ -348,6 +348,10 @@ def _add_network_size_args(parser):
		help='Disable BERT binary head.',
		dest='bert_binary_head')

		group.add_argument('--HIP', type=int, default=None,
		help='Use HIP to compile or not')


		return parser

megatron/fused_kernels/init.py

+79 −0

Original line number	Diff line number	Diff line
		@@ -17,7 +17,9 @@ import os
		import pathlib
		import subprocess

		import torch
		from torch.utils import cpp_extension
		from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME

		# Setting this param to a list has a problem of generating different
		# compilation commands (with diferent order of architectures) and
		@@ -27,6 +29,83 @@ from torch.utils import cpp_extension
		os.environ["TORCH_CUDA_ARCH_LIST"] = ""


		def load_hip(args):

		this_dir = os.path.dirname(os.path.abspath(__file__))

		TORCH_MAJOR = int(torch.__version__.split('.')[0])
		TORCH_MINOR = int(torch.__version__.split('.')[1])
		version_ge_1_1 = []
		if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
		version_ge_1_1 = ['-DVERSION_GE_1_1']
		version_ge_1_3 = []
		if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
		version_ge_1_3 = ['-DVERSION_GE_1_3']
		version_ge_1_5 = []
		if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
		version_ge_1_5 = ['-DVERSION_GE_1_5']
		version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5

		cc_flag = []

		srcpath = pathlib.Path(__file__).parent.absolute()
		buildpath = srcpath / 'build'
		_create_build_dir(buildpath)

		hipcc_args = ['-O3'] + version_dependent_macros
		def _cpp_extention_load_helper(name, sources, extra_hip_flags):
		return cpp_extension.load(
		name=name,
		sources=sources,
		build_directory=buildpath,
		extra_include_paths=[this_dir],
		extra_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1', '-DCMAKE_CXX_COMPILER=g++', '-DCMAKE_C_COMPILER=gcc'],
		extra_cuda_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1'] + extra_hip_flags + cc_flag,
		verbose=(args.rank == 0)
		)
		#Build Extension in a setup file
		#return CUDAExtension(
		# name=name,
		# sources=sources,
		# include_dirs=[this_dir],
		# extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
		# 'nvcc': hipcc_args})

		# ==============
		# Fused softmax.
		# ==============

		if args.masked_softmax_fusion:
		extra_hip_flags = []
		extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
		'-U__CUDA_NO_HALF_CONVERSIONS__',
		'--expt-relaxed-constexpr',
		'--expt-extended-lambda']

		# Upper triangular softmax.
		sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
		srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
		scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
		"scaled_upper_triang_masked_softmax_cuda",
		sources, extra_hip_flags)

		# Masked softmax.
		sources=[srcpath / 'scaled_masked_softmax.cpp',
		srcpath / 'scaled_masked_softmax_cuda.cu']
		scaled_masked_softmax_cuda = _cpp_extention_load_helper(
		"scaled_masked_softmax_cuda", sources, extra_hip_flags)

		# =================================
		# Mixed precision fused layer norm.
		# =================================

		extra_hip_flags = []
		extra_cuda_flags = ['-maxrregcount=50']
		sources=[srcpath / 'layer_norm_cuda.cpp',
		srcpath / 'layer_norm_cuda_kernel.cu']
		fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
		"fused_mix_prec_layer_norm_cuda", sources, extra_hip_flags)

		def load(args):

		# Check if cuda 11 is installed for compute capability 8.0

megatron/fused_kernels/layer_norm_cuda_kernel.cu

+8 −3

Original line number	Diff line number	Diff line
		@@ -250,9 +250,14 @@ void cuWelfordMuSigma2(
		template<typename U> U rsqrt(U v) {
		return U(1) / sqrt(v);
		}
		template<> float rsqrt(float v) {
		#if defined __HIP_PLATFORM_HCC__
		template<>
		#else
		__device__ float rsqrt(float v) {
		return rsqrtf(v);
		}
		#endif

		template<> double rsqrt(double v) {
		return rsqrt(v);
		}
		@@ -304,7 +309,7 @@ void cuApplyLayerNorm(
		// 1) blockDim.x == warpSize
		// 2) Tensors are contiguous
		//
		for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
		for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
		SharedMemory<U> shared;
		U* buf = shared.getPointer();
		U mu,sigma2;
		@@ -543,7 +548,7 @@ void cuComputeGradInput(
		const V* gamma,
		T* grad_input)
		{
		for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
		for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
		U sum_loss1 = U(0);
		U sum_loss2 = U(0);
		const U c_mean = mean[i1];

megatron/fused_kernels/scaled_masked_softmax.cpp

+1 −1

Original line number	Diff line number	Diff line
		@@ -14,7 +14,7 @@
		* limitations under the License.
		*/

		#include <cuda_fp16.h>
		#include <hip/hip_fp16.h>
		#include <torch/extension.h>
		#include <vector>