Merge remote-tracking branch 'origin/F1STAT' into main (c31a066f) · Commits · candle / Megatron-LM

export_DDP_vars.sh

0 → 100644

+6 −0

Original line number	Diff line number	Diff line
		export RANK=$SLURM_PROCID
		export LOCAL_RANK=$SLURM_LOCALID
		export WORLD_SIZE=$SLURM_NTASKS
		export MASTER_ADDR=$SLURM_SUBMIT_HOST
		export MASTER_PORT=29500

finetune_hoc.lsf

+13 −11

Original line number	Diff line number	Diff line
		#!/bin/bash

		#BSUB -nnodes 2
		#BSUB -W 2:00
		#BSUB -P med106
		#BSUB -W 0:45
		#BSUB -P med107
		#BSUB -alloc_flags "smt4 nvme"
		#BSUB -J hoc_FULL
		#BSUB -o hoc_FULL.%J
		@@ -13,10 +13,11 @@ set +x
		#module load open-ce/1.4.0-py38-0
		module load open-ce
		conda deactivate
		conda activate /gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch
		module list
		conda activate /gpfs/alpine/med106/world-shared/irl1/rhel8/myt_py1.11

		export OMP_NUM_THREADS=1
		ulimit -n 65536
		rm -f `find -name *lock`
		#export PYTHONPATH=$PYTHONPATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/megatron/fused_kernels
		#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch/lib/python3.8/site-packages/torch/lib
		#export PATH=$PATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch/lib/python3.8/site-packages/torch/include
		@@ -35,16 +36,16 @@ VALID_DATA="/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/hocdata"
		#export VALID_DATA=picodata/dev.tsv

		export VOCAB_FILE=/gpfs/alpine/world-shared/med106/g8o/pubmed_bert-vocab.txt
		export CHECKPOINT_PATH=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/finetune-HOC_BIG
		export PRETRAINED_CHECKPOINT=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/chkptt
		export CHECKPOINT_PATH=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/finetune-pubmed_bert_1x1_b8
		export PRETRAINED_CHECKPOINT=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/pubmed_bert_1x1_b8_chckpttt

		jsrun --smpiargs="-disable_gpu_hooks" -n $nnodes -r 1 -g 6 -a 6 -c 42 python tasks/main.py \
		--task HOC \
		--tensor-model-parallel-size 2 \
		--pipeline-model-parallel-size 2 \
		--num-layers 24 \
		--hidden-size 1024 \
		--num-attention-heads 16 \
		--tensor-model-parallel-size 1 \
		--pipeline-model-parallel-size 1 \
		--num-layers 12 \
		--hidden-size 768 \
		--num-attention-heads 12 \
		--seq-length 512 \
		--max-position-embeddings 512 \
		--fp16 \
		@@ -64,6 +65,7 @@ jsrun --smpiargs="-disable_gpu_hooks" -n $nnodes -r 1 -g 6 -a 6 -c 42 python tas
		--micro-batch-size 4 \
		--lr 0.0001 \
		--lr-warmup-fraction 0.06 \
		--num-workers 0 \
		--distributed-backend nccl
		#--DDP-impl torch \

megatron/arguments.py

+5 −1

Original line number	Diff line number	Diff line
		@@ -348,6 +348,10 @@ def _add_network_size_args(parser):
		help='Disable BERT binary head.',
		dest='bert_binary_head')

		group.add_argument('--HIP', type=int, default=None,
		help='Use HIP to compile or not')


		return parser


		@@ -656,7 +660,7 @@ def _add_distributed_args(parser):
		group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
		help='Number of layers per virtual pipeline stage')
		group.add_argument('--distributed-backend', default='nccl',
		choices=['nccl', 'gloo'],
		choices=['nccl', 'gloo', 'mpi'],
		help='Which backend to use for distributed training.')
		group.add_argument('--DDP-impl', default='local',
		choices=['local', 'torch'],

megatron/fused_kernels/init.py

+79 −0

Original line number	Diff line number	Diff line
		@@ -17,7 +17,9 @@ import os
		import pathlib
		import subprocess

		import torch
		from torch.utils import cpp_extension
		from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME

		# Setting this param to a list has a problem of generating different
		# compilation commands (with diferent order of architectures) and
		@@ -27,6 +29,83 @@ from torch.utils import cpp_extension
		os.environ["TORCH_CUDA_ARCH_LIST"] = ""


		def load_hip(args):

		this_dir = os.path.dirname(os.path.abspath(__file__))

		TORCH_MAJOR = int(torch.__version__.split('.')[0])
		TORCH_MINOR = int(torch.__version__.split('.')[1])
		version_ge_1_1 = []
		if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
		version_ge_1_1 = ['-DVERSION_GE_1_1']
		version_ge_1_3 = []
		if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
		version_ge_1_3 = ['-DVERSION_GE_1_3']
		version_ge_1_5 = []
		if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
		version_ge_1_5 = ['-DVERSION_GE_1_5']
		version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5

		cc_flag = []

		srcpath = pathlib.Path(__file__).parent.absolute()
		buildpath = srcpath / 'build'
		_create_build_dir(buildpath)

		hipcc_args = ['-O3'] + version_dependent_macros
		def _cpp_extention_load_helper(name, sources, extra_hip_flags):
		return cpp_extension.load(
		name=name,
		sources=sources,
		build_directory=buildpath,
		extra_include_paths=[this_dir],
		extra_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1', '-DCMAKE_CXX_COMPILER=g++', '-DCMAKE_C_COMPILER=gcc'],
		extra_cuda_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1'] + extra_hip_flags + cc_flag,
		verbose=(args.rank == 0)
		)
		#Build Extension in a setup file
		#return CUDAExtension(
		# name=name,
		# sources=sources,
		# include_dirs=[this_dir],
		# extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
		# 'nvcc': hipcc_args})

		# ==============
		# Fused softmax.
		# ==============

		if args.masked_softmax_fusion:
		extra_hip_flags = []
		extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
		'-U__CUDA_NO_HALF_CONVERSIONS__',
		'--expt-relaxed-constexpr',
		'--expt-extended-lambda']

		# Upper triangular softmax.
		sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
		srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
		scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
		"scaled_upper_triang_masked_softmax_cuda",
		sources, extra_hip_flags)

		# Masked softmax.
		sources=[srcpath / 'scaled_masked_softmax.cpp',
		srcpath / 'scaled_masked_softmax_cuda.cu']
		scaled_masked_softmax_cuda = _cpp_extention_load_helper(
		"scaled_masked_softmax_cuda", sources, extra_hip_flags)

		# =================================
		# Mixed precision fused layer norm.
		# =================================

		extra_hip_flags = []
		extra_cuda_flags = ['-maxrregcount=50']
		sources=[srcpath / 'layer_norm_cuda.cpp',
		srcpath / 'layer_norm_cuda_kernel.cu']
		fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
		"fused_mix_prec_layer_norm_cuda", sources, extra_hip_flags)

		def load(args):

		# Check if cuda 11 is installed for compute capability 8.0

megatron/fused_kernels/layer_norm_cuda_kernel.cu

+10 −2

Original line number	Diff line number	Diff line
		@@ -250,9 +250,17 @@ void cuWelfordMuSigma2(
		template<typename U> U rsqrt(U v) {
		return U(1) / sqrt(v);
		}

		#if defined __HIP_PLATFORM_AMD__
		__device__ float rsqrt(float v) {
		return rsqrtf(v);
		}
		#else
		template<> float rsqrt(float v) {
		return rsqrtf(v);
		}
		#endif

		template<> double rsqrt(double v) {
		return rsqrt(v);
		}
		@@ -304,7 +312,7 @@ void cuApplyLayerNorm(
		// 1) blockDim.x == warpSize
		// 2) Tensors are contiguous
		//
		for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
		for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
		SharedMemory<U> shared;
		U* buf = shared.getPointer();
		U mu,sigma2;
		@@ -543,7 +551,7 @@ void cuComputeGradInput(
		const V* gamma,
		T* grad_input)
		{
		for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
		for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
		U sum_loss1 = U(0);
		U sum_loss2 = U(0);
		const U c_mean = mean[i1];