Loading export_DDP_vars.sh 0 → 100644 +6 −0 Original line number Diff line number Diff line export RANK=$SLURM_PROCID export LOCAL_RANK=$SLURM_LOCALID export WORLD_SIZE=$SLURM_NTASKS export MASTER_ADDR=$SLURM_SUBMIT_HOST export MASTER_PORT=29500 finetune_hoc.lsf +13 −11 Original line number Diff line number Diff line #!/bin/bash #BSUB -nnodes 2 #BSUB -W 2:00 #BSUB -P med106 #BSUB -W 0:45 #BSUB -P med107 #BSUB -alloc_flags "smt4 nvme" #BSUB -J hoc_FULL #BSUB -o hoc_FULL.%J Loading @@ -13,10 +13,11 @@ set +x #module load open-ce/1.4.0-py38-0 module load open-ce conda deactivate conda activate /gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch module list conda activate /gpfs/alpine/med106/world-shared/irl1/rhel8/myt_py1.11 export OMP_NUM_THREADS=1 ulimit -n 65536 rm -f `find -name *lock` #export PYTHONPATH=$PYTHONPATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/megatron/fused_kernels #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch/lib/python3.8/site-packages/torch/lib #export PATH=$PATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch/lib/python3.8/site-packages/torch/include Loading @@ -35,16 +36,16 @@ VALID_DATA="/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/hocdata" #export VALID_DATA=picodata/dev.tsv export VOCAB_FILE=/gpfs/alpine/world-shared/med106/g8o/pubmed_bert-vocab.txt export CHECKPOINT_PATH=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/finetune-HOC_BIG export PRETRAINED_CHECKPOINT=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/chkptt export CHECKPOINT_PATH=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/finetune-pubmed_bert_1x1_b8 export PRETRAINED_CHECKPOINT=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/pubmed_bert_1x1_b8_chckpttt jsrun --smpiargs="-disable_gpu_hooks" -n $nnodes -r 1 -g 6 -a 6 -c 42 python tasks/main.py \ --task HOC \ --tensor-model-parallel-size 2 \ --pipeline-model-parallel-size 2 \ --num-layers 24 \ --hidden-size 1024 \ --num-attention-heads 16 \ --tensor-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \ --num-layers 12 \ --hidden-size 768 \ --num-attention-heads 12 \ --seq-length 512 \ --max-position-embeddings 512 \ --fp16 \ Loading @@ -64,6 +65,7 @@ jsrun --smpiargs="-disable_gpu_hooks" -n $nnodes -r 1 -g 6 -a 6 -c 42 python tas --micro-batch-size 4 \ --lr 0.0001 \ --lr-warmup-fraction 0.06 \ --num-workers 0 \ --distributed-backend nccl #--DDP-impl torch \ megatron/arguments.py +5 −1 Original line number Diff line number Diff line Loading @@ -348,6 +348,10 @@ def _add_network_size_args(parser): help='Disable BERT binary head.', dest='bert_binary_head') group.add_argument('--HIP', type=int, default=None, help='Use HIP to compile or not') return parser Loading Loading @@ -656,7 +660,7 @@ def _add_distributed_args(parser): group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None, help='Number of layers per virtual pipeline stage') group.add_argument('--distributed-backend', default='nccl', choices=['nccl', 'gloo'], choices=['nccl', 'gloo', 'mpi'], help='Which backend to use for distributed training.') group.add_argument('--DDP-impl', default='local', choices=['local', 'torch'], Loading megatron/fused_kernels/__init__.py +79 −0 Original line number Diff line number Diff line Loading @@ -17,7 +17,9 @@ import os import pathlib import subprocess import torch from torch.utils import cpp_extension from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME # Setting this param to a list has a problem of generating different # compilation commands (with diferent order of architectures) and Loading @@ -27,6 +29,83 @@ from torch.utils import cpp_extension os.environ["TORCH_CUDA_ARCH_LIST"] = "" def load_hip(args): this_dir = os.path.dirname(os.path.abspath(__file__)) TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) version_ge_1_1 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0): version_ge_1_1 = ['-DVERSION_GE_1_1'] version_ge_1_3 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2): version_ge_1_3 = ['-DVERSION_GE_1_3'] version_ge_1_5 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4): version_ge_1_5 = ['-DVERSION_GE_1_5'] version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5 cc_flag = [] srcpath = pathlib.Path(__file__).parent.absolute() buildpath = srcpath / 'build' _create_build_dir(buildpath) hipcc_args = ['-O3'] + version_dependent_macros def _cpp_extention_load_helper(name, sources, extra_hip_flags): return cpp_extension.load( name=name, sources=sources, build_directory=buildpath, extra_include_paths=[this_dir], extra_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1', '-DCMAKE_CXX_COMPILER=g++', '-DCMAKE_C_COMPILER=gcc'], extra_cuda_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1'] + extra_hip_flags + cc_flag, verbose=(args.rank == 0) ) #Build Extension in a setup file #return CUDAExtension( # name=name, # sources=sources, # include_dirs=[this_dir], # extra_compile_args={'cxx': ['-O3',] + version_dependent_macros, # 'nvcc': hipcc_args}) # ============== # Fused softmax. # ============== if args.masked_softmax_fusion: extra_hip_flags = [] extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr', '--expt-extended-lambda'] # Upper triangular softmax. sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp', srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'] scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper( "scaled_upper_triang_masked_softmax_cuda", sources, extra_hip_flags) # Masked softmax. sources=[srcpath / 'scaled_masked_softmax.cpp', srcpath / 'scaled_masked_softmax_cuda.cu'] scaled_masked_softmax_cuda = _cpp_extention_load_helper( "scaled_masked_softmax_cuda", sources, extra_hip_flags) # ================================= # Mixed precision fused layer norm. # ================================= extra_hip_flags = [] extra_cuda_flags = ['-maxrregcount=50'] sources=[srcpath / 'layer_norm_cuda.cpp', srcpath / 'layer_norm_cuda_kernel.cu'] fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper( "fused_mix_prec_layer_norm_cuda", sources, extra_hip_flags) def load(args): # Check if cuda 11 is installed for compute capability 8.0 Loading megatron/fused_kernels/layer_norm_cuda_kernel.cu +10 −2 Original line number Diff line number Diff line Loading @@ -250,9 +250,17 @@ void cuWelfordMuSigma2( template<typename U> U rsqrt(U v) { return U(1) / sqrt(v); } #if defined __HIP_PLATFORM_AMD__ __device__ float rsqrt(float v) { return rsqrtf(v); } #else template<> float rsqrt(float v) { return rsqrtf(v); } #endif template<> double rsqrt(double v) { return rsqrt(v); } Loading Loading @@ -304,7 +312,7 @@ void cuApplyLayerNorm( // 1) blockDim.x == warpSize // 2) Tensors are contiguous // for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { SharedMemory<U> shared; U* buf = shared.getPointer(); U mu,sigma2; Loading Loading @@ -543,7 +551,7 @@ void cuComputeGradInput( const V* gamma, T* grad_input) { for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { U sum_loss1 = U(0); U sum_loss2 = U(0); const U c_mean = mean[i1]; Loading Loading
export_DDP_vars.sh 0 → 100644 +6 −0 Original line number Diff line number Diff line export RANK=$SLURM_PROCID export LOCAL_RANK=$SLURM_LOCALID export WORLD_SIZE=$SLURM_NTASKS export MASTER_ADDR=$SLURM_SUBMIT_HOST export MASTER_PORT=29500
finetune_hoc.lsf +13 −11 Original line number Diff line number Diff line #!/bin/bash #BSUB -nnodes 2 #BSUB -W 2:00 #BSUB -P med106 #BSUB -W 0:45 #BSUB -P med107 #BSUB -alloc_flags "smt4 nvme" #BSUB -J hoc_FULL #BSUB -o hoc_FULL.%J Loading @@ -13,10 +13,11 @@ set +x #module load open-ce/1.4.0-py38-0 module load open-ce conda deactivate conda activate /gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch module list conda activate /gpfs/alpine/med106/world-shared/irl1/rhel8/myt_py1.11 export OMP_NUM_THREADS=1 ulimit -n 65536 rm -f `find -name *lock` #export PYTHONPATH=$PYTHONPATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/megatron/fused_kernels #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch/lib/python3.8/site-packages/torch/lib #export PATH=$PATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch/lib/python3.8/site-packages/torch/include Loading @@ -35,16 +36,16 @@ VALID_DATA="/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/hocdata" #export VALID_DATA=picodata/dev.tsv export VOCAB_FILE=/gpfs/alpine/world-shared/med106/g8o/pubmed_bert-vocab.txt export CHECKPOINT_PATH=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/finetune-HOC_BIG export PRETRAINED_CHECKPOINT=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/chkptt export CHECKPOINT_PATH=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/finetune-pubmed_bert_1x1_b8 export PRETRAINED_CHECKPOINT=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/pubmed_bert_1x1_b8_chckpttt jsrun --smpiargs="-disable_gpu_hooks" -n $nnodes -r 1 -g 6 -a 6 -c 42 python tasks/main.py \ --task HOC \ --tensor-model-parallel-size 2 \ --pipeline-model-parallel-size 2 \ --num-layers 24 \ --hidden-size 1024 \ --num-attention-heads 16 \ --tensor-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \ --num-layers 12 \ --hidden-size 768 \ --num-attention-heads 12 \ --seq-length 512 \ --max-position-embeddings 512 \ --fp16 \ Loading @@ -64,6 +65,7 @@ jsrun --smpiargs="-disable_gpu_hooks" -n $nnodes -r 1 -g 6 -a 6 -c 42 python tas --micro-batch-size 4 \ --lr 0.0001 \ --lr-warmup-fraction 0.06 \ --num-workers 0 \ --distributed-backend nccl #--DDP-impl torch \
megatron/arguments.py +5 −1 Original line number Diff line number Diff line Loading @@ -348,6 +348,10 @@ def _add_network_size_args(parser): help='Disable BERT binary head.', dest='bert_binary_head') group.add_argument('--HIP', type=int, default=None, help='Use HIP to compile or not') return parser Loading Loading @@ -656,7 +660,7 @@ def _add_distributed_args(parser): group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None, help='Number of layers per virtual pipeline stage') group.add_argument('--distributed-backend', default='nccl', choices=['nccl', 'gloo'], choices=['nccl', 'gloo', 'mpi'], help='Which backend to use for distributed training.') group.add_argument('--DDP-impl', default='local', choices=['local', 'torch'], Loading
megatron/fused_kernels/__init__.py +79 −0 Original line number Diff line number Diff line Loading @@ -17,7 +17,9 @@ import os import pathlib import subprocess import torch from torch.utils import cpp_extension from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME # Setting this param to a list has a problem of generating different # compilation commands (with diferent order of architectures) and Loading @@ -27,6 +29,83 @@ from torch.utils import cpp_extension os.environ["TORCH_CUDA_ARCH_LIST"] = "" def load_hip(args): this_dir = os.path.dirname(os.path.abspath(__file__)) TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) version_ge_1_1 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0): version_ge_1_1 = ['-DVERSION_GE_1_1'] version_ge_1_3 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2): version_ge_1_3 = ['-DVERSION_GE_1_3'] version_ge_1_5 = [] if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4): version_ge_1_5 = ['-DVERSION_GE_1_5'] version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5 cc_flag = [] srcpath = pathlib.Path(__file__).parent.absolute() buildpath = srcpath / 'build' _create_build_dir(buildpath) hipcc_args = ['-O3'] + version_dependent_macros def _cpp_extention_load_helper(name, sources, extra_hip_flags): return cpp_extension.load( name=name, sources=sources, build_directory=buildpath, extra_include_paths=[this_dir], extra_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1', '-DCMAKE_CXX_COMPILER=g++', '-DCMAKE_C_COMPILER=gcc'], extra_cuda_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1'] + extra_hip_flags + cc_flag, verbose=(args.rank == 0) ) #Build Extension in a setup file #return CUDAExtension( # name=name, # sources=sources, # include_dirs=[this_dir], # extra_compile_args={'cxx': ['-O3',] + version_dependent_macros, # 'nvcc': hipcc_args}) # ============== # Fused softmax. # ============== if args.masked_softmax_fusion: extra_hip_flags = [] extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr', '--expt-extended-lambda'] # Upper triangular softmax. sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp', srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'] scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper( "scaled_upper_triang_masked_softmax_cuda", sources, extra_hip_flags) # Masked softmax. sources=[srcpath / 'scaled_masked_softmax.cpp', srcpath / 'scaled_masked_softmax_cuda.cu'] scaled_masked_softmax_cuda = _cpp_extention_load_helper( "scaled_masked_softmax_cuda", sources, extra_hip_flags) # ================================= # Mixed precision fused layer norm. # ================================= extra_hip_flags = [] extra_cuda_flags = ['-maxrregcount=50'] sources=[srcpath / 'layer_norm_cuda.cpp', srcpath / 'layer_norm_cuda_kernel.cu'] fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper( "fused_mix_prec_layer_norm_cuda", sources, extra_hip_flags) def load(args): # Check if cuda 11 is installed for compute capability 8.0 Loading
megatron/fused_kernels/layer_norm_cuda_kernel.cu +10 −2 Original line number Diff line number Diff line Loading @@ -250,9 +250,17 @@ void cuWelfordMuSigma2( template<typename U> U rsqrt(U v) { return U(1) / sqrt(v); } #if defined __HIP_PLATFORM_AMD__ __device__ float rsqrt(float v) { return rsqrtf(v); } #else template<> float rsqrt(float v) { return rsqrtf(v); } #endif template<> double rsqrt(double v) { return rsqrt(v); } Loading Loading @@ -304,7 +312,7 @@ void cuApplyLayerNorm( // 1) blockDim.x == warpSize // 2) Tensors are contiguous // for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { SharedMemory<U> shared; U* buf = shared.getPointer(); U mu,sigma2; Loading Loading @@ -543,7 +551,7 @@ void cuComputeGradInput( const V* gamma, T* grad_input) { for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { U sum_loss1 = U(0); U sum_loss2 = U(0); const U c_mean = mean[i1]; Loading