Commit c31a066f authored by Isaac's avatar Isaac
Browse files

Merge remote-tracking branch 'origin/F1STAT' into main

parents 7eb70084 f0d181e3
Loading
Loading
Loading
Loading

export_DDP_vars.sh

0 → 100644
+6 −0
Original line number Diff line number Diff line
export RANK=$SLURM_PROCID
export LOCAL_RANK=$SLURM_LOCALID
export WORLD_SIZE=$SLURM_NTASKS
export MASTER_ADDR=$SLURM_SUBMIT_HOST
export MASTER_PORT=29500
+13 −11
Original line number Diff line number Diff line
#!/bin/bash

#BSUB -nnodes 2
#BSUB -W 2:00
#BSUB -P med106
#BSUB -W 0:45
#BSUB -P med107
#BSUB -alloc_flags "smt4 nvme"
#BSUB -J hoc_FULL
#BSUB -o hoc_FULL.%J
@@ -13,10 +13,11 @@ set +x
#module load open-ce/1.4.0-py38-0
module load open-ce
conda deactivate
conda activate /gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch
module list
conda activate /gpfs/alpine/med106/world-shared/irl1/rhel8/myt_py1.11

export OMP_NUM_THREADS=1
ulimit -n 65536
rm -f `find -name *lock`
#export PYTHONPATH=$PYTHONPATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/megatron/fused_kernels
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch/lib/python3.8/site-packages/torch/lib
#export PATH=$PATH:/gpfs/alpine/med106/world-shared/irl1/rhel8/mytorch/lib/python3.8/site-packages/torch/include
@@ -35,16 +36,16 @@ VALID_DATA="/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/hocdata"
#export VALID_DATA=picodata/dev.tsv

export VOCAB_FILE=/gpfs/alpine/world-shared/med106/g8o/pubmed_bert-vocab.txt
export CHECKPOINT_PATH=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/finetune-HOC_BIG
export PRETRAINED_CHECKPOINT=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/chkptt
export CHECKPOINT_PATH=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/finetune-pubmed_bert_1x1_b8
export PRETRAINED_CHECKPOINT=/gpfs/alpine/med106/world-shared/irl1/rhel8/fork-megatron/pubmed_bert_1x1_b8_chckpttt

jsrun --smpiargs="-disable_gpu_hooks" -n $nnodes -r 1 -g 6 -a 6 -c 42 python tasks/main.py \
       --task HOC \
       --tensor-model-parallel-size 2 \
       --pipeline-model-parallel-size 2 \
       --num-layers 24 \
       --hidden-size 1024 \
       --num-attention-heads 16 \
       --tensor-model-parallel-size 1 \
       --pipeline-model-parallel-size 1 \
       --num-layers 12 \
       --hidden-size 768 \
       --num-attention-heads 12 \
       --seq-length 512 \
       --max-position-embeddings 512 \
       --fp16 \
@@ -64,6 +65,7 @@ jsrun --smpiargs="-disable_gpu_hooks" -n $nnodes -r 1 -g 6 -a 6 -c 42 python tas
       --micro-batch-size 4 \
       --lr 0.0001 \
       --lr-warmup-fraction 0.06 \
       --num-workers 0 \
       --distributed-backend nccl
       #--DDP-impl torch \
+5 −1
Original line number Diff line number Diff line
@@ -348,6 +348,10 @@ def _add_network_size_args(parser):
                       help='Disable BERT binary head.',
                       dest='bert_binary_head')
    
    group.add_argument('--HIP', type=int, default=None, 
                       help='Use HIP to compile or not')
                                                

    return parser


@@ -656,7 +660,7 @@ def _add_distributed_args(parser):
    group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
                       help='Number of layers per virtual pipeline stage')
    group.add_argument('--distributed-backend', default='nccl',
                       choices=['nccl', 'gloo'],
                       choices=['nccl', 'gloo', 'mpi'],
                       help='Which backend to use for distributed training.')
    group.add_argument('--DDP-impl', default='local',
                       choices=['local', 'torch'],
+79 −0
Original line number Diff line number Diff line
@@ -17,7 +17,9 @@ import os
import pathlib
import subprocess

import torch
from torch.utils import cpp_extension
from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME

# Setting this param to a list has a problem of generating different
# compilation commands (with diferent order of architectures) and
@@ -27,6 +29,83 @@ from torch.utils import cpp_extension
os.environ["TORCH_CUDA_ARCH_LIST"] = ""


def load_hip(args):

    this_dir = os.path.dirname(os.path.abspath(__file__))

    TORCH_MAJOR = int(torch.__version__.split('.')[0])
    TORCH_MINOR = int(torch.__version__.split('.')[1])
    version_ge_1_1 = []
    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
        version_ge_1_1 = ['-DVERSION_GE_1_1']
    version_ge_1_3 = []
    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
        version_ge_1_3 = ['-DVERSION_GE_1_3']
    version_ge_1_5 = []
    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
        version_ge_1_5 = ['-DVERSION_GE_1_5']
    version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5

    cc_flag = []

    srcpath = pathlib.Path(__file__).parent.absolute()
    buildpath = srcpath / 'build'
    _create_build_dir(buildpath)
    
    hipcc_args = ['-O3'] + version_dependent_macros
    def _cpp_extention_load_helper(name, sources, extra_hip_flags):
        return cpp_extension.load(
            name=name,
            sources=sources,
            build_directory=buildpath,
            extra_include_paths=[this_dir],
            extra_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1', '-DCMAKE_CXX_COMPILER=g++', '-DCMAKE_C_COMPILER=gcc'],
            extra_cuda_cflags=['-O3', '-D__HIP_PLATFORM_AMD__=1'] + extra_hip_flags + cc_flag,
            verbose=(args.rank == 0)
        )
        #Build Extension in a setup file
        #return CUDAExtension(
        #    name=name,
        #    sources=sources,
        #    include_dirs=[this_dir],
        #    extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
        #                        'nvcc': hipcc_args})

    # ==============
    # Fused softmax.
    # ==============

    if args.masked_softmax_fusion:
        extra_hip_flags = []
        extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
                            '-U__CUDA_NO_HALF_CONVERSIONS__',
                            '--expt-relaxed-constexpr',
                            '--expt-extended-lambda']
        
        # Upper triangular softmax.
        sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
                 srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
        scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
            "scaled_upper_triang_masked_softmax_cuda",
            sources, extra_hip_flags)

        # Masked softmax.
        sources=[srcpath / 'scaled_masked_softmax.cpp',
                 srcpath / 'scaled_masked_softmax_cuda.cu']
        scaled_masked_softmax_cuda = _cpp_extention_load_helper(
            "scaled_masked_softmax_cuda", sources, extra_hip_flags)

    # =================================
    # Mixed precision fused layer norm.
    # =================================

    extra_hip_flags = []
    extra_cuda_flags = ['-maxrregcount=50']
    sources=[srcpath / 'layer_norm_cuda.cpp',
             srcpath / 'layer_norm_cuda_kernel.cu']
    fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
        "fused_mix_prec_layer_norm_cuda", sources, extra_hip_flags)

def load(args):

    # Check if cuda 11 is installed for compute capability 8.0
+10 −2
Original line number Diff line number Diff line
@@ -250,9 +250,17 @@ void cuWelfordMuSigma2(
template<typename U> U rsqrt(U v) {
  return U(1) / sqrt(v);
}

#if defined __HIP_PLATFORM_AMD__
__device__ float rsqrt(float v) {
  return rsqrtf(v);
}
#else
template<> float rsqrt(float v) {
  return rsqrtf(v);
}
#endif

template<> double rsqrt(double v) {
  return rsqrt(v);
}
@@ -304,7 +312,7 @@ void cuApplyLayerNorm(
  // 1) blockDim.x == warpSize
  // 2) Tensors are contiguous
  //
  for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
    SharedMemory<U> shared;
    U* buf = shared.getPointer();
    U mu,sigma2;
@@ -543,7 +551,7 @@ void cuComputeGradInput(
    const V* gamma,
    T* grad_input)
{
  for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
    U sum_loss1 = U(0);
    U sum_loss2 = U(0);
    const U c_mean = mean[i1];
Loading