Loading pretrain_crusher_mol_lamb.slurm +33 −12 Original line number Diff line number Diff line #!/bin/bash #SBATCH -A MED106_crusher #SBATCH -N 2 #SBATCH -t 06:00:00 #SBATCH -N 16 #SBATCH -t 00:25:00 #SBATCH -J crusher #SBATCH -o %x-%j.out #SBATCH -p batch Loading @@ -27,15 +27,22 @@ export CXX=g++ ranks_per_node=8 gpus_per_rank=$((8/$ranks_per_node)) ranks_total=$(($ranks_per_node*$SLURM_JOB_NUM_NODES)) # export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6.0.29 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" export LD_PRELOAD="${LD_PRELOAD} ${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so" # export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6.0.29 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" # export LD_PRELOAD="${LD_PRELOAD} ${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so" # export LD_PRELOAD="$LD_PRELOAD /lib64/libtinfo.so.6 /lib64/libncurses.so.6 /gpfs/alpine/world-shared/bif136/rccl/build/librccl.so" # export NCCL_NET_GDR_LEVEL=4 # export FI_CXI_ATS=0 # export LD_LIBRARY_PATH=/gpfs/alpine/proj-shared/bif136/aws-ofi-rccl/src/.libs:/opt/cray/libfabric/1.15.0.0/lib64:$LD_LIBRARY_PATH # export NCCL_DEBUG=info export LD_PRELOAD="$LD_PRELOAD /lib64/libtinfo.so.6 /lib64/libncurses.so.6 /gpfs/alpine/world-shared/bif136/rccl/build/librccl.so" export NCCL_NET_GDR_LEVEL=4 export FI_CXI_ATS=0 export LD_LIBRARY_PATH=/gpfs/alpine/proj-shared/bif136/aws-ofi-rccl/src/.libs:/opt/cray/libfabric/1.15.0.0/lib64:$LD_LIBRARY_PATH export NCCL_DEBUG=info # export NCCL_DEBUG=INFO # export FI_CXI_ATS=0 # export LD_LIBRARY_PATH=/opt/rocm-5.2.0/rccl:/gpfs/alpine/med106/world-shared/gounley1/crusher2/aws-ofi-rccl/src/.libs:/gpfs/alpine/world-shared/med106/gounley1/crusher2/aws-ofi-rccl-build:/opt/cray/libfabric/1.15.0.0/lib64/:/opt/rocm-5.2.0/lib:$LD_LIBRARY_PATH # export FI_LOG_LEVEL=info # export NCCL_NET_GDR_LEVEL=3 DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000000.json_smiles_document" DATASET_1="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000001.json_smiles_document" Loading Loading @@ -97,13 +104,27 @@ export CHECKPOINT_PATH=/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatr # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --optimizer lamb --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --optimizer lamb --micro-batch-size 256 --global-batch-size 12288 --seq-length 64 --max-position-embeddings 512 --train-iters 300 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 1024 --num-attention-heads 16 --optimizer lamb --micro-batch-size 96 --global-batch-size 4608 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 32 --optimizer lamb --micro-batch-size 48 --global-batch-size 4608 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 48 --hidden-size 2560 --num-attention-heads 40 --optimizer lamb --micro-batch-size 64 --global-batch-size 24576 --seq-length 64 --max-position-embeddings 512 --train-iters 1200 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # 13.17B # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 4 --num-layers 64 --hidden-size 4096 --num-attention-heads 64 --optimizer lamb --micro-batch-size 16 --global-batch-size 128 --seq-length 64 --max-position-embeddings 512 --train-iters 1200 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 20 --save-interval 10000 --eval-interval 20 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # 25.5B srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " source export_DDP_vars.sh python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 32 --optimizer lamb --micro-batch-size 48 --global-batch-size 2304 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" python pretrain_bert.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 4 --num-layers 80 --hidden-size 5120 --num-attention-heads 80 --optimizer lamb --micro-batch-size 16 --global-batch-size 4096 --seq-length 64 --max-position-embeddings 512 --train-iters 5000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 20 --save-interval 10000 --eval-interval 50 --eval-iters 20 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" Loading
pretrain_crusher_mol_lamb.slurm +33 −12 Original line number Diff line number Diff line #!/bin/bash #SBATCH -A MED106_crusher #SBATCH -N 2 #SBATCH -t 06:00:00 #SBATCH -N 16 #SBATCH -t 00:25:00 #SBATCH -J crusher #SBATCH -o %x-%j.out #SBATCH -p batch Loading @@ -27,15 +27,22 @@ export CXX=g++ ranks_per_node=8 gpus_per_rank=$((8/$ranks_per_node)) ranks_total=$(($ranks_per_node*$SLURM_JOB_NUM_NODES)) # export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6.0.29 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" export LD_PRELOAD="${LD_PRELOAD} ${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so" # export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6.0.29 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" # export LD_PRELOAD="${LD_PRELOAD} ${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so" # export LD_PRELOAD="$LD_PRELOAD /lib64/libtinfo.so.6 /lib64/libncurses.so.6 /gpfs/alpine/world-shared/bif136/rccl/build/librccl.so" # export NCCL_NET_GDR_LEVEL=4 # export FI_CXI_ATS=0 # export LD_LIBRARY_PATH=/gpfs/alpine/proj-shared/bif136/aws-ofi-rccl/src/.libs:/opt/cray/libfabric/1.15.0.0/lib64:$LD_LIBRARY_PATH # export NCCL_DEBUG=info export LD_PRELOAD="$LD_PRELOAD /lib64/libtinfo.so.6 /lib64/libncurses.so.6 /gpfs/alpine/world-shared/bif136/rccl/build/librccl.so" export NCCL_NET_GDR_LEVEL=4 export FI_CXI_ATS=0 export LD_LIBRARY_PATH=/gpfs/alpine/proj-shared/bif136/aws-ofi-rccl/src/.libs:/opt/cray/libfabric/1.15.0.0/lib64:$LD_LIBRARY_PATH export NCCL_DEBUG=info # export NCCL_DEBUG=INFO # export FI_CXI_ATS=0 # export LD_LIBRARY_PATH=/opt/rocm-5.2.0/rccl:/gpfs/alpine/med106/world-shared/gounley1/crusher2/aws-ofi-rccl/src/.libs:/gpfs/alpine/world-shared/med106/gounley1/crusher2/aws-ofi-rccl-build:/opt/cray/libfabric/1.15.0.0/lib64/:/opt/rocm-5.2.0/lib:$LD_LIBRARY_PATH # export FI_LOG_LEVEL=info # export NCCL_NET_GDR_LEVEL=3 DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000000.json_smiles_document" DATASET_1="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000001.json_smiles_document" Loading Loading @@ -97,13 +104,27 @@ export CHECKPOINT_PATH=/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatr # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --optimizer lamb --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --optimizer lamb --micro-batch-size 256 --global-batch-size 12288 --seq-length 64 --max-position-embeddings 512 --train-iters 300 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 1024 --num-attention-heads 16 --optimizer lamb --micro-batch-size 96 --global-batch-size 4608 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 32 --optimizer lamb --micro-batch-size 48 --global-batch-size 4608 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --num-layers 48 --hidden-size 2560 --num-attention-heads 40 --optimizer lamb --micro-batch-size 64 --global-batch-size 24576 --seq-length 64 --max-position-embeddings 512 --train-iters 1200 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # 13.17B # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 4 --num-layers 64 --hidden-size 4096 --num-attention-heads 64 --optimizer lamb --micro-batch-size 16 --global-batch-size 128 --seq-length 64 --max-position-embeddings 512 --train-iters 1200 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 20 --save-interval 10000 --eval-interval 20 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # 25.5B srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " source export_DDP_vars.sh python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 32 --optimizer lamb --micro-batch-size 48 --global-batch-size 2304 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" python pretrain_bert.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 4 --num-layers 80 --hidden-size 5120 --num-attention-heads 80 --optimizer lamb --micro-batch-size 16 --global-batch-size 4096 --seq-length 64 --max-position-embeddings 512 --train-iters 5000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 20 --save-interval 10000 --eval-interval 50 --eval-iters 20 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16"