Loading pretrain_crusher_mol_adam.slurm +41 −2 Original line number Diff line number Diff line Loading @@ -41,9 +41,48 @@ DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000007_smiles_document" DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000008_smiles_document" DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000009_smiles_document" DATASET_10="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000010_smiles_document" DATASET_11="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000011_smiles_document" DATASET_12="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000012_smiles_document" DATASET_13="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000013_smiles_document" DATASET_14="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000014_smiles_document" DATASET_15="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000015_smiles_document" DATASET_16="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000016_smiles_document" DATASET_17="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000017_smiles_document" DATASET_18="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000018_smiles_document" DATASET_19="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000019_smiles_document" DATASET_20="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000020_smiles_document" DATASET_21="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000021_smiles_document" DATASET_22="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000022_smiles_document" DATASET_23="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000023_smiles_document" DATASET_24="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000024_smiles_document" DATASET_25="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000025_smiles_document" DATASET_26="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000026_smiles_document" DATASET_27="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000027_smiles_document" DATASET_28="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000028_smiles_document" DATASET_29="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000029_smiles_document" DATASET_30="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000030_smiles_document" DATASET_31="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000031_smiles_document" DATASET_32="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000032_smiles_document" DATASET_33="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000033_smiles_document" DATASET_34="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000034_smiles_document" DATASET_35="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000035_smiles_document" DATASET_36="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000036_smiles_document" DATASET_37="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000037_smiles_document" DATASET_38="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000038_smiles_document" DATASET_39="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000039_smiles_document" DATASET_40="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000040_smiles_document" DATASET_41="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000041_smiles_document" DATASET_42="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000042_smiles_document" DATASET_43="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000043_smiles_document" DATASET_44="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000044_smiles_document" DATASET_45="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000045_smiles_document" DATASET_46="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000046_smiles_document" DATASET_47="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000047_smiles_document" DATASET_48="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000048_smiles_document" DATASET_49="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000049_smiles_document" DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9} 1.0 ${DATASET_10} 1.0 ${DATASET_11} 1.0 ${DATASET_12} 1.0 ${DATASET_13} 1.0 ${DATASET_14} 1.0 ${DATASET_15} 1.0 ${DATASET_16} 1.0 ${DATASET_17} 1.0 ${DATASET_18} 1.0 ${DATASET_19} 1.0 ${DATASET_20} 1.0 ${DATASET_21} 1.0 ${DATASET_22} 1.0 ${DATASET_23} 1.0 ${DATASET_24} 1.0 ${DATASET_25} 1.0 ${DATASET_26} 1.0 ${DATASET_27} 1.0 ${DATASET_28} 1.0 ${DATASET_29} 1.0 ${DATASET_30} 1.0 ${DATASET_31} 1.0 ${DATASET_32} 1.0 ${DATASET_33} 1.0 ${DATASET_34} 1.0 ${DATASET_35} 1.0 ${DATASET_36} 1.0 ${DATASET_37} 1.0 ${DATASET_38} 1.0 ${DATASET_39} 1.0 ${DATASET_40} 1.0 ${DATASET_41} 1.0 ${DATASET_42} 1.0 ${DATASET_43} 1.0 ${DATASET_44} 1.0 ${DATASET_45} 1.0 ${DATASET_46} 1.0 ${DATASET_47} 1.0 ${DATASET_48} 1.0 ${DATASET_49}" export VOCAB_FILE=/gpfs/alpine/world-shared/med106/blnchrd/models/bert_metrics_brackets/tokenizer/vocab.txt export CHECKPOINT_PATH=/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/temp2 Loading pretrain_crusher_mol_lamb.slurm +64 −14 Original line number Diff line number Diff line #!/bin/bash #SBATCH -A MED106_crusher #SBATCH -N 1 #SBATCH -t 02:15:00 #SBATCH -N 2 #SBATCH -t 06:00:00 #SBATCH -J crusher #SBATCH -o %x-%j.out #SBATCH -p batch Loading Loading @@ -31,18 +31,64 @@ ranks_total=$(($ranks_per_node*$SLURM_JOB_NUM_NODES)) export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6.0.29 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" export LD_PRELOAD="${LD_PRELOAD} ${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so" DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000000_smiles_document" DATASET_1="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000001_smiles_document" DATASET_2="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000002_smiles_document" DATASET_3="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000003_smiles_document" DATASET_4="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000004_smiles_document" DATASET_5="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000005_smiles_document" DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000006_smiles_document" DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000007_smiles_document" DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000008_smiles_document" DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000009_smiles_document" export LD_PRELOAD="$LD_PRELOAD /lib64/libtinfo.so.6 /lib64/libncurses.so.6 /gpfs/alpine/world-shared/bif136/rccl/build/librccl.so" export NCCL_NET_GDR_LEVEL=4 export FI_CXI_ATS=0 export LD_LIBRARY_PATH=/gpfs/alpine/proj-shared/bif136/aws-ofi-rccl/src/.libs:/opt/cray/libfabric/1.15.0.0/lib64:$LD_LIBRARY_PATH export NCCL_DEBUG=info # DATASET="1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5}" DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000000.json_smiles_document" DATASET_1="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000001.json_smiles_document" DATASET_2="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000002.json_smiles_document" DATASET_3="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000003.json_smiles_document" DATASET_4="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000004.json_smiles_document" DATASET_5="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000005.json_smiles_document" DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000006.json_smiles_document" DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000007.json_smiles_document" DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000008.json_smiles_document" DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000009.json_smiles_document" DATASET_10="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000010.json_smiles_document" DATASET_11="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000011.json_smiles_document" DATASET_12="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000012.json_smiles_document" DATASET_13="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000013.json_smiles_document" DATASET_14="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000014.json_smiles_document" DATASET_15="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000015.json_smiles_document" DATASET_16="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000016.json_smiles_document" DATASET_17="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000017.json_smiles_document" DATASET_18="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000018.json_smiles_document" DATASET_19="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000019.json_smiles_document" DATASET_20="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000020.json_smiles_document" DATASET_21="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000021.json_smiles_document" DATASET_22="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000022.json_smiles_document" DATASET_23="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000023.json_smiles_document" DATASET_24="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000024.json_smiles_document" DATASET_25="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000025.json_smiles_document" DATASET_26="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000026.json_smiles_document" DATASET_27="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000027.json_smiles_document" DATASET_28="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000028.json_smiles_document" DATASET_29="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000029.json_smiles_document" DATASET_30="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000030.json_smiles_document" DATASET_31="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000031.json_smiles_document" DATASET_32="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000032.json_smiles_document" DATASET_33="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000033.json_smiles_document" DATASET_34="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000034.json_smiles_document" DATASET_35="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000035.json_smiles_document" DATASET_36="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000036.json_smiles_document" DATASET_37="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000037.json_smiles_document" DATASET_38="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000038.json_smiles_document" DATASET_39="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000039.json_smiles_document" DATASET_40="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000040.json_smiles_document" DATASET_41="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000041.json_smiles_document" DATASET_42="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000042.json_smiles_document" DATASET_43="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000043.json_smiles_document" DATASET_44="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000044.json_smiles_document" DATASET_45="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000045.json_smiles_document" DATASET_46="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000046.json_smiles_document" DATASET_47="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000047.json_smiles_document" DATASET_48="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000048.json_smiles_document" DATASET_49="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000049.json_smiles_document" # DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9} 1.0 ${DATASET_10} 1.0 ${DATASET_11} 1.0 ${DATASET_12} 1.0 ${DATASET_13} 1.0 ${DATASET_14} 1.0 ${DATASET_15} 1.0 ${DATASET_16} 1.0 ${DATASET_17} 1.0 ${DATASET_18} 1.0 ${DATASET_19} 1.0 ${DATASET_20} 1.0 ${DATASET_21} 1.0 ${DATASET_22} 1.0 ${DATASET_23} 1.0 ${DATASET_24} 1.0 ${DATASET_25} 1.0 ${DATASET_26} 1.0 ${DATASET_27} 1.0 ${DATASET_28} 1.0 ${DATASET_29} 1.0 ${DATASET_30} 1.0 ${DATASET_31} 1.0 ${DATASET_32} 1.0 ${DATASET_33} 1.0 ${DATASET_34} 1.0 ${DATASET_35} 1.0 ${DATASET_36} 1.0 ${DATASET_37} 1.0 ${DATASET_38} 1.0 ${DATASET_39} 1.0 ${DATASET_40} 1.0 ${DATASET_41} 1.0 ${DATASET_42} 1.0 ${DATASET_43} 1.0 ${DATASET_44} 1.0 ${DATASET_45} 1.0 ${DATASET_46} 1.0 ${DATASET_47} 1.0 ${DATASET_48} 1.0 ${DATASET_49}" DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" Loading @@ -53,7 +99,11 @@ export CHECKPOINT_PATH=/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatr # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --optimizer lamb --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 1024 --num-attention-heads 16 --optimizer lamb --micro-batch-size 96 --global-batch-size 4608 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " source export_DDP_vars.sh python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 1024 --num-attention-heads 16 --optimizer lamb --micro-batch-size 96 --global-batch-size 2304 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 32 --optimizer lamb --micro-batch-size 48 --global-batch-size 2304 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" Loading
pretrain_crusher_mol_adam.slurm +41 −2 Original line number Diff line number Diff line Loading @@ -41,9 +41,48 @@ DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000007_smiles_document" DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000008_smiles_document" DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000009_smiles_document" DATASET_10="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000010_smiles_document" DATASET_11="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000011_smiles_document" DATASET_12="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000012_smiles_document" DATASET_13="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000013_smiles_document" DATASET_14="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000014_smiles_document" DATASET_15="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000015_smiles_document" DATASET_16="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000016_smiles_document" DATASET_17="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000017_smiles_document" DATASET_18="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000018_smiles_document" DATASET_19="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000019_smiles_document" DATASET_20="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000020_smiles_document" DATASET_21="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000021_smiles_document" DATASET_22="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000022_smiles_document" DATASET_23="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000023_smiles_document" DATASET_24="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000024_smiles_document" DATASET_25="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000025_smiles_document" DATASET_26="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000026_smiles_document" DATASET_27="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000027_smiles_document" DATASET_28="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000028_smiles_document" DATASET_29="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000029_smiles_document" DATASET_30="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000030_smiles_document" DATASET_31="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000031_smiles_document" DATASET_32="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000032_smiles_document" DATASET_33="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000033_smiles_document" DATASET_34="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000034_smiles_document" DATASET_35="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000035_smiles_document" DATASET_36="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000036_smiles_document" DATASET_37="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000037_smiles_document" DATASET_38="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000038_smiles_document" DATASET_39="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000039_smiles_document" DATASET_40="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000040_smiles_document" DATASET_41="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000041_smiles_document" DATASET_42="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000042_smiles_document" DATASET_43="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000043_smiles_document" DATASET_44="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000044_smiles_document" DATASET_45="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000045_smiles_document" DATASET_46="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000046_smiles_document" DATASET_47="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000047_smiles_document" DATASET_48="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000048_smiles_document" DATASET_49="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000049_smiles_document" DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9} 1.0 ${DATASET_10} 1.0 ${DATASET_11} 1.0 ${DATASET_12} 1.0 ${DATASET_13} 1.0 ${DATASET_14} 1.0 ${DATASET_15} 1.0 ${DATASET_16} 1.0 ${DATASET_17} 1.0 ${DATASET_18} 1.0 ${DATASET_19} 1.0 ${DATASET_20} 1.0 ${DATASET_21} 1.0 ${DATASET_22} 1.0 ${DATASET_23} 1.0 ${DATASET_24} 1.0 ${DATASET_25} 1.0 ${DATASET_26} 1.0 ${DATASET_27} 1.0 ${DATASET_28} 1.0 ${DATASET_29} 1.0 ${DATASET_30} 1.0 ${DATASET_31} 1.0 ${DATASET_32} 1.0 ${DATASET_33} 1.0 ${DATASET_34} 1.0 ${DATASET_35} 1.0 ${DATASET_36} 1.0 ${DATASET_37} 1.0 ${DATASET_38} 1.0 ${DATASET_39} 1.0 ${DATASET_40} 1.0 ${DATASET_41} 1.0 ${DATASET_42} 1.0 ${DATASET_43} 1.0 ${DATASET_44} 1.0 ${DATASET_45} 1.0 ${DATASET_46} 1.0 ${DATASET_47} 1.0 ${DATASET_48} 1.0 ${DATASET_49}" export VOCAB_FILE=/gpfs/alpine/world-shared/med106/blnchrd/models/bert_metrics_brackets/tokenizer/vocab.txt export CHECKPOINT_PATH=/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/temp2 Loading
pretrain_crusher_mol_lamb.slurm +64 −14 Original line number Diff line number Diff line #!/bin/bash #SBATCH -A MED106_crusher #SBATCH -N 1 #SBATCH -t 02:15:00 #SBATCH -N 2 #SBATCH -t 06:00:00 #SBATCH -J crusher #SBATCH -o %x-%j.out #SBATCH -p batch Loading Loading @@ -31,18 +31,64 @@ ranks_total=$(($ranks_per_node*$SLURM_JOB_NUM_NODES)) export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6.0.29 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" export LD_PRELOAD="${LD_PRELOAD} ${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so" DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000000_smiles_document" DATASET_1="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000001_smiles_document" DATASET_2="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000002_smiles_document" DATASET_3="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000003_smiles_document" DATASET_4="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000004_smiles_document" DATASET_5="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000005_smiles_document" DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000006_smiles_document" DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000007_smiles_document" DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000008_smiles_document" DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000009_smiles_document" export LD_PRELOAD="$LD_PRELOAD /lib64/libtinfo.so.6 /lib64/libncurses.so.6 /gpfs/alpine/world-shared/bif136/rccl/build/librccl.so" export NCCL_NET_GDR_LEVEL=4 export FI_CXI_ATS=0 export LD_LIBRARY_PATH=/gpfs/alpine/proj-shared/bif136/aws-ofi-rccl/src/.libs:/opt/cray/libfabric/1.15.0.0/lib64:$LD_LIBRARY_PATH export NCCL_DEBUG=info # DATASET="1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5}" DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000000.json_smiles_document" DATASET_1="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000001.json_smiles_document" DATASET_2="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000002.json_smiles_document" DATASET_3="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000003.json_smiles_document" DATASET_4="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000004.json_smiles_document" DATASET_5="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000005.json_smiles_document" DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000006.json_smiles_document" DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000007.json_smiles_document" DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000008.json_smiles_document" DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000009.json_smiles_document" DATASET_10="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000010.json_smiles_document" DATASET_11="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000011.json_smiles_document" DATASET_12="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000012.json_smiles_document" DATASET_13="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000013.json_smiles_document" DATASET_14="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000014.json_smiles_document" DATASET_15="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000015.json_smiles_document" DATASET_16="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000016.json_smiles_document" DATASET_17="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000017.json_smiles_document" DATASET_18="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000018.json_smiles_document" DATASET_19="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000019.json_smiles_document" DATASET_20="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000020.json_smiles_document" DATASET_21="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000021.json_smiles_document" DATASET_22="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000022.json_smiles_document" DATASET_23="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000023.json_smiles_document" DATASET_24="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000024.json_smiles_document" DATASET_25="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000025.json_smiles_document" DATASET_26="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000026.json_smiles_document" DATASET_27="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000027.json_smiles_document" DATASET_28="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000028.json_smiles_document" DATASET_29="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000029.json_smiles_document" DATASET_30="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000030.json_smiles_document" DATASET_31="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000031.json_smiles_document" DATASET_32="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000032.json_smiles_document" DATASET_33="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000033.json_smiles_document" DATASET_34="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000034.json_smiles_document" DATASET_35="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000035.json_smiles_document" DATASET_36="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000036.json_smiles_document" DATASET_37="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000037.json_smiles_document" DATASET_38="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000038.json_smiles_document" DATASET_39="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000039.json_smiles_document" DATASET_40="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000040.json_smiles_document" DATASET_41="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000041.json_smiles_document" DATASET_42="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000042.json_smiles_document" DATASET_43="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000043.json_smiles_document" DATASET_44="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000044.json_smiles_document" DATASET_45="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000045.json_smiles_document" DATASET_46="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000046.json_smiles_document" DATASET_47="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000047.json_smiles_document" DATASET_48="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000048.json_smiles_document" DATASET_49="/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/preprocess_data/part-000049.json_smiles_document" # DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9} 1.0 ${DATASET_10} 1.0 ${DATASET_11} 1.0 ${DATASET_12} 1.0 ${DATASET_13} 1.0 ${DATASET_14} 1.0 ${DATASET_15} 1.0 ${DATASET_16} 1.0 ${DATASET_17} 1.0 ${DATASET_18} 1.0 ${DATASET_19} 1.0 ${DATASET_20} 1.0 ${DATASET_21} 1.0 ${DATASET_22} 1.0 ${DATASET_23} 1.0 ${DATASET_24} 1.0 ${DATASET_25} 1.0 ${DATASET_26} 1.0 ${DATASET_27} 1.0 ${DATASET_28} 1.0 ${DATASET_29} 1.0 ${DATASET_30} 1.0 ${DATASET_31} 1.0 ${DATASET_32} 1.0 ${DATASET_33} 1.0 ${DATASET_34} 1.0 ${DATASET_35} 1.0 ${DATASET_36} 1.0 ${DATASET_37} 1.0 ${DATASET_38} 1.0 ${DATASET_39} 1.0 ${DATASET_40} 1.0 ${DATASET_41} 1.0 ${DATASET_42} 1.0 ${DATASET_43} 1.0 ${DATASET_44} 1.0 ${DATASET_45} 1.0 ${DATASET_46} 1.0 ${DATASET_47} 1.0 ${DATASET_48} 1.0 ${DATASET_49}" DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" Loading @@ -53,7 +99,11 @@ export CHECKPOINT_PATH=/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatr # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --optimizer lamb --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 1024 --num-attention-heads 16 --optimizer lamb --micro-batch-size 96 --global-batch-size 4608 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " source export_DDP_vars.sh python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 1024 --num-attention-heads 16 --optimizer lamb --micro-batch-size 96 --global-batch-size 2304 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 32 --optimizer lamb --micro-batch-size 48 --global-batch-size 2304 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16"