Loading pretrain_crusher_mol_adam.slurm +12 −9 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ #SBATCH -A MED106_crusher #SBATCH -N 1 #SBATCH -t 01:30:00 #SBATCH -t 03:00:00 #SBATCH -J crusher #SBATCH -o %x-%j.out #SBATCH -p batch Loading Loading @@ -31,25 +31,28 @@ ranks_total=$(($ranks_per_node*$SLURM_JOB_NUM_NODES)) export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6.0.29 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" export LD_PRELOAD="${LD_PRELOAD} ${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so" # DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000000_smiles_document" DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000000_smiles_document" DATASET_1="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000001_smiles_document" DATASET_2="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000002_smiles_document" DATASET_3="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000003_smiles_document" DATASET_4="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000004_smiles_document" DATASET_5="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000005_smiles_document" # DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000006_smiles_document" # DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000007_smiles_document" # DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000008_smiles_document" # DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000009_smiles_document" DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000006_smiles_document" DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000007_smiles_document" DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000008_smiles_document" DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000009_smiles_document" DATASET="1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5}" # DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" export VOCAB_FILE=/gpfs/alpine/world-shared/med106/blnchrd/models/bert_metrics_brackets/tokenizer/vocab.txt export CHECKPOINT_PATH=/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/temp2 # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.0001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " source export_DDP_vars.sh python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.0001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 1024 --num-attention-heads 16 --micro-batch-size 96 --global-batch-size 2304 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.0001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 100 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" pretrain_crusher_mol_lamb.slurm +13 −9 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ #SBATCH -A MED106_crusher #SBATCH -N 1 #SBATCH -t 01:30:00 #SBATCH -t 02:15:00 #SBATCH -J crusher #SBATCH -o %x-%j.out #SBATCH -p batch Loading Loading @@ -31,25 +31,29 @@ ranks_total=$(($ranks_per_node*$SLURM_JOB_NUM_NODES)) export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6.0.29 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" export LD_PRELOAD="${LD_PRELOAD} ${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so" # DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000000_smiles_document" DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000000_smiles_document" DATASET_1="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000001_smiles_document" DATASET_2="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000002_smiles_document" DATASET_3="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000003_smiles_document" DATASET_4="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000004_smiles_document" DATASET_5="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000005_smiles_document" # DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000006_smiles_document" # DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000007_smiles_document" # DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000008_smiles_document" # DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000009_smiles_document" DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000006_smiles_document" DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000007_smiles_document" DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000008_smiles_document" DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000009_smiles_document" DATASET="1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5}" # DATASET="1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5}" # DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" export VOCAB_FILE=/gpfs/alpine/world-shared/med106/blnchrd/models/bert_metrics_brackets/tokenizer/vocab.txt export CHECKPOINT_PATH=/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/temp2 # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --optimizer lamb --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " source export_DDP_vars.sh python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --optimizer lamb --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 1024 --num-attention-heads 16 --optimizer lamb --micro-batch-size 96 --global-batch-size 2304 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" Loading
pretrain_crusher_mol_adam.slurm +12 −9 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ #SBATCH -A MED106_crusher #SBATCH -N 1 #SBATCH -t 01:30:00 #SBATCH -t 03:00:00 #SBATCH -J crusher #SBATCH -o %x-%j.out #SBATCH -p batch Loading Loading @@ -31,25 +31,28 @@ ranks_total=$(($ranks_per_node*$SLURM_JOB_NUM_NODES)) export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6.0.29 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" export LD_PRELOAD="${LD_PRELOAD} ${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so" # DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000000_smiles_document" DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000000_smiles_document" DATASET_1="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000001_smiles_document" DATASET_2="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000002_smiles_document" DATASET_3="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000003_smiles_document" DATASET_4="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000004_smiles_document" DATASET_5="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000005_smiles_document" # DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000006_smiles_document" # DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000007_smiles_document" # DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000008_smiles_document" # DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000009_smiles_document" DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000006_smiles_document" DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000007_smiles_document" DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000008_smiles_document" DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000009_smiles_document" DATASET="1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5}" # DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" export VOCAB_FILE=/gpfs/alpine/world-shared/med106/blnchrd/models/bert_metrics_brackets/tokenizer/vocab.txt export CHECKPOINT_PATH=/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/temp2 # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.0001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " source export_DDP_vars.sh python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.0001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 1024 --num-attention-heads 16 --micro-batch-size 96 --global-batch-size 2304 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.0001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 100 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16"
pretrain_crusher_mol_lamb.slurm +13 −9 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ #SBATCH -A MED106_crusher #SBATCH -N 1 #SBATCH -t 01:30:00 #SBATCH -t 02:15:00 #SBATCH -J crusher #SBATCH -o %x-%j.out #SBATCH -p batch Loading Loading @@ -31,25 +31,29 @@ ranks_total=$(($ranks_per_node*$SLURM_JOB_NUM_NODES)) export LD_PRELOAD="/opt/cray/pe/gcc/11.2.0/snos/lib64/libstdc++.so.6.0.29 /gpfs/alpine/world-shared/bip214/rocm_smi_lib/build/rocm_smi/librocm_smi64.so" export LD_PRELOAD="${LD_PRELOAD} ${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so" # DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000000_smiles_document" DATASET_0="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000000_smiles_document" DATASET_1="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000001_smiles_document" DATASET_2="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000002_smiles_document" DATASET_3="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000003_smiles_document" DATASET_4="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000004_smiles_document" DATASET_5="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000005_smiles_document" # DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000006_smiles_document" # DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000007_smiles_document" # DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000008_smiles_document" # DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part3-000009_smiles_document" DATASET_6="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000006_smiles_document" DATASET_7="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000007_smiles_document" DATASET_8="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000008_smiles_document" DATASET_9="/gpfs/alpine/world-shared/med106/gounley1/Megatron-LM/preprocess_data_molecules/part-000009_smiles_document" DATASET="1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5}" # DATASET="1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5}" # DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" DATASET="1.0 ${DATASET_0} 1.0 ${DATASET_1} 1.0 ${DATASET_2} 1.0 ${DATASET_3} 1.0 ${DATASET_4} 1.0 ${DATASET_5} 1.0 ${DATASET_6} 1.0 ${DATASET_7} 1.0 ${DATASET_8} 1.0 ${DATASET_9}" export VOCAB_FILE=/gpfs/alpine/world-shared/med106/blnchrd/models/bert_metrics_brackets/tokenizer/vocab.txt export CHECKPOINT_PATH=/gpfs/alpine/world-shared/med106/gounley1/crusher2/Megatron-LM/temp2 # srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " # source export_DDP_vars.sh # python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --optimizer lamb --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" srun -u -n $ranks_total -c 8 --gpus-per-task=$gpus_per_rank --gpu-bind=closest bash -c " source export_DDP_vars.sh python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 12 --hidden-size 768 --num-attention-heads 12 --optimizer lamb --micro-batch-size 256 --global-batch-size 6144 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16" python pretrain_bert.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 1024 --num-attention-heads 16 --optimizer lamb --micro-batch-size 96 --global-batch-size 2304 --seq-length 64 --max-position-embeddings 512 --train-iters 6000 --save $CHECKPOINT_PATH --data-path $DATASET --vocab-file $VOCAB_FILE --data-impl mmap --split 949,50,1 --distributed-backend nccl --lr 0.001 --min-lr 0.00001 --lr-decay-style linear --lr-warmup-fraction .01 --weight-decay 1e-2 --clip-grad 1.0 --log-interval 100 --save-interval 10000 --eval-interval 100 --eval-iters 10 --bert-no-binary-head --num-workers 2 --HIP 1 --fp16"