Loading job.sb +13 −9 Original line number Diff line number Diff line #!/bin/bash -l #SBATCH -J tft #SBATCH -t 0:30:00 #SBATCH -A stf019 #SBATCH -N 9 #SBATCH -t 2:00:00 #SBATCH -A stf053 #SBATCH -N 18 #SBATCH -C nvme #SBATCH --exclusive ##SBATCH --ntasks-per-node=8 Loading @@ -18,13 +18,17 @@ export TRANSFORMERS_CACHE=$(pwd)/.cache CMD="python -u train.py --config ./topaz_config.yaml" HOME=/tmp time srun --nodes=${SLURM_NNODES} \ REPLICAS=2 NODES_PER_EXEC=$((SLURM_NNODES / REPLICAS)) for i in {1..4} do HOME=/tmp time srun --nodes=${NODES_PER_EXEC} \ --ntasks=72 \ --ntasks-per-gpu=1 --gpus-per-node=8 --gpu-bind=closest\ -c7 \ bash -c "rocm-smi &> /dev/null; source setup_ddp_slurm.sh; $CMD" done python eval/reconstruct_results_peak.py Loading
job.sb +13 −9 Original line number Diff line number Diff line #!/bin/bash -l #SBATCH -J tft #SBATCH -t 0:30:00 #SBATCH -A stf019 #SBATCH -N 9 #SBATCH -t 2:00:00 #SBATCH -A stf053 #SBATCH -N 18 #SBATCH -C nvme #SBATCH --exclusive ##SBATCH --ntasks-per-node=8 Loading @@ -18,13 +18,17 @@ export TRANSFORMERS_CACHE=$(pwd)/.cache CMD="python -u train.py --config ./topaz_config.yaml" HOME=/tmp time srun --nodes=${SLURM_NNODES} \ REPLICAS=2 NODES_PER_EXEC=$((SLURM_NNODES / REPLICAS)) for i in {1..4} do HOME=/tmp time srun --nodes=${NODES_PER_EXEC} \ --ntasks=72 \ --ntasks-per-gpu=1 --gpus-per-node=8 --gpu-bind=closest\ -c7 \ bash -c "rocm-smi &> /dev/null; source setup_ddp_slurm.sh; $CMD" done python eval/reconstruct_results_peak.py