Commit ced91132 authored by Ferreira Da Silva, Rafael's avatar Ferreira Da Silva, Rafael
Browse files

migrating to ROCm 6.0.0

parent 79987307
Loading
Loading
Loading
Loading
+13 −9
Original line number Diff line number Diff line
#!/bin/bash -l
#SBATCH -J tft
#SBATCH -t 0:30:00
#SBATCH -A stf019
#SBATCH -N 9
#SBATCH -t 2:00:00
#SBATCH -A stf053
#SBATCH -N 18
#SBATCH -C nvme
#SBATCH --exclusive
##SBATCH --ntasks-per-node=8
@@ -18,13 +18,17 @@ export TRANSFORMERS_CACHE=$(pwd)/.cache

CMD="python -u train.py --config ./topaz_config.yaml"

HOME=/tmp time srun --nodes=${SLURM_NNODES} \
REPLICAS=2
NODES_PER_EXEC=$((SLURM_NNODES / REPLICAS))

for i in {1..4}
do
    HOME=/tmp time srun --nodes=${NODES_PER_EXEC} \
                   --ntasks=72 \
                   --ntasks-per-gpu=1 --gpus-per-node=8 --gpu-bind=closest\
                   -c7  \
                   bash -c "rocm-smi &> /dev/null; source setup_ddp_slurm.sh; $CMD"

done

python eval/reconstruct_results_peak.py