Commit 92ecb4e6 authored by Yin, Junqi's avatar Yin, Junqi

minor mod to job script

parent ac2ce8b2
#!/bin/bash
#BSUB -P stf011
#BSUB -W 1:00
#BSUB -nnodes 1
#BSUB -nnodes 4
#BSUB -alloc_flags gpumps
#BSUB -J chocosgd
#BSUB -o logs/chocosgd.o%J
......@@ -11,5 +11,15 @@
NNODES=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
source choco_env.sh
jsrun -n$((NNODES*6)) -a1 -g1 -c7 -r6 --smpiargs "-gpu" ./run.sh
# centralized: complete; decentralized: ring, torus, expander, margulis_expander, social
TOPOLOGY=ring
NRANK=$((NNODES*6))
WORLD=$(python -c "print(','.join(['0']*$NRANK))")
cp run.sh.tplt run.sh
sed -i "s/TODO_NRANK/$NRANK/" run.sh
sed -i "s/TODO_GPURANKS/$WORLD/" run.sh
sed -i "s/TODO_TOPOLOGY/$TOPOLOGY/" run.sh
jsrun -n${NRANK} -a1 -g1 -c7 -r6 --smpiargs "-gpu" --bind=proportional-packed:7 --launch_distribution=packed ./run.sh
......@@ -28,11 +28,11 @@ python main.py \
--lr_onecycle_extra_low 0.0015 \
--lr_onecycle_num_epoch 46 \
--lr_schedule_scheme custom_one_cycle \
--optimizer parallel_choco \
--optimizer sgd \
--adam_beta_1 0.9 \
--adam_beta_2 0.999 \
--adam_eps 1e-08 \
--graph_topology ring \
--graph_topology TODO_TOPOLOGY \
--comm_op sign \
--compress_ratio 0.9 \
--compress_warmup_values 0.75,0.9375,0.984375,0.996,0.999 \
......@@ -74,12 +74,12 @@ python main.py \
--save_all_models False \
--user lin \
--project distributed_adam_type_algorithm \
--experiment demo \
--experiment test \
--backend mpi \
--use_ipc False \
--num_workers 0 \
--n_mpi_process 6 \
--n_sub_process 1 \
--world 0,0,0,0,0,0 \
--n_mpi_process TODO_NRANK \
--n_sub_process 6 \
--world TODO_GPURANKS \
--on_cuda True \
--comm_device cuda
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment