Commit 3d4e6ac7 authored by Yin, Junqi's avatar Yin, Junqi

minor fixes

parent ae56ca02
#!/bin/bash
#BSUB -P stf011
#BSUB -W 1:00
#BSUB -W 12:00
#BSUB -nnodes 8
#BSUB -alloc_flags "nvme gpumps"
#BSUB -J chocosgd
#BSUB -o logs/chocosgd.o%J
#BSUB -e logs/chocosgd.e%J
#BSUB -q batch-hm
##BSUB -q killable
NNODES=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
......@@ -16,9 +17,9 @@ single_rank_per_node=false
EXPERIMENT=resnet50
# centralized: complete; decentralized: ring, torus, expander, margulis_expander, social
TOPOLOGY=complete
TOPOLOGY=ring
# ddp only supports complete topology
DDP=True
DDP=False
# shuffle graph
SHUFFLE_GRAPH=True
# hybrid mode
......@@ -45,8 +46,8 @@ sed -i "s/TODO_GPURANKS/$WORLD/" run.sh
sed -i "s/TODO_TOPOLOGY/$TOPOLOGY/" run.sh
sed -i "s/TODO_TIMESTAMP/$NOW/" run.sh
sed -i "s/TODO_DDP/$DDP/" run.sh
sed -i "s/TODO_DDP/$SHUFFLE_GRAPH/" run.sh
sed -i "s/TODO_DDP/$HYBRID/" run.sh
sed -i "s/TODO_SHUFFLE_GRAPH/$SHUFFLE_GRAPH/" run.sh
sed -i "s/TODO_HYBRID/$HYBRID/" run.sh
if [ "$single_rank_per_node" = true ]; then
jsrun -n${NNODES} -a1 -g6 -c42 -r1 --smpiargs "-gpu" --bind=rs --launch_distribution=packed ./run.sh
......
......@@ -79,4 +79,6 @@ python -u main.py \
--world TODO_GPURANKS \
--on_cuda True \
--comm_device cuda \
--ddp TODO_DDP
--ddp TODO_DDP \
--shuffle_graph_per_epoch TODO_SHUFFLE_GRAPH \
--hybrid TODO_HYBRID
......@@ -49,4 +49,7 @@ python -u main.py \
--n_sub_process TODO_NSUB \
--world TODO_GPURANKS \
--on_cuda True \
--comm_device cuda
--comm_device cuda \
--ddp TODO_DDP \
--shuffle_graph_per_epoch TODO_SHUFFLE_GRAPH \
--hybrid TODO_HYBRID
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment