Commit a325efbc authored by Yin, Junqi's avatar Yin, Junqi

add resume option

parent ea797552
......@@ -12,6 +12,9 @@
NNODES=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
source choco_env.sh
CHECKPOINT_DIR="data/checkpoint/imagenet/resnet50/test/16-nodes_ring_DDP-False_SHUFFLE_GRAPH-False_FREQ-10_HYBRID-False_FREQ-5_620280"
RESUME_DIR=$(printf '%s\n' "$CHECKPOINT_DIR" | sed -e 's/[\/&]/\\&/g')
single_rank_per_node=false
#resnet20, lstm, resnet50, densenet100
EXPERIMENT=resnet50
......@@ -54,6 +57,7 @@ sed -i "s/TODO_HYBRID/$HYBRID/" run.sh
sed -i "s/TODO_FREQ_HYBRID/$FREQ_HYBRID/" run.sh
sed -i "s/TODO_FREQ_SHUFFLE/$FREQ_SHUFFLE/" run.sh
sed -i "s/TODO_PRINT_GRAD/$PRINT_GRAD/" run.sh
sed -i "s/TODO_RESUME_DIR/$RESUME_DIR/" run.sh
if [ "$single_rank_per_node" = true ]; then
jsrun -n${NNODES} -a1 -g6 -c42 -r1 --smpiargs "-gpu" --bind=rs --launch_distribution=packed ./run.sh
......
......@@ -54,4 +54,5 @@ python -u main.py \
--shuffle_graph_freq TODO_FREQ_SHUFFLE \
--hybrid TODO_HYBRID \
--hybrid_freq TODO_FREQ_HYBRID \
--print_grad TODO_PRINT_GRAD
--print_grad TODO_PRINT_GRAD \
--resume TODO_RESUME_DIR
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment