Commit a936e13f authored by Yin, Junqi's avatar Yin, Junqi

add setup for lstm example

parent 9a5a5444
......@@ -11,17 +11,31 @@
NNODES=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
source choco_env.sh
single_rank_per_node=false
#resnet20, lstm, resnet50
EXPERIMENT=lstm
# centralized: complete; decentralized: ring, torus, expander, margulis_expander, social
TOPOLOGY=ring
cp run-${EXPERIMENT}.sh.tplt run.sh
NRANK=$((NNODES*6))
WORLD=$(python -c "print(','.join(['0,1,2,3,4,5']*$NNODES))")
if [ "$single_rank_per_node" = true ]; then
WORLD=$(python -c "print(','.join(['0,1,2,3,4,5']*$NNODES))")
sed -i "s/TODO_NRANK/$NNODES/" run.sh
sed -i "s/TODO_NSUB/6/" run.sh
else
WORLD=$(python -c "print(','.join(['0']*$NRANK))")
sed -i "s/TODO_NRANK/$NRANK/" run.sh
sed -i "s/TODO_NSUB/1/" run.sh
fi
NOW=$(date '+%Y%m%d%H%M%S')
cp run.sh.tplt run.sh
sed -i "s/TODO_NRANK/$NNODES/" run.sh
sed -i "s/TODO_GPURANKS/$WORLD/" run.sh
sed -i "s/TODO_TOPOLOGY/$TOPOLOGY/" run.sh
sed -i "s/TODO_TIMESTAMP/$NOW/" run.sh
jsrun -n${NNODES} -a1 -g6 -c42 -r1 --smpiargs "-gpu" --bind=rs --launch_distribution=packed ./run.sh
if [ "$single_rank_per_node" = true ]; then
jsrun -n${NNODES} -a1 -g6 -c42 -r1 --smpiargs "-gpu" --bind=rs --launch_distribution=packed ./run.sh
else
jsrun -n${NRANK} -a1 -g1 -c7 -r6 --smpiargs "-gpu" --bind=proportional-packed:7 --launch_distribution=packed ./run.sh
fi
#!/bin/bash
python main.py \
--work_dir $(pwd) \
--remote_exec False \
--data wikitext2 \
--use_lmdb_data False \
--partition_data random \
--pin_memory True \
--arch rnn_lm \
--train_fast False \
--stop_criteria epoch \
--num_epochs 300 \
--num_iterations 32000 \
--avg_model True \
--reshuffle_per_epoch False \
--batch_size 32 \
--base_batch_size 24 \
--lr 2.5 \
--lr_scaleup True \
--lr_scaleup_type linear \
--lr_scaleup_factor graph \
--lr_warmup True \
--lr_warmup_epochs 5 \
--lr_decay 10 \
--lr_schedule_scheme custom_multistep \
--lr_change_epochs 150,225 \
--optimizer sgd \
--adam_beta_1 0.9 \
--adam_beta_2 0.999 \
--adam_eps 1e-08 \
--graph_topology TODO_TOPOLOGY \
--comm_op sign \
--compress_ratio 0.9 \
--compress_warmup_values 0.75,0.9375,0.984375,0.996,0.999 \
--compress_warmup_epochs 0 \
--quantize_level 16 \
--is_biased True \
--majority_vote False \
--consensus_stepsize 0.5 \
--evaluate_consensus False \
--mask_momentum False \
--clip_grad False \
--local_step 1 \
--turn_on_local_step_from 0 \
--momentum_factor 0 \
--use_nesterov False \
--weight_decay 0 \
--drop_rate 0.40 \
--densenet_growth_rate 12 \
--densenet_bc_mode False \
--densenet_compression 0.5 \
--wideresnet_widen_factor 4 \
--rnn_n_hidden 650 \
--rnn_n_layers 3 \
--rnn_bptt_len 30 \
--rnn_clip 0.4 \
--rnn_use_pretrained_emb False \
--rnn_tie_weights True \
--rnn_weight_norm False \
--manual_seed 6 \
--evaluate False \
--eval_freq 1 \
--summary_freq 100 \
--timestamp TODO_TIMESTAMP \
--track_time True \
--track_detailed_time False \
--display_tracked_time True \
--evaluate_avg False \
--checkpoint ./data/checkpoint \
--save_all_models False \
--user lin \
--project distributed_adam_type_algorithm \
--experiment test \
--backend mpi \
--use_ipc False \
--num_workers 0 \
--n_mpi_process TODO_NRANK \
--n_sub_process TODO_NSUB \
--world TODO_GPURANKS \
--on_cuda True \
--comm_device cuda
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment