Commit 61bededb authored by Yin, Junqi's avatar Yin, Junqi

update imagenet example

parent 993da23e
#!/bin/bash
#BSUB -P stf011
#BSUB -W 0:30
#BSUB -nnodes 3
#BSUB -alloc_flags gpumps
#BSUB -W 1:00
#BSUB -nnodes 8
#BSUB -alloc_flags "nvme gpumps"
#BSUB -J chocosgd
#BSUB -o logs/chocosgd.o%J
#BSUB -e logs/chocosgd.e%J
......@@ -12,12 +12,17 @@ NNODES=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch
source choco_env.sh
single_rank_per_node=false
#resnet20, lstm, resnet50
EXPERIMENT=resnet20
#resnet20, lstm, resnet50, vgg19
EXPERIMENT=resnet50
# centralized: complete; decentralized: ring, torus, expander, margulis_expander, social
TOPOLOGY=ring
# stage data to nvme
if [ "$EXPERIMENT" == "resnet50" ]; then
jsrun -n${NNODES} -a1 -c42 -r1 tar -xf $WORLDWORK/stf011/junqi/choco_env/dl_code/data/ILSVRC/imagenet-data.tar -C /mnt/bb/$USER
fi
cp run-${EXPERIMENT}.sh.tplt run.sh
NRANK=$((NNODES*6))
if [ "$single_rank_per_node" = true ]; then
......
......@@ -113,7 +113,7 @@ def _get_imagenet(conf, name, datasets_path, split):
)
else:
root = os.path.join(
root, "val{}".format("" if not conf.use_lmdb_data else ".lmdb")
root, "validation{}".format("" if not conf.use_lmdb_data else ".lmdb")
)
return define_imagenet_folder(
name=name, root=root, flag=conf.use_lmdb_data, cuda=conf.graph.on_cuda
......
......@@ -62,7 +62,7 @@ def train_and_validate(
display_training_stat(conf, scheduler, tracker_tr, n_bits_to_transmit)
# finish one epoch training and to decide if we want to val our model.
if scheduler.epoch_ % 1 == 0:
if scheduler.epoch_ % conf.eval_freq == 0:
if tracker_tr.stat["loss"].avg > 1e3 or np.isnan(
tracker_tr.stat["loss"].avg
):
......@@ -80,7 +80,7 @@ def train_and_validate(
# evaluate (and only inference) on the whole training loader.
if (
conf.evaluate_consensus or scheduler.is_stop()
) and not conf.train_fast:
) and not conf.train_fast and conf.data != "imagenet":
# prepare the dataloader for the consensus evaluation.
_data_loader = {
"val_loader": _define_cv_dataset(
......
#!/bin/bash
python -u main.py \
python -u main.py \
--work_dir $(pwd) \
--remote_exec False \
--data imagenet \
--data_dir /gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC \
--data_dir /mnt/bb/$USER/data \
--use_lmdb_data False \
--partition_data random \
--pin_memory True \
......@@ -11,7 +11,6 @@ python -u main.py \
--train_fast False \
--stop_criteria epoch \
--num_epochs 90 \
--num_iterations 32000 \
--avg_model True \
--reshuffle_per_epoch True \
--batch_size 128 \
......@@ -22,58 +21,26 @@ python -u main.py \
--lr_scaleup_factor graph \
--lr_warmup True \
--lr_warmup_epochs 5 \
--lr_decay 0.1 \
--lr_onecycle_low 0.15 \
--lr_onecycle_high 3 \
--lr_onecycle_extra_low 0.0015 \
--lr_onecycle_num_epoch 46 \
--lr_schedule_scheme custom_one_cycle \
--lr_schedule_scheme custom_multistep \
--lr_change_epochs 30,60,80 \
--optimizer sgd \
--adam_beta_1 0.9 \
--adam_beta_2 0.999 \
--adam_eps 1e-08 \
--graph_topology TODO_TOPOLOGY \
--comm_op quantize_qsgd \
--compress_ratio 0.9 \
--compress_warmup_values 0.75,0.9375,0.984375,0.996,0.999 \
--compress_warmup_epochs 0 \
--quantize_level 32 \
--is_biased True \
--majority_vote False \
--consensus_stepsize 0.4 \
--evaluate_consensus False \
--mask_momentum False \
--clip_grad False \
--local_step 1 \
--turn_on_local_step_from 0 \
--momentum_factor 0.9 \
--use_nesterov True \
--weight_decay 0.0001 \
--drop_rate 0.0 \
--densenet_growth_rate 12 \
--densenet_bc_mode False \
--densenet_compression 0.5 \
--wideresnet_widen_factor 4 \
--rnn_n_hidden 200 \
--rnn_n_layers 2 \
--rnn_bptt_len 35 \
--rnn_clip 0.25 \
--rnn_use_pretrained_emb True \
--rnn_tie_weights True \
--rnn_weight_norm False \
--manual_seed 6 \
--evaluate False \
--eval_freq 1 \
--eval_freq 9 \
--summary_freq 100 \
--timestamp TODO_TIMESTAMP \
--track_time True \
--track_detailed_time True \
--track_detailed_time False \
--display_tracked_time True \
--evaluate_avg False \
--checkpoint ./data/checkpoint \
--save_all_models False \
--user lin \
--project distributed_adam_type_algorithm \
--experiment test \
--backend mpi \
--use_ipc False \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment