Commit 4e2829f2 authored by Yin, Junqi's avatar Yin, Junqi

add resnet50 example and neighbors info

parent a936e13f
......@@ -13,7 +13,7 @@ source choco_env.sh
single_rank_per_node=false
#resnet20, lstm, resnet50
EXPERIMENT=lstm
EXPERIMENT=resnet20
# centralized: complete; decentralized: ring, torus, expander, margulis_expander, social
TOPOLOGY=ring
......
......@@ -21,7 +21,17 @@ import pcode.utils.op_paths as op_paths
import pcode.utils.stat_tracker as stat_tracker
import pcode.utils.logging as logging
from pcode.utils.timer import Timer
import platform
def print_neighbors(conf):
neighbors_info = conf.graph.get_neighborhood()
neighbor_ranks = [
neighbor_rank
for neighbor_rank in neighbors_info.keys()
if neighbor_rank != conf.graph.rank
]
print("NEIGHBOR_INFO: rank %d is on node %s with neighbors: %s"
% (conf.graph.rank, platform.node(), tuple(neighbor_ranks)))
def setup_ddp(backend):
""""Initialize Apex DDP"""
......@@ -206,7 +216,9 @@ def init_config(conf):
# display the arguments' info.
logging.display_args(conf)
# display neighbors
print_neighbors(conf)
if __name__ == "__main__":
conf = get_args()
......
#!/bin/bash
python main.py \
python -u main.py \
--work_dir $(pwd) \
--remote_exec False \
--data wikitext2 \
......
#!/bin/bash
python main.py \
python -u main.py \
--work_dir $(pwd) \
--remote_exec False \
--data cifar10 \
......@@ -79,7 +79,7 @@ python main.py \
--use_ipc False \
--num_workers 0 \
--n_mpi_process TODO_NRANK \
--n_sub_process 6 \
--n_sub_process TODO_NSUB \
--world TODO_GPURANKS \
--on_cuda True \
--comm_device cuda
#!/bin/bash
python -u main.py \
--work_dir $(pwd) \
--remote_exec False \
--data imagenet \
--data_dir /gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC \
--use_lmdb_data False \
--partition_data random \
--pin_memory True \
--arch resnet50 \
--train_fast False \
--stop_criteria epoch \
--num_epochs 90 \
--num_iterations 32000 \
--avg_model True \
--reshuffle_per_epoch True \
--batch_size 128 \
--base_batch_size 256 \
--lr 0.1 \
--lr_scaleup True \
--lr_scaleup_type linear \
--lr_scaleup_factor graph \
--lr_warmup True \
--lr_warmup_epochs 5 \
--lr_decay 0.1 \
--lr_onecycle_low 0.15 \
--lr_onecycle_high 3 \
--lr_onecycle_extra_low 0.0015 \
--lr_onecycle_num_epoch 46 \
--lr_schedule_scheme custom_one_cycle \
--optimizer sgd \
--adam_beta_1 0.9 \
--adam_beta_2 0.999 \
--adam_eps 1e-08 \
--graph_topology TODO_TOPOLOGY \
--comm_op quantize_qsgd \
--compress_ratio 0.9 \
--compress_warmup_values 0.75,0.9375,0.984375,0.996,0.999 \
--compress_warmup_epochs 0 \
--quantize_level 32 \
--is_biased True \
--majority_vote False \
--consensus_stepsize 0.4 \
--evaluate_consensus False \
--mask_momentum False \
--clip_grad False \
--local_step 1 \
--turn_on_local_step_from 0 \
--momentum_factor 0.9 \
--use_nesterov True \
--weight_decay 0.0001 \
--drop_rate 0.0 \
--densenet_growth_rate 12 \
--densenet_bc_mode False \
--densenet_compression 0.5 \
--wideresnet_widen_factor 4 \
--rnn_n_hidden 200 \
--rnn_n_layers 2 \
--rnn_bptt_len 35 \
--rnn_clip 0.25 \
--rnn_use_pretrained_emb True \
--rnn_tie_weights True \
--rnn_weight_norm False \
--manual_seed 6 \
--evaluate False \
--eval_freq 1 \
--summary_freq 100 \
--timestamp TODO_TIMESTAMP \
--track_time True \
--track_detailed_time True \
--display_tracked_time True \
--evaluate_avg False \
--checkpoint ./data/checkpoint \
--save_all_models False \
--user lin \
--project distributed_adam_type_algorithm \
--experiment test \
--backend mpi \
--use_ipc False \
--num_workers 0 \
--n_mpi_process TODO_NRANK \
--n_sub_process TODO_NSUB \
--world TODO_GPURANKS \
--on_cuda True \
--comm_device cuda
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment