Commit ac2ce8b2 authored by Yin, Junqi's avatar Yin, Junqi

add Summit env and port

parent 3cb976a5
module load gcc/7.4.0
module load ibm-wml-ce/1.7.0-3
conda activate $WORLDWORK/stf011/junqi/choco_env
localhost slots=32
\ No newline at end of file
#!/bin/bash
#BSUB -P stf011
#BSUB -W 1:00
#BSUB -nnodes 1
#BSUB -alloc_flags gpumps
#BSUB -J chocosgd
#BSUB -o logs/chocosgd.o%J
#BSUB -e logs/chocosgd.e%J
##BSUB -q killable
NNODES=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
source choco_env.sh
jsrun -n$((NNODES*6)) -a1 -g1 -c7 -r6 --smpiargs "-gpu" ./run.sh
......@@ -23,6 +23,25 @@ import pcode.utils.logging as logging
from pcode.utils.timer import Timer
def setup_ddp(backend):
""""Initialize Apex DDP"""
import subprocess
try:
get_master = "echo $(cat {} | sort | uniq | grep -v batch | grep -v login | head -1)".format(os.environ['LSB_DJOB_HOSTFILE'])
master_addr = str(subprocess.check_output(get_master, shell=True))[2:-3]
master_port = "29500"
world_size = os.environ['OMPI_COMM_WORLD_SIZE']
world_rank = os.environ['OMPI_COMM_WORLD_RANK']
except KeyError:
print("DDP has to be initialized within a job")
sys.exit(1)
os.environ['MASTER_ADDR'] = master_addr
os.environ['MASTER_PORT'] = master_port
os.environ['WORLD_SIZE'] = world_size
os.environ['RANK'] = world_rank
dist.init_process_group(backend=conf.backend, rank=int(world_rank), world_size=int(world_size))
def init_distributed_world(conf, backend):
if backend == "mpi":
dist.init_process_group("mpi")
......@@ -46,7 +65,8 @@ def init_distributed_world(conf, backend):
def main(conf):
try:
init_distributed_world(conf, backend=conf.backend)
setup_ddp(backend=conf.backend)
#init_distributed_world(conf, backend=conf.backend)
conf.distributed = True and conf.n_mpi_process > 1
except AttributeError as e:
print(f"failed to init the distributed world: {e}.")
......@@ -190,7 +210,6 @@ def init_config(conf):
if __name__ == "__main__":
conf = get_args()
if conf.optimizer == "parallel_choco":
mp.set_start_method("forkserver", force=True)
# mp.set_start_method("spawn", force=True)
......
#!/bin/bash
python main.py \
--work_dir $(pwd) \
--remote_exec False \
--data cifar10 \
--data_dir ./data/ \
--use_lmdb_data False \
--partition_data random \
--pin_memory True \
--arch resnet20 \
--train_fast False \
--stop_criteria epoch \
--num_epochs 300 \
--num_iterations 32000 \
--avg_model True \
--reshuffle_per_epoch True \
--batch_size 128 \
--base_batch_size 64 \
--lr 0.1 \
--lr_scaleup True \
--lr_scaleup_type linear \
--lr_scaleup_factor graph \
--lr_warmup True \
--lr_warmup_epochs 5 \
--lr_decay 0.1 \
--lr_onecycle_low 0.15 \
--lr_onecycle_high 3 \
--lr_onecycle_extra_low 0.0015 \
--lr_onecycle_num_epoch 46 \
--lr_schedule_scheme custom_one_cycle \
--optimizer parallel_choco \
--adam_beta_1 0.9 \
--adam_beta_2 0.999 \
--adam_eps 1e-08 \
--graph_topology ring \
--comm_op sign \
--compress_ratio 0.9 \
--compress_warmup_values 0.75,0.9375,0.984375,0.996,0.999 \
--compress_warmup_epochs 0 \
--quantize_level 16 \
--is_biased True \
--majority_vote False \
--consensus_stepsize 0.5 \
--evaluate_consensus False \
--mask_momentum False \
--clip_grad False \
--local_step 1 \
--turn_on_local_step_from 0 \
--momentum_factor 0.9 \
--use_nesterov True \
--weight_decay 0.0001 \
--drop_rate 0.0 \
--densenet_growth_rate 12 \
--densenet_bc_mode False \
--densenet_compression 0.5 \
--wideresnet_widen_factor 4 \
--rnn_n_hidden 200 \
--rnn_n_layers 2 \
--rnn_bptt_len 35 \
--rnn_clip 0.25 \
--rnn_use_pretrained_emb True \
--rnn_tie_weights True \
--rnn_weight_norm False \
--manual_seed 6 \
--evaluate False \
--eval_freq 1 \
--summary_freq 100 \
--timestamp 1599681078_l2-0.0001_lr-0.1_epochs-300_batchsize-128_basebatchsize-64_num_mpi_process_6_n_sub_process-1_topology-ring_optim-parallel_choco-stepsize-0.5_comm_info-sign_ \
--track_time True \
--track_detailed_time False \
--display_tracked_time True \
--evaluate_avg False \
--checkpoint ./data/checkpoint \
--save_all_models False \
--user lin \
--project distributed_adam_type_algorithm \
--experiment demo \
--backend mpi \
--use_ipc False \
--num_workers 0 \
--n_mpi_process 6 \
--n_sub_process 1 \
--world 0,0,0,0,0,0 \
--on_cuda True \
--comm_device cuda
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment