Commit e08e4a5c authored by IamTao's avatar IamTao
Browse files

update the code in the public repo (check the exp scripts).

parent b5fd08f1
......@@ -3,11 +3,11 @@ $HOME/conda/envs/pytorch-py3.6/bin/python run.py \
--rnn_clip 0.4 --rnn_use_pretrained_emb False --rnn_tie_weights True --drop_rate 0.40 \
--optimizer sgd --avg_model True --experiment test \
--data wikitext2 --pin_memory True \
--batch_size 32 --base_batch_size 24 --num_workers 2 \
--batch_size 32 --base_batch_size 24 --num_workers 2 --eval_freq 1 \
--num_epochs 300 --partition_data random --reshuffle_per_epoch False --stop_criteria epoch \
--n_mpi_process 32 --n_sub_process 1 --world 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1 --on_cuda True --use_ipc False --comm_device cuda \
--lr 2.5 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
--lr_scheduler MultiStepLR --lr_decay 0.1 --lr_milestones 150,225 \
--lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 --lr_decay 10 \
--weight_decay 0 --use_nesterov False --momentum_factor 0 \
--hostfile hostfile --graph_topology social --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/
--hostfile iccluster/hostfile2 --graph_topology social --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
......@@ -3,85 +3,85 @@ $HOME/conda/envs/pytorch-py3.6/bin/python run.py \
--arch resnet20 --optimizer sgd \
--avg_model True --experiment test \
--data cifar10 --pin_memory True \
--batch_size 128 --base_batch_size 64 --num_workers 0 \
--batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
--num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
--n_mpi_process 8 --n_sub_process 1 --world 0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
--lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
--lr_scheduler MultiStepLR --lr_decay 0.1 --lr_milestones 150,225 \
--lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
--weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
--hostfile hostfile --graph_topology complete --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/
--hostfile iccluster/hostfile --graph_topology complete --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
# decentralized sgd with ring topology.
$HOME/conda/envs/pytorch-py3.6/bin/python run.py \
--arch resnet20 --optimizer sgd \
--avg_model True --experiment test \
--data cifar10 --pin_memory True \
--batch_size 128 --base_batch_size 64 --num_workers 0 \
--batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
--num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
--n_mpi_process 8 --n_sub_process 1 --world 0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
--lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
--lr_scheduler MultiStepLR --lr_decay 0.1 --lr_milestones 150,225 \
--lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
--weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
--hostfile hostfile --graph_topology ring --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/
--hostfile iccluster/hostfile --graph_topology ring --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
# parallel_choco with sign + norm for ring topology
$HOME/conda/envs/pytorch-py3.6/bin/python run.py \
--arch resnet20 --optimizer parallel_choco \
--avg_model True --experiment test \
--data cifar10 --pin_memory True \
--batch_size 128 --base_batch_size 64 --num_workers 0 \
--batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
--num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
--n_mpi_process 8 --n_sub_process 1 --world 0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
--lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
--lr_scheduler MultiStepLR --lr_decay 0.1 --lr_milestones 150,225 \
--lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
--weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
--comm_op sign --consensus_stepsize 0.4 --compress_ratio 0.9 --quantize_level 16 --is_biased True \
--hostfile hostfile --graph_topology ring --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/
--comm_op sign --choco_consenus_stepsize 0.4 --compress_ratio 0.9 --quantize_level 16 --is_biased True \
--hostfile iccluster/hostfile --graph_topology ring --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
# parallel_choco with sign + norm for social topology
$HOME/conda/envs/pytorch-py3.6/bin/python run.py \
--arch resnet20 --optimizer parallel_choco \
--avg_model True --experiment test \
--data cifar10 --pin_memory True \
--batch_size 128 --base_batch_size 64 --num_workers 0 \
--batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
--num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
--n_mpi_process 32 --n_sub_process 1 --world 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
--lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
--lr_scheduler MultiStepLR --lr_decay 0.1 --lr_milestones 150,225 \
--lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
--weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
--comm_op sign --consensus_stepsize 0.4 --compress_ratio 0.9 --quantize_level 16 --is_biased True \
--hostfile hostfile --graph_topology social --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/
--comm_op sign --choco_consenus_stepsize 0.4 --compress_ratio 0.9 --quantize_level 16 --is_biased True \
--hostfile iccluster/hostfile --graph_topology social --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
# dcd_psgd with quantize_qsgd for ring topology
$HOME/conda/envs/pytorch-py3.6/bin/python run.py \
--arch resnet20 --optimizer dcd_psgd \
--avg_model True --experiment test \
--data cifar10 --pin_memory True \
--batch_size 128 --base_batch_size 64 --num_workers 0 \
--batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
--num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
--n_mpi_process 8 --n_sub_process 1 --world 0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
--lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
--lr_scheduler MultiStepLR --lr_decay 0.1 --lr_milestones 150,225 \
--lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
--weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
--comm_op quantize_qsgd --compress_ratio 0.9 --quantize_level 16 --is_biased True \
--hostfile hostfile --graph_topology ring --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/
--hostfile iccluster/hostfile --graph_topology ring --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
# ecd_psgd with compress_top_k for ring topology
$HOME/conda/envs/pytorch-py3.6/bin/python run.py \
--arch resnet20 --optimizer ecd_psgd \
--avg_model True --experiment test \
--data cifar10 --pin_memory True \
--batch_size 128 --base_batch_size 64 --num_workers 0 \
--batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
--num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
--n_mpi_process 8 --n_sub_process 1 --world 0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
--lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
--lr_scheduler MultiStepLR --lr_decay 0.1 --lr_milestones 150,225 \
--lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
--weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
--comm_op compress_top_k --compress_ratio 0.9 --quantize_level 16 --is_biased True \
--hostfile hostfile --graph_topology ring --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/
--hostfile iccluster/hostfile --graph_topology ring --track_time True --display_tracked_time True \
--python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
/home/lin/conda/envs/pytorch-py3.6/bin/python run.py \
--arch resnet50 --optimizer sgd \
--avg_model True --experiment plain_decentralized_timing \
--data imagenet --use_lmdb_data True --data_dir /mlodata1/ILSVRC/ --pin_memory True \
--batch_size 128 --base_batch_size 256 --num_workers 16 --eval_freq 1 \
--num_epochs 90 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
--on_cuda True --n_mpi_process 8 --n_sub_process 4 --world 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3 \
--lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
--lr_schedule_scheme custom_multistep --lr_change_epochs 30,60,80 \
--weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
--comm_op quantize_qsgd --consensus_stepsize 0.4 --compress_ratio 0.9 --quantize_level 32 --is_biased True \
--hostfile gcloud/hostfile --graph_topology ring --track_time True --track_detailed_time True --display_tracked_time True \
--python_path /home/lin/conda/envs/pytorch-py3.6/bin/python --mpi_path /home/lin/.openmpi/ --evaluate_avg True --summary_freq 100 \
--backend mpi --work_dir /mlodata1/choco_decentralized_code --remote_exec False --clean_python False --mpi_env LD_LIBRARY_PATH=/home/lin/.openmpi/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
......@@ -2,13 +2,13 @@
--arch resnet50 --optimizer parallel_choco \
--avg_model True --experiment test \
--data imagenet --use_lmdb_data True --data_dir /mlodata1/tlin/dataset/ILSVRC/ --pin_memory True \
--batch_size 512 --base_batch_size 256 --num_workers 2 \
--batch_size 512 --base_batch_size 256 --num_workers 2 --eval_freq 1 \
--num_epochs 90 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
--on_cuda True --n_mpi_process 8 --n_sub_process 4 --world 0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3 \
--lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
--lr_scheduler MultiStepLR --lr_decay 0.1 --lr_milestones 30,60,80 \
--lr_schedule_scheme custom_multistep --lr_change_epochs 30,60,80 \
--weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
--comm_op sign --consensus_stepsize 0.5 \
--hostfile hostfile --graph_topology ring --track_time True --track_detailed_time True --display_tracked_time True \
--python_path /home/lin/conda/envs/pytorch-py3.6/bin/python --mpi_path /home/lin/.openmpi/ --summary_freq 100 \
--comm_op sign --choco_consenus_stepsize 0.5 \
--hostfile gcloud/hostfile --graph_topology data_center --track_time True --track_detailed_time True --display_tracked_time True \
--python_path /home/lin/conda/envs/pytorch-py3.6/bin/python --mpi_path /home/lin/.openmpi/ --evaluate_avg True --summary_freq 100 \
--backend mpi --work_dir /mlodata1/tlin/decentralized_code --remote_exec False --clean_python False --mpi_env LD_LIBRARY_PATH=/home/lin/.openmpi/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment