example.sh 6.04 KB
Newer Older
1 2 3 4 5
# centralized sgd with complete topology.
$HOME/conda/envs/pytorch-py3.6/bin/python run.py \
    --arch resnet20 --optimizer sgd \
    --avg_model True --experiment test \
    --data cifar10 --pin_memory True \
6
    --batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
7 8 9
    --num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
    --n_mpi_process 8 --n_sub_process 1 --world 0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
    --lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
10
    --lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
11
    --weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
12 13
    --hostfile iccluster/hostfile --graph_topology complete --track_time True --display_tracked_time True \
    --python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
14 15 16 17 18 19

# decentralized sgd with ring topology.
$HOME/conda/envs/pytorch-py3.6/bin/python run.py \
    --arch resnet20 --optimizer sgd \
    --avg_model True --experiment test \
    --data cifar10 --pin_memory True \
20
    --batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
21 22 23
    --num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
    --n_mpi_process 8 --n_sub_process 1 --world 0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
    --lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
24
    --lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
25
    --weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
26 27
    --hostfile iccluster/hostfile --graph_topology ring --track_time True --display_tracked_time True \
    --python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
28 29 30 31 32 33

# parallel_choco with sign + norm for ring topology
$HOME/conda/envs/pytorch-py3.6/bin/python run.py \
    --arch resnet20 --optimizer parallel_choco \
    --avg_model True --experiment test \
    --data cifar10 --pin_memory True \
34
    --batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
35 36 37
    --num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
    --n_mpi_process 8 --n_sub_process 1 --world 0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
    --lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
38
    --lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
39
    --weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
40 41 42
    --comm_op sign --choco_consenus_stepsize 0.4 --compress_ratio 0.9 --quantize_level 16 --is_biased True \
    --hostfile iccluster/hostfile --graph_topology ring --track_time True --display_tracked_time True \
    --python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
43 44 45 46 47 48

# parallel_choco with sign + norm for social topology
$HOME/conda/envs/pytorch-py3.6/bin/python run.py \
    --arch resnet20 --optimizer parallel_choco \
    --avg_model True --experiment test \
    --data cifar10 --pin_memory True \
49
    --batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
50 51 52
    --num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
    --n_mpi_process 32 --n_sub_process 1 --world 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
    --lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
53
    --lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
54
    --weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
55 56 57
    --comm_op sign --choco_consenus_stepsize 0.4 --compress_ratio 0.9 --quantize_level 16 --is_biased True \
    --hostfile iccluster/hostfile --graph_topology social --track_time True --display_tracked_time True \
    --python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
58 59 60 61 62 63

# dcd_psgd with quantize_qsgd for ring topology
$HOME/conda/envs/pytorch-py3.6/bin/python run.py \
    --arch resnet20 --optimizer dcd_psgd \
    --avg_model True --experiment test \
    --data cifar10 --pin_memory True \
64
    --batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
65 66 67
    --num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
    --n_mpi_process 8 --n_sub_process 1 --world 0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
    --lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
68
    --lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
69 70
    --weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
    --comm_op quantize_qsgd --compress_ratio 0.9 --quantize_level 16 --is_biased True \
71 72
    --hostfile iccluster/hostfile --graph_topology ring --track_time True --display_tracked_time True \
    --python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True
73 74 75 76 77 78

# ecd_psgd with compress_top_k for ring topology
$HOME/conda/envs/pytorch-py3.6/bin/python run.py \
    --arch resnet20 --optimizer ecd_psgd \
    --avg_model True --experiment test \
    --data cifar10 --pin_memory True \
79
    --batch_size 128 --base_batch_size 64 --num_workers 0 --eval_freq 1 \
80 81 82
    --num_epochs 300 --partition_data random --reshuffle_per_epoch True --stop_criteria epoch \
    --n_mpi_process 8 --n_sub_process 1 --world 0,0,0,0,0,0,0,0 --on_cuda True --use_ipc False --comm_device cuda \
    --lr 0.1 --lr_scaleup True --lr_scaleup_factor graph --lr_warmup True --lr_warmup_epochs 5 \
83
    --lr_schedule_scheme custom_multistep --lr_change_epochs 150,225 \
84 85
    --weight_decay 1e-4 --use_nesterov True --momentum_factor 0.9 \
    --comm_op compress_top_k --compress_ratio 0.9 --quantize_level 16 --is_biased True \
86 87
    --hostfile iccluster/hostfile --graph_topology ring --track_time True --display_tracked_time True \
    --python_path $HOME/conda/envs/pytorch-py3.6/bin/python --mpi_path $HOME/.openmpi/ --evaluate_avg True