Commit 6aff9d3d authored by Aristeidis Tsaris's avatar Aristeidis Tsaris
Browse files

adding some more scripts

parent 001e4f69
global:
report_freq: 20
exit_after_tests: True
profiler_mode: 'single'
json_prefix: 'benchy_imagenet_0_noddp'
output_dir: 'ascent_logs'
IO:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
synthetic:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
full:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
global:
report_freq: 20
exit_after_tests: True
profiler_mode: 'single'
json_prefix: 'benchy_imagenet_0'
output_dir: 'ascent_logs'
IO:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
synthetic:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
full:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
global:
report_freq: 20
exit_after_tests: True
profiler_mode: 'single'
json_prefix: 'benchy_imagenet_1_noddp'
output_dir: 'ascent_logs'
IO:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
synthetic:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
full:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
global:
report_freq: 20
exit_after_tests: True
profiler_mode: 'single'
json_prefix: 'benchy_imagenet_1'
output_dir: 'ascent_logs'
IO:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
synthetic:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
full:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
global:
report_freq: 20
exit_after_tests: True
profiler_mode: 'single'
json_prefix: 'benchy_imagenet_2_noddp'
output_dir: 'ascent_logs'
IO:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
synthetic:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
full:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
global:
report_freq: 20
exit_after_tests: True
profiler_mode: 'single'
json_prefix: 'benchy_imagenet_2'
output_dir: 'ascent_logs'
IO:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
synthetic:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
full:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
global:
report_freq: 20
exit_after_tests: True
profiler_mode: 'single'
json_prefix: 'benchy_imagenet_3_noddp'
output_dir: 'ascent_logs'
IO:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
synthetic:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
full:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
global:
report_freq: 20
exit_after_tests: True
profiler_mode: 'single'
json_prefix: 'benchy_imagenet_3'
output_dir: 'ascent_logs'
IO:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
synthetic:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
full:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
global:
report_freq: 20
exit_after_tests: True
profiler_mode: 'single'
json_prefix: 'benchy_imagenet_4_noddp'
output_dir: 'ascent_logs'
IO:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
synthetic:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
full:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
global:
report_freq: 20
exit_after_tests: True
profiler_mode: 'single'
json_prefix: 'benchy_imagenet_4'
output_dir: 'ascent_logs'
IO:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
synthetic:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
full:
run_benchmark: True
nbatches: 40
ntrials: 2
nwarmup: 1
......@@ -11,107 +11,75 @@
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
LOG_DIR=ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
echo "Starting bency"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-0.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a1 -c42 -g1 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.sythetic.json \
--epochs 1 \
--prof 100 \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.0N.json \
--epochs 100 \
--no-checkpoints \
--data-backend syntetic \
${DATA_DIR}
"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a1 -c42 -g1 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.sythetic.opt.json \
--epochs 1 \
--prof 100 \
--no-checkpoints \
--data-backend syntetic \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--use-benchy \
${DATA_DIR}
"
echo "Starting bency no DDP"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-0-noddp.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a1 -c42 -g1 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.data.json \
--epochs 1 \
--prof 100 \
--no-checkpoints \
--data-backend pytorch \
${DATA_DIR}
"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a1 -c42 -g1 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.data.opt.json \
--epochs 1 \
--prof 100 \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.0N.json \
--epochs 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
# DALI doesn't work on ascent,
# 6th GPU doesn't work on ascent
echo "Starting bency dali"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-0-dali.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a1 -c42 -g1 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.data.opt.dali.json \
--epochs 1 \
--prof 100 \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.0N.json \
--epochs 100 \
--no-checkpoints \
--data-backend dali-cpu \
--amp \
--memory-format nhwc \
--use-benchy \
${DATA_DIR}
"
......@@ -11,11 +11,12 @@
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
LOG_DIR=ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export BENCHY_CONFIG_FILE=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf.yaml
echo "Starting bency"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-1.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
......@@ -33,69 +34,52 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
echo "Starting bency no DDP"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-1-noddp.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.1N.json \
--epochs 1 \
--prof 100 \
--epochs 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--dtLdTime \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
echo "Starting bency dali"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-1-dali.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.syntetic.opt.1N.json \
--epochs 1 \
--prof 100 \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.1N.json \
--epochs 100 \
--no-checkpoints \
--data-backend syntetic \
--data-backend dali-cpu \
--amp \
--memory-format nhwc \
--use-benchy \
${DATA_DIR}
"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.syntetic.noddp.opt.1N.json \
--epochs 1 \
--prof 100 \
--no-checkpoints \
--data-backend syntetic \
--amp \
--memory-format nhwc \
--noDDP \
${DATA_DIR}
"
......@@ -11,11 +11,12 @@
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
LOG_DIR=ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export BENCHY_CONFIG_FILE=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-2.yaml
echo "Starting bency"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-2.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
......@@ -33,71 +34,52 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
echo "Starting bency no DDP"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-2-noddp.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
nsys profile -t cuda,nvtx -o ./baseline_1_%q{OMPI_COMM_WORLD_RANK} \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.2N.json \
--epochs 1 \
--prof 100 \
--epochs 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--dtLdTime \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
echo "Starting bency dali"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-2-dali.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.syntetic.opt.2N.json \
--epochs 1 \
--prof 100 \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.2N.json \
--epochs 100 \
--no-checkpoints \
--data-backend syntetic \
--data-backend dali-cpu \
--amp \
--memory-format nhwc \
--use-benchy \
${DATA_DIR}
"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.syntetic.noddp.opt.2N.json \
--epochs 1 \
--prof 100 \
--no-checkpoints \
--data-backend syntetic \
--amp \
--memory-format nhwc \
--noDDP \
${DATA_DIR}
"
......@@ -11,11 +11,12 @@
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
LOG_DIR=ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export BENCHY_CONFIG_FILE=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-3.yaml
echo "Starting bency"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-3.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
......@@ -33,69 +34,52 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
echo "Starting bency no DDP"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-3-noddp.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.3N.json \
--epochs 1 \
--prof 100 \
--epochs 100 \