Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Tsaris, Aristeidis (aris)
pytorch_tutorial
Commits
6aff9d3d
Commit
6aff9d3d
authored
Oct 08, 2021
by
Aristeidis Tsaris
Browse files
adding some more scripts
parent
001e4f69
Changes
19
Hide whitespace changes
Inline
Side-by-side
ascent/benchy_configs/benchy-conf-0-noddp.yaml
0 → 100644
View file @
6aff9d3d
global
:
report_freq
:
20
exit_after_tests
:
True
profiler_mode
:
'
single'
json_prefix
:
'
benchy_imagenet_0_noddp'
output_dir
:
'
ascent_logs'
IO
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
synthetic
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
full
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
ascent/benchy_configs/benchy-conf-0.yaml
0 → 100644
View file @
6aff9d3d
global
:
report_freq
:
20
exit_after_tests
:
True
profiler_mode
:
'
single'
json_prefix
:
'
benchy_imagenet_0'
output_dir
:
'
ascent_logs'
IO
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
synthetic
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
full
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
ascent/benchy_configs/benchy-conf-1-noddp.yaml
0 → 100644
View file @
6aff9d3d
global
:
report_freq
:
20
exit_after_tests
:
True
profiler_mode
:
'
single'
json_prefix
:
'
benchy_imagenet_1_noddp'
output_dir
:
'
ascent_logs'
IO
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
synthetic
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
full
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
ascent/benchy_configs/benchy-conf-1.yaml
0 → 100644
View file @
6aff9d3d
global
:
report_freq
:
20
exit_after_tests
:
True
profiler_mode
:
'
single'
json_prefix
:
'
benchy_imagenet_1'
output_dir
:
'
ascent_logs'
IO
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
synthetic
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
full
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
ascent/benchy_configs/benchy-conf-2-noddp.yaml
0 → 100644
View file @
6aff9d3d
global
:
report_freq
:
20
exit_after_tests
:
True
profiler_mode
:
'
single'
json_prefix
:
'
benchy_imagenet_2_noddp'
output_dir
:
'
ascent_logs'
IO
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
synthetic
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
full
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
ascent/benchy_configs/benchy-conf-2.yaml
0 → 100644
View file @
6aff9d3d
global
:
report_freq
:
20
exit_after_tests
:
True
profiler_mode
:
'
single'
json_prefix
:
'
benchy_imagenet_2'
output_dir
:
'
ascent_logs'
IO
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
synthetic
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
full
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
ascent/benchy_configs/benchy-conf-3-noddp.yaml
0 → 100644
View file @
6aff9d3d
global
:
report_freq
:
20
exit_after_tests
:
True
profiler_mode
:
'
single'
json_prefix
:
'
benchy_imagenet_3_noddp'
output_dir
:
'
ascent_logs'
IO
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
synthetic
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
full
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
ascent/benchy_configs/benchy-conf-3.yaml
0 → 100644
View file @
6aff9d3d
global
:
report_freq
:
20
exit_after_tests
:
True
profiler_mode
:
'
single'
json_prefix
:
'
benchy_imagenet_3'
output_dir
:
'
ascent_logs'
IO
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
synthetic
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
full
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
ascent/benchy_configs/benchy-conf-4-noddp.yaml
0 → 100644
View file @
6aff9d3d
global
:
report_freq
:
20
exit_after_tests
:
True
profiler_mode
:
'
single'
json_prefix
:
'
benchy_imagenet_4_noddp'
output_dir
:
'
ascent_logs'
IO
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
synthetic
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
full
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
ascent/benchy_configs/benchy-conf-4.yaml
0 → 100644
View file @
6aff9d3d
global
:
report_freq
:
20
exit_after_tests
:
True
profiler_mode
:
'
single'
json_prefix
:
'
benchy_imagenet_4'
output_dir
:
'
ascent_logs'
IO
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
synthetic
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
full
:
run_benchmark
:
True
nbatches
:
40
ntrials
:
2
nwarmup
:
1
ascent/sub_
1GPU
.lsf
→
ascent/sub_
ips_0N
.lsf
View file @
6aff9d3d
...
...
@@ -11,107 +11,75 @@
nnodes
=
$(
cat
${
LSB_DJOB_HOSTFILE
}
|
sort
|
uniq
|
grep
-v
login |
grep
-v
batch |
wc
-l
)
DATA_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/
ascent_logs/
LOG_DIR
=
ascent_logs/
source
/gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
echo
"Starting bency"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-0.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a1
-c42
-g1
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j
2
8
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.sythetic.json
\
--epochs 1
\
--prof 100
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.0N.json
\
--epochs 100
\
--no-checkpoints
\
--data-backend syntetic
\
${
DATA_DIR
}
"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a1
-c42
-g1
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j 28
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.sythetic.opt.json
\
--epochs 1
\
--prof 100
\
--no-checkpoints
\
--data-backend syntetic
\
--data-backend pytorch
\
--amp
\
--memory-format nhwc
\
--use-benchy
\
${
DATA_DIR
}
"
echo
"Starting bency no DDP"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-0-noddp.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a1
-c42
-g1
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j
2
8
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.data.json
\
--epochs 1
\
--prof 100
\
--no-checkpoints
\
--data-backend pytorch
\
${
DATA_DIR
}
"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a1
-c42
-g1
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j 28
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.data.opt.json
\
--epochs 1
\
--prof 100
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.0N.json
\
--epochs 100
\
--no-checkpoints
\
--data-backend pytorch
\
--amp
\
--memory-format nhwc
\
--use-benchy
\
--noDDP
\
${
DATA_DIR
}
"
exit
# DALI doesn't work on ascent,
# 6th GPU doesn't work on ascent
echo
"Starting bency dali"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-0-dali.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a1
-c42
-g1
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j
2
8
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.data.opt.dali.json
\
--epochs 1
\
--prof 100
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.0N.json
\
--epochs 100
\
--no-checkpoints
\
--data-backend dali-cpu
\
--amp
\
--memory-format nhwc
\
--use-benchy
\
${
DATA_DIR
}
"
ascent/sub_
test
_1N.lsf
→
ascent/sub_
ips
_1N.lsf
View file @
6aff9d3d
...
...
@@ -11,11 +11,12 @@
nnodes
=
$(
cat
${
LSB_DJOB_HOSTFILE
}
|
sort
|
uniq
|
grep
-v
login |
grep
-v
batch |
wc
-l
)
DATA_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/
ascent_logs/
LOG_DIR
=
ascent_logs/
source
/gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export
BENCHY_CONFIG_FILE
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf.yaml
echo
"Starting bency"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-1.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
...
...
@@ -33,69 +34,52 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--amp
\
--memory-format nhwc
\
--use-benchy
\
--noDDP
\
${
DATA_DIR
}
"
exit
echo
"Starting bency no DDP"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-1-noddp.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j
2
8
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.1N.json
\
--epochs 1
\
--prof 100
\
--epochs 100
\
--no-checkpoints
\
--data-backend pytorch
\
--amp
\
--memory-format nhwc
\
--dtLdTime
\
--use-benchy
\
--noDDP
\
${
DATA_DIR
}
"
exit
echo
"Starting bency dali"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-1-dali.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j
2
8
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.syntetic.opt.1N.json
\
--epochs 1
\
--prof 100
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.1N.json
\
--epochs 100
\
--no-checkpoints
\
--data-backend
syntetic
\
--data-backend
dali-cpu
\
--amp
\
--memory-format nhwc
\
--use-benchy
\
${
DATA_DIR
}
"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j 28
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.syntetic.noddp.opt.1N.json
\
--epochs 1
\
--prof 100
\
--no-checkpoints
\
--data-backend syntetic
\
--amp
\
--memory-format nhwc
\
--noDDP
\
${
DATA_DIR
}
"
ascent/sub_
test
_2N.lsf
→
ascent/sub_
ips
_2N.lsf
View file @
6aff9d3d
...
...
@@ -11,11 +11,12 @@
nnodes
=
$(
cat
${
LSB_DJOB_HOSTFILE
}
|
sort
|
uniq
|
grep
-v
login |
grep
-v
batch |
wc
-l
)
DATA_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/
ascent_logs/
LOG_DIR
=
ascent_logs/
source
/gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export
BENCHY_CONFIG_FILE
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-2.yaml
echo
"Starting bency"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-2.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
...
...
@@ -33,71 +34,52 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--amp
\
--memory-format nhwc
\
--use-benchy
\
--noDDP
\
${
DATA_DIR
}
"
exit
echo
"Starting bency no DDP"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-2-noddp.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
nsys profile
-t
cuda,nvtx
-o
./baseline_1_%q
{
OMPI_COMM_WORLD_RANK
}
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j
2
8
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.2N.json
\
--epochs 1
\
--prof 100
\
--epochs 100
\
--no-checkpoints
\
--data-backend pytorch
\
--amp
\
--memory-format nhwc
\
--dtLdTime
\
--use-benchy
\
--noDDP
\
${
DATA_DIR
}
"
exit
echo
"Starting bency dali"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-2-dali.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j
2
8
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.syntetic.opt.2N.json
\
--epochs 1
\
--prof 100
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.2N.json
\
--epochs 100
\
--no-checkpoints
\
--data-backend
syntetic
\
--data-backend
dali-cpu
\
--amp
\
--memory-format nhwc
\
--use-benchy
\
${
DATA_DIR
}
"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j 28
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.syntetic.noddp.opt.2N.json
\
--epochs 1
\
--prof 100
\
--no-checkpoints
\
--data-backend syntetic
\
--amp
\
--memory-format nhwc
\
--noDDP
\
${
DATA_DIR
}
"
ascent/sub_
test
_3N.lsf
→
ascent/sub_
ips
_3N.lsf
View file @
6aff9d3d
...
...
@@ -11,11 +11,12 @@
nnodes
=
$(
cat
${
LSB_DJOB_HOSTFILE
}
|
sort
|
uniq
|
grep
-v
login |
grep
-v
batch |
wc
-l
)
DATA_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/
ascent_logs/
LOG_DIR
=
ascent_logs/
source
/gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export
BENCHY_CONFIG_FILE
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-3.yaml
echo
"Starting bency"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-3.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
...
...
@@ -33,69 +34,52 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--amp
\
--memory-format nhwc
\
--use-benchy
\
--noDDP
\
${
DATA_DIR
}
"
exit
echo
"Starting bency no DDP"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-3-noddp.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j
2
8
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.3N.json
\
--epochs 1
\
--prof 100
\
--epochs 100
\