Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Tsaris, Aristeidis (aris)
pytorch_tutorial
Commits
001e4f69
Commit
001e4f69
authored
Oct 07, 2021
by
Aristeidis Tsaris
Browse files
adding another notebook
parent
5a104e99
Changes
7
Hide whitespace changes
Inline
Side-by-side
ascent/ascent_logs/parse_json_MultiGPU.ipynb
View file @
001e4f69
This source diff could not be displayed because it is too large. You can
view the blob
instead.
ascent/sub_test_1N.lsf
View file @
001e4f69
...
...
@@ -14,7 +14,30 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source
/gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export
BENCHY_CONFIG_FILE
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.1N.json
\
--epochs 100
\
--no-checkpoints
\
--data-backend pytorch
\
--amp
\
--memory-format nhwc
\
--use-benchy
\
--noDDP
\
${
DATA_DIR
}
"
exit
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
...
...
ascent/sub_test_2N.lsf
View file @
001e4f69
...
...
@@ -14,6 +14,7 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source
/gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export
BENCHY_CONFIG_FILE
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-2.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
...
...
@@ -21,6 +22,29 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.2N.json
\
--epochs 100
\
--no-checkpoints
\
--data-backend pytorch
\
--amp
\
--memory-format nhwc
\
--use-benchy
\
--noDDP
\
${
DATA_DIR
}
"
exit
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
nsys profile
-t
cuda,nvtx
-o
./baseline_1_%q
{
OMPI_COMM_WORLD_RANK
}
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j 28
\
-p 10
\
-b 128
\
...
...
@@ -36,6 +60,7 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
${
DATA_DIR
}
"
exit
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
...
...
ascent/sub_test_3N.lsf
View file @
001e4f69
...
...
@@ -14,7 +14,30 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source
/gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export
BENCHY_CONFIG_FILE
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-3.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.3N.json
\
--epochs 100
\
--no-checkpoints
\
--data-backend pytorch
\
--amp
\
--memory-format nhwc
\
--use-benchy
\
--noDDP
\
${
DATA_DIR
}
"
exit
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
...
...
ascent/sub_test_4N.lsf
View file @
001e4f69
...
...
@@ -14,7 +14,30 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source
/gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export
BENCHY_CONFIG_FILE
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-4.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j 8
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.dataload.opt.4N.json
\
--epochs 100
\
--no-checkpoints
\
--data-backend pytorch
\
--amp
\
--memory-format nhwc
\
--use-benchy
\
--noDDP
\
${
DATA_DIR
}
"
exit
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
...
...
@@ -76,3 +99,23 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--noDDP
\
${
DATA_DIR
}
"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a5
-c42
-g5
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
source export_DDP_envvars.sh &&
\
python -u ../imagenet/main.py
\
--arch resnet50
\
-j 28
\
-p 10
\
-b 128
\
--training-only
\
--raport-file
${
LOG_DIR
}
/benchmark.data.opt.4N.json
\
--epochs 1
\
--prof 100
\
--no-checkpoints
\
--data-backend pytorch
\
--amp
\
--memory-format nhwc
\
${
DATA_DIR
}
"
imagenet/image_classification/dataloaders.py
View file @
001e4f69
...
...
@@ -444,7 +444,7 @@ def get_pytorch_train_loader(
pin_memory
=
True
,
collate_fn
=
partial
(
fast_collate
,
memory_format
),
drop_last
=
True
,
persistent_workers
=
True
,
persistent_workers
=
False
,
# see https://github.com/pytorch/pytorch/issues/48370
)
return
(
...
...
imagenet/main.py
View file @
001e4f69
...
...
@@ -331,7 +331,7 @@ def add_parser_arguments(parser, skip_arch=False):
required
=
False
,
help
=
"number of classes"
)
parser
.
add_argument
(
"--use-benchy"
,
action
=
"store_true"
,
help
=
"enable benchy"
)
def
prepare_for_training
(
args
,
model_args
,
model_arch
):
...
...
@@ -562,6 +562,14 @@ def main(args, model_args, model_arch):
model_and_loss
,
optimizer
,
lr_policy
,
scaler
,
train_loader
,
val_loader
,
logger
,
ema
,
model_ema
,
train_loader_len
,
\
batch_size_multiplier
,
start_epoch
=
prepare_for_training
(
args
,
model_args
,
model_arch
)
if
args
.
use_benchy
:
try
:
from
benchy.torch
import
BenchmarkGenericIteratorWrapper
train_loader
=
BenchmarkGenericIteratorWrapper
(
train_loader
,
args
.
batch_size
)
except
:
print
(
"Requested to use benchy but could not find library. Ignoring..."
)
if
(
args
.
dtLdTime
):
dtLdTime
(
train_loader
,
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment