Commit 001e4f69 authored by Aristeidis Tsaris's avatar Aristeidis Tsaris
Browse files

adding another notebook

parent 5a104e99
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -14,7 +14,30 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export BENCHY_CONFIG_FILE=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.1N.json \
--epochs 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
......
......@@ -14,6 +14,7 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export BENCHY_CONFIG_FILE=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-2.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
......@@ -21,6 +22,29 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.2N.json \
--epochs 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
nsys profile -t cuda,nvtx -o ./baseline_1_%q{OMPI_COMM_WORLD_RANK} \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-p 10 \
-b 128\
......@@ -36,6 +60,7 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
${DATA_DIR}
"
exit
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
......
......@@ -14,7 +14,30 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export BENCHY_CONFIG_FILE=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-3.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.3N.json \
--epochs 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
......
......@@ -14,7 +14,30 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export BENCHY_CONFIG_FILE=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-4.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.4N.json \
--epochs 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
......@@ -76,3 +99,23 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--noDDP \
${DATA_DIR}
"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.data.opt.4N.json \
--epochs 1 \
--prof 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
${DATA_DIR}
"
......@@ -444,7 +444,7 @@ def get_pytorch_train_loader(
pin_memory=True,
collate_fn=partial(fast_collate, memory_format),
drop_last=True,
persistent_workers=True,
persistent_workers=False, # see https://github.com/pytorch/pytorch/issues/48370
)
return (
......
......@@ -331,7 +331,7 @@ def add_parser_arguments(parser, skip_arch=False):
required=False,
help="number of classes"
)
parser.add_argument("--use-benchy", action="store_true", help="enable benchy")
def prepare_for_training(args, model_args, model_arch):
......@@ -562,6 +562,14 @@ def main(args, model_args, model_arch):
model_and_loss, optimizer, lr_policy, scaler, train_loader, val_loader, logger, ema, model_ema, train_loader_len, \
batch_size_multiplier, start_epoch = prepare_for_training(args, model_args, model_arch)
if args.use_benchy:
try:
from benchy.torch import BenchmarkGenericIteratorWrapper
train_loader = BenchmarkGenericIteratorWrapper(train_loader, args.batch_size)
except:
print("Requested to use benchy but could not find library. Ignoring...")
if (args.dtLdTime):
dtLdTime(
train_loader,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment