Commit 001e4f69 authored by Aristeidis Tsaris's avatar Aristeidis Tsaris
Browse files

adding another notebook

parent 5a104e99
......@@ -14,7 +14,30 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export BENCHY_CONFIG_FILE=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.1N.json \
--epochs 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
......
......@@ -14,6 +14,7 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export BENCHY_CONFIG_FILE=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-2.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
......@@ -21,6 +22,29 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.2N.json \
--epochs 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
nsys profile -t cuda,nvtx -o ./baseline_1_%q{OMPI_COMM_WORLD_RANK} \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-p 10 \
-b 128\
......@@ -36,6 +60,7 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
${DATA_DIR}
"
exit
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
......
......@@ -14,7 +14,30 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export BENCHY_CONFIG_FILE=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-3.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.3N.json \
--epochs 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
......
......@@ -14,7 +14,30 @@ DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data/
LOG_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/ascent_logs/
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
export BENCHY_CONFIG_FILE=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/ascent/benchy-conf-4.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 8 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.dataload.opt.4N.json \
--epochs 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
--use-benchy \
--noDDP \
${DATA_DIR}
"
exit
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
......@@ -76,3 +99,23 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--noDDP \
${DATA_DIR}
"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a5 -c42 -g5 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-p 10 \
-b 128\
--training-only \
--raport-file ${LOG_DIR}/benchmark.data.opt.4N.json \
--epochs 1 \
--prof 100 \
--no-checkpoints \
--data-backend pytorch \
--amp \
--memory-format nhwc \
${DATA_DIR}
"
......@@ -444,7 +444,7 @@ def get_pytorch_train_loader(
pin_memory=True,
collate_fn=partial(fast_collate, memory_format),
drop_last=True,
persistent_workers=True,
persistent_workers=False, # see https://github.com/pytorch/pytorch/issues/48370
)
return (
......
......@@ -331,7 +331,7 @@ def add_parser_arguments(parser, skip_arch=False):
required=False,
help="number of classes"
)
parser.add_argument("--use-benchy", action="store_true", help="enable benchy")
def prepare_for_training(args, model_args, model_arch):
......@@ -562,6 +562,14 @@ def main(args, model_args, model_arch):
model_and_loss, optimizer, lr_policy, scaler, train_loader, val_loader, logger, ema, model_ema, train_loader_len, \
batch_size_multiplier, start_epoch = prepare_for_training(args, model_args, model_arch)
if args.use_benchy:
try:
from benchy.torch import BenchmarkGenericIteratorWrapper
train_loader = BenchmarkGenericIteratorWrapper(train_loader, args.batch_size)
except:
print("Requested to use benchy but could not find library. Ignoring...")
if (args.dtLdTime):
dtLdTime(
train_loader,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment