Commit efe645d1 authored by Tsaris, Aristeidis's avatar Tsaris, Aristeidis
Browse files

summit simple code

parent 4143513c
#!/bin/bash
# Begin LSF directives
#BSUB -P gen166
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 1
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data
CODE_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/imagenet_simple
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
echo "Starting bency"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-0-8w.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a1 -c42 -g1 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 32 \
--use-benchy
"
echo "Starting bency no DDP"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-0-8w-noddp.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a1 -c42 -g1 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 32 \
--use-benchy \
--noddp
"
...@@ -6,6 +6,7 @@ import sys ...@@ -6,6 +6,7 @@ import sys
import math import math
import argparse import argparse
import subprocess import subprocess
import yaml
# Torch # Torch
import torch import torch
...@@ -53,6 +54,14 @@ parser.add_argument('--wd', type=float, default=0.00005, ...@@ -53,6 +54,14 @@ parser.add_argument('--wd', type=float, default=0.00005,
help='weight decay') help='weight decay')
parser.add_argument("--use-benchy", action="store_true", help="enable benchy") parser.add_argument("--use-benchy", action="store_true", help="enable benchy")
parser.add_argument("--noddp", action="store_true", help="enable noddp") parser.add_argument("--noddp", action="store_true", help="enable noddp")
parser.add_argument("--workers", default=5, type=int, metavar="N",
help="number of data loading workers (default: 5)",)
parser.add_argument("--bucketS", default=25, type=int, metavar="N",
help="bucket_cap_mb for DDP (default: 25MB)",)
parser.add_argument("--benchy-ext", default=None, type=str,
help="extention to benchy file",)
parser.add_argument("--benchy-log", default=None, type=str,
help="benchy log location",)
world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
world_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) world_rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
...@@ -99,22 +108,29 @@ def train(epoch): ...@@ -99,22 +108,29 @@ def train(epoch):
step+=1 step+=1
def new_yaml_file(args):
with open("./benchy-conf.yaml") as f:
y=yaml.safe_load(f)
y["global"]["json_prefix"] = "benchy_output_" + args.benchy_ext
y["global"]["output_dir"] = args.benchy_log
str_file = "/tmp/benchy-conf_%s.yaml"%(args.benchy_ext)
with open(str_file, 'w') as f2:
yaml.dump(y, f2, default_flow_style=False)
os.environ['BENCHY_CONFIG_FILE'] = str_file
if __name__=="__main__": if __name__=="__main__":
args = parser.parse_args() args = parser.parse_args()
if(world_rank==0):
new_yaml_file(args)
torch.cuda.manual_seed(42) torch.cuda.manual_seed(42)
cudnn.benchmark = True cudnn.benchmark = True
dist.init_process_group('nccl', dist.init_process_group('nccl',
rank=world_rank, world_size=world_size) rank=world_rank, world_size=world_size)
kwargs = {'num_workers': 8, 'pin_memory': True}
# When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
# issues with Infiniband implementations that are not fork-safe
if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
kwargs['multiprocessing_context'] = 'forkserver'
train_dataset = \ train_dataset = \
datasets.ImageFolder(args.train_dir, datasets.ImageFolder(args.train_dir,
transform=transforms.Compose([ transform=transforms.Compose([
...@@ -129,7 +145,7 @@ if __name__=="__main__": ...@@ -129,7 +145,7 @@ if __name__=="__main__":
train_dataset, num_replicas=world_size, rank=world_rank) train_dataset, num_replicas=world_size, rank=world_rank)
train_loader = torch.utils.data.DataLoader( train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, train_dataset, batch_size=args.batch_size,
sampler=train_sampler, **kwargs) sampler=train_sampler, num_workers=args.workers)
if args.use_benchy: if args.use_benchy:
train_loader = BenchmarkGenericIteratorWrapper(train_loader, args.batch_size) train_loader = BenchmarkGenericIteratorWrapper(train_loader, args.batch_size)
...@@ -140,7 +156,7 @@ if __name__=="__main__": ...@@ -140,7 +156,7 @@ if __name__=="__main__":
if args.noddp: if args.noddp:
model = model_r model = model_r
else: else:
model = DDP(model_r, device_ids=[local_rank])#, bucket_cap_mb=1) model = DDP(model_r, device_ids=[local_rank], bucket_cap_mb=args.bucketS)
optimizer = optim.SGD(model.parameters(), optimizer = optim.SGD(model.parameters(),
lr=(args.base_lr * world_size), lr=(args.base_lr * world_size),
......
global:
report_freq: 10
exit_after_tests: True
profiler_mode: 'single'
json_prefix: 'benchy_output'
output_dir: '/output_dir'
use_distributed_barrier: False
IO:
run_benchmark: True
nbatches: 50
ntrials: 3
nwarmup: 1
synthetic:
run_benchmark: True
nbatches: 50
ntrials: 3
nwarmup: 1
full:
run_benchmark: True
nbatches: 50
ntrials: 3
nwarmup: 1
\ No newline at end of file
export RANK=$OMPI_COMM_WORLD_RANK
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export MASTER_ADDR=$(cat $LSB_DJOB_HOSTFILE | sort | uniq | grep -v batch | grep -v login | head -1)
export MASTER_PORT=29500 # default from torch launcher
#!/bin/bash
# Begin LSF directives
#BSUB -P stf011
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 16
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
CODE_DIR=/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/imagenet_simple
LOG_DIR=/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/summit_simple/summit_logs
source /gpfs/alpine/world-shared/stf011/atsaris/summit_env/monai/setup.sh
echo "Starting bency"
BENCHY_EXT="base.16N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR}
"
echo "Starting bency noddp"
BENCHY_EXT="noddp.16N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--noddp
"
echo "Starting bency 100MB"
BENCHY_EXT="100MB.16N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--bucketS 100
"
echo "Starting bency 1MB"
BENCHY_EXT="1MB.16N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--bucketS 1
"
echo "Starting bency bs8"
BENCHY_EXT="base.16N.bs8.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 8 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR}
"
echo "Starting bency noddp bs8"
BENCHY_EXT="noddp.16N.bs8.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 8 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--noddp
"
#!/bin/bash
# Begin LSF directives
#BSUB -P stf011
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 2:00
#BSUB -nnodes 1
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
CODE_DIR=/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/imagenet_simple
LOG_DIR=/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/summit_simple/summit_logs
source /gpfs/alpine/world-shared/stf011/atsaris/summit_env/monai/setup.sh
echo "Starting bency"
BENCHY_EXT="base.1N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR}
"
echo "Starting bency noddp"
BENCHY_EXT="noddp.1N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--noddp
"
echo "Starting bency 100MB"
BENCHY_EXT="100MB.1N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--bucketS 100
"
echo "Starting bency 1MB"
BENCHY_EXT="1MB.1N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--bucketS 1
"
#!/bin/bash
# Begin LSF directives
#BSUB -P stf011
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 2
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
CODE_DIR=/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/imagenet_simple
LOG_DIR=/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/summit_simple/summit_logs
source /gpfs/alpine/world-shared/stf011/atsaris/summit_env/monai/setup.sh
echo "Starting bency"
BENCHY_EXT="base.2N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR}
"
echo "Starting bency noddp"
BENCHY_EXT="noddp.2N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--noddp
"
echo "Starting bency 100MB"
BENCHY_EXT="100MB.2N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--bucketS 100
"
echo "Starting bency 1MB"
BENCHY_EXT="1MB.2N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--bucketS 1
"
echo "Starting bency bs64"
BENCHY_EXT="base.2N.bs64.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 64 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR}
"
echo "Starting bency noddp bs64"
BENCHY_EXT="noddp.2N.bs64.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 64 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--noddp
"
#!/bin/bash
# Begin LSF directives
#BSUB -P stf011
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 32
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
CODE_DIR=/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/imagenet_simple
LOG_DIR=/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/summit_simple/summit_logs
source /gpfs/alpine/world-shared/stf011/atsaris/summit_env/monai/setup.sh
echo "Starting bency"
BENCHY_EXT="base.32N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR}
"
echo "Starting bency noddp"
BENCHY_EXT="noddp.32N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--noddp
"
echo "Starting bency 100MB"
BENCHY_EXT="100MB.32N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--bucketS 100
"
echo "Starting bency 1MB"
BENCHY_EXT="1MB.32N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 128 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--bucketS 1
"
echo "Starting bency bs4"
BENCHY_EXT="base.32N.bs4.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 4 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR}
"
echo "Starting bency noddp bs4"
BENCHY_EXT="noddp.32N.bs4.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 4 \
--workers 0 \
--use-benchy \
--benchy-ext ${BENCHY_EXT} \
--benchy-log ${LOG_DIR} \
--noddp
"
#!/bin/bash
# Begin LSF directives
#BSUB -P stf011
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 4
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
CODE_DIR=/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/imagenet_simple