Commit 5ff714ad authored by Tsaris, Aristeidis's avatar Tsaris, Aristeidis
Browse files

adding baseline code

parent 3454fd8a
......@@ -75,7 +75,7 @@ def load_jpeg_from_file(path, cuda=True):
return input
"""
class HybridTrainPipe(Pipeline):
def __init__(
self,
......@@ -220,7 +220,7 @@ class DALIWrapper(object):
return DALIWrapper.gen_wrapper(
self.dalipipeline, self.num_classes, self.one_hot, self.memory_format
)
"""
def get_dali_train_loader(dali_cpu=False):
def gdtl(
......@@ -444,7 +444,7 @@ def get_pytorch_train_loader(
pin_memory=True,
collate_fn=partial(fast_collate, memory_format),
drop_last=True,
persistent_workers=False, # see https://github.com/pytorch/pytorch/issues/48370
persistent_workers=False,
)
return (
......
......@@ -99,7 +99,7 @@ class ModelAndLoss(nn.Module):
if self.noDDP:
self.model = self.model
else:
self.model = DDP(self.model, device_ids=[gpu_id], output_device=gpu_id, bucket_cap_mb=1)
self.model = DDP(self.model, device_ids=[gpu_id])
def load_model_state(self, state):
if not state is None:
......
......@@ -106,7 +106,7 @@ def add_parser_arguments(parser, skip_arch=False):
parser.add_argument(
"-j",
"--workers",
default=5,
default=1,
type=int,
metavar="N",
help="number of data loading workers (default: 5)",
......
......@@ -12,16 +12,15 @@
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
LOG_DIR=/gpfs/alpine/world-shared/stf011/atsaris/tmp/imagenet
LOG_DIR=summit_logs/
source /gpfs/alpine/world-shared/stf011/atsaris/summit_env/monai/setup.sh
source /gpfs/alpine/stf011/world-shared/atsaris/summit_env/pytorch_1_9_new/activate.sh
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 --bind=proportional-packed:7 --launch_distribution=packed \
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a1 -c42 -g1 -r1 --bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
source export_DDP_envvars.sh && \
python -u main.py \
python -u ../imagenet/main.py \
--arch resnet50 \
-j 28 \
-p 10 \
-b 128\
--training-only \
......@@ -29,6 +28,6 @@ jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 --bind=propor
--epochs 1 \
--prof 100 \
--no-checkpoints \
--data-backend syntetic \
--data-backend pytorch \
${DATA_DIR}
"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment