Commit 4143513c authored by Aristeidis Tsaris's avatar Aristeidis Tsaris
Browse files

adding simpler example

parent 558ffac0
#!/bin/bash
# Begin LSF directives
#BSUB -P gen166
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 1
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data
CODE_DIR=/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/imagenet_simple
source /gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
echo "Starting bency"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-0-8w.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a1 -c42 -g1 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 32 \
--use-benchy
"
echo "Starting bency no DDP"
export BENCHY_CONFIG_FILE=benchy_configs/benchy-conf-0-8w-noddp.yaml
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a1 -c42 -g1 -r1 \
--bind=proportional-packed:7 --launch_distribution=packed \
bash -c "\
python -u ${CODE_DIR}/example1.py \
--train-dir ${DATA_DIR}/train \
--epochs 100 \
--batch-size 32 \
--use-benchy \
--noddp
"
from __future__ import print_function
# Python
import os
import sys
import math
import argparse
import subprocess
# Torch
import torch
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from torchvision import datasets, transforms, models
# Torch Dist
import torch.multiprocessing as mp
import torch.distributed as dist
import torch.utils.data.distributed
from torch.nn.parallel import DistributedDataParallel as DDP
# Local
import utils.utils as utils
from benchy.torch import BenchmarkGenericIteratorWrapper
# Training settings
parser = argparse.ArgumentParser(description='PyTorch ImageNet Example',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--train-dir', default=os.path.expanduser('~/imagenet/train'),
help='path to training data')
parser.add_argument('--batch-size', type=int, default=32,
help='input batch size for training')
parser.add_argument('--val-batch-size', type=int, default=32,
help='input batch size for validation')
parser.add_argument('--epochs', type=int, default=90,
help='number of epochs to train')
parser.add_argument("--print-freq", "-p", default=10,
type=int, metavar="N",
help="print frequency (default: 10)")
parser.add_argument("--prof", type=int, default=-1, metavar="N",
help="Run only N iterations")
parser.add_argument('--base-lr', type=float, default=0.0125,
help='learning rate for a single GPU')
parser.add_argument('--warmup-epochs', type=float, default=5,
help='number of warmup epochs')
parser.add_argument('--momentum', type=float, default=0.9,
help='SGD momentum')
parser.add_argument('--wd', type=float, default=0.00005,
help='weight decay')
parser.add_argument("--use-benchy", action="store_true", help="enable benchy")
parser.add_argument("--noddp", action="store_true", help="enable noddp")
world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
world_rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
torch.cuda.set_device(local_rank)
ACC_METADATA = {"unit": "%", "format": ":.2f"}
IPS_METADATA = {"unit": "img/s", "format": ":.2f"}
TIME_METADATA = {"unit": "s", "format": ":.5f"}
LOSS_METADATA = {"format": ":.5f"}
get_master = "echo $(cat {} | sort | uniq | grep -v batch | grep -v login | head -1)".format(os.environ['LSB_DJOB_HOSTFILE'])
os.environ['MASTER_ADDR'] = str(subprocess.check_output(get_master, shell=True))[2:-3]
os.environ['MASTER_PORT'] = "23456"
os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
def train(epoch):
model.train()
train_sampler.set_epoch(epoch)
data_iter = iter(train_loader)
if args.prof > 0: data_iter = utils.first_n(args.prof, data_iter)
step = 0
while (data_iter):
try:
data, target = next(data_iter)
data, target = data.cuda(), target.cuda()
except StopIteration:
break
optimizer.zero_grad()
output = model(data)
loss = F.cross_entropy(output, target)
loss.backward()
optimizer.step()
if (torch.distributed.get_rank() == 0):
print("train", epoch, ":", step, "loss", utils.to_python_float(loss))
step+=1
if __name__=="__main__":
args = parser.parse_args()
torch.cuda.manual_seed(42)
cudnn.benchmark = True
dist.init_process_group('nccl',
rank=world_rank, world_size=world_size)
kwargs = {'num_workers': 8, 'pin_memory': True}
# When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
# issues with Infiniband implementations that are not fork-safe
if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
kwargs['multiprocessing_context'] = 'forkserver'
train_dataset = \
datasets.ImageFolder(args.train_dir,
transform=transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
]))
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset, num_replicas=world_size, rank=world_rank)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size,
sampler=train_sampler, **kwargs)
if args.use_benchy:
train_loader = BenchmarkGenericIteratorWrapper(train_loader, args.batch_size)
model_r = models.resnet50().cuda()
if args.noddp:
model = model_r
else:
model = DDP(model_r, device_ids=[local_rank])#, bucket_cap_mb=1)
optimizer = optim.SGD(model.parameters(),
lr=(args.base_lr * world_size),
momentum=args.momentum, weight_decay=args.wd)
for epoch in range(0, args.epochs):
train(epoch)
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import numpy as np
import torch
import shutil
import torch.distributed as dist
def should_backup_checkpoint(args):
def _sbc(epoch):
return args.gather_checkpoints and (epoch < 10 or epoch % 10 == 0)
return _sbc
def save_checkpoint(
state,
is_best,
filename="checkpoint.pth.tar",
checkpoint_dir="./",
backup_filename=None,
):
if (not torch.distributed.is_initialized()) or torch.distributed.get_rank() == 0:
filename = os.path.join(checkpoint_dir, filename)
print("SAVING {}".format(filename))
torch.save(state, filename)
if is_best:
shutil.copyfile(
filename, os.path.join(checkpoint_dir, "model_best.pth.tar")
)
if backup_filename is not None:
shutil.copyfile(filename, os.path.join(checkpoint_dir, backup_filename))
def timed_generator(gen):
start = time.time()
for g in gen:
end = time.time()
t = end - start
yield g, t
start = time.time()
def timed_function(f):
def _timed_function(*args, **kwargs):
start = time.time()
ret = f(*args, **kwargs)
return ret, time.time() - start
return _timed_function
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
def reduce_tensor(tensor):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
rt /= (
torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
)
return rt
def first_n(n, generator):
for i, d in zip(range(n), generator):
yield d
def to_python_float(t):
if hasattr(t, 'item'):
return t.item()
else:
return t[0]
def calc_ips(batch_size, time):
world_size = (
torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
)
tbs = world_size * batch_size
return tbs / time
def nvtx_range_push(name, enabled):
if enabled:
torch.cuda.synchronize()
torch.cuda.nvtx.range_push(name)
def nvtx_range_pop(enabled):
if enabled:
torch.cuda.synchronize()
torch.cuda.nvtx.range_pop()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment