Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Tsaris, Aristeidis (aris)
pytorch_tutorial
Commits
4143513c
Commit
4143513c
authored
Oct 11, 2021
by
Aristeidis Tsaris
Browse files
adding simpler example
parent
558ffac0
Changes
4
Hide whitespace changes
Inline
Side-by-side
ascent_simple/sub_ips_0N.lsf
0 → 100755
View file @
4143513c
#!/bin/bash
# Begin LSF directives
#BSUB -P gen166
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 1
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes
=
$(
cat
${
LSB_DJOB_HOSTFILE
}
|
sort
|
uniq
|
grep
-v
login |
grep
-v
batch |
wc
-l
)
DATA_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data
CODE_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/imagenet_simple
source
/gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
echo
"Starting bency"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-0-8w.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a1
-c42
-g1
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 32
\
--use-benchy
"
echo
"Starting bency no DDP"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-0-8w-noddp.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a1
-c42
-g1
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 32
\
--use-benchy
\
--noddp
"
imagenet_simple/example1.py
0 → 100644
View file @
4143513c
from
__future__
import
print_function
# Python
import
os
import
sys
import
math
import
argparse
import
subprocess
# Torch
import
torch
import
torch.backends.cudnn
as
cudnn
import
torch.nn.functional
as
F
import
torch.optim
as
optim
import
torch.nn
as
nn
from
torchvision
import
datasets
,
transforms
,
models
# Torch Dist
import
torch.multiprocessing
as
mp
import
torch.distributed
as
dist
import
torch.utils.data.distributed
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
# Local
import
utils.utils
as
utils
from
benchy.torch
import
BenchmarkGenericIteratorWrapper
# Training settings
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch ImageNet Example'
,
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
parser
.
add_argument
(
'--train-dir'
,
default
=
os
.
path
.
expanduser
(
'~/imagenet/train'
),
help
=
'path to training data'
)
parser
.
add_argument
(
'--batch-size'
,
type
=
int
,
default
=
32
,
help
=
'input batch size for training'
)
parser
.
add_argument
(
'--val-batch-size'
,
type
=
int
,
default
=
32
,
help
=
'input batch size for validation'
)
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
90
,
help
=
'number of epochs to train'
)
parser
.
add_argument
(
"--print-freq"
,
"-p"
,
default
=
10
,
type
=
int
,
metavar
=
"N"
,
help
=
"print frequency (default: 10)"
)
parser
.
add_argument
(
"--prof"
,
type
=
int
,
default
=-
1
,
metavar
=
"N"
,
help
=
"Run only N iterations"
)
parser
.
add_argument
(
'--base-lr'
,
type
=
float
,
default
=
0.0125
,
help
=
'learning rate for a single GPU'
)
parser
.
add_argument
(
'--warmup-epochs'
,
type
=
float
,
default
=
5
,
help
=
'number of warmup epochs'
)
parser
.
add_argument
(
'--momentum'
,
type
=
float
,
default
=
0.9
,
help
=
'SGD momentum'
)
parser
.
add_argument
(
'--wd'
,
type
=
float
,
default
=
0.00005
,
help
=
'weight decay'
)
parser
.
add_argument
(
"--use-benchy"
,
action
=
"store_true"
,
help
=
"enable benchy"
)
parser
.
add_argument
(
"--noddp"
,
action
=
"store_true"
,
help
=
"enable noddp"
)
world_size
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_SIZE'
])
world_rank
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_RANK'
])
local_rank
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_LOCAL_RANK'
])
torch
.
cuda
.
set_device
(
local_rank
)
ACC_METADATA
=
{
"unit"
:
"%"
,
"format"
:
":.2f"
}
IPS_METADATA
=
{
"unit"
:
"img/s"
,
"format"
:
":.2f"
}
TIME_METADATA
=
{
"unit"
:
"s"
,
"format"
:
":.5f"
}
LOSS_METADATA
=
{
"format"
:
":.5f"
}
get_master
=
"echo $(cat {} | sort | uniq | grep -v batch | grep -v login | head -1)"
.
format
(
os
.
environ
[
'LSB_DJOB_HOSTFILE'
])
os
.
environ
[
'MASTER_ADDR'
]
=
str
(
subprocess
.
check_output
(
get_master
,
shell
=
True
))[
2
:
-
3
]
os
.
environ
[
'MASTER_PORT'
]
=
"23456"
os
.
environ
[
'WORLD_SIZE'
]
=
os
.
environ
[
'OMPI_COMM_WORLD_SIZE'
]
os
.
environ
[
'RANK'
]
=
os
.
environ
[
'OMPI_COMM_WORLD_RANK'
]
def
train
(
epoch
):
model
.
train
()
train_sampler
.
set_epoch
(
epoch
)
data_iter
=
iter
(
train_loader
)
if
args
.
prof
>
0
:
data_iter
=
utils
.
first_n
(
args
.
prof
,
data_iter
)
step
=
0
while
(
data_iter
):
try
:
data
,
target
=
next
(
data_iter
)
data
,
target
=
data
.
cuda
(),
target
.
cuda
()
except
StopIteration
:
break
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
F
.
cross_entropy
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
(
torch
.
distributed
.
get_rank
()
==
0
):
print
(
"train"
,
epoch
,
":"
,
step
,
"loss"
,
utils
.
to_python_float
(
loss
))
step
+=
1
if
__name__
==
"__main__"
:
args
=
parser
.
parse_args
()
torch
.
cuda
.
manual_seed
(
42
)
cudnn
.
benchmark
=
True
dist
.
init_process_group
(
'nccl'
,
rank
=
world_rank
,
world_size
=
world_size
)
kwargs
=
{
'num_workers'
:
8
,
'pin_memory'
:
True
}
# When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
# issues with Infiniband implementations that are not fork-safe
if
(
kwargs
.
get
(
'num_workers'
,
0
)
>
0
and
hasattr
(
mp
,
'_supports_context'
)
and
mp
.
_supports_context
and
'forkserver'
in
mp
.
get_all_start_methods
()):
kwargs
[
'multiprocessing_context'
]
=
'forkserver'
train_dataset
=
\
datasets
.
ImageFolder
(
args
.
train_dir
,
transform
=
transforms
.
Compose
([
transforms
.
RandomResizedCrop
(
224
),
transforms
.
RandomHorizontalFlip
(),
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
=
[
0.485
,
0.456
,
0.406
],
std
=
[
0.229
,
0.224
,
0.225
])
]))
train_sampler
=
torch
.
utils
.
data
.
distributed
.
DistributedSampler
(
train_dataset
,
num_replicas
=
world_size
,
rank
=
world_rank
)
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
train_dataset
,
batch_size
=
args
.
batch_size
,
sampler
=
train_sampler
,
**
kwargs
)
if
args
.
use_benchy
:
train_loader
=
BenchmarkGenericIteratorWrapper
(
train_loader
,
args
.
batch_size
)
model_r
=
models
.
resnet50
().
cuda
()
if
args
.
noddp
:
model
=
model_r
else
:
model
=
DDP
(
model_r
,
device_ids
=
[
local_rank
])
#, bucket_cap_mb=1)
optimizer
=
optim
.
SGD
(
model
.
parameters
(),
lr
=
(
args
.
base_lr
*
world_size
),
momentum
=
args
.
momentum
,
weight_decay
=
args
.
wd
)
for
epoch
in
range
(
0
,
args
.
epochs
):
train
(
epoch
)
imagenet_simple/utils/__pycache__/utils.cpython-38.pyc
0 → 100644
View file @
4143513c
File added
imagenet_simple/utils/utils.py
0 → 100644
View file @
4143513c
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import
os
import
numpy
as
np
import
torch
import
shutil
import
torch.distributed
as
dist
def
should_backup_checkpoint
(
args
):
def
_sbc
(
epoch
):
return
args
.
gather_checkpoints
and
(
epoch
<
10
or
epoch
%
10
==
0
)
return
_sbc
def
save_checkpoint
(
state
,
is_best
,
filename
=
"checkpoint.pth.tar"
,
checkpoint_dir
=
"./"
,
backup_filename
=
None
,
):
if
(
not
torch
.
distributed
.
is_initialized
())
or
torch
.
distributed
.
get_rank
()
==
0
:
filename
=
os
.
path
.
join
(
checkpoint_dir
,
filename
)
print
(
"SAVING {}"
.
format
(
filename
))
torch
.
save
(
state
,
filename
)
if
is_best
:
shutil
.
copyfile
(
filename
,
os
.
path
.
join
(
checkpoint_dir
,
"model_best.pth.tar"
)
)
if
backup_filename
is
not
None
:
shutil
.
copyfile
(
filename
,
os
.
path
.
join
(
checkpoint_dir
,
backup_filename
))
def
timed_generator
(
gen
):
start
=
time
.
time
()
for
g
in
gen
:
end
=
time
.
time
()
t
=
end
-
start
yield
g
,
t
start
=
time
.
time
()
def
timed_function
(
f
):
def
_timed_function
(
*
args
,
**
kwargs
):
start
=
time
.
time
()
ret
=
f
(
*
args
,
**
kwargs
)
return
ret
,
time
.
time
()
-
start
return
_timed_function
def
accuracy
(
output
,
target
,
topk
=
(
1
,)):
"""Computes the precision@k for the specified values of k"""
maxk
=
max
(
topk
)
batch_size
=
target
.
size
(
0
)
_
,
pred
=
output
.
topk
(
maxk
,
1
,
True
,
True
)
pred
=
pred
.
t
()
correct
=
pred
.
eq
(
target
.
view
(
1
,
-
1
).
expand_as
(
pred
))
res
=
[]
for
k
in
topk
:
correct_k
=
correct
[:
k
].
view
(
-
1
).
float
().
sum
(
0
,
keepdim
=
True
)
res
.
append
(
correct_k
.
mul_
(
100.0
/
batch_size
))
return
res
def
reduce_tensor
(
tensor
):
rt
=
tensor
.
clone
()
dist
.
all_reduce
(
rt
,
op
=
dist
.
ReduceOp
.
SUM
)
rt
/=
(
torch
.
distributed
.
get_world_size
()
if
torch
.
distributed
.
is_initialized
()
else
1
)
return
rt
def
first_n
(
n
,
generator
):
for
i
,
d
in
zip
(
range
(
n
),
generator
):
yield
d
def
to_python_float
(
t
):
if
hasattr
(
t
,
'item'
):
return
t
.
item
()
else
:
return
t
[
0
]
def
calc_ips
(
batch_size
,
time
):
world_size
=
(
torch
.
distributed
.
get_world_size
()
if
torch
.
distributed
.
is_initialized
()
else
1
)
tbs
=
world_size
*
batch_size
return
tbs
/
time
def
nvtx_range_push
(
name
,
enabled
):
if
enabled
:
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
nvtx
.
range_push
(
name
)
def
nvtx_range_pop
(
enabled
):
if
enabled
:
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
nvtx
.
range_pop
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment