Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Tsaris, Aristeidis (aris)
pytorch_tutorial
Commits
efe645d1
Commit
efe645d1
authored
Oct 12, 2021
by
Tsaris, Aristeidis
Browse files
summit simple code
parent
4143513c
Changes
12
Hide whitespace changes
Inline
Side-by-side
ascent_simple/sub_ips_0N.lsf
deleted
100755 → 0
View file @
4143513c
#!/bin/bash
# Begin LSF directives
#BSUB -P gen166
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 1
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes
=
$(
cat
${
LSB_DJOB_HOSTFILE
}
|
sort
|
uniq
|
grep
-v
login |
grep
-v
batch |
wc
-l
)
DATA_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/imagenet/data
CODE_DIR
=
/gpfs/wolf/gen166/proj-shared/atsaris/pytorch_tutorial/imagenet_simple
source
/gpfs/wolf/gen166/proj-shared/atsaris/env/activate.sh
echo
"Starting bency"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-0-8w.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a1
-c42
-g1
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 32
\
--use-benchy
"
echo
"Starting bency no DDP"
export
BENCHY_CONFIG_FILE
=
benchy_configs/benchy-conf-0-8w-noddp.yaml
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a1
-c42
-g1
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 32
\
--use-benchy
\
--noddp
"
imagenet_simple/example1.py
View file @
efe645d1
...
...
@@ -6,6 +6,7 @@ import sys
import
math
import
argparse
import
subprocess
import
yaml
# Torch
import
torch
...
...
@@ -53,6 +54,14 @@ parser.add_argument('--wd', type=float, default=0.00005,
help
=
'weight decay'
)
parser
.
add_argument
(
"--use-benchy"
,
action
=
"store_true"
,
help
=
"enable benchy"
)
parser
.
add_argument
(
"--noddp"
,
action
=
"store_true"
,
help
=
"enable noddp"
)
parser
.
add_argument
(
"--workers"
,
default
=
5
,
type
=
int
,
metavar
=
"N"
,
help
=
"number of data loading workers (default: 5)"
,)
parser
.
add_argument
(
"--bucketS"
,
default
=
25
,
type
=
int
,
metavar
=
"N"
,
help
=
"bucket_cap_mb for DDP (default: 25MB)"
,)
parser
.
add_argument
(
"--benchy-ext"
,
default
=
None
,
type
=
str
,
help
=
"extention to benchy file"
,)
parser
.
add_argument
(
"--benchy-log"
,
default
=
None
,
type
=
str
,
help
=
"benchy log location"
,)
world_size
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_SIZE'
])
world_rank
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_RANK'
])
...
...
@@ -99,22 +108,29 @@ def train(epoch):
step
+=
1
def
new_yaml_file
(
args
):
with
open
(
"./benchy-conf.yaml"
)
as
f
:
y
=
yaml
.
safe_load
(
f
)
y
[
"global"
][
"json_prefix"
]
=
"benchy_output_"
+
args
.
benchy_ext
y
[
"global"
][
"output_dir"
]
=
args
.
benchy_log
str_file
=
"/tmp/benchy-conf_%s.yaml"
%
(
args
.
benchy_ext
)
with
open
(
str_file
,
'w'
)
as
f2
:
yaml
.
dump
(
y
,
f2
,
default_flow_style
=
False
)
os
.
environ
[
'BENCHY_CONFIG_FILE'
]
=
str_file
if
__name__
==
"__main__"
:
args
=
parser
.
parse_args
()
if
(
world_rank
==
0
):
new_yaml_file
(
args
)
torch
.
cuda
.
manual_seed
(
42
)
cudnn
.
benchmark
=
True
dist
.
init_process_group
(
'nccl'
,
rank
=
world_rank
,
world_size
=
world_size
)
kwargs
=
{
'num_workers'
:
8
,
'pin_memory'
:
True
}
# When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
# issues with Infiniband implementations that are not fork-safe
if
(
kwargs
.
get
(
'num_workers'
,
0
)
>
0
and
hasattr
(
mp
,
'_supports_context'
)
and
mp
.
_supports_context
and
'forkserver'
in
mp
.
get_all_start_methods
()):
kwargs
[
'multiprocessing_context'
]
=
'forkserver'
train_dataset
=
\
datasets
.
ImageFolder
(
args
.
train_dir
,
transform
=
transforms
.
Compose
([
...
...
@@ -129,7 +145,7 @@ if __name__=="__main__":
train_dataset
,
num_replicas
=
world_size
,
rank
=
world_rank
)
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
train_dataset
,
batch_size
=
args
.
batch_size
,
sampler
=
train_sampler
,
**
kwarg
s
)
sampler
=
train_sampler
,
num_workers
=
args
.
worker
s
)
if
args
.
use_benchy
:
train_loader
=
BenchmarkGenericIteratorWrapper
(
train_loader
,
args
.
batch_size
)
...
...
@@ -140,7 +156,7 @@ if __name__=="__main__":
if
args
.
noddp
:
model
=
model_r
else
:
model
=
DDP
(
model_r
,
device_ids
=
[
local_rank
]
)
#
, bucket_cap_mb=
1
)
model
=
DDP
(
model_r
,
device_ids
=
[
local_rank
],
bucket_cap_mb
=
args
.
bucketS
)
optimizer
=
optim
.
SGD
(
model
.
parameters
(),
lr
=
(
args
.
base_lr
*
world_size
),
...
...
imagenet_simple/utils/__pycache__/utils.cpython-38.pyc
View file @
efe645d1
No preview for this file type
summit_simple/benchy-conf.yaml
0 → 100644
View file @
efe645d1
global
:
report_freq
:
10
exit_after_tests
:
True
profiler_mode
:
'
single'
json_prefix
:
'
benchy_output'
output_dir
:
'
/output_dir'
use_distributed_barrier
:
False
IO
:
run_benchmark
:
True
nbatches
:
50
ntrials
:
3
nwarmup
:
1
synthetic
:
run_benchmark
:
True
nbatches
:
50
ntrials
:
3
nwarmup
:
1
full
:
run_benchmark
:
True
nbatches
:
50
ntrials
:
3
nwarmup
:
1
\ No newline at end of file
summit_simple/export_DDP_envvars.sh
0 → 100644
View file @
efe645d1
export
RANK
=
$OMPI_COMM_WORLD_RANK
export
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
export
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
export
MASTER_ADDR
=
$(
cat
$LSB_DJOB_HOSTFILE
|
sort
|
uniq
|
grep
-v
batch |
grep
-v
login |
head
-1
)
export
MASTER_PORT
=
29500
# default from torch launcher
summit_simple/sub_ips_16N.lsf
0 → 100755
View file @
efe645d1
#!/bin/bash
# Begin LSF directives
#BSUB -P stf011
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 16
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes
=
$(
cat
${
LSB_DJOB_HOSTFILE
}
|
sort
|
uniq
|
grep
-v
login |
grep
-v
batch |
wc
-l
)
DATA_DIR
=
/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
CODE_DIR
=
/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/imagenet_simple
LOG_DIR
=
/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/summit_simple/summit_logs
source
/gpfs/alpine/world-shared/stf011/atsaris/summit_env/monai/setup.sh
echo
"Starting bency"
BENCHY_EXT
=
"base.16N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
"
echo
"Starting bency noddp"
BENCHY_EXT
=
"noddp.16N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--noddp
"
echo
"Starting bency 100MB"
BENCHY_EXT
=
"100MB.16N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--bucketS 100
"
echo
"Starting bency 1MB"
BENCHY_EXT
=
"1MB.16N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--bucketS 1
"
echo
"Starting bency bs8"
BENCHY_EXT
=
"base.16N.bs8.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 8
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
"
echo
"Starting bency noddp bs8"
BENCHY_EXT
=
"noddp.16N.bs8.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 8
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--noddp
"
summit_simple/sub_ips_1N.lsf
0 → 100755
View file @
efe645d1
#!/bin/bash
# Begin LSF directives
#BSUB -P stf011
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 2:00
#BSUB -nnodes 1
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes
=
$(
cat
${
LSB_DJOB_HOSTFILE
}
|
sort
|
uniq
|
grep
-v
login |
grep
-v
batch |
wc
-l
)
DATA_DIR
=
/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
CODE_DIR
=
/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/imagenet_simple
LOG_DIR
=
/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/summit_simple/summit_logs
source
/gpfs/alpine/world-shared/stf011/atsaris/summit_env/monai/setup.sh
echo
"Starting bency"
BENCHY_EXT
=
"base.1N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
"
echo
"Starting bency noddp"
BENCHY_EXT
=
"noddp.1N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--noddp
"
echo
"Starting bency 100MB"
BENCHY_EXT
=
"100MB.1N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--bucketS 100
"
echo
"Starting bency 1MB"
BENCHY_EXT
=
"1MB.1N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--bucketS 1
"
summit_simple/sub_ips_2N.lsf
0 → 100755
View file @
efe645d1
#!/bin/bash
# Begin LSF directives
#BSUB -P stf011
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 2
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes
=
$(
cat
${
LSB_DJOB_HOSTFILE
}
|
sort
|
uniq
|
grep
-v
login |
grep
-v
batch |
wc
-l
)
DATA_DIR
=
/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
CODE_DIR
=
/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/imagenet_simple
LOG_DIR
=
/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/summit_simple/summit_logs
source
/gpfs/alpine/world-shared/stf011/atsaris/summit_env/monai/setup.sh
echo
"Starting bency"
BENCHY_EXT
=
"base.2N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
"
echo
"Starting bency noddp"
BENCHY_EXT
=
"noddp.2N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--noddp
"
echo
"Starting bency 100MB"
BENCHY_EXT
=
"100MB.2N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--bucketS 100
"
echo
"Starting bency 1MB"
BENCHY_EXT
=
"1MB.2N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--bucketS 1
"
echo
"Starting bency bs64"
BENCHY_EXT
=
"base.2N.bs64.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 64
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
"
echo
"Starting bency noddp bs64"
BENCHY_EXT
=
"noddp.2N.bs64.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 64
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--noddp
"
summit_simple/sub_ips_32N.lsf
0 → 100755
View file @
efe645d1
#!/bin/bash
# Begin LSF directives
#BSUB -P stf011
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 32
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes
=
$(
cat
${
LSB_DJOB_HOSTFILE
}
|
sort
|
uniq
|
grep
-v
login |
grep
-v
batch |
wc
-l
)
DATA_DIR
=
/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
CODE_DIR
=
/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/imagenet_simple
LOG_DIR
=
/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/summit_simple/summit_logs
source
/gpfs/alpine/world-shared/stf011/atsaris/summit_env/monai/setup.sh
echo
"Starting bency"
BENCHY_EXT
=
"base.32N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
"
echo
"Starting bency noddp"
BENCHY_EXT
=
"noddp.32N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--noddp
"
echo
"Starting bency 100MB"
BENCHY_EXT
=
"100MB.32N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--bucketS 100
"
echo
"Starting bency 1MB"
BENCHY_EXT
=
"1MB.32N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--bucketS 1
"
echo
"Starting bency bs4"
BENCHY_EXT
=
"base.32N.bs4.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 4
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
"
echo
"Starting bency noddp bs4"
BENCHY_EXT
=
"noddp.32N.bs4.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 4
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
\
--noddp
"
summit_simple/sub_ips_4N.lsf
0 → 100755
View file @
efe645d1
#!/bin/bash
# Begin LSF directives
#BSUB -P stf011
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 4
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands
nnodes
=
$(
cat
${
LSB_DJOB_HOSTFILE
}
|
sort
|
uniq
|
grep
-v
login |
grep
-v
batch |
wc
-l
)
DATA_DIR
=
/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
CODE_DIR
=
/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/imagenet_simple
LOG_DIR
=
/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/summit_simple/summit_logs
source
/gpfs/alpine/world-shared/stf011/atsaris/summit_env/monai/setup.sh
echo
"Starting bency"
BENCHY_EXT
=
"base.4N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\
--batch-size 128
\
--workers 0
\
--use-benchy
\
--benchy-ext
${
BENCHY_EXT
}
\
--benchy-log
${
LOG_DIR
}
"
echo
"Starting bency noddp"
BENCHY_EXT
=
"noddp.4N.0w"
jsrun
--smpiargs
=
"-disable_gpu_hooks"
-n
${
nnodes
}
-a6
-c42
-g6
-r1
\
--bind
=
proportional-packed:7
--launch_distribution
=
packed
\
bash
-c
"
\
python -u
${
CODE_DIR
}
/example1.py
\
--train-dir
${
DATA_DIR
}
/train
\
--epochs 100
\