sub_ips_4N.lsf 3.17 KB
Newer Older
Tsaris, Aristeidis's avatar
Tsaris, Aristeidis committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/bin/bash
# Begin LSF directives
#BSUB -P stf011
#BSUB -J sc21
#BSUB -o logs/sc21.o%J
#BSUB -W 1:00
#BSUB -nnodes 4
#BSUB -alloc_flags "nvme smt4"
####BSUB -N
# End LSF directives and begin shell commands

nnodes=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)
DATA_DIR=/gpfs/alpine/world-shared/stf011/junqi/choco_env/dl_code/data/ILSVRC
CODE_DIR=/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/imagenet_simple
LOG_DIR=/gpfs/alpine/stf011/proj-shared/atsaris/sc21/pytorch_tutorial/summit_simple/summit_logs

source /gpfs/alpine/world-shared/stf011/atsaris/summit_env/monai/setup.sh

echo "Starting bency"
BENCHY_EXT="base.4N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
    --bind=proportional-packed:7 --launch_distribution=packed \
    bash -c "\
    python -u ${CODE_DIR}/example1.py \
    --train-dir ${DATA_DIR}/train \
    --epochs 100 \
    --batch-size 128 \
    --workers 0 \
    --use-benchy \
    --benchy-ext ${BENCHY_EXT} \
    --benchy-log ${LOG_DIR}
    "

echo "Starting bency noddp"
BENCHY_EXT="noddp.4N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
    --bind=proportional-packed:7 --launch_distribution=packed \
    bash -c "\
    python -u ${CODE_DIR}/example1.py \
    --train-dir ${DATA_DIR}/train \
    --epochs 100 \
    --batch-size 128 \
    --workers 0 \
    --use-benchy \
    --benchy-ext ${BENCHY_EXT} \
    --benchy-log ${LOG_DIR} \
    --noddp
    "

echo "Starting bency 100MB"
BENCHY_EXT="100MB.4N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
    --bind=proportional-packed:7 --launch_distribution=packed \
    bash -c "\
    python -u ${CODE_DIR}/example1.py \
    --train-dir ${DATA_DIR}/train \
    --epochs 100 \
    --batch-size 128 \
    --workers 0 \
    --use-benchy \
    --benchy-ext ${BENCHY_EXT} \
    --benchy-log ${LOG_DIR} \
    --bucketS 100
    "

echo "Starting bency 1MB"
BENCHY_EXT="1MB.4N.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
    --bind=proportional-packed:7 --launch_distribution=packed \
    bash -c "\
    python -u ${CODE_DIR}/example1.py \
    --train-dir ${DATA_DIR}/train \
    --epochs 100 \
    --batch-size 128 \
    --workers 0 \
    --use-benchy \
    --benchy-ext ${BENCHY_EXT} \
    --benchy-log ${LOG_DIR} \
    --bucketS 1
    "

echo "Starting bency bs32"
BENCHY_EXT="base.4N.bs32.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
    --bind=proportional-packed:7 --launch_distribution=packed \
    bash -c "\
    python -u ${CODE_DIR}/example1.py \
    --train-dir ${DATA_DIR}/train \
    --epochs 100 \
    --batch-size 32 \
    --workers 0 \
    --use-benchy \
    --benchy-ext ${BENCHY_EXT} \
    --benchy-log ${LOG_DIR}
    "

echo "Starting bency noddp bs32"
BENCHY_EXT="noddp.4N.bs32.0w"
jsrun --smpiargs="-disable_gpu_hooks" -n${nnodes} -a6 -c42 -g6 -r1 \
    --bind=proportional-packed:7 --launch_distribution=packed \
    bash -c "\
    python -u ${CODE_DIR}/example1.py \
    --train-dir ${DATA_DIR}/train \
    --epochs 100 \
    --batch-size 32 \
    --workers 0 \
    --use-benchy \
    --benchy-ext ${BENCHY_EXT} \
    --benchy-log ${LOG_DIR} \
    --noddp
    "