Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Rogers, David
launchad
Commits
d239f1b3
Commit
d239f1b3
authored
Jun 10, 2020
by
David M. Rogers
Browse files
Fixes for next trial.
parent
7eaa8bc5
Changes
7
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
d239f1b3
*.err
*.out
*.logs
ChangeLog
0 → 100644
View file @
d239f1b3
* v. 0.3
loadem.py: - moved logs/rank0000.log into logs/jobid/rank0000.log
- saved shards-in-progress to new redis db key
run_ad.sh: - added trap for copy-out errors
TODO
0 → 100644
View file @
d239f1b3
1. fix run_docking.lsf to trap errors and shutdown the DB on time-up signal
loadem.py
View file @
d239f1b3
...
...
@@ -27,11 +27,19 @@ def run_redis(host, fn):
return
u
def
get_shard
(
host
):
shard
=
run_redis
(
host
,
lambda
r
:
r
.
spop
(
'shards'
))
def
enqueue
(
r
):
shard
=
r
.
spop
(
'shards'
)
if
shard
is
not
None
:
r
.
sadd
(
'doing'
,
shard
)
return
shard
shard
=
run_redis
(
host
,
enqueue
)
if
shard
is
None
:
return
shard
return
shard
.
decode
(
'utf8'
)
out_pre
=
'/gpfs/alpine/world-shared/bif128/docked'
def
main
(
argv
):
global
conn_retries
assert
len
(
argv
)
==
2
,
"Usage: %s <redis host>"
...
...
@@ -39,7 +47,10 @@ def main(argv):
host
=
argv
[
1
]
rank
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_RANK'
])
username
=
os
.
environ
[
'USER'
]
ofile
=
open
(
'/gpfs/alpine/world-shared/bif128/docked/logs/rank%04x.log'
%
rank
,
"w"
)
jobid
=
os
.
environ
[
'LSB_JOBID'
]
ret
=
subprocess
.
call
(
"mkdir -p %s/logs/%s"
%
(
out_pre
,
jobid
),
shell
=
True
)
ofile
=
open
(
'%s/logs/%s/rank%04x.log'
%
(
out_pre
,
jobid
,
rank
),
"w"
)
time
.
sleep
(
rank
*
0.0001
)
# 10k connections per second at startup
n
=
0
...
...
@@ -54,10 +65,12 @@ def main(argv):
cmd
[
2
]
=
"p"
+
cmd
[
2
]
ret
=
subprocess
.
call
(
cmd
)
if
ret
:
ofile
.
write
(
"%s ERR"
%
shard
)
ofile
.
write
(
"%s ERR
\n
"
%
shard
)
run_redis
(
host
,
lambda
r
:
r
.
sadd
(
'errors'
,
shard
))
else
:
ofile
.
write
(
"%s OK
\n
"
%
shard
)
run_redis
(
host
,
lambda
r
:
r
.
srem
(
'doing'
,
shard
))
n
+=
1
if
n
%
10
==
0
:
# 13k of these messages.
ofile
.
flush
()
...
...
parse_log.py
0 → 100644
View file @
d239f1b3
import
datetime
as
DT
import
sys
sec
=
[]
f
=
open
(
sys
.
argv
[
1
])
t0
=
None
for
line
in
f
:
t1
=
DT
.
datetime
.
strptime
(
line
[:
26
],
"%Y-%m-%d %H:%M:%S.%f"
)
if
t0
is
not
None
:
dt
=
(
t1
-
t0
).
total_seconds
()
print
(
"%.3f %s"
%
(
dt
,
line
))
if
"completed docking"
in
line
:
sec
.
append
(
dt
)
t0
=
t1
m
=
sum
(
sec
)
/
len
(
sec
)
v
=
sum
((
s
-
m
)
**
2
for
s
in
sec
)
/
len
(
sec
)
print
(
min
(
sec
),
max
(
sec
),
m
,
v
**
0.5
)
run_ad.sh
View file @
d239f1b3
...
...
@@ -9,7 +9,7 @@
export
OMP_NUM_THREADS
=
7
set
-e
version
=
"run_ad.sh v0.
2
"
version
=
"run_ad.sh v0.
3
"
if
[
$#
-ne
2
]
;
then
echo
"Usage:
$0
shard_name shard_segment"
...
...
@@ -78,7 +78,8 @@ log completed segment file list ${start} to ${end}
log completed docking
# copy-out function
#cd $WORK_DIR
tar
czf
$shard_name
.
$seg
.tgz
`
awk
'{printf("%s.xml\n%s.dlg\n",$0,$0);}'
lignames.
$seg
`
tar
czf
$shard_name
.
$seg
.tgz
`
awk
'{printf("%s.xml\n%s.dlg\n",$0,$0);}'
lignames.
$seg
`
\
||
echo
"Error tarring some files."
cp
$shard_name
.
$seg
.tgz
$OUT_DIR
#rm -fr $WORK_DIR
log completed copyout
...
...
run_docking.lsf
View file @
d239f1b3
#BSUB -nnodes
5
#BSUB -W
3
0
#BSUB -q
debug
#BSUB -nnodes
100
#BSUB -W
6
0
#BSUB -q
batch
#BSUB -P BIF128
#BSUB -J ADv
1
#BSUB -J ADv
0.3
#BSUB -o %J.out
#BSUB -alloc_flags "NVME"
...
...
@@ -10,7 +10,7 @@ source /ccs/proj/bif128/venvs/env.sh
PROJ=/gpfs/alpine/bif128/proj-shared/redis
gpus=$(( (LSB_MAX_NUM_PROCESSORS-1)/7 ))
echo "Starting $((gpus/6)) node run of v0.
2
at " `date`
echo "Starting $((gpus/6)) node run of v0.
3
at " `date`
[ -s $PROJ/shards.rdb ]
REMAKE=$?
...
...
@@ -24,8 +24,8 @@ fi
for((i=0;i<120;i++)); do
memb=$(query scard shards)
sleep 1
[ $? -eq 0 ] && break
sleep 1
done
echo "$memb initial members at " `date`
...
...
@@ -35,13 +35,20 @@ jsrun -X 0 \
-n $gpus -r6 -a1 -g1 -c7 -d cyclic -b packed:7 \
python loadem.py `hostname`
# Print a nice little summary:
memb=$(query scard shards)
echo "$memb members remain at " `date`
echo
memb=$(query scard errors)
echo "$memb errors"
if [ $memb -gt 0 ]; then
query smembers errors
fi
echo
memb=$(query scard doing)
echo "$memb in-progress [sic]"
if [ $memb -gt 0 ]; then
query smembers doing
fi
kill %
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment