Commit d239f1b3 authored by David M. Rogers's avatar David M. Rogers
Browse files

Fixes for next trial.

parent 7eaa8bc5
* v. 0.3
loadem.py: - moved logs/rank0000.log into logs/jobid/rank0000.log
- saved shards-in-progress to new redis db key
run_ad.sh: - added trap for copy-out errors
1. fix run_docking.lsf to trap errors and shutdown the DB on time-up signal
......@@ -27,11 +27,19 @@ def run_redis(host, fn):
return u
def get_shard(host):
shard = run_redis(host, lambda r: r.spop('shards'))
def enqueue(r):
shard = r.spop('shards')
if shard is not None:
r.sadd('doing', shard)
return shard
shard = run_redis(host, enqueue)
if shard is None:
return shard
return shard.decode('utf8')
out_pre = '/gpfs/alpine/world-shared/bif128/docked'
def main(argv):
global conn_retries
assert len(argv) == 2, "Usage: %s <redis host>"
......@@ -39,7 +47,10 @@ def main(argv):
host = argv[1]
rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
username = os.environ['USER']
ofile = open('/gpfs/alpine/world-shared/bif128/docked/logs/rank%04x.log'%rank, "w")
jobid = os.environ['LSB_JOBID']
ret = subprocess.call("mkdir -p %s/logs/%s"%(out_pre,jobid), shell=True)
ofile = open('%s/logs/%s/rank%04x.log'%(out_pre,jobid,rank), "w")
time.sleep(rank*0.0001) # 10k connections per second at startup
n = 0
......@@ -54,10 +65,12 @@ def main(argv):
cmd[2] = "p" + cmd[2]
ret = subprocess.call(cmd)
if ret:
ofile.write("%s ERR"%shard)
ofile.write("%s ERR\n"%shard)
run_redis(host, lambda r: r.sadd('errors', shard))
else:
ofile.write("%s OK\n"%shard)
run_redis(host, lambda r: r.srem('doing', shard))
n += 1
if n%10 == 0: # 13k of these messages.
ofile.flush()
......
import datetime as DT
import sys
sec = []
f = open(sys.argv[1])
t0 = None
for line in f:
t1 = DT.datetime.strptime(line[:26], "%Y-%m-%d %H:%M:%S.%f")
if t0 is not None:
dt = (t1-t0).total_seconds()
print("%.3f %s"%(dt, line))
if "completed docking" in line:
sec.append(dt)
t0 = t1
m = sum(sec)/len(sec)
v = sum((s-m)**2 for s in sec)/len(sec)
print(min(sec), max(sec), m, v**0.5)
......@@ -9,7 +9,7 @@
export OMP_NUM_THREADS=7
set -e
version="run_ad.sh v0.2"
version="run_ad.sh v0.3"
if [ $# -ne 2 ]; then
echo "Usage: $0 shard_name shard_segment"
......@@ -78,7 +78,8 @@ log completed segment file list ${start} to ${end}
log completed docking
# copy-out function
#cd $WORK_DIR
tar czf $shard_name.$seg.tgz `awk '{printf("%s.xml\n%s.dlg\n",$0,$0);}' lignames.$seg`
tar czf $shard_name.$seg.tgz `awk '{printf("%s.xml\n%s.dlg\n",$0,$0);}' lignames.$seg` \
|| echo "Error tarring some files."
cp $shard_name.$seg.tgz $OUT_DIR
#rm -fr $WORK_DIR
log completed copyout
......
#BSUB -nnodes 5
#BSUB -W 30
#BSUB -q debug
#BSUB -nnodes 100
#BSUB -W 60
#BSUB -q batch
#BSUB -P BIF128
#BSUB -J ADv1
#BSUB -J ADv0.3
#BSUB -o %J.out
#BSUB -alloc_flags "NVME"
......@@ -10,7 +10,7 @@ source /ccs/proj/bif128/venvs/env.sh
PROJ=/gpfs/alpine/bif128/proj-shared/redis
gpus=$(( (LSB_MAX_NUM_PROCESSORS-1)/7 ))
echo "Starting $((gpus/6)) node run of v0.2 at " `date`
echo "Starting $((gpus/6)) node run of v0.3 at " `date`
[ -s $PROJ/shards.rdb ]
REMAKE=$?
......@@ -24,8 +24,8 @@ fi
for((i=0;i<120;i++)); do
memb=$(query scard shards)
sleep 1
[ $? -eq 0 ] && break
sleep 1
done
echo "$memb initial members at " `date`
......@@ -35,13 +35,20 @@ jsrun -X 0 \
-n $gpus -r6 -a1 -g1 -c7 -d cyclic -b packed:7 \
python loadem.py `hostname`
# Print a nice little summary:
memb=$(query scard shards)
echo "$memb members remain at " `date`
echo
memb=$(query scard errors)
echo "$memb errors"
if [ $memb -gt 0 ]; then
query smembers errors
fi
echo
memb=$(query scard doing)
echo "$memb in-progress [sic]"
if [ $memb -gt 0 ]; then
query smembers doing
fi
kill %
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment