Commit 4125d09c authored by David M. Rogers's avatar David M. Rogers
Browse files

Ready to test docking steps.

parent 57f1e9a6
......@@ -3,13 +3,13 @@
import os, subprocess
import redis, time, random
test = True
test = False
testone = True
conn_retries = 0
def get_shard(host):
def run_redis(fn):
global conn_retries
for i in range(120):
try:
r = redis.StrictRedis(host=host, port=6379, password="Z1908840168_2_T1", db=0)
......@@ -19,11 +19,15 @@ def get_shard(host):
time.sleep(random.random()*0.2)
else:
raise redis.exceptions.ConnectionError
u = fn(r)
shard = r.spop('shards')
r.connection_pool.disconnect()
del r
return u
def get_shard(host):
shard = run_redis(lambda r: r.spop('shards'))
if shard is None:
return shard
return shard.decode('utf8')
......@@ -33,9 +37,10 @@ def main(argv):
assert len(argv) == 2, "Usage: %s <redis host>"
host = argv[1]
me = int(os.environ['OMPI_COMM_WORLD_RANK'])
ofile = open('/gpfs/alpine/world-shared/bif128/docked/logs/rank%04x.log'%me, "w")
time.sleep(me*0.0001) # 10k connections per second at startup
rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
username = os.environ['USER']
ofile = open('/gpfs/alpine/world-shared/bif128/docked/logs/rank%04x.log'%rank, "w")
time.sleep(rank*0.0001) # 10k connections per second at startup
n = 0
while True:
......@@ -49,18 +54,21 @@ def main(argv):
ret = subprocess.call(cmd)
if ret:
ofile.write("%s ERR"%shard)
#r.sadd('errors', shard) # FIXME - make this better.
run_redis(lambda r: r.sadd('errors', shard))
else:
ofile.write("%s OK\n"%shard)
n += 1
if n%10 == 0: # 13k of these messages.
ofile.flush()
print("Host %04x processed %d shards."%(me,n))
print("Host %04x processed %d decishards."%(rank,n))
if testone:
break
ofile.close()
print("Host %04x completed (%d shards processed)."%(me,n))
print("%d connection retries"%conn_retries)
print("Host %04x completed (%d decishards processed)."%(rank,n))
print("Host %04x %d connection retries"%(rank,conn_retries))
ret = subprocess.call("rm -fr /mnt/bb/%s/%d"%(user,rank), shell=True)
if __name__=="__main__":
import sys
......
#BSUB -nnodes 60
#BSUB -W 30
#BSUB -q debug
#BSUB -P BIF128
#BSUB -J ADv1
#BSUB -o %J.out
source /ccs/proj/bif128/venvs/env.sh
PROJ=/gpfs/alpine/bif128/proj-shared/redis
gpus=$(( (LSB_MAX_NUM_PROCESSORS-1)/7 ))
echo "Starting $((gpus/6)) node run at " `date`
[ -s $PROJ/shards.rdb ]
REMAKE=$?
# REMAKE == 0 if file exists and has nonzero size
redis-server shards.conf 2>&1 >$PROJ/"shard_"$LSB_JOBID.log &
if [ $REMAKE -eq 1 ]; then
python setdb.py `hostname` /gpfs/alpine/bif128/world-shared/ligand_shards.txt
fi
for((i=0;i<120;i++)); do
memb=$(query scard shards)
sleep 1
[ $? -eq 0 ] && break
done
echo "$memb initial members at " `date`
jsrun -X 0 \
-n $gpus -r6 -a1 -g1 -c7 -d cyclic -b packed:7 \
python loadem.py `hostname`
memb=$(query scard shards)
echo "$memb members remain at " `date`
memb=$(query scard errors)
echo "$memb errors"
if [ $memb -gt 0 ]; then
query smembers errors
fi
kill %
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment