Commit 91f9dc30 authored by David M. Rogers's avatar David M. Rogers
Browse files

Version used for gigadock.

parent 1ca85933
......@@ -6,12 +6,12 @@ from datetime import datetime
test = False
testone = False
hopper = True
hopper = False
conn_retries = 0
def stamp():
return datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") + " v1.1"
return datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") + " v1.2"
def run_redis(host, fn):
global conn_retries
......@@ -68,6 +68,8 @@ def main(argv):
# redis DB contains 4 sets of shard-IDs
n = 0
errors = 0
consecutive_errors = 0
while True:
shard = get_shard(host)
if shard is None: # graceful shutdown
......@@ -83,8 +85,16 @@ def main(argv):
if ret:
ofile.write("%s %s ERR\n"%(stamp(),shard))
newset = 'errors'
consecutive_errors += 1
errors += 1
if consecutive_errors >= 10:
print("%s Host %04x quitting due to %d consecutive errors."%(stamp(),rank,consecutive_errors))
break
if consecutive_errors >= 2:
time.sleep(60)
else:
ofile.write("%s %s OK\n"%(stamp(),shard))
consecutive_errors = 0
run_redis(host, lambda r: r.smove('doing', newset, shard))
n += 1
......@@ -96,7 +106,7 @@ def main(argv):
ofile.close()
print("%s Host %04x completed (%d decishards processed, %d conn retries)."%(stamp(),rank,n,conn_retries))
print("%s Host %04x completed (%d decishards processed, %d errors, %d conn retries)."%(stamp(),rank,n,errors,conn_retries))
ret = subprocess.call("rm -fr /mnt/bb/%s/%d"%(username, rank), shell=True)
if __name__=="__main__":
......
......@@ -9,7 +9,7 @@
export OMP_NUM_THREADS=7
set -e
version="run_ad.sh v1.1"
version="run_ad.sh v1.2"
if [ $# -ne 2 ]; then
echo "Usage: $0 shard_name shard_segment"
......
#BSUB -nnodes 10
#BSUB -W 1:20
#BSUB -nnodes 4600
#BSUB -W 24:00
#BSUB -q batch
#BSUB -P BIF128
#BSUB -J ADv1.1
#BSUB -J ADv1.0
#BSUB -o %J.out
#BSUB -alloc_flags "NVME"
......@@ -10,7 +10,7 @@ source /ccs/proj/bif128/venvs/env.sh
PROJ=/gpfs/alpine/bif128/proj-shared/redis
gpus=$(( (LSB_MAX_NUM_PROCESSORS-1)/7 ))
echo "Starting $((gpus/6)) node run of ADv1.1 at " `date`
echo "Starting $((gpus/6)) node run of ADv1.0 at " `date`
[ -s $PROJ/shards.rdb ]
REMAKE=$?
......@@ -27,7 +27,6 @@ for((i=0;i<120;i++)); do
[ $? -eq 0 ] && break
sleep 1
done
query set hopper $((gpus*2))
echo "$memb initial members at " `date`
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment