Commit 91f9dc30 authored by David M. Rogers's avatar David M. Rogers
Browse files

Version used for gigadock.

parent 1ca85933
...@@ -6,12 +6,12 @@ from datetime import datetime ...@@ -6,12 +6,12 @@ from datetime import datetime
test = False test = False
testone = False testone = False
hopper = True hopper = False
conn_retries = 0 conn_retries = 0
def stamp(): def stamp():
return datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") + " v1.1" return datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") + " v1.2"
def run_redis(host, fn): def run_redis(host, fn):
global conn_retries global conn_retries
...@@ -68,6 +68,8 @@ def main(argv): ...@@ -68,6 +68,8 @@ def main(argv):
# redis DB contains 4 sets of shard-IDs # redis DB contains 4 sets of shard-IDs
n = 0 n = 0
errors = 0
consecutive_errors = 0
while True: while True:
shard = get_shard(host) shard = get_shard(host)
if shard is None: # graceful shutdown if shard is None: # graceful shutdown
...@@ -83,8 +85,16 @@ def main(argv): ...@@ -83,8 +85,16 @@ def main(argv):
if ret: if ret:
ofile.write("%s %s ERR\n"%(stamp(),shard)) ofile.write("%s %s ERR\n"%(stamp(),shard))
newset = 'errors' newset = 'errors'
consecutive_errors += 1
errors += 1
if consecutive_errors >= 10:
print("%s Host %04x quitting due to %d consecutive errors."%(stamp(),rank,consecutive_errors))
break
if consecutive_errors >= 2:
time.sleep(60)
else: else:
ofile.write("%s %s OK\n"%(stamp(),shard)) ofile.write("%s %s OK\n"%(stamp(),shard))
consecutive_errors = 0
run_redis(host, lambda r: r.smove('doing', newset, shard)) run_redis(host, lambda r: r.smove('doing', newset, shard))
n += 1 n += 1
...@@ -96,7 +106,7 @@ def main(argv): ...@@ -96,7 +106,7 @@ def main(argv):
ofile.close() ofile.close()
print("%s Host %04x completed (%d decishards processed, %d conn retries)."%(stamp(),rank,n,conn_retries)) print("%s Host %04x completed (%d decishards processed, %d errors, %d conn retries)."%(stamp(),rank,n,errors,conn_retries))
ret = subprocess.call("rm -fr /mnt/bb/%s/%d"%(username, rank), shell=True) ret = subprocess.call("rm -fr /mnt/bb/%s/%d"%(username, rank), shell=True)
if __name__=="__main__": if __name__=="__main__":
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
export OMP_NUM_THREADS=7 export OMP_NUM_THREADS=7
set -e set -e
version="run_ad.sh v1.1" version="run_ad.sh v1.2"
if [ $# -ne 2 ]; then if [ $# -ne 2 ]; then
echo "Usage: $0 shard_name shard_segment" echo "Usage: $0 shard_name shard_segment"
......
#BSUB -nnodes 10 #BSUB -nnodes 4600
#BSUB -W 1:20 #BSUB -W 24:00
#BSUB -q batch #BSUB -q batch
#BSUB -P BIF128 #BSUB -P BIF128
#BSUB -J ADv1.1 #BSUB -J ADv1.0
#BSUB -o %J.out #BSUB -o %J.out
#BSUB -alloc_flags "NVME" #BSUB -alloc_flags "NVME"
...@@ -10,7 +10,7 @@ source /ccs/proj/bif128/venvs/env.sh ...@@ -10,7 +10,7 @@ source /ccs/proj/bif128/venvs/env.sh
PROJ=/gpfs/alpine/bif128/proj-shared/redis PROJ=/gpfs/alpine/bif128/proj-shared/redis
gpus=$(( (LSB_MAX_NUM_PROCESSORS-1)/7 )) gpus=$(( (LSB_MAX_NUM_PROCESSORS-1)/7 ))
echo "Starting $((gpus/6)) node run of ADv1.1 at " `date` echo "Starting $((gpus/6)) node run of ADv1.0 at " `date`
[ -s $PROJ/shards.rdb ] [ -s $PROJ/shards.rdb ]
REMAKE=$? REMAKE=$?
...@@ -27,7 +27,6 @@ for((i=0;i<120;i++)); do ...@@ -27,7 +27,6 @@ for((i=0;i<120;i++)); do
[ $? -eq 0 ] && break [ $? -eq 0 ] && break
sleep 1 sleep 1
done done
query set hopper $((gpus*2))
echo "$memb initial members at " `date` echo "$memb initial members at " `date`
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment