Commit e775f660 authored by David M. Rogers's avatar David M. Rogers
Browse files

Jun 10 midnight test.

parents
*.err
*.out
#!/usr/bin/env python3
import os, subprocess
import redis, time, random
test = True
conn_retries = 0
def get_shard(host):
global conn_retries
for i in range(120):
try:
r = redis.StrictRedis(host=host, port=6379, password="Z1908840168_2_T1",
db=0, single_connection_client=True)
break
except redis.exceptions.ConnectionError:
conn_retries += 1
time.sleep(random.random()*0.2)
else:
raise redis.exceptions.ConnectionError
shard = r.spop('shards')
r.connection_pool.disconnect()
del r
return shard.decode('utf8')
def main(argv):
global conn_retries
assert len(argv) == 2, "Usage: %s <redis host>"
host = argv[1]
me = int(os.environ['OMPI_COMM_WORLD_RANK'])
ofile = open('/gpfs/alpine/world-shared/bif128/docked/logs/rank%04x.log'%me, "w")
time.sleep(me*0.0001) # 10k connections per second at startup
n = 0
while True:
shard = get_shard(host)
if shard is None: # graceful shutdown
break
ret = False
if not test:
cmd = ["bash", "/ccs/proj/bif128/analysis/reduce/run_ad.sh"]
cmd.extend(shard.split())
ret = subprocess.call(cmd)
if ret:
ofile.write("%s ERR"%shard)
#r.sadd('errors', shard) # FIXME - make this better.
else:
ofile.write("%s OK\n"%shard)
n += 1
if n%10 == 0: # 13k of these messages.
ofile.flush()
print("Host %04x processed %d shards."%(me,n))
ofile.close()
print("Host %04x completed (%d shards processed)."%(me,n))
print("%d connection retries"%conn_retries)
if __name__=="__main__":
import sys
main(sys.argv)
#BSUB -nnodes 8
#BSUB -W 60
#BSUB -q debug
#BSUB -P BIF128
#BSUB -J gpu_dock
#BSUB -o %J.out
#BSUB -e %J.err
#BSUB -alloc_flags "NVME"
gpus=$(( (LSB_MAX_NUM_PROCESSORS-1)/7 ))
export OMP_NUM_THREADS=7
# -X 0 will keep launchers running if some finish early
jsrun -X 0 -e individual \
-n $gpus -r6 -a1 -g1 -c7 -l gpu-cpu \
-b packed:7 \
sg bif128 /ccs/proj/bif128/analysis/reduce/run_ad.sh
#!/bin/bash
# modules: cuda gcc
# workflow copied from:
# /ccs/proj/bif128/fireworks/docking/docking_launcher/dynamic/docking.py
# example shard: /gpfs/alpine/bif128/world-shared/ligand_shards/A_C_F_N_P_Si_HD_Cl_NA_S_OA_Br_SA_13types_output_p9512.tar.gz
# - shard_name: p9512
# - shard_segment: 0
export OMP_NUM_THREADS=7
set -e
version="run_ad.sh v0.2"
if [ $# -ne 2 ]; then
echo "Usage: $0 shard_name shard_segment"
exit 1
fi
shard_name=$1
seg=$2
sfile=`grep $shard_name.tar.gz /gpfs/alpine/bif128/world-shared/ligand_shards.txt`
if [ ! -s "$sfile" ]; then
echo "Missing $sfile"
exit 1
fi
pfile=/gpfs/alpine/world-shared/bif128/final_Mpro_pdbqt_maps_for_1B.tgz
if [ ! -s $pfile ]; then
echo "Missing $pfile"
exit 1
fi
HOST=`hostname`
OUT_DIR=/gpfs/alpine/world-shared/bif128/docked/$LSB_JOBID/$HOST
mkdir -p $OUT_DIR
log() {
echo $(date +"%F %H:%M:%S.%N") "($version) $*" >>$OUT_DIR/$OMPI_COMM_WORLD_RANK.status
}
log started $shard $seg
# prep directories
WORK_DIR=/mnt/bb/$USER/$OMPI_COMM_WORLD_RANK/$shard_name
mkdir -p $WORK_DIR
cd $WORK_DIR
if [ ! -s `basename $sfile` ]; then
# copy-in function
cp $sfile $pfile .
tar -xzf `basename $sfile` # populates ligands/ dir.
tar -xzf `basename $pfile` # unzips *fld
# ls
# A_C_F_N_P_Si_HD_Cl_NA_S_OA_Br_SA_13types_output_p9512.tar.gz
# final_Mpro_pdbqt_maps_for_1B.tgz
# final_MPro_pdbqt_maps_for_1B
# ligands
mv final_MPro_pdbqt_maps_for_1B/* .
ls ligands/* >liglist
ls -1 ligands | cut -f 1 -d '.' >lignames
log completed copyin
fi
# build the file list
ls *fld >filelist.$seg
n=`wc -l <lignames`
start=$(( n*seg/10 + 1 ))
end=$(( n*(seg+1)/10 ))
sed -n -e ${start},${end}p lignames >lignames.$seg
sed -n -e ${start},${end}p liglist >liglist.$seg
paste -d '\n' lignames.$seg liglist.$seg >>filelist.$seg
log completed segment file list ${start} to ${end}
# run AD
/ccs/proj/bif128/docking/autodock_gpu_64wi -filelist filelist.$seg -nrun 20 -autostop 1 -nev 3000000 >${seg}.log
log completed docking
# copy-out function
#cd $WORK_DIR
tar czf $shard_name.$seg.tgz `awk '{printf("%s.xml\n%s.dlg\n",$0,$0);}' lignames.$seg`
cp $shard_name.$seg.tgz $OUT_DIR
#rm -fr $WORK_DIR
log completed copyout
#!/usr/bin/env python3
# WARNING: this actually sets 10 shards per .tar.gz "shard" file
# labeled with index pair: "n m", n = shard_id, m \in {0, 1, ..., 9}
import re
def get_rdb(host):
import redis
return redis.Redis(host=host, port=6379, password="Z1908840168_2_T1", db=0)
expr = re.compile(r"_p([0-9]*).tar.gz")
def main(argv):
assert len(argv) == 3, "Usage: %s <server name> <shard list file>"
r = get_rdb(argv[1])
k = 0
lines = 0
for line in open(argv[2]):
lines += 1
m = expr.search(line)
if m is None:
continue
shards = ["%s %d"%(m[1], i) for i in range(10)]
r.sadd('shards', *shards)
k += 1
print("%d/%d shards added"%(k,lines))
if __name__=="__main__":
import sys
main(sys.argv)
This diff is collapsed.
#BSUB -nnodes 60
#BSUB -W 30
#BSUB -q debug
#BSUB -P BIF128
#BSUB -J db_load
#BSUB -o %J.out
#BSUB -e %J.err
source /ccs/proj/bif128/venvs/env.sh
PROJ=/gpfs/alpine/bif128/proj-shared/redis
cpus=$(( LSB_MAX_NUM_PROCESSORS-1 ))
# create the database afresh (DON'T DO THIS AFTER ACTUAL DOCKING IS DONE)
rm -f $PROJ/shards.rdb
redis-server shards.conf 2>&1 >$PROJ/"redis_"$LSB_JOBID.log &
echo "Starting $((cpus/42)) node run at " `date`
python setdb.py `hostname` /gpfs/alpine/bif128/world-shared/ligand_shards.txt
for((i=0;i<120;i++)); do
memb=$(redis-cli --pass Z1908840168_2_T1 scard shards)
sleep 1
[ $? -eq 0 ] && break
done
echo "$memb initial members at " `date`
# note: using the following flag and -a460 failed to launch any actual jobs
#-e individual \
jsrun -X 0 \
-n $((cpus/42)) -r1 -a460 -g6 -c42 -d cyclic -b packed:smt:1 \
python loadem.py `hostname`
memb=$(redis-cli --pass Z1908840168_2_T1 scard shards)
echo "$memb members remain at " `date`
memb=$(redis-cli --pass Z1908840168_2_T1 scard errors)
echo "$memb errors"
if [ $memb -gt 0 ]; then
redis-cli --pass Z1908840168_2_T1 smembers errors
fi
kill %
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment