Commit 81b842ab authored by David M. Rogers's avatar David M. Rogers
Browse files

Mark rescored outputs with batch.

parent e03974f1
......@@ -10,7 +10,7 @@ rules = yaml.safe_load(open(base / 'rules.yaml'))
bucket = 'gs://ccddc'
test = False
hopper = True
hopper = False
conn_retries = 0
......
......@@ -27,9 +27,9 @@ def gsutil(cmd):
def process_inp(r, name):
n = ihash( int(name, 16) )
inp = [ "%x.pq" % fhash(n+i) for i in range(batch_sz) ]
inp = [ (n+i, "%x.pq" % fhash(n+i)) for i in range(batch_sz) ]
inp2 = [ "gs://ccddc/%s_docked/"%r + i for i in inp ]
inp2 = [ "gs://ccddc/%s_docked/%s"%(r, i[1]) for i in inp ]
gsutil(['cp'] + inp2 + ['./'])
end = Event()
......@@ -121,14 +121,15 @@ class LoadMol(Worker):
dt = time.time() - t0
print("LoadMol setup done in %f seconds"%dt)
def fn(self, inp):
def fn(self, i):
n, inp = i
try:
df = pd.read_parquet(inp)
os.remove(inp)
except FileNotFoundError:
print("Error: Input file %s is missing!"%inp)
return pd.DataFrame()
df['batch'] = n
v2 = self.v2
v3 = self.v3
for x in ['', '2', '3']:
......@@ -175,6 +176,8 @@ class Scorer(Worker):
df.loc[df[c].notna(), c] = self.score(v)
return df
# process control, using advice from:
# https://www.cloudcity.io/blog/2019/02/27/things-i-wish-they-told-me-about-multiprocessing-in-python/
def stop_procs(procs):
end_time = time.time() + 200 # seconds (be sure they're done)
num_terminated = 0
......
......@@ -4,7 +4,7 @@
#SBATCH -c 64
#SBATCH -J rescore
#SBATCH -o %x.%A_%a.%j.out
#SBATCH --array=1-4
#SBATCH --array=1-13
echo "Starting $SLURM_JOB_NAME-$SLURM_ARRAY_TASK_ID at" `date`
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment