Commit 5f593c56 authored by David M. Rogers's avatar David M. Rogers
Browse files

Updates for large array with slurm.

parent c0f8e598
#!/bin/bash
#SBATCH -p dock
#SBATCH --nodes 1
#SBATCH --nodes 2
#SBATCH --cpus-per-task 2
#SBATCH --gres gpu:1
#SBATCH -J dock
#SBATCH -o %x.%A_%a.out
#SBATCH --array=1-1000
#SBATCH --array=1-866
# TODO: add date/time to output filename
echo "Starting $SLURM_JOB_NAME-$SLURM_ARRAY_TASK_ID at" `date`
source /apps/dock_env/env.sh
......@@ -13,7 +15,7 @@ source /apps/dock_env/env.sh
export OMP_NUM_THREADS=1
DIR=/apps/launchad
cd /dev/shm
srun -n1 -N1 --gres=gpu:1 --cpus-per-task=2 --exclusive \
srun -n2 -N2 --gres=gpu:1 --cpus-per-task=2 --exclusive \
$DIR/loadem.py ccddc-controller $SLURM_JOB_NAME
echo "Completed $SLURM_JOB_NAME-$SLURM_ARRAY_TASK_ID at" `date`
......@@ -110,7 +110,8 @@ def requeue(assigned, host, db):
item = r.spop(assigned)
if item is None:
break
r.smove(assigned, 'ready', item)
r.sadd('ready', item)
#r.smove(assigned, 'ready', item)
print("%s %s re-queued %s."%(stamp(), assigned, item))
else:
raise IndexError("More than 10 items assigned to %s!"%assigned)
......
#!/usr/bin/env python3
from helpers import *
import os
import os, concurrent, subprocess
import pandas as pd
import numpy as np
import oddt
from oddt.scoring import descriptors
from oddt.scoring.functions import RFScore
from oddt.scoring.models.regressors import randomforest
def fhash(x):
return (48271*x)%2147483647
def ihash(y):
return (1899818559*y)%2147483647
threads = 1
batch_sz = 16
def gsutil(cmd):
args = ["gsutil", "-o", "GSUtil:parallel_process_count=1"
, "-o", "GSUtil:parallel_thread_count=%d"%threads
, "-o", "GSUtil:state_dir=gsutil"
, "-m"
] + cmd
return subprocess.call( args )
def process_inp(r, name):
n = ihash( int(name, 16) )
inp = "10344a.pq 11ad68.pq 132686.pq 16d551.pq d420e.pq 10f0d9.pq 1269f7.pq 1618c2.pq 1791e0.pq dfe9d.pq".split()
#inp = [ "%x.pq" % fhash(n+i) for i in range(batch_sz) ]
inp2 = [ "gs://ccddc/%s_docked/"%r + i for i in inp ]
gsutil(['cp'] + inp2 + ['./'])
#with concurrent.futures.ProcessPoolExecutor() as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
ans = executor.map(rescore, inp)
return pd.concat(ans)
receptor = None
def main(argv):
assert len(argv) == 3, "Usage: %s <receptor.pdbqt> <ligs.pq>"
global receptor
assert len(argv) == 3, "Usage: %s <receptor id> <lig id>"
# set up descriptors
receptor = next(oddt.toolkit.readfile('pdbqt', argv[1]))
df = pd.read_parquet(argv[2])
os.remove(argv[2]) # signal job start
receptor = next(oddt.toolkit.readfile('pdbqt', argv[1]+'.pdbqt'))
result = process_inp(argv[1], argv[2])
result.to_parquet(argv[2]+'.pq',
compression='snappy', engine='pyarrow')
def get_descriptors(receptor, confs):
cutoff = 12
ligand_atomic_nums = [6, 7, 8, 9, 15, 16, 17, 35, 53]
protein_atomic_nums = [6, 7, 8, 16]
......@@ -29,41 +68,69 @@ def main(argv):
'vina_hydrogen',
'vina_num_rotors']
vina = oddt.scoring.descriptors.oddt_vina_descriptor(receptor, vina_scores=vina_scores)
descriptors_v1 = cc
descriptors_v2 = oddt.scoring.descriptors.close_contacts_descriptor(
receptor,
cutoff=np.array([0, 2, 4, 6, 8, 10, 12]),
protein_types=protein_atomic_nums,
ligand_types=ligand_atomic_nums)
#descriptors_v1 = cc
#descriptors_v2 = oddt.scoring.descriptors.close_contacts_descriptor(
# receptor,
# cutoff=np.array([0, 2, 4, 6, 8, 10, 12]),
# protein_types=protein_atomic_nums,
# ligand_types=ligand_atomic_nums)
descriptors_v3 = oddt.scoring.ensemble_descriptor((vina, cc))
# calculate descriptors individually
desc_rfscore1 = []
desc_rfscore2 = []
desc_rfscore3 = []
for x in df['conf']:
if pd.isnan(x):
desc_rfscore1.append(None)
desc_rfscore2.append(None)
desc_rfscore3.append(None)
continue
try:
c = oddt.toolkit.readstring('pdbqt', x)
desc_rfscore1.append(descriptors_v1.build(c))
desc_rfscore2.append(descriptors_v2.build(c))
desc_rfscore3.append(descriptors_v3.build(c))
except Exception:
desc_rfscore1.append(None)
desc_rfscore2.append(None)
desc_rfscore3.append(None)
result = pd.DataFrame({'name': df['name'].astype(str),
'desc_rfscore1': desc_rfscore1,
'desc_rfscore2': desc_rfscore2,
'desc_rfscore3': desc_rfscore3,
})
result.set_index('name')
result.to_parquet(argv[2], compression='snappy', engine='pyarrow')
return [ descriptors_v3.build( oddt.toolkit.readstring('pdbqt', x) ).reshape(-1)
for x in confs ]
# load models
models = [
('rf3', '/apps/data/RFScore_v3_pdbbind2016.pickle' )
, ('dude3', '/apps/data/RFScoreVS_v3_dude.pickle' )
, ('dock3', '/apps/data/RFScoreVS_v3_dock.pickle' )
, ('vina3', '/apps/data/RFScoreVS_v3_vina.pickle' )
]
# parallel load all these pickles
#with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
models = dict( executor.map(lambda m: (m[0], RFScore.rfscore.load(m[1], version=3)),
models) )
#models = dict(
# rf3 = RFScore.rfscore.load(
# '/apps/data/RFScore_v3_pdbbind2016.pickle', version=3)
## , vs_dude_v1 = RFScore.rfscore.load(
## 'RFScoreVS_v1_dude.pickle',version=1)
## , vs_dude_v2 = RFScore.rfscore.load(
## 'RFScoreVS_v2_dude.pickle',version=2)
# , vs_dude_v3 = RFScore.rfscore.load(
# '/apps/data/RFScoreVS_v3_dude.pickle',version=3)
## , vs_dock_v1 = RFScore.rfscore.load(
## 'RFScoreVS_v1_dock.pickle',version=1)
## , vs_dock_v2 = RFScore.rfscore.load(
## 'RFScoreVS_v2_dock.pickle',version=2)
# , vs_dock_v3 = RFScore.rfscore.load(
# '/apps/data/RFScoreVS_v3_dock.pickle',version=3)
## , vs_vina_v1 = RFScore.rfscore.load(
## 'RFScoreVS_v1_vina.pickle',version=1)
## , vs_vina_v2 = RFScore.rfscore.load(
## 'RFScoreVS_v2_vina.pickle',version=2)
# , vs_vina_v3 = RFScore.rfscore.load(
# '/apps/data/RFScoreVS_v3_vina.pickle',version=3)
#)
def rescore(inp):
print("Rescoring %s"%inp)
df = pd.read_parquet(inp)
os.remove(inp)
columns = [ 'rf3'
, 'dude3'
, 'dock3'
, 'vina3'
]
dvs = get_descriptors(receptor, df['conf'].values)
data = df['score']
for c in columns:
data[c] = models[c].model.predict(dvs)
return data
if __name__=="__main__":
import sys
......
#!/bin/bash
#SBATCH -p rescore
#SBATCH --nodes 1
#SBATCH -n64
#SBATCH -J rescore
#SBATCH -o %x.%A_%a.out
#SBATCH --array=1-1
echo "Starting $SLURM_JOB_NAME-$SLURM_ARRAY_TASK_ID at" `date`
source /apps/dock_env/env.sh
export OMP_NUM_THREADS=1
eval "$(/apps/anaconda3/bin/conda shell.bash hook)"
conda activate rescore
DIR=/apps/launchad
cd /dev/shm
srun -n8 --cpus-per-task=8 -N1 $DIR/loadem.py ccddc-controller $SLURM_JOB_NAME
echo "Completed $SLURM_JOB_NAME-$SLURM_ARRAY_TASK_ID at" `date`
......@@ -21,7 +21,20 @@ dock:
rm -f *.xml
rm -f *.dlg
# uses output of rescore
# 100k ligands per file
# makes number of files 10k files
combine: # combine 10:1 again ~
[]
# Re-score ligand/receptor conf.
# uses output of combine
# ? rescore all 3 conf?
# - remove "far" ligands
# - combine "close" ligands
# Note: this re-combines files 10:1
# creating output files that span a sequence
# 10k ligands file, 100k files
rescore:
queue: rescore
db: 1
......@@ -29,9 +42,9 @@ rescore:
out: [ "{r}_scored/{n}.pq" ]
inp:
- targets/{r}.tgz # note: untarring is automatic
- "{r}_docked/{n}.pq"
#- "{r}_docked/{n}.pq" # ~1050 ligands,
script: |
/apps/launchad/rescore.py *{r}*.pdbqt {n}.pq
/apps/launchad/rescore.py {r}.pdbqt {n}
breakup:
queue: rescore
......@@ -44,22 +57,3 @@ breakup:
mkdir -p ligs
/apps/launchad/breakup.py -n 512 $((1+{n}*512)) docked.{n}.parquet ligs/%s.pq
dock_test:
queue: dock
db: 4
params: [r, n]
out: [ "{r}_docked/{n}.pq" ]
inp:
- targets/{r}.tgz # note: untarring is automatic
- ligs/{n}.pq
script: |
export OMP_NUM_THREADS=2
ls {r}.maps.fld >filelist
/apps/launchad/create_inp.py {n}.pq >>filelist
rm {n}.pq
autodock_gpu_64wi -filelist filelist \
-nrun 20 -autostop 1 -nev 3000000 >/dev/null
/apps/launchad/package_out.py filelist {n}.pq
rm -f *_*.pdbqt
rm -f *.xml
rm -f *.dlg
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment