Commit baccadc8 authored by Baker, Matthew's avatar Baker, Matthew
Browse files

Working version of scripting to upload ligand data into a mongodb over signac

parents
Loading
Loading
Loading
Loading
+106 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# Parse files like ligands/PV-001952970482_1_T2.pdbqt
# present in input files like /gpfs/alpine/bif128/world-shared/runs/N108/A_C_B_Cl_I_NA_F_OA_N_S_Br_SA_HD_13types_output_p35245.tar.gz
# or files like ligands/Z1149212469_1_T1.pdbqt
# present in input files like /gpfs/alpine/bif128/world-shared/ligand_shards/A_C_F_NA_Cl_OA_N_S_Br_SA_HD_11types_output_p61791.tar.gz

import sys, re
import logging

from helpers import *
import signac

from mpi4py import MPI
from mpi4py.futures import MPICommExecutor

project = signac.get_project()

def insert_signac(lname, lshard, latoms, ltors):
    #job = project.open_job(statepoint={'real_id':name})
    #ljob = project.add_many( {'real_id':name} for name in lname )
    ljob = [ project.open_job(statepoint={'real_id':name}) for name in lname ]
    added_jobs = project.add_many(ljob)

    for job, shard, atoms, tors in zip(ljob, lshard, latoms, ltors):
        job.doc['atoms'] = atoms
        job.doc['tors']  = tors
        job.doc['shard'] = shard

    return True

def insert_loop(tname):
      with signac.pymongo_buffered(project):
        try:
            shard = shard_id(tname)
        except ValueError:
            print("Bad name encountered, {}".format(tname))
            return ("","","")
        fname = None
        k = 0
        lname, lshard, latoms, ltors = [], [], [], []
        for fname, name, f in tar_iter(tname, "pdbqt"):
            try:
                atoms, tors = get_pdbqt_info(f)
                if atoms == 0 or tors is None:
                    print("Bad molecule %s %s %d %s"%(fname,name,atoms,str(tors)))
                    if tors is None:
                        tors = 0
            except:
                print("Bad file: {}".format(fname))
                continue
            lname.append(name)
            lshard.append(shard)
            latoms.append(atoms)
            ltors.append(tors)
            k += 1
            if k%100 == 0:
                insert_signac(lname, lshard, latoms, ltors)
                lname, lshard, latoms, ltors = [], [], [], []
        if fname is None:
            print("No pdbqt files found.")
            return ("", "", "")
        if len(lname) > 0:
            insert_signac(lname, lshard, latoms, ltors)

        return (shard, fname, k)

        # raw mongodb commands:
        #try:
        #    cpds.insert_many(info)
        #except Duplicate:
        #    pass
        #try:
        #    atypes.insert_one({'_id':fname, 'fullname':name})
        #except Duplicate:
        #    pass

def main(shards):
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    with MPICommExecutor(MPI.COMM_WORLD, root=0) as executor:
      if executor is not None:
        return list(executor.map(insert_loop, shards))
    #for tname in mpi_tname:
    #  insert_loop(tname)

def get_pdbqt_info(f):
    atom = 0
    tors = None
    for line in f:
        try:
            if line[:6] == b"ATOM  " or line[:6] == b"HETATM":
                atom = max(atom, int(line[6:11]))
            elif line[:7] == b"TORSDOF":
                tors = int(line[7:])
        except ValueError:
            print("Error getting info. `{}`".format(line))
            raise
    return atom, tors

if __name__ == "__main__":
    print("Reading file file")
    with open("/lustre/or-hydra/cades-bsd/world-shared/bzf-ligand/mongo/ligand_shards.txt", "r") as shard_file:
        shard_data = shard_file.read()
    print("Processing")
    res = main(shard_data.strip().split("\n")[int(sys.argv[1]):])
    #for x in res:
    #    print(x)

helpers.py

0 → 100644
+109 −0
Original line number Diff line number Diff line
import pymongo
import tarfile as tf
from pathlib import Path
Duplicate = pymongo.errors.DuplicateKeyError

def get_mdb():
    mclient = pymongo.MongoClient("mongodb://signac:covid19-scalable@mongodb-signac-bif128.apps.marble.ccs.ornl.gov", port=30594)
    return mclient["signac"]

# FIXME: parse sys.env['SLURM_NODELIST'] = "rhea[473,509]"
def get_rdb(name=None):
    if name is None:
        host = 'localhost'
    else:
        host = open("/gpfs/alpine/proj-shared/bif128/redis/%s.server"%name).read().strip()
    import redis
    r = redis.Redis(host=host, port=6379, password="Z1908840168_2_T1", db=0)
    return r

def shard_id(fname):
    s = fname.rindex("output_")
    return fname[s+7:].split('.')[0]

# iterator over tar-file
def tar_iter(name, ext=None):
    inp = Path(name)
    fname = inp.name.split('.', 1)[0]
    with tf.open(inp, 'r|*') as tar:
        data = None
        for tarinfo in tar:
            #if tarinfo.isreg():
            #    print(tarinfo.name, "is", tarinfo.size, "bytes in size")
            if ext is not None and tarinfo.name[-len(ext)-1:] != '.'+ext:
                continue
            name = tarinfo.name[:-len(ext)-1] # get a good name from the file
            #if name[:2] == './':
            #    name = name[2:]
            name = name.rsplit('/', 1)[-1]

            with tar.extractfile(tarinfo) as f:
                yield fname, name, f

# parse the first cluster from each included xml file.
# example cluster syntax is:
#   <cluster cluster_rank="1" lowest_binding_energy="-4.09" run="9" mean_binding_energy="-4.09" num_in_clus="1" />
import re
cl = re.compile(b'\s*<cluster cluster_rank="1" lowest_binding_energy="([^"]*)" run="([^"]*)" mean_binding_energy="([^"]*)" num_in_clus="([^"]*)" />')
def xml_to_energy(f):
    for line in f:
        m = cl.match(line)
        if m:
            try:
                return float(m[1]),int(m[2]),float(m[3]),int(m[4])
            except ValueError:
                print("error extracting energy from xml. {}".format(line))
                raise
    return None

def grep_all(f, *keys):
    out = dict((k,[]) for k in keys)
    for line in f:
        line = line.decode('utf8')
        for k in keys:
            if k in line:
                out[k].append(line)
    return out

def parse_dlg(f):
    #dlg = grep_all(f, "DOCKED:", "Estimated", "Number of atoms", "Number of rotat")
    dlg = grep_all(f, "DOCKED:")
    confs = dlg_to_confs(dlg['DOCKED:'])
    # truncate to max 5
    #confs = confs[:min(5,len(confs))]
    return confs[0] # en, pdbqt

# inputstrings are all containing "DOCKED:"
def dlg_to_confs(lines):
    confs = []
    conf = ""
    en = None
    for s in lines:
        conf += s[8:] + "\n"
        # DOCKED: USER    Estimated Free Energy of Binding    =  -7.85 kcal/mol
        if "Estimated Free Energy of Binding" in s:
            tok = s.replace('=','').split()
            en = float(tok[7])
        elif s[8:14] == "ENDMDL":
            confs.append( (en, conf) )
            assert en is not None
            conf = ""
            en = None
    confs.sort()
    return confs

# Basically, autodock will spit out -nruns number of poses, which I chose to output to a .pdb file so I could load them in typical molecular visualization tools. This also means that you'd need to get the energies for all the models, and sort them. This is what I was doing on a much smaller set:

# lines that contain "Estimated"
def dlg_to_energy(lines):
    en = []
    for line in lines[:-1]:
        en.append( line.replace('=','').split()[7] )
    return en

# You can also get ligand data from the dlg if you like.
def dlg_liganddata(atline, rotline):
    atomcount = int(atline.split()[-1])
    torsioncount = int(rotline.split()[-1])
    return (atomcount, torsioncount)

shard_mpi.batch

0 → 100644
+44 −0
Original line number Diff line number Diff line
#!/bin/bash -l
#SBATCH -J ligand_indexing
#SBATCH -N 10
#SBATCH -n 360
#SBATCH -c 1
#SBATCH --ntasks-per-node=36
#SBATCH --mem=0
#SBATCH -p covid19
#SBATCH -A covid19
#SBATCH -t 24:00:00


threads=360
chunk=1

set -e
######### computed values #########
# starting shard line no.
#start=1
start=120000
# big step size
step=$((threads*chunk))
# get number of shards
#n=133121
n=133118

source /home/bzf/anaconda3/etc/profile.d/conda.sh
conda activate sig
module purge
module load PE-gnu

add_n() {
    m=$1
    time mpirun -mca btl ^openib python /lustre/or-hydra/cades-bsd/world-shared/bzf-ligand/mongo/add_pdbqt_signac_mpi2.py $m
}


#for((i=$start; i<=$n; i+=$threads)); do
#    for((j=0;j<$threads;j++)); do
        add_n $((i))
#    done
#    wait
#    echo $((i+$threads)) >current.log
#done