Commit 1204b5ab authored by Gao, Shang's avatar Gao, Shang
Browse files

added molecular dynamics scripts

parent 19af511d
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -76,8 +76,14 @@ class crossbowBase(object):
          - tags: list (optional)
            list of tag dictionaries to add to the package
        '''
        #check if package already exists
        if package in self.ckan.action.package_list():
            raise Exception('package already exists')
        
        #check format
        if " " in package or any(x.isupper() for x in package):
            raise Exception('package must be lowercase and without spaces')
        
        self.ckan.action.package_create(
                name=package,owner_org=owner_org,title=title,author=author,
                author_email=author_email,maintainer=maintainer,
+1 −0
Original line number Diff line number Diff line
@@ -85,6 +85,7 @@ class crossbowGlobus(crossbowBase):
        
        self.client_id = '18270179-b498-4032-9187-ca646d6e29af'
        self.cadesdtn = 'e6bcd4fa-ac1b-11e6-9aee-22000a1e3b52'
        self.crossbowdtn = '586dc9b6-189a-11e7-bb9f-22000b9a448b'

        try:
            # if we already have tokens, load and use them
+376 −0
Original line number Diff line number Diff line
'''
Convolutional variational autoencoder in Keras.

Based off dense variational autoencoder from
https://github.com/fchollet/keras/blob/master/examples/variational_autoencoder.py
'''

import numpy as np
from keras.layers import Input, Dense, Lambda, Flatten, Reshape, Dropout
from keras.layers import Convolution2D, Conv2DTranspose
from keras.models import Model
from keras.optimizers import SGD, Adam, RMSprop, Adadelta
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from keras import objectives
import warnings

class conv_variational_autoencoder(object):
    '''
    variational autoencoder class
    
    parameters:
      - image_size: tuple
        height and width of images
      - channels: int
        number of channels in input images
      - conv_layers: int
        number of encoding/decoding convolutional layers
      - feature_maps: list of ints
        number of output feature maps for each convolutional layer
      - filter_shapes: list of tuples
        convolutional filter shape for each convolutional layer
      - strides: list of tuples
        convolutional stride for each convolutional layer
      - dense_layers: int
        number of encoding/decoding dense layers
      - dense_neurons: list of ints
        number of neurons for each dense layer
      - dense_dropouts: list of float
        fraction of neurons to drop in each dense layer (between 0 and 1)
      - latent_dim: int
        number of dimensions for latent embedding
      - activation: string (default='relu')
        activation function to use for layers
      - eps_mean: float (default = 0.0)
        mean to use for epsilon (target distribution for embedding)
      - eps_std: float (default = 1.0)
        standard dev to use for epsilon (target distribution for embedding)
       
    methods:
      - train(data,batch_size,epochs=1,history=False,checkpoint=False,
              filepath=None)
        train network on given data
      - save(filepath)
        save the model weights to a file
      - load(filepath)
        load model weights from a file
      - return_embeddings(data)
        return the embeddings for given data
      - generate(embedding)
        return a generated output given a latent embedding
    '''

    def __init__(self,image_size,channels,conv_layers,feature_maps,filter_shapes,
                 strides,dense_layers,dense_neurons,dense_dropouts,latent_dim,
                 activation='relu',eps_mean=0.0,eps_std=1.0):
        
        #check that arguments are proper length
        if len(filter_shapes)!=conv_layers:
            raise Exception("number of convolutional layers must equal length of \
                             filter_shapes list")
        if len(strides)!=conv_layers:
            raise Exception("number of convolutional layers must equal length of \
                             strides list")
        if len(feature_maps)!=conv_layers:
            raise Exception("number of convolutional layers must equal length of \
                             feature_maps list")
        if len(dense_neurons)!=dense_layers:
            raise Exception("number of dense layers must equal length of \
                             dense_neurons list")
        if len(dense_dropouts)!=dense_layers:
            raise Exception("number of dense layers must equal length of \
                             dense_dropouts list")
        
        #even shaped filters may cause problems in theano backend
        even_filters = [f for pair in filter_shapes for f in pair if f % 2 == 0]
        if K.image_dim_ordering() == 'th' and len(even_filters) > 0:
            warnings.warn('Even shaped filters may cause problems in Theano backend')
        
        self.eps_mean = eps_mean
        self.eps_std = eps_std
        self.image_size = image_size
        
        #define input layer
        if K.image_dim_ordering() == 'th':
            self.input = Input(shape=(channels,image_size[0],image_size[1]))
        else:
            self.input = Input(shape=(image_size[0],image_size[1],channels))
                    
        #define convolutional encoding layers
        self.encode_conv = []
        layer = Convolution2D(feature_maps[0],filter_shapes[0],padding='same',
                              activation=activation,strides=strides[0])(self.input)
        self.encode_conv.append(layer)
        for i in range(1,conv_layers):
            layer = Convolution2D(feature_maps[i],filter_shapes[i],
                                  padding='same',activation=activation,
                                  strides=strides[i])(self.encode_conv[i-1])
            self.encode_conv.append(layer)
        
        #define dense encoding layers
        self.flat = Flatten()(self.encode_conv[-1])
        self.encode_dense = []
        layer = Dense(dense_neurons[0],activation=activation)\
                (Dropout(dense_dropouts[0])(self.flat))
        self.encode_dense.append(layer)
        for i in range(1,dense_layers):
            layer = Dense(dense_neurons[i],activation=activation)\
                    (Dropout(dense_dropouts[i])(self.encode_dense[i-1]))
            self.encode_dense.append(layer)
        
        #define embedding layer
        self.z_mean = Dense(latent_dim)(self.encode_dense[-1])
        self.z_log_var = Dense(latent_dim)(self.encode_dense[-1]) 
        self.z = Lambda(self._sampling, output_shape=(latent_dim,))\
                 ([self.z_mean, self.z_log_var]) 
                
        #save all decoding layers for generation model
        self.all_decoding=[]
                
        #define dense decoding layers
        self.decode_dense = []
        layer = Dense(dense_neurons[-1], activation=activation)
        self.all_decoding.append(layer)
        self.decode_dense.append(layer(self.z))
        for i in range(1,dense_layers):
            layer = Dense(dense_neurons[-i-1],activation=activation)
            self.all_decoding.append(layer)
            self.decode_dense.append(layer(self.decode_dense[i-1]))
        
        #dummy model to get image size after encoding convolutions
        self.decode_conv = []
        if K.image_dim_ordering() == 'th':
            dummy_input = np.ones((1,channels,image_size[0],image_size[1]))
        else:
            dummy_input = np.ones((1,image_size[0],image_size[1],channels))
        dummy = Model(self.input, self.encode_conv[-1])
        conv_size = dummy.predict(dummy_input).shape
        layer = Dense(conv_size[1]*conv_size[2]*conv_size[3],activation=activation)
        self.all_decoding.append(layer)
        self.decode_dense.append(layer(self.decode_dense[-1]))
        reshape = Reshape(conv_size[1:])
        self.all_decoding.append(reshape)
        self.decode_conv.append(reshape(self.decode_dense[-1]))
        
        #define deconvolutional decoding layers
        for i in range(1,conv_layers):
            if K.image_dim_ordering() == 'th':
                dummy_input = np.ones((1,channels,image_size[0],image_size[1]))
            else:
                dummy_input = np.ones((1,image_size[0],image_size[1],channels))
            dummy = Model(self.input, self.encode_conv[-i-1])
            conv_size = list(dummy.predict(dummy_input).shape)
            
            if K.image_dim_ordering() == 'th':
                conv_size[1] = feature_maps[-i]
            else:
                conv_size[3] = feature_maps[-i]
            
            layer = Conv2DTranspose(feature_maps[-i-1],filter_shapes[-i],
                                    padding='same',activation=activation,
                                    strides=strides[-i])
            self.all_decoding.append(layer)
            self.decode_conv.append(layer(self.decode_conv[i-1]))
        
        layer = Conv2DTranspose(channels,filter_shapes[0],padding='same',
                                activation='sigmoid',strides=strides[0])
        self.all_decoding.append(layer)
        self.output=layer(self.decode_conv[-1])

        #build model
        self.model = Model(self.input, self.output)
        self.optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
        self.model.compile(optimizer=self.optimizer, loss=self._vae_loss)
        print "model summary:"
        self.model.summary()
        
        #model for embeddings
        self.embedder = Model(self.input, self.z_mean)
        
        #model for generation
        self.decoder_input = Input(shape=(latent_dim,))
        self.generation = []
        self.generation.append(self.all_decoding[0](self.decoder_input))
        for i in range(1, len(self.all_decoding)):
            self.generation.append(self.all_decoding[i](self.generation[i-1]))
        self.generator = Model(self.decoder_input, self.generation[-1])
        
    def _sampling(self,args):
        '''
        sampling function for embedding layer
        '''
        z_mean,z_log_var = args
        epsilon = K.random_normal(shape=K.shape(z_mean),mean=self.eps_mean,
                                  stddev=self.eps_std)
        return z_mean + K.exp(z_log_var) * epsilon
        
    def _vae_loss(self,input,output):
        '''
        loss function for variational autoencoder
        '''
        input_flat = K.flatten(input)
        output_flat = K.flatten(output)
        xent_loss = self.image_size[0] * self.image_size[1] \
                    * objectives.binary_crossentropy(input_flat,output_flat)
        kl_loss = - 0.5 * K.mean(1 + self.z_log_var - K.square(self.z_mean) 
                  - K.exp(self.z_log_var), axis=-1)
        return xent_loss + kl_loss
        
    class _LossHistory(Callback):
        '''
        loss history for training variational autoencoder
        '''
        def on_train_begin(self, logs={}):
            self.losses = []
            self.val_losses = []

        def on_epoch_end(self, batch, logs={}):
            self.losses.append(logs.get('loss'))
            self.val_losses.append(logs.get('val_loss'))
        
    def train(self,data,batch_size,epochs=1,validation_data=None,history=False,
              checkpoint=False,filepath=None):
        '''
        train network on given data
        
        parameters:
          - data: numpy array
            input data
          - batch_size: int
            number of records per batch
          - epochs: int (default: 1)
            number of epochs to train for
          - validation_data: tuple (optional)
            tuple of numpy arrays (X,y) representing validation dataZ
          - history: boolean (default: False)
            whether or not to record training history
          - checkpoint: boolean (default: False)
            whether or not to save model after each epoch
          - filepath: string (optional)
            path to save model if checkpoint is set to True
        
        outputs:
            None
        '''
        if checkpoint==True and filepath==None:
            raise Exception("Please enter a path to save the network")
        
        callbacks = []
        if history:
            self.history = self._LossHistory()
            callbacks.append(self.history)
        if checkpoint:
            callbacks.append(ModelCheckpoint(filepath))
        
        self.model.fit(data,data,batch_size,epochs=epochs,shuffle=True,
                       validation_data=(data,data),callbacks=callbacks)
    
    def save(self,filepath):
        '''
        save the model weights to a file
        
        parameters:
          - filepath: string
            path to save model weights
        
        outputs:
            None
        '''
        self.model.save_weights(filepath)
        
    def load(self,filepath):
        '''
        load model weights from a file
        
        parameters:
          - filepath: string
            path from which to load model weights
        
        outputs:
            None
        '''
        self.model.load_weights(filepath)
    
    def return_embeddings(self,data):
        '''
        return the embeddings for given data
        
        parameters:
          - data: numpy array
            input data
        
        outputs:
            numpy array of embeddings for input data
        '''
        return self.embedder.predict(data)

    def generate(self,embedding):
        '''
        return a generated output given a latent embedding
        
        parameters:
          - data: numpy array
            latent embedding
        
        outputs:
            numpy array of generated output
        '''
        return self.generator.predict(embedding)

if __name__ == "__main__":

    import sys
    import os
    import matplotlib.pyplot as plt
    from scipy.stats import norm

    # get directory to dcd files
    args = (sys.argv)
    if len(args) != 2:
        raise Exception("Usage: python conv_vae.py <path to contact matrix numpy array>")

    #define parameters
    channels = 1
    batch_size = 1000
    conv_layers = 4
    feature_maps = [256,256,256,256]
    filter_shapes = [(1,3),(1,3),(3,3),(3,3)]
    strides = [(1,1),(1,3),(1,1),(1,1)]
    dense_layers = 1
    dense_neurons = [128]
    dense_dropouts = [0]
    latent_dim = 3
    epochs = 1
    train_size = 0.9
    
    #load data
    print "loading data"
    X = np.load(args[1])
    
    #reshape to 4d tensors
    image_size = X.shape[-2:]
    if K.image_dim_ordering() == 'th':
        tensor_shape = (1,image_size[0],image_size[1])
    else:
        tensor_shape = (image_size[0],image_size[1],1)
    X = X.reshape((X.shape[0],) + tensor_shape)
    
    #test train split
    idx = int(X.shape[0]*train_size)
    X_train = X[:idx]
    X_test = X[idx:]
    
    #build autoencoder
    print "building variational autoencoder"
    autoencoder = conv_variational_autoencoder(
                  image_size,channels,conv_layers,feature_maps,filter_shapes,
                  strides,dense_layers,dense_neurons,dense_dropouts,latent_dim)

    #load saved weights if they exist
    if os.path.isfile("./savedweights.dat"):
        autoencoder.load("./savedweights.dat")
    else:
        autoencoder.train(X_train,batch_size,epochs=epochs,
                          validation_data=(X_test,X_test),
                          checkpoint=True,filepath="./savedweights.dat")
+100 −0
Original line number Diff line number Diff line
import gzip
import os
import sys
import glob
import numpy as np
import MDAnalysis as mdanal
import re
import shutil

# get directory to dcd files
args = (sys.argv)
if len(args) != 2:
    raise Exception("Usage: python feature_extraction.py <path to dcd files>")
dcd_path = args[1]
if dcd_path[-1] != '/':
    dcd_path += '/'
    
# directory for results
if not os.path.exists('./results'):
    os.makedirs('./results')
    
# get dataset info
files = glob.glob(dcd_path+'*-protein-*.dcd')
n = len(files)
dataset_name = os.path.basename(files[0]).split('-protein-')[0]
file_basename = os.path.basename(files[0])[:-7]

# process dcd files
k = 0
for i in range(n): 
    print 'processing dcd file %i of %i    \r' % (i+1,n)
    # specify path of structure & trajectory files
    u0 = mdanal.Universe('%s%s.pdb' % (dcd_path,file_basename), 
                         '%s%s-0%i.dcd' % (dcd_path,file_basename,i))
    #number of frames in file
    f = len(u0.trajectory)

    from MDAnalysis.analysis import contacts
    # crude definition of salt bridges as contacts between CA atoms
    CA = "(name CA and resid 1:21)"
    # reference groups (first frame of the trajectory, but you could also use a
    # separate PDB, eg crystal structure)
    CA0 = u0.select_atoms(CA)
    
    # running over frames per trajectory 
    for j in range(0, (f)):
        # calculating and saving native contact dat files per frame
        # set up analysis of native contacts ("salt bridges"); salt bridges have a
        # distance <8 A
        ca = contacts.ContactAnalysis1(u0, selection=(CA, CA),
                        refgroup=(CA0, CA0), radius=8.0, outfile='results/test%i.dat' % k)    
        ca.run(store=True, start=j, stop=j+1, step=1)
        # reading zipped native contact array files
        inF_array = gzip.GzipFile("results/test%i.array.gz" % k, 'rb')   
        s_array = inF_array.read()
        inF_array.close()
        # copying to another file
        outF_array = open("results/test%i.array" % k, 'wb')
        outF_array.write(s_array)
        outF_array.close() 
        # removing zipped array file
        os.remove("results/test%i.array.gz" % k)
        # to next file name numerics
        k += 1
    
    print "progress: %i frames" % k

# concatening frames into single array
print "assembling frames into single array"
n = len(glob.glob('results/test*.array'))
final_array = []
for i in range(n):
    sys.stdout.write('processing frame %i of %i        \r' % (i+1,n))
    sys.stdout.flush()
    frame = []
    with open('results/test%i.array' % i, "r") as f:
        for line in f:
            data = [float(x) for x in line.split()]
            frame.append(data)
    final_array.append(frame)
final_array = np.array(final_array)
np.save('%s_contact_matrix' % dataset_name,final_array)
print ""

# concatening labels into single array
print "assembling labels into single array"
n = len(glob.glob('results/test*.dat'))
final_array = []
for i in range(n):
    sys.stdout.write('processing frame %i of %i        \r' % (i+1,n))
    sys.stdout.flush()
    data = np.loadtxt('results/test%i.dat' % i)
    final_array.append(data)
final_array = np.array(final_array)
np.save('%s_labels' % dataset_name,final_array)
print ""

# cleanup
print 'cleaning up'
shutil.rmtree('./results')