Source code for ample.util.spicker

#!/usr/bin/env python

# python imports
import glob
import logging
import os
import re
import shutil
import sys

# our imports
from ample.util import ample_util
from ample.ensembler._ensembler import Cluster
    
logger = logging.getLogger(__name__)

[docs]class Spickerer(object): def __init__(self, spicker_exe=None, run_dir=None): """Initialise from a dictionary of options""" if not spicker_exe: if 'CCP4' in os.environ: spicker_exe = os.path.join(os.environ['CCP4'], 'bin', 'spicker') else: raise RuntimeError, "Cannot find a spicker executable!" if not (os.path.exists(spicker_exe) and os.access(spicker_exe, os.X_OK)): raise RuntimeError, "Cannot find a valid spicker executable: {0}".format(spicker_exe) self.spicker_exe = spicker_exe self.run_dir = run_dir self.results = None self.cluster_method = 'spicker' self.score_type = 'rmsd' return
[docs] def get_length(self, pdb): pdb = open(pdb) counter = 0 for line in pdb: pdb_pattern = re.compile('^ATOM') pdb_result = pdb_pattern.match(line) if pdb_result: atom = line[13:16] if re.search('CA', atom): counter += 1 return str(counter)
[docs] def create_input_files(self, models, score_type='rmsd', score_matrix=None): """ jmht Create the input files required to run spicker (See notes in spicker.f FORTRAN file for a description of the required files) """ if not len(models): msg = "no models provided!" logger.critical(msg) raise RuntimeError, msg if score_type: score_type = score_type.lower() logger.debug("Using score_type: {0}".format(score_type)) if score_type == 'read_matrix': if not (score_matrix and os.path.isfile(score_matrix)): msg = 'Cannot find score_matrix: {0}'.format(score_matrix) logger.critical(msg) raise RuntimeError, msg logger.debug("Using score_matrix: {0}".format(score_matrix)) shutil.copy(score_matrix, os.path.join(self.run_dir,'score.matrix')) # elif score_type == 'tm': # # Create file so spicker knows to calculate TM scores # with open('TM.score','w') as f: f.write('\n') # elif score_type == 'rmsd': # pass # else: # raise RuntimeError,"Unknown score_type: {0}".format(score_type) # read_out - Input file for spicker with coordinates of the CA atoms for each of the PDB structures # # file_list - a list of the full path of all PDBs - used so we can loop through it and copy the selected # ones to the relevant directory after we have run spicker - the order of these must match the order # of the structures in the rep1.tra1 file list_string = '' counter = 0 with open('rep1.tra1', "w") as read_out, open('file_list', "w") as file_list: for infile in models: pdbname = os.path.basename(infile) file_list.write(infile + '\n') list_string = list_string + pdbname + '\n' counter += 1 length = self.get_length(infile) # 1st field is length, 2nd energy, 3rd & 4th don't seem to be used for anything read_out.write('\t' + length + '\t926.917 ' + str(counter) + ' ' + str(counter) + '\n') with open(infile) as read: # Write out the coordinates of the CA atoms for line in read: pattern = re.compile('^ATOM\s*(\d*)\s*(\w*)\s*(\w*)\s*(\w)\s*(\d*)\s*(.\d*.\d*)\s*(.\d*.\d*)\s*(.\d*.\d*)\s*(.\d*.\d*)') result = re.match(pattern, line) if result: split = re.split(pattern, line) if split[2] == 'CA': read_out.write(' ' + split[6] + ' ' + split[7] + ' ' + split[8] + '\n') # from spicker.f # * 'rmsinp'---Mandatory, length of protein & piece for RMSD calculation; with open('rmsinp', "w") as rmsinp: rmsinp.write('1 ' + length + '\n\n') rmsinp.write(length + '\n') # make tra.in # from spicker.f # * 'tra.in'---Mandatory, list of trajectory names used for clustering. # * In the first line of 'tra.in', there are 3 parameters: # * par1: number of decoy files # * par2: 1, default cutoff, best for decoys from template-based # * modeling; # * -1, cutoff based on variation, best for decoys from # * ab initio modeling. # * -2, use TM scores for clustering # * par3: 1, closc from all decoys; -1, closc clustered decoys # * From second lines are the file names which contain coordinates # * of 3D structure decoys. All these files are mandatory par2 = '-2' if score_type == 'tm' else '-1' with open('tra.in', "w") as tra: tra.write('1 {0} 1 \nrep1.tra1\n'.format(par2)) # Create the file with the sequence of the PDB structures # from spicker.f # * 'seq.dat'--Mandatory, sequence file, for output of PDB models. with open('seq.dat', "w") as seq, open(models[0], 'r') as a_pdb: for line in a_pdb: pattern = re.compile('^ATOM\s*(\d*)\s*(\w*)\s*(\w*)\s*(\w)\s*(\d*)\s*(\d*)\s') result = re.match(pattern, line) if result: split = re.split(pattern, line) if split[2] == 'CA': seq.write('\t' + split[5] + '\t' + split[3] + '\n') return
[docs] def cluster(self, models, num_clusters=10, max_cluster_size=200, run_dir=None, score_type='rmsd', score_matrix=None, nproc=1): """Cluster decoys using spicker Parameters ---------- models : list A list containing structure decoys cluster_dir : str The directory to store the cluster data in cluster_method_type : str The method to be used to cluster the decoys score_type : str The scoring metric for clustering num_clusters : int The number of clusters to produce max_cluster_size : int The maximum number of decoys per cluster cluster_exe : str The path to the spicker executable nproc : int The number of processors to use score_matrix : str, optional The path to the score matrix to be used Returns ------- list A list containing the clusters Raises ------ RuntimeError No clusters returned by SPICKER """ self._cluster(models, run_dir=run_dir, score_type=score_type, score_matrix=score_matrix, nproc=nproc) ns_clusters = len(self.results) if ns_clusters == 0: raise RuntimeError('No clusters returned by SPICKER') if ns_clusters < int(num_clusters): logger.critical('Requested {0} clusters but SPICKER only found {1} so using {1} clusters'.format(num_clusters, ns_clusters)) num_clusters = ns_clusters clusters = [] for i in range(num_clusters): cluster = self.results[i] # We truncate the list of models to max_cluster_size. This probably # needs to be redone, because as the models are ordered by their similarity # to the cluster centroid, we automatically select the 200 most similar # to the centroid. However if the cluster is large and the models similar # then when theseus calculates the variances, the variances will be # representative of the 200, but might not show how the models vary throughout # the whole cluster, which could provide better information for truncating the models. # # Note - hlfsimko - 24.02.16: Maybe we can keep the full length clusters # and slice the list after truncation? cluster.num_clusters = ns_clusters cluster.models = cluster.models[0:max_cluster_size] clusters.append(cluster) return clusters
def _cluster(self, models, run_dir=None, score_type='rmsd', score_matrix=None, nproc=1): """ Run spicker to cluster the models """ owd = os.getcwd() if run_dir: self.run_dir = os.path.abspath(run_dir) if not self.run_dir: self.run_dir = os.path.join(owd, 'spicker') if not os.path.isdir(self.run_dir): os.mkdir(self.run_dir) os.chdir(self.run_dir) logger.debug("Running spicker with score_type {0} in directory: {1}".format(score_type, self.run_dir)) logger.debug("Using executable: {0} on {1} processors".format(self.spicker_exe, nproc)) self.score_type = score_type self.create_input_files(models, score_type=score_type, score_matrix=score_matrix) # We need special care if we are running with tm scores as we will be using the OPENMP # version of spicker which requires increasing the stack size on linux and setting the # OMP_NUM_THREADS environment variable on all platforms # The stack size on 64-bit linux seems to be 15Mb, so I guess asking for 50 seems reasonable # I'm assuming that the limit is in bytes and specified by an integer so 50Mb -> 50000000 preexec_fn=None env = None if score_type == 'tm': env = { 'OMP_NUM_THREADS' : str(nproc)} if sys.platform.lower().startswith('linux'): def set_stack(): import resource stack_bytes = 50000000 # 50Mb #resource.setrlimit(resource.RLIMIT_STACK, (resource.RLIM_INFINITY,resource.RLIM_INFINITY)) resource.setrlimit(resource.RLIMIT_STACK, (stack_bytes,stack_bytes)) preexec_fn=set_stack logfile = os.path.abspath("spicker.log") rtn = ample_util.run_command([self.spicker_exe], logfile=logfile, env=env, preexec_fn=preexec_fn) if not rtn == 0: raise RuntimeError,"Error running spicker, check logfile: {0}".format(logfile) # Read the log and generate the results self.results = self.process_log() # Always go back to where we started os.chdir(owd) return
[docs] def process_log(self, logfile=None): """Read the spicker str.txt file and return a list of SpickerResults for each cluster. We use the R_nat value to order the files in the cluster """ if not logfile: logfile = os.path.join(self.run_dir, 'str.txt') clusterCounts = [] index2rcens = [] # File with the spicker results for each cluster logger.debug("Processing spicker output file: {0}".format(logfile)) f = open(logfile, 'r') line = f.readline() while line: line = line.strip() if line.startswith("#Cluster"): # skip 2 lines to Nstr f.readline() f.readline() line = f.readline().strip() if not line.startswith("Nstr="): raise RuntimeError, "Problem reading file: {0}".format(logfile) ccount = int(line.split()[1]) clusterCounts.append(ccount) # Loop through this cluster i2rcen = [] line = f.readline().strip() while not line.startswith("------"): fields = line.split() # i_cl i_str R_nat R_cen E #str traj # tuple of: ( index in file , distance from centroid ) i2rcen.append((int(fields[5]), float(fields[3]))) line = f.readline().strip() index2rcens.append(i2rcen) line = f.readline() # Sort clusters by the R_cen - distance from cluster centroid for i, l in enumerate(index2rcens): # Sort by the distance form the centroid, so first becomes centroid sorted_by_rcen = sorted(l, key=lambda tup: tup[1]) index2rcens[i] = sorted_by_rcen # Now map the indices to their files # Get ordered list of the pdb files flist = os.path.join(self.run_dir, 'file_list') pdb_list = [ line.strip() for line in open(flist , 'r') ] results = [] # create results num_clusters = len(clusterCounts) for cluster in range(num_clusters): result = Cluster() result.cluster_method = self.cluster_method result.cluster_score_type = self.score_type result.index = cluster + 1 result.num_clusters = num_clusters pdb_file = os.path.join(self.run_dir, "spicker_cluster_{0}.list".format(cluster + 1)) with open(pdb_file, "w") as f: for i, (idx, rcen) in enumerate(index2rcens[ cluster ]): pdb = pdb_list[idx - 1] #if i == 0: result.cluster_centroid = pdb result.models.append(pdb) result.r_cen.append(rcen) f.write(pdb + "\n") results.append(result) return results
[docs] def results_summary(self): """Summarise the spicker results""" if not self.results: raise RuntimeError, "Could not find any results!" rstr = "---- Spicker Results ----\n\n" for i, r in enumerate(self.results): rstr += "Cluster: {0}\n".format(i + 1) rstr += "* number of models: {0}\n".format(r.size) #rstr += "* files are listed in file: {0}\n".format(r.pdb_file) rstr += "* centroid model is: {0}\n".format(r.centroid) rstr += "\n" return rstr
if __name__ == "__main__": import argparse # # Run Spicker on a directory of PDB files # parser = argparse.ArgumentParser() parser.add_argument('-e', '--executable', help="spicker executable to use") parser.add_argument('-m', '--models', required=True, help="Models to cluster") parser.add_argument('-s', '--score_type', default='rmsd', help="Use TM score") args = parser.parse_args() models = args.models if not os.path.exists(models): raise RuntimeError("Cannot find models: {0}".format(models)) if os.path.isdir(models): models = [ os.path.abspath(m) for m in glob.glob(os.path.join(models, "*.pdb")) ] elif os.path.isfile(models): with open(models) as f: models = [ l.strip() for l in f.readlines() if l.strip() ] if not len(models): print "Cannot find any pdbs in: {0}".format(models) sys.exit(1) spicker_exe = os.path.abspath(args.executable) if args.executable else None logging.basicConfig() logging.getLogger().setLevel(logging.DEBUG) spicker = Spickerer(spicker_exe=spicker_exe) spicker.cluster(models, score_type=args.score_type) print spicker.results_summary()