#!/usr/bin/env python
# python imports
import glob
import logging
import os
import re
import shutil
import sys
# our imports
from ample.util import ample_util
from ample.ensembler._ensembler import Cluster
logger = logging.getLogger(__name__)
[docs]class Spickerer(object):
def __init__(self, spicker_exe=None, run_dir=None):
"""Initialise from a dictionary of options"""
if not spicker_exe:
if 'CCP4' in os.environ: spicker_exe = os.path.join(os.environ['CCP4'], 'bin', 'spicker')
else: raise RuntimeError, "Cannot find a spicker executable!"
if not (os.path.exists(spicker_exe) and os.access(spicker_exe, os.X_OK)):
raise RuntimeError, "Cannot find a valid spicker executable: {0}".format(spicker_exe)
self.spicker_exe = spicker_exe
self.run_dir = run_dir
self.results = None
self.cluster_method = 'spicker'
self.score_type = 'rmsd'
return
[docs] def get_length(self, pdb):
pdb = open(pdb)
counter = 0
for line in pdb:
pdb_pattern = re.compile('^ATOM')
pdb_result = pdb_pattern.match(line)
if pdb_result:
atom = line[13:16]
if re.search('CA', atom):
counter += 1
return str(counter)
[docs] def cluster(self, models, num_clusters=10, max_cluster_size=200, run_dir=None, score_type='rmsd', score_matrix=None, nproc=1):
"""Cluster decoys using spicker
Parameters
----------
models : list
A list containing structure decoys
cluster_dir : str
The directory to store the cluster data in
cluster_method_type : str
The method to be used to cluster the decoys
score_type : str
The scoring metric for clustering
num_clusters : int
The number of clusters to produce
max_cluster_size : int
The maximum number of decoys per cluster
cluster_exe : str
The path to the spicker executable
nproc : int
The number of processors to use
score_matrix : str, optional
The path to the score matrix to be used
Returns
-------
list
A list containing the clusters
Raises
------
RuntimeError
No clusters returned by SPICKER
"""
self._cluster(models, run_dir=run_dir, score_type=score_type, score_matrix=score_matrix, nproc=nproc)
ns_clusters = len(self.results)
if ns_clusters == 0: raise RuntimeError('No clusters returned by SPICKER')
if ns_clusters < int(num_clusters):
logger.critical('Requested {0} clusters but SPICKER only found {1} so using {1} clusters'.format(num_clusters, ns_clusters))
num_clusters = ns_clusters
clusters = []
for i in range(num_clusters):
cluster = self.results[i]
# We truncate the list of models to max_cluster_size. This probably
# needs to be redone, because as the models are ordered by their similarity
# to the cluster centroid, we automatically select the 200 most similar
# to the centroid. However if the cluster is large and the models similar
# then when theseus calculates the variances, the variances will be
# representative of the 200, but might not show how the models vary throughout
# the whole cluster, which could provide better information for truncating the models.
#
# Note - hlfsimko - 24.02.16: Maybe we can keep the full length clusters
# and slice the list after truncation?
cluster.num_clusters = ns_clusters
cluster.models = cluster.models[0:max_cluster_size]
clusters.append(cluster)
return clusters
def _cluster(self, models, run_dir=None, score_type='rmsd', score_matrix=None, nproc=1):
"""
Run spicker to cluster the models
"""
owd = os.getcwd()
if run_dir: self.run_dir = os.path.abspath(run_dir)
if not self.run_dir: self.run_dir = os.path.join(owd, 'spicker')
if not os.path.isdir(self.run_dir): os.mkdir(self.run_dir)
os.chdir(self.run_dir)
logger.debug("Running spicker with score_type {0} in directory: {1}".format(score_type, self.run_dir))
logger.debug("Using executable: {0} on {1} processors".format(self.spicker_exe, nproc))
self.score_type = score_type
self.create_input_files(models, score_type=score_type, score_matrix=score_matrix)
# We need special care if we are running with tm scores as we will be using the OPENMP
# version of spicker which requires increasing the stack size on linux and setting the
# OMP_NUM_THREADS environment variable on all platforms
# The stack size on 64-bit linux seems to be 15Mb, so I guess asking for 50 seems reasonable
# I'm assuming that the limit is in bytes and specified by an integer so 50Mb -> 50000000
preexec_fn=None
env = None
if score_type == 'tm':
env = { 'OMP_NUM_THREADS' : str(nproc)}
if sys.platform.lower().startswith('linux'):
def set_stack():
import resource
stack_bytes = 50000000 # 50Mb
#resource.setrlimit(resource.RLIMIT_STACK, (resource.RLIM_INFINITY,resource.RLIM_INFINITY))
resource.setrlimit(resource.RLIMIT_STACK, (stack_bytes,stack_bytes))
preexec_fn=set_stack
logfile = os.path.abspath("spicker.log")
rtn = ample_util.run_command([self.spicker_exe], logfile=logfile, env=env, preexec_fn=preexec_fn)
if not rtn == 0:
raise RuntimeError,"Error running spicker, check logfile: {0}".format(logfile)
# Read the log and generate the results
self.results = self.process_log()
# Always go back to where we started
os.chdir(owd)
return
[docs] def process_log(self, logfile=None):
"""Read the spicker str.txt file and return a list of SpickerResults for each cluster.
We use the R_nat value to order the files in the cluster
"""
if not logfile: logfile = os.path.join(self.run_dir, 'str.txt')
clusterCounts = []
index2rcens = []
# File with the spicker results for each cluster
logger.debug("Processing spicker output file: {0}".format(logfile))
f = open(logfile, 'r')
line = f.readline()
while line:
line = line.strip()
if line.startswith("#Cluster"):
# skip 2 lines to Nstr
f.readline()
f.readline()
line = f.readline().strip()
if not line.startswith("Nstr="):
raise RuntimeError, "Problem reading file: {0}".format(logfile)
ccount = int(line.split()[1])
clusterCounts.append(ccount)
# Loop through this cluster
i2rcen = []
line = f.readline().strip()
while not line.startswith("------"):
fields = line.split()
# i_cl i_str R_nat R_cen E #str traj
# tuple of: ( index in file , distance from centroid )
i2rcen.append((int(fields[5]), float(fields[3])))
line = f.readline().strip()
index2rcens.append(i2rcen)
line = f.readline()
# Sort clusters by the R_cen - distance from cluster centroid
for i, l in enumerate(index2rcens):
# Sort by the distance form the centroid, so first becomes centroid
sorted_by_rcen = sorted(l, key=lambda tup: tup[1])
index2rcens[i] = sorted_by_rcen
# Now map the indices to their files
# Get ordered list of the pdb files
flist = os.path.join(self.run_dir, 'file_list')
pdb_list = [ line.strip() for line in open(flist , 'r') ]
results = []
# create results
num_clusters = len(clusterCounts)
for cluster in range(num_clusters):
result = Cluster()
result.cluster_method = self.cluster_method
result.cluster_score_type = self.score_type
result.index = cluster + 1
result.num_clusters = num_clusters
pdb_file = os.path.join(self.run_dir, "spicker_cluster_{0}.list".format(cluster + 1))
with open(pdb_file, "w") as f:
for i, (idx, rcen) in enumerate(index2rcens[ cluster ]):
pdb = pdb_list[idx - 1]
#if i == 0: result.cluster_centroid = pdb
result.models.append(pdb)
result.r_cen.append(rcen)
f.write(pdb + "\n")
results.append(result)
return results
[docs] def results_summary(self):
"""Summarise the spicker results"""
if not self.results: raise RuntimeError, "Could not find any results!"
rstr = "---- Spicker Results ----\n\n"
for i, r in enumerate(self.results):
rstr += "Cluster: {0}\n".format(i + 1)
rstr += "* number of models: {0}\n".format(r.size)
#rstr += "* files are listed in file: {0}\n".format(r.pdb_file)
rstr += "* centroid model is: {0}\n".format(r.centroid)
rstr += "\n"
return rstr
if __name__ == "__main__":
import argparse
#
# Run Spicker on a directory of PDB files
#
parser = argparse.ArgumentParser()
parser.add_argument('-e', '--executable',
help="spicker executable to use")
parser.add_argument('-m', '--models', required=True,
help="Models to cluster")
parser.add_argument('-s', '--score_type', default='rmsd',
help="Use TM score")
args = parser.parse_args()
models = args.models
if not os.path.exists(models): raise RuntimeError("Cannot find models: {0}".format(models))
if os.path.isdir(models):
models = [ os.path.abspath(m) for m in glob.glob(os.path.join(models, "*.pdb")) ]
elif os.path.isfile(models):
with open(models) as f:
models = [ l.strip() for l in f.readlines() if l.strip() ]
if not len(models):
print "Cannot find any pdbs in: {0}".format(models)
sys.exit(1)
spicker_exe = os.path.abspath(args.executable) if args.executable else None
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
spicker = Spickerer(spicker_exe=spicker_exe)
spicker.cluster(models, score_type=args.score_type)
print spicker.results_summary()