Source code for ample.ensembler.subcluster

"""Subcluster core module"""

__author__ = "Jens Thomas, and Felix Simkovic"
__date__ = "01 Oct 2016"
__version__ = "1.0"

from collections import namedtuple
import itertools
import logging
import mmtbx.superpose
import numpy
import re
import os
import shutil

from ample.util import ample_util
from ample.util import pdb_edit

logger = logging.getLogger()

SCORE_MATRIX_NAME = 'score.matrix'
FILE_LIST_NAME = 'files.list'

[docs]class SubClusterer(object): """Base class for clustering pdbs by distance Sub-classes just need to provide a generate_distance_matrix class """ def __init__(self,executable=None, nproc=1): if executable and not os.path.exists(executable) and os.access(executable, os.X_OK): msg = "Cannot find subclusterer executable: {0}".format(executable) raise RuntimeError(msg) self.executable = executable self.nproc = nproc self.distance_matrix = None self.index2pdb = [] self.cluster_score = None return
[docs] def generate_distance_matrix(self, *args, **kwargs): assert False
[docs] def cluster_by_radius(self, radius): """Return a list of pdbs clustered by the given radius""" if self.distance_matrix is None: raise RuntimeError("Need to call generate_distance_matrix before cluster_by_radius!") cluster_indices, cluster_score = self._cluster_indices(radius) self.cluster_score = cluster_score if cluster_indices: return [ self.index2pdb[i] for i in cluster_indices ] else: return None
def _cluster_indices(self,thresh): """Return the indices of the largest cluster that have distances < thresh. We loop through each row of the distance matrix and for each row (pdb) see how many pdbs are < thresh to this pdb. We return the largest cluster. """ #self.dump_matrix("maxcluster.csv") thresh = float(thresh) # get mask of all elements where condition is true. We exclude 0.0 to ensure we don't get the # index of the model that the row is compared with, as this needs to be the first model in the # ensemble. This means we would also exclude models that had an rmsd of zero to the centroid, but # as these are likely to be identical (and this occurrence rare), this should be ok condition = numpy.logical_and(self.distance_matrix <= thresh, self.distance_matrix != 0.0) # Array of sums of each row - largest number is a row where most items satisfy condition condition_sum = sum(condition) # Find all rows that have the maximum of the condition true and then select the first one row_index = numpy.where(condition_sum == numpy.max(condition_sum))[0][0] # Select all values from that row where the condition is true and insert the first index so that # it becomes the centroid of that cluster max_cluster = numpy.insert(numpy.where(condition[row_index])[0], 0, row_index) # max_cluster = [] # len_matrix = len(self.distance_matrix) # for i in range(len_matrix): # cluster = [i] # for j in range(len_matrix): # if self.distance_matrix[i][j] is None or j==i: continue # if float(self.distance_matrix[i][j]) < thresh: # cluster.append(j) # if len(cluster) > len(max_cluster): # max_cluster = copy.copy(cluster) if len(max_cluster) == 1: return None, None else: cluster_score = self.calculate_score(max_cluster) return sorted(max_cluster), cluster_score
[docs] def calculate_score(self, cluster): """Given a list of indices of a cluster, calculate the rmsd we want to give to phaser """ ALL_BY_ALL = True if ALL_BY_ALL: # # Calculate all the rmsds of all decoys in the cluster with each other # rmsds = [] # lenc = len(cluster) # for i in range(lenc): # for j in range(i+1,lenc): # i1 = cluster[i] # i2 = cluster[j] # rmsds.append(self.distance_matrix[i1][i2]) rmsds = [ self.distance_matrix[i] for i in itertools.combinations(cluster, 2) ] else: # Just use the rmsds of the decoys to the the cluster centroid - assumes # the centroid approximates the native row = cluster[0] rmsds = [self.distance_matrix[row][j] for j in cluster[1:]] return max(rmsds)
[docs] def dump_raw_matrix(self,file_name): with open(file_name,'w') as f: for row in self.distance_matrix: f.write(",".join(map(str,row))+"\n") f.write("\n") return
[docs] def dump_pdb_matrix(self, file_name=SCORE_MATRIX_NAME, offset=0): with open(file_name,'w') as f: l = len(self.distance_matrix) + offset for i in range(offset, l): for j in range(i, l): f.write("{0: > 4d} {1: > 4d} {2: > 8.3F}\n".format(i,j, self.distance_matrix[i-offset][j-offset])) f.write("\n") return os.path.abspath(file_name)
[docs]class CctbxClusterer(SubClusterer): """Class to cluster files with maxcluster"""
[docs] def generate_distance_matrix(self, pdb_list): """Run cctbx to generate the distance distance_matrix""" num_models = len(pdb_list) if not num_models: msg = "generate_distance_matrix got empty pdb_list!" logging.critical(msg) raise RuntimeError(msg) # Index is just the order of the pdb in the file self.index2pdb = pdb_list # Create a square matrix storing the rmsd distances between models self.distance_matrix = numpy.zeros([num_models, num_models]) for m1, m2 in itertools.combinations(pdb_list, 2): i, j = pdb_list.index(m1), pdb_list.index(m2) fixed = mmtbx.superpose.SuperposePDB(m1, preset='ca', log=None, quiet=True) moving = mmtbx.superpose.SuperposePDB(m2, preset='ca', log=None, quiet=True) rmsd, _ = moving.superpose(fixed) self.distance_matrix[i, j] = self.distance_matrix[j, i] = float(rmsd) # # Might be deleted when confirmed that above code works # for i, m1 in enumerate(pdb_list): # fixed = mmtbx.superpose.SuperposePDB(m1, preset='ca', log=None, quiet=True) # for j, m2 in enumerate(pdb_list): # if j <= i: continue # moving = mmtbx.superpose.SuperposePDB(m2, preset='ca', log=None, quiet=True) # rmsd, _ = moving.superpose(fixed) # self.distance_matrix[i][j] = float(rmsd) # # Copy in other half of matrix - we use a full matrix as it's easier to scan for clusters # for x in range(len(self.distance_matrix)): # for y in range(len(self.distance_matrix)): # self.distance_matrix[y][x] = self.distance_matrix[x][y] return
[docs]class FpcClusterer(SubClusterer): """Class to cluster files with fast_protein_clusterer"""
[docs] def generate_distance_matrix(self,pdb_list): # Create list of pdb files fname = os.path.join(os.getcwd(), "files.list" ) with open( fname, 'w' ) as f: f.write( "\n".join( pdb_list )+"\n" ) # Index is just the order of the pdb in the file self.index2pdb = pdb_list # Run fast_protein_cluster - this is just to generate the distance matrix, but there # doesn't seem to be a way to stop it clustering as well - not a problem as it just # generates more files log_name = os.path.abspath("fast_protein_cluster.log") matrix_file = "fpc.matrix" cmd = [self.executable, "--cluster_write_text_matrix", matrix_file, "-i", fname] retcode = ample_util.run_command( cmd, logfile=log_name ) if retcode != 0: msg = "non-zero return code for fast_protein_cluster in generate_distance_matrix!\nCheck logfile:{0}".format(log_name) logging.critical(msg) raise RuntimeError(msg) mlen=0 data=[] with open(matrix_file) as f: for l in f: l = l.strip().split() x = int(l[0]) y = int(l[1]) d = float(l[2]) mlen = max(mlen,x+1) # +1 as we want the length data.append((x,y,d)) # create empty matrix - we use None's but this means we need to check for then when # looking through the matrix # use square matrix to make indexing easier as we're unlikely to be very big m = numpy.zeros([mlen, mlen]) # Fill in all values (upper triangle) for i,j,d in data: if i > j: m[j][i] = d else: m[i][j] = d # Copy to lower for x in range(mlen): for y in range(mlen): if x==y: continue m[y][x] = m[x][y] self.distance_matrix = m return
[docs]class GesamtClusterer(SubClusterer): """Class to cluster files with Gesamt"""
[docs] def generate_distance_matrix(self, pdb_list, purge=False): if True: self._generate_pairwise_rmsd_matrix(pdb_list, purge=purge) else: self._generate_distance_matrix_generic(self, pdb_list, purge=purge, purge_all=False, metric='qscore') return
def _generate_pairwise_rmsd_matrix(self, models, purge=False): """ Use gesamt to generate an all-by-all pairwise rmsd matrix of a list of pdb models Notes: gesamt -input-list inp_list.dat -sheaf-x where inp_list.dat contains: 1ADZ.pdb -s /1/A 1ADZ.pdb -s /2/A 1ADZ.pdb -s /3/A """ # Index is just the order of the pdb in the file self.index2pdb = models # Create file with list of pdbs and model/chain glist = 'gesamt_models.dat' with open(glist, 'w') as w: for m in models: w.write("{0} -s /1/A \n".format(m)) w.write('\n') cmd = [ self.executable, '-input-list', glist, '-sheaf-x', '-nthreads={0}'.format(self.nproc)] logfile = os.path.abspath('gesamt_archive.log') rtn = ample_util.run_command(cmd, logfile) if rtn != 0: raise RuntimeError("Error running gesamt - check logfile: {0}".format(logfile)) # Create a square distance_matrix no_models in size filled with None num_models = len(models) self.distance_matrix = numpy.zeros([num_models, num_models]) # Read in the rmsds calculated self._parse_gesamt_rmsd_log(logfile, num_models) if purge: os.unlink(glist) os.unlink(logfile) return def _parse_gesamt_rmsd_log(self, logfile, num_models): reading = -1 nmodel = 0 with open(logfile) as f: for line in f: if line.startswith(' ===== CROSS-RMSDs') or reading == 0: # find start of RMSDS and skip blank line reading += 1 continue if reading == 1: fields = line.strip().split('|') nmodel = int(fields[0]) rmsd_txt = fields[2].strip() # poke into distance matrix rmsds = [ float(r) for r in rmsd_txt.split() ] for j in range(len(rmsds)): if j == nmodel: continue self.distance_matrix[nmodel-1][j] = rmsds[j] if nmodel == num_models: reading = -1 if nmodel != num_models: raise RuntimeError("Could not generate distance matrix with gesamt") return def _generate_distance_matrix_generic(self, models, purge=True, purge_all=False, metric='qscore'): # Make sure all the files are in the same directory otherwise we wont' work mdir = os.path.dirname(models[0]) if not all([ os.path.dirname(p) == mdir for p in models ]): raise RuntimeError("All pdb files are not in the same directory!") # Create list of pdb files fname = os.path.join(os.getcwd(), FILE_LIST_NAME) with open(fname, 'w') as f: f.write("\n".join(models)+"\n") # Index is just the order of the pdb in the file self.index2pdb = models nmodels = len(models) # Make the archive logger.debug("Generating gesamt archive from models in directory %s", mdir) garchive = 'gesamt.archive' if not os.path.isdir(garchive): os.mkdir(garchive) logfile = os.path.abspath('gesamt_archive.log') cmd = [ self.executable, '--make-archive', garchive, '-pdb', mdir ] #cmd += [ '-nthreads=auto' ] cmd += [ '-nthreads={0}'.format(self.nproc) ] # HACK FOR DYLD!!!! env = None #env = {'DYLD_LIBRARY_PATH' : '/opt/ccp4-devtools/install/lib'} rtn = ample_util.run_command(cmd, logfile,env = env ) if rtn != 0: raise RuntimeError("Error running gesamt - check logfile: {0}".format(logfile)) if purge_all: os.unlink(logfile) # Now loop through each file creating the matrix if metric == 'rmsd': parity = 0.0 elif metric == 'qscore': parity = 1 else: raise RuntimeError("Unrecognised metric: {0}".format(metric)) #m = [[parity for _ in range(nmodels)] for _ in range(nmodels)] m = numpy.full([nmodels, nmodels], parity) for i, model in enumerate(models): mname = os.path.basename(model) gesamt_out = '{0}_gesamt.out'.format(mname) logfile = '{0}_gesamt.log'.format(mname) cmd = [ self.executable, model, '-archive', garchive, '-o', gesamt_out ] cmd += [ '-nthreads={0}'.format(self.nproc) ] rtn = ample_util.run_command(cmd, logfile) if rtn != 0: raise RuntimeError("Error running gesamt!") else: if purge: os.unlink(logfile) gdata = self._parse_gesamt_out(gesamt_out) assert gdata[0].file_name == mname, gdata[0].file_name + " " + mname score_dict = { g.file_name: (g.rmsd, g.q_score) for g in gdata } for j in range(i + 1, nmodels): # Try and get the rmsd and qscore for this model. If it's missing we assume the model was # too divergent for gesamt to find it and we set the rmsd and qscore to fixed values model2 = os.path.basename(models[j]) try: rmsd, qscore = score_dict[model2] except KeyError: rmsd = RMSD_MAX qscore = QSCORE_MIN if metric == 'rmsd': score = rmsd elif metric == 'qscore': score = qscore else: raise RuntimeError("Unrecognised metric: {0}".format(metric)) m[i][j] = score if purge_all: os.unlink(gesamt_out) # Copy upper half of matrix to lower for x in range(nmodels): for y in range(nmodels): if x == y: continue m[y][x] = m[x][y] self.distance_matrix = m # Remove the gesamt archive if purge: shutil.rmtree(garchive) # Write out the matrix in a form spicker can use self.dump_pdb_matrix(SCORE_MATRIX_NAME) return def _parse_gesamt_out(self, out_file): # Assumption is there are no pdb_codes GesamtData = namedtuple('GesamtData', ['count', 'chain_id', 'q_score', 'rmsd', 'seq_id', 'nalign', 'nres', 'file_name']) data = [] with open(out_file) as f: for i, line in enumerate(f): if i < 2: continue # First 2 lines are headers if not line.strip(): continue # ignore blanks try: tmp = GesamtData(*line.split()) # Convert from strings to correct types data.append(GesamtData(int(tmp.count), tmp.chain_id, float(tmp.q_score), float(tmp.rmsd), tmp.seq_id, int(tmp.nalign), int(tmp.nres), os.path.basename(tmp.file_name))) except Exception as e: msg = 'Error parsing line {0}: {1}\n{2}'.format(i, line, e.message) logging.critical(msg) raise e assert len(data),"Failed to read any data!" return data
[docs]class LsqkabClusterer(SubClusterer): """Class to cluster files with Lsqkab"""
[docs] def calc_rmsd(self, model1, model2, nresidues=None, logfile='lsqkab.out', purge=False): if not nresidues: _, nresidues = pdb_edit.num_atoms_and_residues(model1, first=True) stdin = """FIT RESIDUE CA 1 TO {0} CHAIN {1} MATCH 1 to {0} CHAIN {1} output RMS end""".format(nresidues, 'A') cmd = [ 'lsqkab', 'XYZINM', model1, 'XYZINF', model2 ] ample_util.run_command(cmd, logfile=logfile, stdin=stdin) rmsd = self.parse_lsqkab_output(logfile) # cleanup if purge: os.unlink(logfile) os.unlink('RMSTAB') return rmsd
[docs] def generate_distance_matrix(self, models, purge=True, metric='qscore'): # Index is just the order of the pdb in the file self.index2pdb = models num_models = len(models) # Assume all models are the same size and only have a single chain # We also assume that the chain is called 'A' (not relevant here) _, nresidues = pdb_edit.num_atoms_and_residues(models[0], first=True) # Create a square distance_matrix no_models in size filled with None self.distance_matrix = numpy.zeros([num_models, num_models]) logfile='lsqkab.out' parity = 0.0 for i, fixed in enumerate(models): for j, model2 in enumerate(models): if j < i: continue if j == i: rmsd = parity elif j > i: rmsd = self.calc_rmsd(fixed, model2, nresidues=nresidues, logfile=logfile) self.distance_matrix[i][j] = rmsd # Clean up output files from lsqkab os.unlink(logfile) os.unlink('RMSTAB') # Copy in other half of matrix - we use a full matrix as it's easier to scan for clusters for x in range(len(self.distance_matrix)): for y in range(len(self.distance_matrix)): self.distance_matrix[y][x] = self.distance_matrix[x][y] return
[docs] def parse_lsqkab_output(self, output_file): with open(output_file) as f: for l in f.readlines(): if l.startswith(" RMS XYZ DISPLACEMENT ="): return float(l.split()[4]) assert False
[docs]class MaxClusterer(SubClusterer): """Class to cluster files with maxcluster"""
[docs] def generate_distance_matrix(self, pdb_list): """Run maxcluster to generate the distance distance_matrix""" num_models = len( pdb_list ) if not num_models: msg = "generate_distance_matrix got empty pdb_list!" logging.critical(msg) raise RuntimeError(msg) self.index2pdb=[0]*num_models # Maxcluster arguments # -l [file] File containing a list of PDB model fragments # -L [n] Log level (default is 4 for single MaxSub, 1 for lists) # -d [f] The distance cut-off for search (default auto-calibrate) # -bb Perform RMSD fit using backbone atoms # -C [n] Cluster method: 0 - No clustering # -rmsd ??? #os.system(MAX + ' -l list -L 4 -rmsd -d 1000 -bb -C0 >MAX_LOG ') #print 'MAX Done' # Create the list of files for maxcluster fname = os.path.join(os.getcwd(), FILE_LIST_NAME ) with open( fname, 'w' ) as f: f.write( "\n".join( pdb_list )+"\n" ) #log_name = "maxcluster_radius_{0}.log".format(radius) log_name = os.path.abspath("maxcluster.log") cmd = [ self.executable, "-l", fname, "-L", "4", "-rmsd", "-d", "1000", "-bb", "-C0" ] retcode = ample_util.run_command( cmd, logfile=log_name ) if retcode != 0: msg = "non-zero return code for maxcluster in generate_distance_matrix!\nSee logfile: {0}".format(log_name) logging.critical(msg) raise RuntimeError(msg) # Create a square distance_matrix no_models in size filled with None parity = 0.0 self.distance_matrix = numpy.full([num_models, num_models], parity) #jmht Save output for parsing - might make more sense to use one of the dedicated maxcluster output formats #max_log = open(cur_dir+'/MAX_LOG') max_log = open( log_name, 'r') pattern = re.compile('INFO \: Model') for line in max_log: if re.match(pattern, line): # Split so that we get a list with # 0: model 1 index # 1: path to model 1 without .pdb suffix # 2: model 2 index # 3: path to model 2 without .pdb suffix # 4: distance metric split = re.split('INFO \: Model\s*(\d*)\s*(.*)\.pdb\s*vs\. Model\s*(\d*)\s*(.*)\.pdb\s*=\s*(\d*\.\d*)', line) self.distance_matrix[ int(split[1]) -1 ][ int(split[3]) -1] = float(split[5]) if split[2]+'.pdb' not in self.index2pdb: self.index2pdb[int(split[1]) -1] = split[2]+'.pdb' if split[4]+'.pdb' not in self.index2pdb: self.index2pdb[int(split[3]) -1] = split[4]+'.pdb' # Copy in other half of matrix - we use a full matrix as it's easier to scan for clusters for x in range(len(self.distance_matrix)): for y in range(len(self.distance_matrix)): self.distance_matrix[y][x] = self.distance_matrix[x][y] return