Source code for ample.ensembler.abinitio

"""Ensembler module for ab initio decoys"""

__author__ = "Jens Thomas, and Felix Simkovic"
__date__ = "17 Feb 2016"
__version__ = "1.0"

import logging
import os
import shutil

import _ensembler
import cluster_util
import subcluster
import subcluster_util
import truncation_util
from constants import SIDE_CHAIN_TREATMENTS, SUBCLUSTER_RADIUS_THRESHOLDS

from ample.util import fast_protein_cluster
from ample.util import scwrl_util
from ample.util import spicker

logger = logging.getLogger(__name__)


[docs]class AbinitioEnsembler(_ensembler.Ensembler): """Ensemble creator using on multiple models with identical sequences most likely created using Rosetta or Quark ab initio modelling """ def __init__(self, **kwargs): # Inherit all functions from Parent Ensembler super(AbinitioEnsembler, self).__init__(**kwargs) # self.subcluster_method='FLOATING_RADII' self.cluster_score_matrix = None self.subcluster_method = 'ORIGINAL' self.subcluster_program = "maxcluster" self.subclustering_method = "radius" self.subcluster_radius_thresholds = SUBCLUSTER_RADIUS_THRESHOLDS # we save the truncator so that we can query it for data later self.truncator = None return
[docs] def cluster_models(self, models=None, cluster_method='spicker', num_clusters=1, cluster_dir=None, max_cluster_size=200, ): """Wrapper function to run clustering of models dependent on the method """ # Cluster our protein structures logger.info('Generating %d clusters using method: %s', num_clusters, cluster_method) if cluster_method != 'import' and not len(models): raise RuntimeError, "Cannot find any models for ensembling!" # Get the cluster_method_type and cluster_score_type from the cluster_method cluster_method_type, cluster_score_type, cluster_exe = self.parse_cluster_method(cluster_method) # Set directory if cluster_method_type not in ['skip']: pass elif cluster_method_type in ['import', 'random']: if not os.path.isdir(cluster_dir): raise RuntimeError('Cannot find cluster directory: {0}'.format(cluster_dir)) else: cluster_dir = os.path.join(self.work_dir, 'clustering') if cluster_method_type == 'fast_protein_cluster': SCORE_TYPE = 'rmsd' CLUSTER_METHOD = 'kmeans' logger.info('Running fast_protein_cluster with: score_type: %s and cluster_method: %s', SCORE_TYPE, CLUSTER_METHOD) clusters = fast_protein_cluster.FPC().fpc.cluster(cluster_method=CLUSTER_METHOD, fpc_exe=cluster_exe, max_cluster_size=max_cluster_size, models=models, num_clusters=num_clusters, nproc=self.nproc, score_type=SCORE_TYPE, work_dir=cluster_dir) elif cluster_method_type == 'import': clusters = cluster_util.import_cluster(models) elif cluster_method_type == 'random': clusters = cluster_util.random_cluster(cluster_method_type, max_cluster_size, models, num_clusters) elif cluster_method_type == 'spicker': logger.info('* Running SPICKER to cluster models *') spickerer = spicker.Spickerer(spicker_exe=cluster_exe) clusters = spickerer.cluster(models, num_clusters=num_clusters, max_cluster_size=max_cluster_size, score_type=cluster_score_type, run_dir=cluster_dir, score_matrix=None, nproc=self.nproc) logger.debug(spickerer.results_summary()) else: msg = 'Unrecognised clustering method: {0}'.format(cluster_method_type) raise RuntimeError(msg) return clusters
[docs] def ensemble_from_subcluster(self, cluster_files, radius, truncation, cluster_score=None): subcluster_dir = os.path.join(truncation.directory, 'subcluster_{0}'.format(radius)) os.mkdir(subcluster_dir) os.chdir(subcluster_dir) cluster_num = truncation.cluster.index truncation_level = truncation.level basename = 'c{0}_t{1}_r{2}'.format(cluster_num, truncation_level, radius) # List of files for reference with open(os.path.join(subcluster_dir, "{0}.list".format(basename)), 'w') as f: for m in cluster_files: f.write(m + "\n") f.write("\n") cluster_file = self.superpose_models(cluster_files, work_dir=subcluster_dir) if not cluster_file: msg = "Error running theseus on ensemble {0} in directory: {1}\nSkipping subcluster: {0}".format(basename, subcluster_dir) logger.critical(msg) return None ensemble_pdb = os.path.join(subcluster_dir, basename + '.pdb') shutil.move(cluster_file, ensemble_pdb) ensemble = _ensembler.Ensemble() # First add the data from the cluster ensemble.cluster_method = truncation.cluster.cluster_method ensemble.cluster_score_type = truncation.cluster.score_type ensemble.num_clusters = truncation.cluster.num_clusters ensemble.cluster_num = truncation.cluster.index ensemble.cluster_centroid = truncation.cluster.centroid ensemble.cluster_num_models = len(truncation.cluster) # Then the truncation information ensemble.num_residues = truncation.num_residues ensemble.truncation_dir = truncation.directory ensemble.truncation_level = truncation.level ensemble.truncation_method = truncation.method ensemble.truncation_percent = truncation.percent ensemble.truncation_residues = truncation.residues ensemble.truncation_variance = truncation.variances # Now the subcluster info # The data we've collected is the same for all pdbs in this level so just keep using the first ensemble.subcluster_num_models = len(cluster_files) # Get the centroid model name from the list of files given to theseus - we can't parse # the pdb file as theseus truncates the filename ensemble.subcluster_centroid_model = os.path.abspath(cluster_files[0]) ensemble.subcluster_radius_threshold = radius ensemble.subcluster_score = cluster_score ensemble.pdb = ensemble_pdb return ensemble
[docs] def generate_ensembles(self, models, cluster_dir=None, cluster_method=None, ensembles_directory=None, ensemble_max_models=None, num_clusters=None, percent_truncation=None, side_chain_treatments=SIDE_CHAIN_TREATMENTS, subcluster_radius_thresholds=SUBCLUSTER_RADIUS_THRESHOLDS, subcluster_program=None, truncation_method=None, truncation_pruning=None, use_scwrl=False): if not num_clusters: num_clusters = self.num_clusters if not percent_truncation: percent_truncation = self.percent_truncation if not truncation_method: truncation_method = self.truncation_method if not truncation_pruning: truncation_pruning = self.truncation_pruning logger.info('Ensembling models in directory: %s', self.work_dir) if not all([os.path.isfile(m) for m in models]): msg = "Problem reading models given to Ensembler: {0}".format(models) raise RuntimeError(msg) self.ensembles = [] for cluster in self.cluster_models(models=models, cluster_method=cluster_method, num_clusters=num_clusters, cluster_dir=cluster_dir): if len(cluster) < 2: logger.info("Cannot truncate cluster %d as < 2 models!", cluster.index) continue logger.info('Processing cluster: %d', cluster.index) truncate_dir = os.path.join(self.work_dir, "cluster_{0}".format(cluster.index)) if not os.path.isdir(truncate_dir): os.mkdir(truncate_dir) os.chdir(truncate_dir) # Add sidechains using SCWRL here so we only add them to the models we actually use if use_scwrl: cluster.models = self.scwrl_models(cluster.models, truncate_dir, self.scwrl_exe) self.truncator = truncation_util.Truncator(work_dir=truncate_dir) self.truncator.theseus_exe = self.theseus_exe for truncation in self.truncator.truncate_models(models=cluster.models, truncation_method=truncation_method, percent_truncation=percent_truncation, truncation_pruning=truncation_pruning): # Add cluster information truncation.cluster = cluster for ensemble in self.subcluster_models(truncation, subcluster_program=subcluster_program, ensemble_max_models=self.ensemble_max_models, radius_thresholds=subcluster_radius_thresholds): # Now add the side chains for ensemble in self.edit_side_chains(ensemble, side_chain_treatments): self.ensembles.append(ensemble) return self.ensembles
[docs] def generate_ensembles_from_amoptd(self, models, amoptd): """Generate ensembles from data in supplied ample data dictionary.""" kwargs = { 'cluster_dir' : amoptd['cluster_dir'], 'cluster_method' : amoptd['cluster_method'], 'num_clusters' : amoptd['num_clusters'], 'percent_truncation' : amoptd['percent'], 'side_chain_treatments' : amoptd['side_chain_treatments'], 'subcluster_program' : amoptd['subcluster_program'], 'subcluster_radius_thresholds' : amoptd['subcluster_radius_thresholds'], 'truncation_method' : amoptd['truncation_method'], 'truncation_pruning' : amoptd['truncation_pruning'], 'use_scwrl' : amoptd['use_scwrl'] } # strip out any that are None kwargs = { k : v for k, v in kwargs.iteritems() if v is not None } ensembles = self.generate_ensembles(models, **kwargs) # We need to save these data to amopt as it's impossible to reconstruct otherwise amoptd['truncation_levels'] = self.truncator.truncation_levels amoptd['truncation_variances'] = self.truncator.truncation_variances amoptd['truncation_nresidues'] = self.truncator.truncation_nresidues return ensembles
[docs] def parse_cluster_method(self, cluster_method): """Return the cluster_method_type, cluster_score_type, cluster_exe from a generic cluster_method""" cluster_score_type = 'rmsd' if cluster_method == 'fast_protein_cluster': cluster_method_type = 'fast_protein_cluster' cluster_exe = self.fast_protein_cluster_exe elif cluster_method in ['spicker', 'spicker_qscore', 'spicker_tm']: cluster_method_type = 'spicker' cluster_exe = self.spicker_exe if cluster_method == 'spicker_qscore': cluster_score_type = 'read_matrix' elif cluster_method == 'spicker_tm': cluster_score_type = 'tm' elif cluster_method in ['import', 'random', 'skip']: cluster_method_type = cluster_method cluster_exe = None else: msg = "Unrecognised cluster_method: {0}".format(cluster_method) raise RuntimeError(msg) logger.debug('cluster_method_type: %s cluster_score_type: %s cluster_exe %s', cluster_method_type, cluster_score_type, cluster_exe) return cluster_method_type, cluster_score_type, cluster_exe
[docs] def scwrl_models(self, models, work_dir, scwrl_exe): """Add side chains to the models with Scwrl""" scwrl_directory = os.path.join(work_dir, "scrwl") if not os.path.isdir(scwrl_directory): os.mkdir(scwrl_directory) scwrled_models = scwrl_util.Scwrl(scwrl_exe=scwrl_exe).process_models(models, scwrl_directory, strip_oxt=True) return scwrled_models
[docs] def subclusterer_factory(self, subcluster_program): """Return an instantiated subclusterer based on the given program""" if subcluster_program == 'gesamt': clusterer = subcluster.GesamtClusterer(self.gesamt_exe, nproc=self.nproc) elif subcluster_program == 'maxcluster': clusterer = subcluster.MaxClusterer(self.maxcluster_exe) elif subcluster_program == 'lsqkab': clusterer = subcluster.LsqkabClusterer(self.lsqkab_exe) else: raise RuntimeError("Unrecognised subcluster_program: {0}".format(subcluster_program)) return clusterer
[docs] def subcluster_models(self, truncation, subcluster_program=None, radius_thresholds=None, ensemble_max_models=None): if self.subcluster_method == "ORIGINAL": return self.subcluster_models_fixed_radii(truncation, subcluster_program=subcluster_program, ensemble_max_models=ensemble_max_models, radius_thresholds=radius_thresholds) elif self.subcluster_method == "FLOATING_RADII": return self.subcluster_models_floating_radii(truncation, subcluster_program=subcluster_program, ensemble_max_models=ensemble_max_models) else: assert False
[docs] def subcluster_models_fixed_radii(self, truncation, subcluster_program=None, ensemble_max_models=None, radius_thresholds=None): # Theseus only works with > 3 residues if truncation.num_residues <= 2: return [] if not radius_thresholds: radius_thresholds = self.subcluster_radius_thresholds # Make sure everyting happens in the truncation directory owd = os.getcwd() os.chdir(truncation.directory) # Generate the distance matrix clusterer = self.subclusterer_factory(subcluster_program) clusterer.generate_distance_matrix(truncation.models) # clusterer.dump_matrix(os.path.join(truncation_dir,"subcluster_distance.matrix")) # for debugging # Loop through the radius thresholds ensembles = [] previous_clusters = [] for radius in radius_thresholds: logger.debug("subclustering models under radius: %.2f", radius) # Get list of pdbs clustered according to radius threshold cluster_files = clusterer.cluster_by_radius(radius) if not cluster_files: logger.debug("Skipping radius %.2f as no files clustered in directory %s", radius, truncation.directory) continue logger.debug("Clustered %d files", len(cluster_files)) cluster_files = subcluster_util.slice_subcluster(cluster_files, previous_clusters, ensemble_max_models, radius, radius_thresholds) if not cluster_files: logger.debug('Could not create different cluster for radius %.2f in directory: %s', radius, truncation.directory) continue # Remember this cluster so we don't create duplicate clusters previous_clusters.append(cluster_files) ensemble = self.ensemble_from_subcluster(cluster_files, radius, truncation, cluster_score=clusterer.cluster_score) if ensemble is None: continue ensembles.append(ensemble) # back to where we started os.chdir(owd) return ensembles
[docs] def subcluster_models_floating_radii(self, truncation, subcluster_program=None, ensemble_max_models=None): logger.info("subclustering with floating radii") clusterer = self.subclusterer_factory(subcluster_program) clusterer.generate_distance_matrix(truncation.models) # clusterer.dump_matrix(os.path.join(truncation_dir,"subcluster_distance.matrix")) # for debugging subclusters = [] clusters = [] radii = [] len_truncated_models = len(truncation.models) for i in range(len(self.subcluster_radius_thresholds)): radius = None nmodels = None if i > 0 and radii[i - 1] > self.subcluster_radius_thresholds[i]: radius = radii[i - 1] nmodels = len(clusters[i - 1]) cluster_files, radius = subcluster_util.subcluster_nmodels(nmodels, radius, clusterer, direction='up', increment=1) else: radius = self.subcluster_radius_thresholds[i] cluster_files = clusterer.cluster_by_radius(radius) if cluster_files: cluster_files = tuple(sorted(cluster_files)) # Need to sort so that we can check if we've had this lot before cluster_size = len(cluster_files) else: cluster_files = [] cluster_size = 0 if radius in radii or cluster_size == 0: # Increase radius till we have one more than the last one if cluster_size == 0: nmodels = 2 else: radius = radii[i - 1] nmodels = len(clusters[i - 1]) + 1 cluster_files, radius = subcluster_util.subcluster_nmodels(nmodels, radius, clusterer, direction='up', increment=1) cluster_files = sorted(cluster_files) elif cluster_size >= ensemble_max_models or cluster_files in clusters: # Randomly pick ensemble_max_models cluster_files = subcluster_util.pick_nmodels(cluster_files, clusters, ensemble_max_models) if not cluster_files: logger.debug('Could not cluster files under radius: %.2f - could not find different models', radius) break # Need to check in case we couldn't cluster under this radius if cluster_size == 0 or radius in radii: logger.debug('Could not cluster files under radius: %.2f - got %d files', radius, len(cluster_files)) break logger.debug('Subclustering %d files under radius %.2f', cluster_size, radius) cluster_ensemble = self.ensemble_from_subcluster(list(cluster_files), radius, truncation, cluster_score=clusterer.cluster_score) if cluster_ensemble is None: logger.debug('Could not cluster files under radius: %.2f', radius) break subclusters.append(cluster_ensemble) clusters.append(tuple(cluster_files)) # append as tuple so it is hashable radii.append(radius) if cluster_size == len_truncated_models: break return subclusters