Source code for ample.ensembler.abinitio

"""Ensembler module for ab initio decoys"""

__author__ = "Jens Thomas, and Felix Simkovic"
__date__ = "17 Feb 2016"
__version__ = "1.0"

import logging
import os
import shutil

import _ensembler
import cluster_util
import subcluster
import subcluster_util
import truncation_util
from constants import SIDE_CHAIN_TREATMENTS, SUBCLUSTER_RADIUS_THRESHOLDS

from ample.util import fast_protein_cluster
from ample.util import scwrl_util
from ample.util import spicker

logger = logging.getLogger(__name__)


[docs]class AbinitioEnsembler(_ensembler.Ensembler):
    """Ensemble creator using on multiple models with identical sequences most
       likely created using Rosetta or Quark ab initio modelling
    """
    
    def __init__(self, **kwargs):
        
        # Inherit all functions from Parent Ensembler
        super(AbinitioEnsembler, self).__init__(**kwargs)

        # self.subcluster_method='FLOATING_RADII'
        self.cluster_score_matrix = None
        self.subcluster_method = 'ORIGINAL'
        self.subcluster_program = "maxcluster"
        self.subclustering_method = "radius"
        self.subcluster_radius_thresholds = SUBCLUSTER_RADIUS_THRESHOLDS
        
        # we save the truncator so that we can query it for data later
        self.truncator = None
        
        return

[docs]    def cluster_models(self,
                       models=None,
                       cluster_method='spicker',
                       num_clusters=1,
                       cluster_dir=None,
                       max_cluster_size=200,
                       ):
        """Wrapper function to run clustering of models dependent on the method
        """
        
        # Cluster our protein structures
        logger.info('Generating %d clusters using method: %s', num_clusters, cluster_method)

        if cluster_method != 'import' and not len(models):
            raise RuntimeError, "Cannot find any models for ensembling!" 
        
        # Get the cluster_method_type and cluster_score_type from the cluster_method
        cluster_method_type, cluster_score_type, cluster_exe = self.parse_cluster_method(cluster_method)
        
        # Set directory
        if cluster_method_type not in ['skip']:
            pass
        elif cluster_method_type in ['import', 'random']:
            if not os.path.isdir(cluster_dir): 
                raise RuntimeError('Cannot find cluster directory: {0}'.format(cluster_dir))
        else:
            cluster_dir = os.path.join(self.work_dir, 'clustering')
        
        if cluster_method_type == 'fast_protein_cluster':
            SCORE_TYPE = 'rmsd'
            CLUSTER_METHOD = 'kmeans'
            logger.info('Running fast_protein_cluster with: score_type: %s and cluster_method: %s', SCORE_TYPE, CLUSTER_METHOD)
            clusters = fast_protein_cluster.FPC().fpc.cluster(cluster_method=CLUSTER_METHOD,
                                                              fpc_exe=cluster_exe,
                                                              max_cluster_size=max_cluster_size,
                                                              models=models,
                                                              num_clusters=num_clusters,
                                                              nproc=self.nproc,
                                                              score_type=SCORE_TYPE,
                                                              work_dir=cluster_dir)
        elif cluster_method_type == 'import':
            clusters = cluster_util.import_cluster(models)
        elif cluster_method_type == 'random':
            clusters = cluster_util.random_cluster(cluster_method_type,
                                                   max_cluster_size,
                                                   models,
                                                   num_clusters)
        elif cluster_method_type == 'spicker':
            logger.info('* Running SPICKER to cluster models *')
            spickerer = spicker.Spickerer(spicker_exe=cluster_exe)
            clusters = spickerer.cluster(models,
                                         num_clusters=num_clusters,
                                         max_cluster_size=max_cluster_size,
                                         score_type=cluster_score_type,
                                         run_dir=cluster_dir,
                                         score_matrix=None,
                                         nproc=self.nproc)
            logger.debug(spickerer.results_summary())
        else:
            msg = 'Unrecognised clustering method: {0}'.format(cluster_method_type)
            raise RuntimeError(msg)
                
        return clusters
    
[docs]    def ensemble_from_subcluster(self, cluster_files, radius, truncation, cluster_score=None):
        subcluster_dir = os.path.join(truncation.directory, 'subcluster_{0}'.format(radius))
        os.mkdir(subcluster_dir)
        os.chdir(subcluster_dir)

        cluster_num = truncation.cluster.index
        truncation_level = truncation.level
        basename = 'c{0}_t{1}_r{2}'.format(cluster_num, truncation_level, radius)
        
        # List of files for reference
        with open(os.path.join(subcluster_dir, "{0}.list".format(basename)), 'w') as f:
            for m in cluster_files: f.write(m + "\n")
            f.write("\n")
        
        cluster_file = self.superpose_models(cluster_files, work_dir=subcluster_dir)
        if not cluster_file:
            msg = "Error running theseus on ensemble {0} in directory: {1}\nSkipping subcluster: {0}".format(basename,
                                                                                                             subcluster_dir)
            logger.critical(msg)
            return None
         
        ensemble_pdb = os.path.join(subcluster_dir, basename + '.pdb')
        shutil.move(cluster_file, ensemble_pdb)
        
        ensemble = _ensembler.Ensemble()
        
        # First add the data from the cluster
        ensemble.cluster_method = truncation.cluster.cluster_method
        ensemble.cluster_score_type = truncation.cluster.score_type
        ensemble.num_clusters = truncation.cluster.num_clusters
        ensemble.cluster_num = truncation.cluster.index
        ensemble.cluster_centroid = truncation.cluster.centroid
        ensemble.cluster_num_models = len(truncation.cluster)
        
        # Then the truncation information
        ensemble.num_residues = truncation.num_residues
        ensemble.truncation_dir = truncation.directory
        ensemble.truncation_level = truncation.level
        ensemble.truncation_method = truncation.method
        ensemble.truncation_percent = truncation.percent
        ensemble.truncation_residues = truncation.residues
        ensemble.truncation_variance = truncation.variances

        # Now the subcluster info
        # The data we've collected is the same for all pdbs in this level so just keep using the first
        ensemble.subcluster_num_models = len(cluster_files)
        # Get the centroid model name from the list of files given to theseus - we can't parse
        # the pdb file as theseus truncates the filename
        ensemble.subcluster_centroid_model =  os.path.abspath(cluster_files[0])
        ensemble.subcluster_radius_threshold = radius
        ensemble.subcluster_score = cluster_score
        
        ensemble.pdb = ensemble_pdb
        
        return ensemble

[docs]    def generate_ensembles(self,
                           models,
                           cluster_dir=None,
                           cluster_method=None,
                           ensembles_directory=None,
                           ensemble_max_models=None,
                           num_clusters=None,
                           percent_truncation=None,
                           side_chain_treatments=SIDE_CHAIN_TREATMENTS,
                           subcluster_radius_thresholds=SUBCLUSTER_RADIUS_THRESHOLDS,
                           subcluster_program=None,
                           truncation_method=None,
                           truncation_pruning=None,
                           use_scwrl=False):

        if not num_clusters:
            num_clusters = self.num_clusters
        if not percent_truncation:
            percent_truncation = self.percent_truncation
        if not truncation_method:
            truncation_method = self.truncation_method
        if not truncation_pruning:
            truncation_pruning = self.truncation_pruning
        
        logger.info('Ensembling models in directory: %s', self.work_dir)
        if not all([os.path.isfile(m) for m in models]):
            msg = "Problem reading models given to Ensembler: {0}".format(models)
            raise RuntimeError(msg)
        
        self.ensembles = []
        for cluster in self.cluster_models(models=models,
                                           cluster_method=cluster_method,
                                           num_clusters=num_clusters,
                                           cluster_dir=cluster_dir):
            if len(cluster) < 2:
                logger.info("Cannot truncate cluster %d as < 2 models!", cluster.index)
                continue
            logger.info('Processing cluster: %d', cluster.index)
            
            truncate_dir = os.path.join(self.work_dir, "cluster_{0}".format(cluster.index))
            if not os.path.isdir(truncate_dir): os.mkdir(truncate_dir)
            os.chdir(truncate_dir)
            
            # Add sidechains using SCWRL here so we only add them to the models we actually use
            if use_scwrl:
                cluster.models = self.scwrl_models(cluster.models, truncate_dir, self.scwrl_exe)
                 
            self.truncator = truncation_util.Truncator(work_dir=truncate_dir)
            self.truncator.theseus_exe = self.theseus_exe
            for truncation in self.truncator.truncate_models(models=cluster.models,
                                                             truncation_method=truncation_method,
                                                             percent_truncation=percent_truncation,
                                                             truncation_pruning=truncation_pruning):
                # Add cluster information
                truncation.cluster = cluster
                for ensemble in self.subcluster_models(truncation,
                                                       subcluster_program=subcluster_program,
                                                       ensemble_max_models=self.ensemble_max_models,
                                                       radius_thresholds=subcluster_radius_thresholds):
                    # Now add the side chains
                    for ensemble in self.edit_side_chains(ensemble, side_chain_treatments):
                        self.ensembles.append(ensemble)
        return self.ensembles

[docs]    def generate_ensembles_from_amoptd(self, models, amoptd):
        """Generate ensembles from data in supplied ample data dictionary."""
        kwargs = {
                  'cluster_dir' : amoptd['cluster_dir'],
                  'cluster_method' : amoptd['cluster_method'],
                  'num_clusters' : amoptd['num_clusters'],
                  'percent_truncation' : amoptd['percent'],
                  'side_chain_treatments' : amoptd['side_chain_treatments'],
                  'subcluster_program' : amoptd['subcluster_program'],
                  'subcluster_radius_thresholds' : amoptd['subcluster_radius_thresholds'],
                  'truncation_method' : amoptd['truncation_method'],
                  'truncation_pruning' : amoptd['truncation_pruning'],
                  'use_scwrl' : amoptd['use_scwrl']
                  }

        # strip out any that are None
        kwargs = { k : v for k, v in kwargs.iteritems() if v is not None }
        
        ensembles = self.generate_ensembles(models, **kwargs)
        
        # We need to save these data to amopt as it's impossible to reconstruct otherwise
        amoptd['truncation_levels'] = self.truncator.truncation_levels
        amoptd['truncation_variances'] = self.truncator.truncation_variances
        amoptd['truncation_nresidues'] =  self.truncator.truncation_nresidues
        
        return ensembles

[docs]    def parse_cluster_method(self, cluster_method):
        """Return the cluster_method_type, cluster_score_type, cluster_exe from a generic cluster_method"""
        cluster_score_type = 'rmsd'
        if cluster_method == 'fast_protein_cluster':
            cluster_method_type = 'fast_protein_cluster'
            cluster_exe = self.fast_protein_cluster_exe
        elif cluster_method in ['spicker', 'spicker_qscore', 'spicker_tm']:
            cluster_method_type = 'spicker'
            cluster_exe = self.spicker_exe
            if cluster_method == 'spicker_qscore':
                cluster_score_type = 'read_matrix'
            elif cluster_method == 'spicker_tm':
                cluster_score_type = 'tm'
        elif cluster_method in ['import', 'random', 'skip']:
            cluster_method_type = cluster_method
            cluster_exe = None
        else:
            msg = "Unrecognised cluster_method: {0}".format(cluster_method)
            raise RuntimeError(msg)
        logger.debug('cluster_method_type: %s cluster_score_type: %s cluster_exe %s', cluster_method_type, cluster_score_type, cluster_exe)
        return cluster_method_type, cluster_score_type, cluster_exe
    
[docs]    def scwrl_models(self, models, work_dir, scwrl_exe):
        """Add side chains to the models with Scwrl"""
        
        scwrl_directory = os.path.join(work_dir, "scrwl")
        if not os.path.isdir(scwrl_directory): os.mkdir(scwrl_directory)
        
        scwrled_models = scwrl_util.Scwrl(scwrl_exe=scwrl_exe).process_models(models, 
                                                                              scwrl_directory, 
                                                                              strip_oxt=True)
        return scwrled_models

[docs]    def subclusterer_factory(self, subcluster_program):
        """Return an instantiated subclusterer based on the given program"""
        if subcluster_program == 'gesamt':
            clusterer = subcluster.GesamtClusterer(self.gesamt_exe, nproc=self.nproc)
        elif subcluster_program == 'maxcluster':
            clusterer = subcluster.MaxClusterer(self.maxcluster_exe)
        elif subcluster_program == 'lsqkab':
            clusterer = subcluster.LsqkabClusterer(self.lsqkab_exe)
        else:
            raise RuntimeError("Unrecognised subcluster_program: {0}".format(subcluster_program))
        return clusterer

[docs]    def subcluster_models(self,
                          truncation,
                          subcluster_program=None,
                          radius_thresholds=None,
                          ensemble_max_models=None):
        if self.subcluster_method == "ORIGINAL":
            return self.subcluster_models_fixed_radii(truncation,
                                                      subcluster_program=subcluster_program,
                                                      ensemble_max_models=ensemble_max_models,
                                                      radius_thresholds=radius_thresholds)
        elif self.subcluster_method == "FLOATING_RADII":
            return self.subcluster_models_floating_radii(truncation,
                                                         subcluster_program=subcluster_program,
                                                         ensemble_max_models=ensemble_max_models)
        else:
            assert False
        
[docs]    def subcluster_models_fixed_radii(self,
                                      truncation,
                                      subcluster_program=None,
                                      ensemble_max_models=None,
                                      radius_thresholds=None):
        
        # Theseus only works with > 3 residues
        if truncation.num_residues <= 2: return []
        
        if not radius_thresholds: radius_thresholds = self.subcluster_radius_thresholds
        
        # Make sure everyting happens in the truncation directory
        owd = os.getcwd()
        os.chdir(truncation.directory)
            
        # Generate the distance matrix
        clusterer = self.subclusterer_factory(subcluster_program)
        clusterer.generate_distance_matrix(truncation.models)
        # clusterer.dump_matrix(os.path.join(truncation_dir,"subcluster_distance.matrix")) # for debugging

        # Loop through the radius thresholds
        ensembles = []
        previous_clusters = []
        for radius in radius_thresholds:
            logger.debug("subclustering models under radius: %.2f", radius)

            # Get list of pdbs clustered according to radius threshold
            cluster_files = clusterer.cluster_by_radius(radius)
            if not cluster_files:
                logger.debug("Skipping radius %.2f as no files clustered in directory %s", radius, truncation.directory)
                continue
                
            logger.debug("Clustered %d files", len(cluster_files))
            cluster_files = subcluster_util.slice_subcluster(cluster_files,
                                                             previous_clusters,
                                                             ensemble_max_models,
                                                             radius,
                                                             radius_thresholds)
            if not cluster_files:
                logger.debug('Could not create different cluster for radius %.2f in directory: %s', radius, truncation.directory)
                continue
            
            # Remember this cluster so we don't create duplicate clusters
            previous_clusters.append(cluster_files)
            ensemble = self.ensemble_from_subcluster(cluster_files, radius, truncation, cluster_score=clusterer.cluster_score)
            if ensemble is None: continue
            ensembles.append(ensemble)
        
        # back to where we started
        os.chdir(owd)
        
        return ensembles
    
[docs]    def subcluster_models_floating_radii(self,
                                         truncation,
                                         subcluster_program=None,
                                         ensemble_max_models=None):
        logger.info("subclustering with floating radii")

        clusterer = self.subclusterer_factory(subcluster_program)
        clusterer.generate_distance_matrix(truncation.models)
        # clusterer.dump_matrix(os.path.join(truncation_dir,"subcluster_distance.matrix")) # for debugging
        
        subclusters = []
        clusters = []
        radii = []
        len_truncated_models = len(truncation.models)
        for i in range(len(self.subcluster_radius_thresholds)):
            radius = None
            nmodels = None
            if i > 0 and radii[i - 1] > self.subcluster_radius_thresholds[i]:
                radius = radii[i - 1]
                nmodels = len(clusters[i - 1])
                cluster_files, radius = subcluster_util.subcluster_nmodels(nmodels, radius, clusterer, direction='up', increment=1)
            else:
                radius = self.subcluster_radius_thresholds[i]
                cluster_files = clusterer.cluster_by_radius(radius)
            
            if cluster_files:
                cluster_files = tuple(sorted(cluster_files))  # Need to sort so that we can check if we've had this lot before
                cluster_size = len(cluster_files)
            else:
                cluster_files = []
                cluster_size = 0

            if radius in radii or cluster_size == 0:
                # Increase radius till we have one more than the last one
                if cluster_size == 0:
                    nmodels = 2
                else:
                    radius = radii[i - 1]
                    nmodels = len(clusters[i - 1]) + 1
                cluster_files, radius = subcluster_util.subcluster_nmodels(nmodels, radius, clusterer, direction='up', increment=1)
                cluster_files = sorted(cluster_files)
            elif cluster_size >= ensemble_max_models or cluster_files in clusters:
                # Randomly pick ensemble_max_models
                cluster_files = subcluster_util.pick_nmodels(cluster_files, clusters, ensemble_max_models)
                if not cluster_files:
                    logger.debug('Could not cluster files under radius: %.2f - could not find different models', radius)
                    break
            
            # Need to check in case we couldn't cluster under this radius
            if cluster_size == 0 or radius in radii:
                logger.debug('Could not cluster files under radius: %.2f - got %d files', radius, len(cluster_files))
                break
            logger.debug('Subclustering %d files under radius %.2f', cluster_size, radius)
            cluster_ensemble = self.ensemble_from_subcluster(list(cluster_files), radius, truncation, cluster_score=clusterer.cluster_score)
            if cluster_ensemble is None:
                logger.debug('Could not cluster files under radius: %.2f', radius)
                break
            
            subclusters.append(cluster_ensemble)
            clusters.append(tuple(cluster_files))  # append as tuple so it is hashable
            radii.append(radius)
            if cluster_size == len_truncated_models: break
            
        return subclusters