Source code for ample.ensembler

"""The AMPLE ensembler sub-package

This sub-package contains the source to apply AMPLE's
cluster-and-truncate approach to protein structure models.
It contains a series of different modules to apply varying
cluster and truncation algorithms to those models.
"""

__author__ = "Jens Thomas, and Felix Simkovic"
__date__ = "14 Nov 2016"
__version__ = "1.0"

import collections
import glob
import iotbx.pdb
import logging
import operator
import os
import shutil
import sys

from constants import SIDE_CHAIN_TREATMENTS
from abinitio import AbinitioEnsembler
from homologs import HomologEnsembler
from single_model import SingleModelEnsembler
from ample.util import ample_util
from ample.util import argparse_util
from ample.util import exit_util
from ample.util import pdb_edit
from ample.util import printTable

logger = logging.getLogger(__name__)


[docs]def add_argparse_options(parser=None):
    """Function to add the ensemble-specific options
    
    Parameters
    ----------
    parser : :obj:`argparse.ArgumentParser`
    standalone : bool, optional
       Define if the parser is standalone [default: False]

    """
    if parser is None:
        import argparse
        parser = argparse.ArgumentParser()

    parser = parser.add_argument_group('Ensemble Options')

    # Ensenble-specific options here
    parser.add_argument('-cluster_dir',
                        help='Path to directory of pre-clustered models to import')

    parser.add_argument('-cluster_method',
                        help='How to cluster the models for ensembling (spicker|fast_protein_cluster')

    parser.add_argument('-ensembler_timeout', type=int,
                        help='Time in seconds before timing out ensembling')

    parser.add_argument('-gesamt_exe', metavar='gesamt_exe',
                        help='Path to the gesamt executable')

    parser.add_argument('-homologs', metavar='True/False',
                        help='Generate ensembles from homologs models (requires -alignment_file)')

    parser.add_argument('-homolog_aligner', metavar='homolog_aligner',
                        help='Program to use for structural alignment of homologs (gesamt|mustang)')

    parser.add_argument('-ensemble_max_models',
                        help='Maximum number of models permitted in an ensemble')

    parser.add_argument('-maxcluster_exe',
                        help='Path to Maxcluster executable')

    parser.add_argument('-mustang_exe', metavar='mustang_exe',
                        help='Path to the mustang executable')

    parser.add_argument('-num_clusters', type=int,
                        help='The number of Spicker clusters of the original decoys that will be sampled [1]')

    parser.add_argument('-percent', metavar='percent_truncation',
                        help='percent interval for truncation')

    parser.add_argument('-score_matrix',
                        help='Path to score matrix for spicker')

    parser.add_argument('-score_matrix_file_list',
                        help='File with list of ordered model names for the score_matrix')

    parser.add_argument('-side_chain_treatments', type=str, nargs='+',
                        help='The side chain treatments to use. Default: {0}'.format(SIDE_CHAIN_TREATMENTS))

    parser.add_argument('-subcluster_radius_thresholds', type=float, nargs='+',
                        help='The radii to use for subclustering the truncated ensembles')

    parser.add_argument('-subcluster_program',
                        help='Program for subclustering models [maxcluster]')

    parser.add_argument('-theseus_exe', metavar='Theseus exe (required)',
                        help='Path to theseus executable')

    parser.add_argument('-thin_clusters', metavar='True/False',
                        help='Create ensembles from 10 clusters with 1 + 3A subclustering and polyAlanine sidechains')

    parser.add_argument('-truncation_method',
                        help='How to truncate the models for ensembling percent|thresh|focussed|scores')

    parser.add_argument('-truncation_pruning',
                        help='Whether to remove isolated residues (single)')

    parser.add_argument('-truncation_scorefile',
                        help="CSV file containing per residue scores - COLUMN ONE MUST BE RESIDUE INDEX STARTING FROM 1")

    parser.add_argument('-truncation_scorefile_header', nargs='+',
                        help="column headers to be used to create ensembles")

    return parser


[docs]def cluster_script(amoptd, python_path="ccp4-python"):
    """Create the script for ensembling on a cluster
    
    Parameters
    ----------
    amoptd : dict
       An AMPLE option dictionary
    python_path : str, optional
       The path to the CCP4 python executable

    Returns
    -------
    str
       The path to the cluster script

    """
    # write out script
    work_dir = amoptd['work_dir']
    script_path = os.path.join(work_dir, "submit_ensemble.sh")
    with open(script_path, "w") as job_script:
        job_script.write(ample_util.SCRIPT_HEADER + os.linesep)
        job_script.write(
            "ccp4-python -m ample.ensembler -restart_pkl {0}".format(amoptd['results_path']) + os.linesep)

    # Make executable
    os.chmod(script_path, 0o777)
    return script_path


[docs]def create_ensembles(amoptd):
    """Create the ensembles using the values in the amoptd dictionary
    
    Parameters
    ----------
    amoptd : dict
       An AMPLE option dictionary

    """
    # Create instance of the ensembler
    ensembler = ensembler_factory(amoptd)

    ############################################################################
    # For a single model we don't need to use glob
    if not (amoptd['single_model'] or amoptd['models']):
        msg = 'AMPLE ensembler needs either a single_model or a list of models'
        exit_util.exit_error(msg, sys.exc_info()[2])
        if amoptd['single_model'] and not os.path.isfile(amoptd['single_model']):
            msg = 'Cannot find single_model pdb: {0}'.format(
                amoptd['single_model'])
            exit_util.exit_error(msg, sys.exc_info()[2])
        elif amoptd['models'] and len(amoptd['models'] < 2):
            msg = 'Not enough models provided for ensembling - use single_model_mode instead'
            exit_util.exit_error(msg, sys.exc_info()[2])

    models = list([amoptd['single_model']]
                  ) if amoptd['single_model_mode'] else amoptd['models']

    #if amoptd['cluster_method'] == 'spicker_tmscore':
    #    models = reorder_models(models, amoptd['score_matrix_file_list'])
    #    if not (os.path.isfile(amoptd['score_matrix']) and os.path.isfile(amoptd['score_matrix_file_list'])):
    #        raise RuntimeError("spicker_tmscore needs a score_matrix and score_matrix_file_list")
    #    ensembler.score_matrix = amoptd['score_matrix']

    # Run ensemble creation
    ensembles = ensembler.generate_ensembles_from_amoptd(models, amoptd)

    ############################################################################
    # Hack to pull out the data - need to update code to work with ensemble objects rather than dictionaries
    amoptd['ensembles'] = [e.pdb for e in ensembles]
    amoptd['ensembles_data'] = [e.__dict__ for e in ensembles]

    # We need to let the main process know that we have succeeded as this module could be run on a cluster node with no link
    # to the parent process, so we create a file here indicating that we got this far and didn't die from an exception
    with open(amoptd['ensemble_ok'], 'w') as f:
        f.write('ok\n')

    # Delete all intermediate files if we're purging
    if amoptd['purge']:
        shutil.rmtree(ensembler.work_dir)
    return


[docs]def ensembler_factory(amoptd):
    """Return an ensembler object for the required ensembles
    
    Parameters
    ----------
    amoptd : dict
       An AMPLE option dictionary

    Returns
    -------
    :obj:`HomologEnsembler`, :obj:`SingleModelEnsembler`, or :obj:`AbinitioEnsembler`
       Instantiated ensembler object

    """
    if amoptd['homologs']:
        ensembler_class = HomologEnsembler
    elif amoptd['single_model_mode']:
        ensembler_class = SingleModelEnsembler
    else:
        ensembler_class = AbinitioEnsembler

    # Paths to ensembling directories
    ensembles_directory = os.path.join(amoptd['work_dir'], 'ensembles')
    amoptd['ensembles_directory'] = ensembles_directory
    work_dir = os.path.join(amoptd['work_dir'], 'ensemble_workdir')

    return ensembler_class(
        ensembles_directory=ensembles_directory,
        ensemble_max_models=amoptd['ensemble_max_models'],
        nproc=amoptd['nproc'],
        work_dir=work_dir,
        # Executables
        fast_protein_cluster_exe=amoptd['fast_protein_cluster_exe'],
        gesamt_exe=amoptd['gesamt_exe'],
        lsqkab_exe=amoptd['lsqkab_exe'],
        maxcluster_exe=amoptd['maxcluster_exe'],
        scwrl_exe=amoptd['scwrl_exe'],
        spicker_exe=amoptd['spicker_exe'],
        theseus_exe=amoptd['theseus_exe'],
    )


[docs]def collate_cluster_data(ensembles_data):
    """Collate the cluster data into a dictionary structure

    Parameters
    ----------
    ensembles_data : list, tuple
       A list of ensembles' data dictionaries

    Returns
    -------
    A dictionary with the collated data
    
    """
    
    clusters = {}  # Loop through all ensemble data objects and build up a data tree
    cluster_method = None
    cluster_score_type = None
    truncation_method = None
    percent_truncation = None
    side_chain_treatments = []
    for e in ensembles_data:
        if not cluster_method:
            cluster_method = e['cluster_method']
            cluster_score_type = e['cluster_score_type']
            percent_truncation = e['truncation_percent']
            truncation_method = e['truncation_method']
            # num_clusters = e['num_clusters']
        cnum = e['cluster_num']
        if cnum not in clusters:
            clusters[cnum] = {}
            clusters[cnum]['cluster_centroid'] = e['cluster_centroid']
            clusters[cnum]['cluster_num_models'] = e['cluster_num_models']
            clusters[cnum]['tlevels'] = {}
        tlvl = e['truncation_level']
        if tlvl not in clusters[cnum]['tlevels']:
            clusters[cnum]['tlevels'][tlvl] = {}
            clusters[cnum]['tlevels'][tlvl]['truncation_variance'] = e['truncation_variance']
            clusters[cnum]['tlevels'][tlvl]['num_residues'] = e['num_residues']
            clusters[cnum]['tlevels'][tlvl]['radius_thresholds'] = {}
        srt = e['subcluster_radius_threshold']
        if srt not in clusters[cnum]['tlevels'][tlvl]['radius_thresholds']:
            clusters[cnum]['tlevels'][tlvl]['radius_thresholds'][srt] = {}
            clusters[cnum]['tlevels'][tlvl]['radius_thresholds'][srt]['num_models'] = e['subcluster_num_models']
            clusters[cnum]['tlevels'][tlvl]['radius_thresholds'][srt]['sct'] = {}
        sct = e['side_chain_treatment']
        if sct not in side_chain_treatments:
            side_chain_treatments.append(sct)
        if sct not in clusters[cnum]['tlevels'][tlvl]['radius_thresholds'][srt]['sct']:
            clusters[cnum]['tlevels'][tlvl]['radius_thresholds'][srt]['sct'][sct] = {}
            clusters[cnum]['tlevels'][tlvl]['radius_thresholds'][srt]['sct'][sct]['name'] = e['name']
            clusters[cnum]['tlevels'][tlvl]['radius_thresholds'][srt]['sct'][sct]['num_atoms'] = e['ensemble_num_atoms']

    return {
        'clusters': clusters,
        'cluster_method': cluster_method,
        'cluster_score_type': cluster_score_type,
        'truncation_method': truncation_method,
        'percent_truncation': percent_truncation,
        'side_chain_treatments': side_chain_treatments,
    }


[docs]def cluster_table_data(clusters, cluster_num, side_chain_treatments, header=True):
    """Generate a table containing the cluster data
    
    Parameters
    ----------
    clusters : list, tuple 
       A list of cluster dictionaries
    cluster_num : int
       The cluster number of interest
    side_chain_treatments : list, tuple
       A list of side chain treatments
    header: bool
      Return a header line

    Returns
    -------
    list
       A list of table data lines

    """
    # FIX TO IGNORE side_chain_treatments
    # tdata = [("Name", "Truncation Level", u"Variance Threshold (\u212B^2)", "No. Residues", u"Radius Threshold (\u212B)", "No. Decoys", "Number of Atoms", "Sidechain Treatment")]
    if header:
        tdata = [("Name", "Cluster", "Truncation Level", "Variance Threshold (A^2)", "No. Residues",
                  "Radius Threshold (A)", "No. Decoys", "Number of Atoms", "Sidechain Treatment")]
    else:
        tdata = []
    for tl in sorted(clusters[cluster_num]['tlevels']):
        tvar = clusters[cluster_num]['tlevels'][tl]['truncation_variance']
        nresidues = clusters[cluster_num]['tlevels'][tl]['num_residues']
        for i, rt in enumerate(sorted(clusters[cluster_num]['tlevels'][tl]['radius_thresholds'])):
            nmodels = clusters[cluster_num]['tlevels'][tl]['radius_thresholds'][rt]['num_models']
            side_chain_treatments = clusters[cluster_num]['tlevels'][tl]['radius_thresholds'][rt]['sct'].keys(
            )
            for j, sct in enumerate(side_chain_treatments):
                name = clusters[cluster_num]['tlevels'][tl]['radius_thresholds'][rt]['sct'][sct]['name']
                num_atoms = clusters[cluster_num]['tlevels'][tl]['radius_thresholds'][rt]['sct'][sct]['num_atoms']
                if i == 0 and j == 0:  # change of radius
                    tdata.append((name, cluster_num, tl, tvar,
                                  nresidues, rt, nmodels, num_atoms, sct))
                elif i > 0 and j == 0:  # change of side_chain
                    tdata.append((name, "", "", "", "", rt,
                                  nmodels, num_atoms, sct))
                else:
                    tdata.append(
                        (name, "", "", "", "", "", "", num_atoms, sct))
    return tdata


[docs]def ensemble_summary(ensembles_data):
    """Print a summary of the ensembling process
    
    Parameters
    ----------
    ensembles_data : list, tuple
       A list of ensembles' data dictionaries

    Returns
    -------
    str

    """

    tableFormat = printTable.Table()

    d = collate_cluster_data(ensembles_data)
    clusters = d['clusters']
    num_clusters = len(clusters)

    rstr = "\n"
    rstr += "Ensemble Results\n"
    rstr += "----------------\n\n"
    rstr += "Cluster method: {0}\n".format(d['cluster_method'])
    rstr += "Cluster score type: {0}\n".format(d['cluster_score_type'])
    rstr += "Number of clusters: {0}\n".format(num_clusters)
    rstr += "Truncation method: {0}\n".format(d['truncation_method'])
    rstr += "Percent truncation: {0}\n".format(d['percent_truncation'])
    rstr += "Side-chain treatments: {0}\n".format(d['side_chain_treatments'])

    for cluster_num in sorted(clusters.keys()):
        rstr += "\n"
        rstr += "Cluster {0}\n".format(cluster_num)
        rstr += "Number of models: {0}\n".format(
            clusters[cluster_num]['cluster_num_models'])
        rstr += "Cluster centroid: {0}\n".format(
            clusters[cluster_num]['cluster_centroid'])
        rstr += "\n"
        tdata = cluster_table_data(
            clusters, cluster_num, d['side_chain_treatments'])
        rstr += tableFormat.pprint_table(tdata)

    rstr += "\nGenerated {0} ensembles\n\n".format(len(ensembles_data))

    return rstr


[docs]def import_ensembles(amoptd):
    """Import ensembles using their file paths

    Parameters
    ----------
    amoptd : dict
       An AMPLE option dictionary
    
    Returns
    -------
    list
       A list of absolute files paths of the ensembles

    """
    if not pdb_edit.check_pdb_directory(amoptd['ensembles'], single=False):
        msg = "Cannot import ensembles from the directory: {0}".format(
            amoptd['ensembles'])
        exit_util.exit_error(msg)

    logger.info("Importing ensembles from directory: {0}".format(
        amoptd['ensembles']))

    ensembles = glob.glob(os.path.join(amoptd['ensembles'], '*.pdb'))
    amoptd['ensembles'] = ensembles

    # get the data on the ensemble
    ensembles_data = []
    for e in ensembles:
        d = {}
        d['name'] = os.path.splitext(os.path.basename(e))[0]
        d['ensemble_pdb'] = e

        # Get data on the models
        hierarchy = iotbx.pdb.pdb_input(file_name=e).construct_hierarchy()
        d['subcluster_num_models'] = len(hierarchy.models())
        d['num_residues'] = len(
            hierarchy.models()[0].chains()[0].residue_groups())
        d['ensemble_num_atoms'] = len(hierarchy.models()[0].atoms())

        ensembles_data.append(d)

    amoptd['ensembles_data'] = ensembles_data

    return ensembles


[docs]def reorder_models(models, ordered_list_file):
    """Reorder the list of models from a list of models (possibly in a different directory)
    
    Parameters
    ----------
    models : list, tuple
       A list of model file paths
    ordered_list_file : str
       The path to a file with a model order list

    Returns
    -------
    list
       The model list reordered

    """
    with open(ordered_list_file) as f:
        ordered_list = [line.strip() for line in f if line.strip()]
    mdir = os.path.dirname(models[0])
    model_names = set([os.path.basename(m) for m in models])
    reordered_models = []
    for tm_model in ordered_list:
        name = os.path.basename(tm_model)
        assert name in model_names, "TM model {0} is not in models!".format(
            name)
        reordered_models.append(os.path.join(mdir, name))
    return reordered_models


[docs]def set_phaser_rms_from_subcluster_score(optd):
    """Set the phaser_rms for each ensemble based on its score"""
    if optd['ensemble_options'] is None:
        optd['ensemble_options'] = {}
    for ensemble_data in optd['ensembles_data']:
        name = ensemble_data['name']
        if name not in optd['ensemble_options']:
            optd['ensemble_options'][name] = {}
        optd['ensemble_options'][name]['phaser_rms'] = ensemble_data['subcluster_score']
    return


[docs]def sort_ensembles(ensemble_pdbs, ensembles_data=None, keys=None, prioritise=True):
    """Sort AMPLE ensemble data.
    
    Parameters
    ----------
    ensemble_pdbs: list, tuple
       A list of ensemble file paths
    ensembles_data : list, tuple, optional
       A list of ensembles' data dictionaries
    keys : list, tuple, optional
       A list of keys in ensembles_data by which we sort. The key order must be high > middle > low!
    prioritise : bool, optional
       Favour ensembles in truncation level 20-50% [default: True]
    
    Returns
    -------
    list
       A sorted list of the ensemble pdb file paths
    
    Raises
    ------
    RuntimeError
       Not enough ensembles provided

    """
    if len(ensemble_pdbs) < 1:
        raise RuntimeError("Not enough ensembles")
    # Sort with out method only if we have data which contains our generated data
    if ensembles_data:
        ensemble_pdbs_sorted = _sort_ensembles(
            ensemble_pdbs, ensembles_data, keys, prioritise)
    else:
        ensemble_pdbs_sorted = sorted(ensemble_pdbs)
    return ensemble_pdbs_sorted


def _sort_ensembles(ensemble_pdbs, ensemble_data, keys, prioritise):
    """Sort ensembles based on ``keys`` or the following order
        1) Cluster
        2) Truncation level - favoured region of 20-50% first
        3) Subcluster radius threshold
        4) Side chain treatment
    """
    assert len(ensemble_pdbs) == len(
        ensemble_data), "Unequal ensembles data for sorting"

    # Keys we want to sort data by - differs for different source of ensembles
    if keys is None:
        keys = ['cluster_num', 'truncation_score_key', 'truncation_level',
                'subcluster_radius_threshold', 'side_chain_treatment']
    else:
        keys = [key for key in keys
                if key in ensemble_data[0] and ensemble_data[0][key]]

    if keys:
        # Zip the data so order remains identical between pdbs and data
        ensembles_zipped = zip(ensemble_pdbs, ensemble_data)

        # The sorting itself
        ensembles_zipped_ordered = sorted(
            ensembles_zipped, key=lambda e: [e[1][k] for k in keys])

        # Sort the ensembles further if we want to prioritise the "sweet spot"
        # between 20-50% truncation level
        if "truncation_level" in keys and prioritise:
            ensembles_zipped_ordered = _sweet_spotting(
                ensembles_zipped_ordered, keys)

        # We need to `unzip` the data to get a list of ordered pdbs
        ensemble_pdbs_sorted, _ = zip(*ensembles_zipped_ordered)
    else:
        ensemble_pdbs_sorted = sorted(ensemble_pdbs)

    return list(ensemble_pdbs_sorted)


def _sweet_spotting(ensembles_zipped_ordered, keys):
    """Sort the ensembles further if we want to prioritise the "sweet spot"
    between 20-50% truncation level

    Parameters
    ----------
    ensembles_zipped_ordered : list, tuple
       Zipped and __sorted__ data
    keys : list, tuple
       Sorting keys

    Returns
    -------
    list
       Sweet-spotted data

    """

    class Eimer(object):
        """Container unit for binnable data"""

        def __init__(self):
            self.low = []
            self.mid = []
            self.high = []

        def add(self, e):
            if e[1]['truncation_level'] > 50:
                self.high.append(e)
            elif e[1]['truncation_level'] < 20:
                self.low.append(e)
            else:
                self.mid.append(e)

    def magic(key, data):
        """Magic binning happening here"""
        container = {k[key]: [] for k in zip(*data)[1]}
        for e in data:
            container[e[1][key]].append(e)
        mulleimer = []
        for c in sorted(list(container)):
            bin = Eimer()
            for e in container[c]:
                bin.add(e)
            mulleimer.extend(bin.mid + bin.low + bin.high)
        return mulleimer

    # We need to bin the data here in case we want the cluster number or truncation score key to take priority
    # over the truncation level. The binning will, thus, respect the cluster number by binning it accordingly.
    if "cluster_num" in keys and keys.index("cluster_num") < keys.index("truncation_level"):
        return magic("cluster_num", ensembles_zipped_ordered)

    elif "truncation_score_key" in keys and keys.index("truncation_score_key") < keys.index("truncation_level"):
        return magic("truncation_score_key", ensembles_zipped_ordered)

    else:
        bin = Eimer()
        for ens in ensembles_zipped_ordered:
            bin.add(ens)
        return bin.mid + bin.low + bin.high