Source code for ample.util.contact_util

"""Wrapper module for the ConKit package"""

from __future__ import division

__author__ = "Felix Simkovic"
__date__ = "18 Mar 2017"
__version__ = "2.1"

from distutils.version import StrictVersion

import inspect
import logging
import numpy
import os
import sys
import tempfile

from ample.modelling import energy_functions

import conkit
import conkit.io
import conkit.plot

logger = logging.getLogger(__name__)


[docs]class SubselectionAlgorithm(object):
    """A class to collect all subselection algorithms"""
    @staticmethod
    def _numpify(data):
        """Convert a Python array to a Numpy array"""
        if type(data).__module__ == numpy.__name__:
            return data
        else:
            return numpy.asarray(data)

[docs]    @staticmethod
    def cutoff(data, cutoff=0.287):
        """A cutoff-defined subselection algorithm

        Description
        -----------
        This algorithm removes a decoy, if its score is l
        ess than the cutoff.

        Parameters
        ----------
        data : list, tuple
           A 1D array of scores
        cutoff : float, optional
           The cutoff of keeping decoys

        Returns
        -------
        list
           The decoy indices to keep
        list
           The decoy indices to throw

        """
        data = SubselectionAlgorithm._numpify(data)
        keep = numpy.where(data >= cutoff)[0]
        throw = numpy.where(data < cutoff)[0]
        return keep.tolist(), throw.tolist()

[docs]    @staticmethod
    def linear(data, cutoff=0.5):
        """A linearly-defined subselection algorithm

        Description
        -----------
        This algorithm removes the worst 500 decoys.

        Parameters
        ----------
        data : list, tuple
           A 1D array of scores
        cutoff : float, optional
           The porportion of the total number of decoys to keep

        Returns
        -------
        list
           The decoy indices to keep
        list
           The decoy indices to throw

        """
        sorted_indices = SubselectionAlgorithm._numpify(data).argsort()[::-1]
        point = numpy.ceil(sorted_indices.shape[0] * cutoff)
        keep = sorted_indices[:point]
        throw = sorted_indices[point:]
        return keep.tolist(), throw.tolist()

[docs]    @staticmethod
    def scaled(data, cutoff=0.5):
        """A scaling-defined subselection algorithm

        Description
        -----------
        This algorithm removes a decoy, if its scaled score
        is less than 0.5. The scaled score is calculated by
        dividing the satisfaction score by the average of the
        set.

        Parameters
        ----------
        data : list, tuple
           A 1D array of scores
        cutoff : float, optional
           The cutoff of keeping decoys

        Returns
        -------
        list
           The decoy indices to keep
        list
           The decoy indices to throw

        """
        data = SubselectionAlgorithm._numpify(data)
        data_scaled = data / numpy.mean(data)
        keep = numpy.where(data_scaled >= cutoff)[0]
        throw = numpy.where(data_scaled < cutoff)[0]
        return keep.tolist(), throw.tolist()


# Populate the available subselection modes into a list
SUBSELECTION_MODES = [func_name for func_name, _ in inspect.getmembers(SubselectionAlgorithm)
                      if not func_name.startswith('_')]


[docs]class ContactUtil(object):
    """

    Attributes
    ----------
    bbcontacts_file : str
       The path to the bbcontacts contact file
    bbcontacts_format : str
       The format of ``bbcontacts_file``
    contact_file : str
       The path to the contact file
    contact_format : str
       The format of ``contact_file``
    cutoff_factor : float
       The contact list truncation factor
    distance_to_neighbor : int
       The minimum distance between contacting residues
    sequence_file : str
       The path to the sequence file
    sequence_format : str
       The format of the ``sequence_file``

    """

    def __init__(self, contact_file, contact_format, sequence_file, sequence_format, bbcontacts_file=None,
                 cutoff_factor=1.0, distance_to_neighbor=5):
        """Initialise a new :obj:`ContactUtil` instance

        Parameters
        ----------
        contact_file : str
           The path to the contact file
        contact_format : str
           The format of ``contact_file``
        sequence_file : str
           The path to the sequence file
        sequence_format : str
           The format of the ``sequence_file``
        bbcontacts_file : str, optional
           The path to the bbcontacts contact file
        cutoff_factor : float, optional
           The contact list truncation factor [default: 1.0]
        distance_to_neighbor : int, optional
           The minimum distance between contacting residues [default: 5]

        """
        self._bbcontacts_file = None
        self._bbcontacts_format = None
        self._contact_file = None
        self._contact_format = None
        self._cutoff_factor = None
        self._distance_to_neighbor = None
        self._sequence_file = None
        self._sequence_format = None

        self.bbcontacts_format = 'bbcontacts'
        self.contact_format = contact_format
        self.sequence_format = sequence_format

        self.bbcontacts_file = bbcontacts_file
        self.contact_file = contact_file
        self.sequence_file = sequence_file

        self.cutoff_factor = cutoff_factor
        self.distance_to_neighbor = distance_to_neighbor

    @property
    def bbcontacts_file(self):
        """The path to ``bbcontacts_file``"""
        return self._bbcontacts_file

    @bbcontacts_file.setter
    def bbcontacts_file(self, fname):
        """Define the path to the ``bbcontacts_file``"""
        self._bbcontacts_file = fname

    @property
    def bbcontacts_format(self):
        """The format of ``bbcontacts_file``"""
        return self._bbcontacts_format

    @bbcontacts_format.setter
    def bbcontacts_format(self, value):
        """Define the format of ``bbcontacts_file``

        Raises
        ------
        ValueError
           Unknown contact file format

        """
        if value != 'bbcontacts':
            raise ValueError('Unknown contact file format: {0}'.format(value))
        self._bbcontacts_format = value

    @property
    def contacts_file(self):
        """The path to ``contacts_file``"""
        return self._contact_file

    @contacts_file.setter
    def contacts_file(self, fname):
        """Define the path to the ``contacts_file``"""
        self._contact_file = fname

    @property
    def contact_format(self):
        """The format of ``contact_file``"""
        return self._contact_format

    @contact_format.setter
    def contact_format(self, value):
        """Define the format of ``contact_file``

        Raises
        ------
        ValueError
           Unknown contact file format

        """
        if value not in conkit.io.CONTACT_FILE_PARSERS.keys():
            raise ValueError('Unknown contact file format: {0}'.format(value))
        self._contact_format = value

    @property
    def cutoff_factor(self):
        """The contact list truncation factor"""
        return self._cutoff_factor

    @cutoff_factor.setter
    def cutoff_factor(self, value):
        """Define the contact list truncation factor"""
        if value < 0.0:
            msg = "cutoff factor needs to be positive: {0}".format(value)
            raise ValueError(msg)
        self._cutoff_factor = float(value)

    @property
    def distance_to_neighbor(self):
        """The minimum distance between neighboring contacts"""
        return self._distance_to_neighbor

    @distance_to_neighbor.setter
    def distance_to_neighbor(self, value):
        """"Define the minimum distance between neighboring contacts"""
        if value < 0:
            msg = "cutoff factor needs to be positive: {0}".format(value)
            raise ValueError(msg)
        self._distance_to_neighbor = int(value)

    @property
    def sequence_file(self):
        """The path to ``sequence_file``"""
        return self._sequence_file

    @sequence_file.setter
    def sequence_file(self, fname):
        """Define the path to the ``sequence_file``"""
        self._sequence_file = fname

    @property
    def sequence_format(self):
        """The format of ``sequence_file``"""
        return self._sequence_format

    @sequence_format.setter
    def sequence_format(self, value):
        """Define the format of ``sequence_format``

        Raises
        ------
        ValueError
           Unknown sequence file format

        """
        if value not in conkit.io.SEQUENCE_FILE_PARSERS.keys():
            raise ValueError('Unknown sequence file format: {0}'.format(value))
        self._sequence_format = value

    def _preprocess(self):
        """Pre-process the data according to the data provided

        Parameters
        ----------
        match : bool
           Match the contact maps

        Returns
        -------
        :obj:`conkit.core.ContactMap`
           The modified and processed contact map

        """
        logger.info('Provided contact file and format are: %s - %s', self.contact_file, self.contact_format)
        contact_map = conkit.io.read(self.contact_file, self.contact_format).top_map

        logger.info('Provided sequence file and format are: %s - %s', self.sequence_file, self.sequence_format)
        sequence = conkit.io.read(self.sequence_file, self.sequence_format).top_sequence
        contact_map.sequence = sequence
        contact_map.assign_sequence_register()

        logger.info('Calculating the scalar score')
        contact_map.calculate_scalar_score()

        dtn = self.distance_to_neighbor
        logger.info('Removing neighboring residues to distance of %d residues', dtn)
        contact_map.remove_neighbors(min_distance=dtn, inplace=True)

        sort_key = 'raw_score'
        logger.info('Sorting the contact map based on %s', sort_key)
        contact_map.sort(sort_key, reverse=True, inplace=True)

        ncontacts = int(contact_map.sequence.seq_len * self.cutoff_factor)
        logger.info('Slicing contact map to contain top %d contacts only', ncontacts)
        contact_map = contact_map[:ncontacts]

        if self.bbcontacts_file:
            logger.info('Provided contact file and format are: %s - %s', self.bbcontacts_file, self.bbcontacts_format)
            bbcontact_map = conkit.io.read(self.bbcontacts_file, self.bbcontacts_format).top_map
            bbcontact_map.sequence = sequence
            bbcontact_map.assign_sequence_register()
            bbcontact_map.rescale(inplace=True)
            bbcontact_map.calculate_scalar_score()
            bbcontact_map.sort(sort_key, reverse=True, inplace=True)

            for bbcontact in bbcontact_map:
                if bbcontact.id in contact_map:
                    contact_map[bbcontact.id].weight = 2
                    for d in (1, 2):
                        alternate_positions = [
                            (bbcontact.res1_seq, bbcontact.res2_seq + d),
                            (bbcontact.res1_seq, bbcontact.res2_seq - d),
                            (bbcontact.res1_seq + d, bbcontact.res2_seq),
                            (bbcontact.res1_seq - d, bbcontact.res2_seq),
                        ]
                        for alt_pos in alternate_positions:
                            if alt_pos in contact_map:
                                contact_map[alt_pos].weight = 2
                else:
                    contact_map.add(bbcontact)

            contact_map.sort(sort_key, reverse=True, inplace=True)

        return contact_map

[docs]    def subselect_decoys(self, decoys, decoy_format, mode='linear', subdistance_to_neighbor=24, **kwargs):
        """Subselect decoys excluding those not satisfying long-distance restraints

        Parameters
        ----------
        decoys : list, tuple
           A list containing paths to decoy files
        decoy_format : str
           The file format of ``decoys``
        mode : str, optional
           The subselection mode to use
            * scaled: keep the decoys with scaled scores of >= 0.5
            * linear: keep the top half of decoys
            * cutoff: Keep all decoys with satisfaction scores of >= 0.287
        subdistance_to_neighbor : int, optional
           The minimum distance between neighboring residues in the subselection [default: 24]
        **kwargs
           Job submission related keyword arguments

        Returns
        -------
        list
           A list of paths to the sub-selected decoys

        """
        from ample.util import ample_util
        from ample.util import workers_util

        # Compute the long range contact satisfaction on a per-decoy basis
        logger.info('Long-range contacts are defined with sequence separation of 24+')

        # Hack a custom copy of the contact map together that we can use with the script
        # All decoys should be sequence identical and thus we can just match it to the top
        contact_map = self._preprocess()
        contact_map.match(conkit.io.read(decoys[0], decoy_format).top_map, inplace=True)
        tmp_contact_file = tempfile.NamedTemporaryFile(delete=False)
        conkit.io.write(tmp_contact_file.name, 'casprr', contact_map)

        # Construct the job scripts
        job_scripts = []    # Hold job scripts
        log_files = []      # Hold paths to log files
        executable = 'conkit-precision.bat' if sys.platform.startswith('win') else 'conkit-precision'
        for decoy in decoys:
            # Some file names
            decoy_name = os.path.splitext(os.path.basename(decoy))[0]
            contact_name = os.path.splitext(os.path.basename(self.contact_file))[0]
            prefix = '{0}_{1}_'.format(contact_name, decoy_name)
            # Create the run scripts
            script = tempfile.NamedTemporaryFile(prefix=prefix, suffix=ample_util.SCRIPT_EXT, delete=False)

            # Construct the command
            # TODO: Get the log file business working properly
            cmd = [executable, '-d', subdistance_to_neighbor]
            # Decoy file and format - version dependent
            if StrictVersion(conkit.__version__) <= StrictVersion('0.6.3'):
                cmd += [decoy]
            else:
                cmd += [decoy, decoy_format]
            # Sequence file and format
            cmd += [self.sequence_file, self.sequence_format]
            # Contact file and format
            cmd += [tmp_contact_file.name, 'casprr']
            # Write the command to the script
            script.write(
                ample_util.SCRIPT_HEADER + os.linesep
                + " ".join(map(str, cmd)) + os.linesep
            )
            script.close()
            os.chmod(script.name, 0o777)
            job_scripts.append(script.name)
            # Save some more information
            log_files.append(os.path.splitext(script.name)[0] + ".log")

        # Execute the scripts
        success = workers_util.run_scripts(
            job_scripts=job_scripts,
            monitor=None,
            check_success=None,
            early_terminate=None,
            nproc=kwargs['nproc'] if 'nproc' in kwargs else 1,
            job_time=7200,          # Might be too long/short, taken from Rosetta modelling
            job_name='subselect',
            submit_cluster=kwargs['submit_cluster'] if 'submit_cluster' in kwargs else False,
            submit_qtype=kwargs['submit_qtype'] if 'submit_qtype' in kwargs else None,
            submit_queue=kwargs['submit_queue'] if 'submit_queue' in kwargs else False,
            submit_array=kwargs['submit_array'] if 'submit_array' in kwargs else None,
            submit_max_array=kwargs['submit_max_array'] if 'submit_max_array' in kwargs else None,
        )

        if not success:
            msg = "Error running decoy subselection"
            raise RuntimeError(msg)

        # Collate the scores
        scores = numpy.zeros(len(decoys))
        for i, (decoy, log, script) in enumerate(zip(decoys, log_files, job_scripts)):
            for line in open(log, 'r'):
                if line.startswith('Precision score'):
                    scores[i] = float(line.strip().split()[-1])
            os.unlink(log)
            os.unlink(script)

        # Subselect the decoys
        logger.info('Model selection mode: %s', mode)
        if mode == 'scaled':
            keep, throw = SubselectionAlgorithm.scaled(scores)
        elif mode == 'linear':
            keep, throw = SubselectionAlgorithm.linear(scores)
        elif mode == 'cutoff':
            keep, throw = SubselectionAlgorithm.cutoff(scores)
        else:
            msg = "Unknown sub-selection mode: {0}".format(mode)
            logger.critical(msg)
            raise ValueError(msg)

        # Some checks
        if len(keep) < 1:
            msg = "Number of decoys to keep is 0 - defaulting to keeping all"
            logger.warning(msg)
            keep, throw = range(len(decoys)), []

        logger.info('Excluding %d decoy(s) from ensembling', len(throw))

        # TODO: return the scores so we can store them in AMPLE dict
        # Return the list of decoys to keep
        return tuple([decoys[i] for i in keep])

[docs]    def summarize(self, plot_file, structure_file=None, structure_format=None, native_cutoff=8):
        """Process the contact file etc

        Parameters
        ----------
        plot_file : str
           The path to the contact map plot
        structure_file : str
           A reference structure file
        structure_format : str
           The format of ``structure_file``
        native_cutoff : int
           The distance cutoff for contact extraction from ``structure_file``

        Returns
        -------
        str
           The path to the contact map plot
        float
           The precision score, if calculated, else 0.0

        Raises
        ------
        ValueError
           A structure file also needs a structure format
        ValueError
           A structure format also needs structure file
        ValueError
           Unknown structure format

        """
        # Process the contact map according to the parameters defined here
        contact_map = self._preprocess()

        logger.debug(structure_file)
        logger.debug(structure_format)
        if structure_file and not structure_format:
            msg = "A structure file also needs a structure format"
            logger.critical(msg)
            raise ValueError(msg)
        elif structure_file and structure_format and structure_format not in conkit.io.CONTACT_FILE_PARSERS.keys():
            msg = "Unknown structure format"
            logger.critical(msg)
            raise ValueError(msg)
        elif structure_file and structure_format:
            logger.info(
                'Provided structure file and format are: {0} - {1}'.format(structure_file, structure_format)
            )
            structure_map = conkit.io.read(structure_file, structure_format).top_map
            contact_map.match(structure_map, inplace=True)

            # Calculate the precision score
            precision = contact_map.precision
        else:
            structure_map = None
            precision = 0.0

        # Draw a contact map plot
        conkit.plot.ContactMapFigure(contact_map, reference=structure_map, file_name=plot_file)

        return plot_file, precision

[docs]    def write_restraints(self, restraint_file, restraint_format, energy_function):
        """Write a list of restraints

        Parameters
        ----------
        restraint_file : str
           The file to write the restraints to
        restraint_format : str
           The restraints format, depends primarily on the program for which the restraints will be used
        energy_function : str
           The energy function

        Returns
        -------
        str
           The file the restraints were written to

        Raises
        ------
        ValueError
           Unknown restraint format
        ValueError
           Unknown Rosetta energy function
        ValueError
           Unknown SAINT2 energy function
        """
        # Process the contact map according to the parameters defined here
        contact_map = self._preprocess()

        if restraint_format not in ['rosetta', 'saint2']:
            msg = 'Unknown restraint format: {0}'.format(restraint_format)
            logger.critical(msg)
            raise ValueError(msg)
        elif restraint_format == 'rosetta' and not hasattr(energy_functions.RosettaFunctionConstructs, energy_function):
            msg = 'Unknown Rosetta energy function: {0} for {1}'.format(energy_function, restraint_format)
            logger.critical(msg)
            raise ValueError(msg)
        elif restraint_format == 'saint2' and not hasattr(energy_functions.Saint2FunctionConstructs, energy_function):
            msg = 'Unknown SAINT2 energy function: {0} for {1}'.format(energy_function, restraint_format)
            logger.critical(msg)
            raise ValueError(msg)

        with open(restraint_file, 'w') as f_out:

            if restraint_format == 'rosetta':
                construct = getattr(
                    energy_functions.RosettaFunctionConstructs, energy_function
                ).fget(energy_functions.RosettaFunctionConstructs)

                for contact in contact_map:
                    contact_dict = contact._to_dict()
                    contact_dict['atom1'] = 'CA' if contact.res1 == 'G' else 'CB'
                    contact_dict['atom2'] = 'CA' if contact.res2 == 'G' else 'CB'
                    contact_dict['energy_bonus'] = contact.weight * 15.00
                    contact_dict['scalar_score'] = contact.scalar_score * contact.weight
                    contact_dict['sigmoid_cutoff'] = energy_functions.DynamicDistances.cutoff(contact.res1, contact.res2)
                    contact_dict['sigmoid_slope'] = energy_functions.DynamicDistances.percentile(contact.res1, contact.res2)
                    f_out.write(construct.format(**contact_dict) + os.linesep)

            elif restraint_format == 'saint2':
                construct = getattr(
                    energy_functions.Saint2FunctionConstructs, 'DEFAULT'
                ).fget(energy_functions.Saint2FunctionConstructs)

                for contact in contact_map:
                    contact_dict = contact._to_dict()
                    f_out.write(construct.format(**contact_dict) + os.linesep)

        return restraint_file

[docs]    @staticmethod
    def check_options(optd):
        """Function to check that all contact files are available

        Raises
        ------
        ValueError
           You must provide ``-contact_file`` when using ``-bbcontacts_file`` or use as ``-contact_file`` instead
        ValueError
           Cannot find contact file
        ValueError
           Rosetta energy function unavailable

        """
        # Make sure contact file is provided with bbcontacts_file
        if not optd['contact_file'] and optd['bbcontacts_file']:
            msg = "You must provide -contact_file when using -bbcontacts_file or use as -contact_file instead"
            logger.critical(msg)
            raise ValueError(msg)

        # Check the existence of the contact file
        if optd['contact_file'] and not os.path.isfile(optd['contact_file']):
            msg = "Cannot find contact file:\n{0}".format(optd['contact_file'])
            logger.critical(msg)
            raise ValueError(msg)

        # Check the existence of the contact file
        if optd['bbcontacts_file'] and not os.path.isfile(optd['bbcontacts_file']):
            msg = "Cannot find contact file:\n{0}".format(optd['contact_file'])
            logger.critical(msg)
            raise ValueError(msg)

        # Check that the contact file format was provided
        if optd['contact_file'] and not optd['contact_format']:
            msg = "You must define the contact file format via -contact_format"
            logger.critical(msg)
            raise ValueError(msg)

        # Check that the contact file format is defined in ConKit
        if optd['contact_format'] not in conkit.io.CONTACT_FILE_PARSERS:
            msg = "The provided contact file format is not yet implemented"
            logger.critical(msg)
            raise ValueError(msg)

        # Make sure user selected energy function is pre-defined
        if optd['restraints_format'] == 'rosetta' and optd['energy_function']:
            if not hasattr(energy_functions.RosettaFunctionConstructs, optd['energy_function']):
                msg = "Rosetta energy function {0} unavailable".format(optd['energy_function'])
                logger.critical(msg)
                raise ValueError(msg)

        if optd['restraints_format'] == 'saint2' and optd['energy_function']:
            if not hasattr(energy_functions.Saint2FunctionConstructs, optd['energy_function']):
                msg = "SAINT2 energy function {0} unavailable".format(optd['energy_function'])
                logger.critical(msg)
                raise ValueError(msg)

        if optd['subselect_mode'] and optd['subselect_mode'].lower() not in SUBSELECTION_MODES:
            msg = "Subselection mode not valid"
            logger.critical(msg)
            raise ValueError(msg)