Source code for ample.util.tm_util

#!/usr/bin/env ccp4-python

from __future__ import division

__author__ = "Felix Simkovic"
__date__ = "28 Jul 2016"
__version__ = 1.0

import itertools
import logging
import operator
import os
import random
import string
import sys
import tempfile
import warnings


from ample.parsers import alignment_parser
from ample.parsers import tm_parser
from ample.util import ample_util
from ample.util import pdb_edit
from ample.util import workers_util

try:
    from Bio import PDB
    from Bio import SeqIO
    BIOPYTHON_AVAILABLE = True
except ImportError:
    BIOPYTHON_AVAILABLE = False

logger = logging.getLogger(__name__)

[docs]class ModelData(object):
    """Class to store model data"""
    __slots__ = ('model_name', 'structure_name', 'model_fname', 'structure_fname', 
                 'log_fname', 'tmscore', 'rmsd', 'nr_residues_common', 'gdtts', 
                 'gdtha', 'maxsub', 'seq_id')
    
    def __init__(self, model_name, structure_name, model_fname, structure_fname, log_fname, tmscore, rmsd, nr_residues_common):
        self.model_name = model_name
        self.structure_name = structure_name
        self.model_fname = model_fname
        self.structure_fname = structure_fname
        self.log_fname = log_fname
        self.tmscore = tmscore
        self.rmsd = rmsd
        self.nr_residues_common = nr_residues_common
        self.gdtts = 0.0
        self.gdtha = 0.0
        self.maxsub = 0.0
        self.seq_id = 0

    def _asdict(self):
        """Convert the object to a dictionary"""
        return {k: getattr(self, k) for k in self.__slots__}


[docs]class TMapps(object):
    """
    Superclass of TMscore and TMalign

    Attributes
    ----------
    entries : list
       List containing the TMscore entries on a per-model basis
    structure : str
       Path to the reference structure
    method : str
       One of [TMscore|TMalign]
    executable : str
       Path to the TMscore executable
    work_dir : str
       Path to the working directory

    Todo
    ----
    * Function to return the entries as numpy matrix
    * Function to return the entries in a pandas dataframe

    """

    def __init__(self, executable, method, wdir=None, **kwargs):
        """
        Parameters
        ----------
        executable : str
           Path to the executable
        method : str
           One of [TMscore|TMalign]
        work_dir : str
           Path to the working directory

        """
        self.entries = []
        self.executable = executable
        self.method = method.lower()
        self.work_dir = wdir if wdir else os.getcwd()

        # Cluster submission options
        self._nproc = kwargs['nproc'] if 'nproc' in kwargs else 1
        self._submit_cluster = kwargs['submit_cluster'] if 'submit_cluster' in kwargs else False
        self._submit_qtype = kwargs['submit_qtype'] if 'submit_qtype' in kwargs else None
        self._submit_queue = kwargs['submit_queue'] if 'submit_queue' in kwargs else None
        self._submit_array = kwargs['submit_array'] if 'submit_array' in kwargs else True
        self._submit_max_array = kwargs['submit_max_array'] if 'submit_max_array' in kwargs else None

[docs]    def comparison(self, models, structures):
        """
        Compare a list of model structures to a second list of reference structures

        Parameters
        ----------
        models : list
           List containing the paths to the model structure files
        structures : list
           List containing the paths to the reference structure files

        Returns
        -------
        entries : list
           List of TMscore data entries on a per-model basis

        """

        if len(models) < 1 or len(structures) < 1:
            msg = 'No model structures provided' if len(models) < 1 else \
                'No reference structures provided'
            logger.critical(msg)
            raise RuntimeError(msg)

        elif len(structures) == 1:
            logger.info('Using single structure provided for all model comparisons')
            structures = [structures[0] for _ in xrange(len(models))]

        elif len(models) != len(structures):
            msg = "Unequal number of models and structures"
            logger.critical(msg)
            raise RuntimeError(msg)

        # Create a logfile parser
        if self.method == "tmalign":
            pt = tm_parser.TMalignLogParser()
        elif self.method == "tmscore":
            pt = tm_parser.TMscoreLogParser()
        else:
            msg = "Invalid method selected: ", self.method
            logger.critical(msg)
            raise RuntimeError(msg)

        # =======================================================================
        # Iterate through the structure files and execute the TMscore comparisons
        # =======================================================================

        logger.info('Using algorithm: {0}'.format(self.method))
        logger.info('------- Evaluating decoys -------')

        # Construct the job scripts
        data_entries = []   # Store some data
        job_scripts = []    # Hold job scripts
        log_files = []      # Hold paths to log files
        for model_pdb, structure_pdb in zip(models, structures):
            # Some file names
            model_name = os.path.splitext(os.path.basename(model_pdb))[0]
            structure_name = os.path.splitext(os.path.basename(structure_pdb))[0]
            prefix = '{0}_{1}_{2}'.format(model_name, structure_name, self.method)
            if not os.path.isfile(model_pdb):
                logger.warning("Cannot find: {0}".format(model_pdb))
                continue
            elif not os.path.isfile(structure_pdb):
                logger.warning("Cannot find: {0}".format(structure_pdb))
                continue
            # Create the run scripts
            script = tempfile.NamedTemporaryFile(prefix=prefix, suffix=ample_util.SCRIPT_EXT, delete=False)
            script.write(ample_util.SCRIPT_HEADER + os.linesep * 2)
            script.write('{exe} {model} {reference} {sep}{sep}'.format(
                exe=self.executable,
                model=model_pdb,
                reference=structure_pdb,
                sep=os.linesep,
            ))
            script.close()
            os.chmod(script.name, 0o777)
            job_scripts.append(script.name)
            # Save some more information
            data_entries.append([model_name, structure_name, model_pdb, structure_pdb])
            log_files.append(os.path.splitext(script.name)[0] + ".log")

        # Execute the scripts
        logger.info('Executing TManalysis scripts')
        logger.disabled = True
        success = workers_util.run_scripts(
            job_scripts=job_scripts,
            monitor=None,
            check_success=None,
            early_terminate=None,
            nproc=self._nproc,
            job_time=7200,              # Might be too long/short, taken from Rosetta modelling
            job_name='tm_analysis',
            submit_cluster=self._submit_cluster,
            submit_qtype=self._submit_qtype,
            submit_queue=self._submit_queue,
            submit_array=self._submit_array,
            submit_max_array=self._submit_max_array
        )
        logger.disabled = False

        if not success:
            msg = "Error running TManalysis"
            raise RuntimeError(msg)

        # Extract the data
        entries = []
        for entry, log, script in zip(data_entries, log_files, job_scripts):

            try:
                # Reset the TM log parser to default values
                pt.reset()
                # Parse the TM method logfile to extract the data
                pt.parse(log)
            except Exception:
                msg = "Error processing the {0} log file: {1}".format(self.method, log)
                logger.critical(msg)
                log = "None"

            model_name, structure_name, model_pdb, structure_pdb = entry
            _entry = self._store(model_name, structure_name, model_pdb, structure_pdb, log, pt)
            entries.append(_entry)

            os.unlink(script)

        self.entries = entries
        return entries

    def _get_iterator(self, all_vs_all):
        """

        Parameters
        ----------
        all_vs_all: bool

        Returns
        -------
        function

        """
        # Use different itertools functions depending on the comparison type
        if all_vs_all:
            logger.info("All-vs-all comparison of models and structures")
            iterator = itertools.product     # yields an iterator of all unique combinations
        else:
            logger.info("Direct comparison of models and structures")
            iterator = itertools.izip        # yields a zipped iterator
        return iterator

    def _store(self, model_name, structure_name, model_pdb, structure_pdb, logfile, pt):
        # Generic data that either both parsers have or are defined independently of the parser
        model = ModelData(model_name, structure_name, model_pdb, structure_pdb, logfile, pt.tm, pt.rmsd, pt.nr_residues_common)
        # Specific attributes that either but not both parsers have
        if hasattr(pt, 'gdtts'):
            model.gdtts = pt.gdtts
        if hasattr(pt, 'gdtha'):
            model.gdtha = pt.gdtha
        if hasattr(pt, 'maxsub'):
            model.maxsub = pt.maxsub
        if hasattr(pt, 'seq_id'):
            model.seq_id = pt.seq_id
        return model._asdict()

[docs]    @staticmethod
    def binary_avail(binary):
        """Check if TM binary is available

        Paramaters
        ----------
        binary : str
           The binary name of `TMscore` or `TMalign`

        Returns
        -------
        bool
        
        Raises
        ------
        ValueError
           The binary is not `TMalign` or `TMscore`

        """
        if binary.lower() == 'tmalign':
            exe_name = "TMalign" + ample_util.EXE_EXT
        elif binary.lower() == 'tmscore':
            exe_name = "TMscore" + ample_util.EXE_EXT
        else:
            raise ValueError('Provide one of TMalign or TMscore')

        try:
            ample_util.find_exe(exe_name)
        except:
            return False
        return True


[docs]class TMalign(TMapps):
    """
    Wrapper to handle TMalign scoring for one or more structures

    Examples
    --------
    >>> models = ["<MODEL_1>", "<MODEL_2>", "<MODEL_3>"]
    >>> references = ["<REFERENCE_1>", "<REFERENCE>", "<REFERENCE>"]
    >>> tm = TMalign("<PATH_TO_EXE>")
    >>> entries = tm.compare_structures(models, references)

    """

    def __init__(self, executable, wdir=None, **kwargs):
        super(TMalign, self).__init__(executable, "TMalign", wdir=wdir, **kwargs)

[docs]    def compare_structures(self, models, structures, all_vs_all=False):
        """
        Compare a list of model structures to a second list of reference structures

        Parameters
        ----------
        models : list
           List containing the paths to the model structure files
        structures : list
           List containing the paths to the reference structure files
        all_vs_all : bool
           Flag to compare all models against all structures

        Returns
        -------
        entries : list

        """
        # Check what we are comparing
        if len(structures) == 1:
            logger.info('Using single structure provided for all model comparisons')
            structures = [structures[0] for _ in xrange(len(models))]

        # The models parsed forward to the comparison
        models_to_compare, structures_to_compare = [], []
        combination_iterator = self._get_iterator(all_vs_all)

        for (model, structure) in combination_iterator(models, structures):
            models_to_compare.append(model)
            structures_to_compare.append(structure)

        return self.comparison(models_to_compare, structures_to_compare)
        

[docs]class TMscore(TMapps):
    """
    Wrapper to handle TMscoring for one or more structures

    Examples
    --------
    >>> models = ["<MODEL_1>", "<MODEL_2>", "<MODEL_3>"]
    >>> references = ["<REFERENCE_1>", "<REFERENCE>", "<REFERENCE>"]
    >>> tm = TMscore("<PATH_TO_EXE>")
    >>> entries = tm.compare_structures(models, references)

    """

    def __init__(self, executable, wdir=None, **kwargs):
        super(TMscore, self).__init__(executable, "TMscore", wdir=wdir, **kwargs)

[docs]    def compare_structures(self, models, structures, fastas=None, all_vs_all=False):
        """
        Compare a list of model structures to a second list of reference structures

        Parameters
        ----------
        models : list
           List containing the paths to the model structure files
        structures : list
           List containing the paths to the reference structure files
        fastas : list
           List containing the paths to the FASTA files
        all_vs_all : bool
           Flag to compare all models against all structures

        Returns
        -------
        entries : list
           List of TMscore data entries on a per-model basis

        Notes
        -----
        If a FASTA sequence is provided, a much more accurate comparison can be carried out. However, to by-pass this
        there is also an option to run the comparison without it. This might work just fine for larger models.

        """
        if not BIOPYTHON_AVAILABLE:
            raise RuntimeError("Biopython is not available")
        # Check what we are comparing
        if structures and len(structures) == 1:
            logger.info('Using single structure provided for all model comparisons')
            structures = [structures[0] for _ in xrange(len(models))]

        if fastas and len(fastas) == 1:
            logger.info('Using single FASTA provided for all model comparisons')
            fastas = [fastas[0] for _ in xrange(len(models))]

        # The models parsed forward to the comparison
        models_to_compare, structures_to_compare = [], []
        combination_iterator = self._get_iterator(all_vs_all)

        if fastas:

            # Determine the iterator and create the combinations to be compared
            for (model, structure, fasta) in combination_iterator(models, structures, fastas):

                # Extract the FASTA sequence
                fasta_record = list(SeqIO.parse(open(fasta, 'r'), 'fasta'))[0]
                fasta_data = [(i + 1, j) for i, j in enumerate(str(fasta_record.seq))]

                # Extract some information from each PDB structure file
                model_data = list(self._pdb_info(model))
                structure_data = list(self._pdb_info(structure))

                # Align the model sequence to the FASTA sequence
                for fasta_pos in fasta_data:
                    if fasta_pos not in model_data:
                        model_data.append((fasta_pos[0], "-"))
                model_data.sort(key=operator.itemgetter(0))

                # Align the structure_sequence to the model sequence
                aln_parser = alignment_parser.AlignmentParser()
                fasta_structure_aln = aln_parser.align_sequences("".join(zip(*fasta_data)[1]),
                                                                 "".join(zip(*structure_data)[1]))

                # Remove parts of the alignment that are gaps in both sequences
                to_remove = []
                _alignment = zip("".join(zip(*model_data)[1]), fasta_structure_aln[1])
                for i, (model_res, structure_res) in enumerate(_alignment):
                    if model_res == "-" and structure_res == "-":
                        to_remove.append(i)
                for i in reversed(to_remove):
                    _alignment.pop(i)

                model_aln = "".join(zip(*_alignment)[0])
                structure_aln = "".join(zip(*_alignment)[1])

                if len(model_aln) != len(structure_aln):
                    msg = "Unequal lengths of your model and structure sequences"
                    logger.critical(msg)
                    raise RuntimeError(msg)

                # Modify the structures based on the aligned sequences
                pdb_combo = self._mod_structures(model_aln, structure_aln, model, structure)

                # Save the models
                models_to_compare.append(pdb_combo[0])
                structures_to_compare.append(pdb_combo[1])

        else:
            # No FASTA processing etc, pure comparisons of sequences
            for (model, structure) in combination_iterator(models, structures):

                # Extract some information from each PDB structure file
                model_data = list(self._pdb_info(model))
                structure_data = list(self._pdb_info(structure))

                # Align the sequences to see how much of the predicted decoys are in the xtal
                alignment = alignment_parser.AlignmentParser().align_sequences("".join(zip(*model_data)[1]),
                                                                               "".join(zip(*structure_data)[1]))
                # Redundant but identical to if
                alignment = zip(alignment[0], alignment[1])

                model_aln = "".join(zip(*alignment)[0])
                structure_aln = "".join(zip(*alignment)[1])

                if len(model_aln) != len(structure_aln):
                    msg = "Unequal lengths of your model and structure sequences"
                    logger.critical(msg)
                    raise RuntimeError(msg)

                # Modify the structures based on the aligned sequences
                pdb_combo = self._mod_structures(model_aln, structure_aln, model, structure)

                # Save the models9
                models_to_compare.append(pdb_combo[0])
                structures_to_compare.append(pdb_combo[1])

        return self.comparison(models_to_compare, structures_to_compare)

    def _mod_structures(self, model_aln, structure_aln, model_pdb, structure_pdb):
        """

        Parameters
        ----------
        model_aln : str
           A string containing the aligned sequence of the model
        structure_aln : str
           A string containing the alignment sequence of the structure
        model_pdb : str
           The path to the model pdb file
        structure_pdb : str
           The path to the structure pdb file

        Returns
        -------
        model_pdb_ret : str
           The path to the modified model pdb file
        structure_pdb_ret : str
           The path to the modified structure pdb file

        """

        # ================================
        # File definitions
        # ================================

        # Create a storage for the files
        work_dir_mod = os.path.join(self.work_dir, "tm_util_pdbs")
        if not os.path.isdir(work_dir_mod):
            os.mkdir(work_dir_mod)

        # Create a random file suffix to avoid overwriting file names if duplicate
        # Taken from http://stackoverflow.com/a/2257449
        random_suffix = ''.join(random.SystemRandom().choice(string.ascii_lowercase + string.digits) for _ in range(10))

        # File names and output files
        model_name = os.path.basename(model_pdb).rsplit(".", 1)[0]
        model_pdb_ret = os.path.join(work_dir_mod, "_".join([model_name, random_suffix, "mod.pdb"]))

        structure_name = os.path.basename(structure_pdb).rsplit(".", 1)[0]
        structure_pdb_ret = os.path.join(work_dir_mod, "_".join([structure_name, random_suffix, "mod.pdb"]))

        # Check if the files we are to create for comparison do not exist
        if os.path.isfile(model_pdb_ret) or os.path.isfile(structure_pdb_ret):
            msg = "Comparison structures exist. Move, delete or rename before continuing"
            logger.critical(msg)
            raise RuntimeError(msg)

        # Create temporary files
        _model_pdb_tmp_stage1 = ample_util.tmp_file_name(delete=False, directory=work_dir_mod, suffix=".pdb")
        _model_pdb_tmp_stage2 = ample_util.tmp_file_name(delete=False, directory=work_dir_mod, suffix=".pdb")

        _structure_pdb_tmp_stage1 = ample_util.tmp_file_name(delete=False, directory=work_dir_mod, suffix=".pdb")
        _structure_pdb_tmp_stage2 = ample_util.tmp_file_name(delete=False, directory=work_dir_mod, suffix=".pdb")

        # ==================================
        # File manipulation and modification
        # ==================================

        # Get the gap positions in both sequences
        model_gaps = self._find_gaps(model_aln)
        structure_gaps = self._find_gaps(structure_aln)

        # Renumber the pdb files - required in case there are any gaps
        pdb_edit.renumber_residues_gaps(model_pdb, _model_pdb_tmp_stage1, model_gaps)
        pdb_edit.renumber_residues_gaps(structure_pdb, _structure_pdb_tmp_stage1, structure_gaps)

        # Determine the gap indeces
        model_gaps_indeces = [i+1 for i, is_gap in enumerate(model_gaps) if is_gap]
        structure_gaps_indeces = [i + 1 for i, is_gap in enumerate(structure_gaps) if is_gap]

        # Use gaps of other sequence to even out
        pdb_edit.select_residues(_model_pdb_tmp_stage1, _model_pdb_tmp_stage2, delete=structure_gaps_indeces)
        pdb_edit.select_residues(_structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2, delete=model_gaps_indeces)

        # Renumber the pdb files - required by TMscore binary
        pdb_edit.renumber_residues(_model_pdb_tmp_stage2, model_pdb_ret)
        pdb_edit.renumber_residues(_structure_pdb_tmp_stage2, structure_pdb_ret)

        # ==================================
        # Checks and validations
        # ==================================

        # Extract some information from each PDB structure file
        _model_data = list(self._pdb_info(model_pdb_ret))
        _structure_data = list(self._pdb_info(structure_pdb_ret))

        # Make sure our structures contain the same residues with correct indeces
        if set(_model_data) != set(_structure_data):
            msg = "Residues in model and structure non-identical. Affected PDBs {0} - {1}".format(model_name, structure_name)
            logger.critical(msg)
            raise RuntimeError(msg)

        # Remove the temporary files
        for f in [_model_pdb_tmp_stage1, _model_pdb_tmp_stage2, _structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2]:
            os.unlink(f)

        return model_pdb_ret, structure_pdb_ret

    def _find_gaps(self, seq):
        """
        Identify gaps in the protein chain

        Parameters
        ----------
        seq : str
           String of amino acids

        Returns
        -------
        indexes : list
           List of booleans that contain gaps

        """
        return [True if char == "-" else False for char in seq]

    def _pdb_info(self, pdb):
        """
        Obtain the pdb indeces and residue names

        Parameters
        ----------
        pdb : str
           The path to a PDB file

        Yields
        ------
        list
            A list containing per residue information

        """
        if not BIOPYTHON_AVAILABLE: raise RuntimeError("Biopython is not available")
        with warnings.catch_warnings():
            logger.debug("Suppressing BIOPYTHON warnings for PDBParser")
            warnings.simplefilter("ignore")
            structure = PDB.PDBParser().get_structure("pdb", pdb)

        # Weird problem with get_...() methods if MODEL is not explicitly stated in
        # PDB file. Thus, just iterate over everything return when top chain completed
        for model in structure:
            for i, chain in enumerate(model):
                if i > 0:
                    return
                for residue in chain:
                    hetero, res_seq, _ = residue.get_id()

                    if hetero.strip():
                        logger.debug("Hetero atom detected in {0}: {1}".format(pdb, res_seq))
                        continue

                    resname_three = residue.resname
                    if resname_three == "MSE":
                        resname_three = "MET"
                    resname_one = PDB.Polypeptide.three_to_one(resname_three)

                    yield (res_seq, resname_one)

    def _residue_one(self, pdb):
        """
        Find the first residue index in a pdb structure

        Parameters
        ----------
        pdb : str
           Path to a structure file in PDB format

        Returns
        -------
        int
           Residue sequence index of first residue in structure file

        """
        for line in open(pdb, 'r'):
            if line.startswith("ATOM"):
                line = line.split()
                return int(line[5])


[docs]def main():
    import argparse
    import csv

    parser = argparse.ArgumentParser()
    parser.add_argument('--allvall', action="store_true", help="All vs all comparison")
    parser.add_argument('--log', default='info', choices=['debug', 'info', 'warning', 'error'],
                        help="logging level (defaults to 'warning')")
    # parser.add_argument('--purge', action="store_true", help="Remove the run directory")
    parser.add_argument('--rundir', default=os.getcwd(), help="Run directory")
    parser.add_argument('-f', '-fasta', dest="fastas", nargs="+", default=None)
    parser.add_argument('-m', '-models', dest="models", nargs="+", required=True)
    parser.add_argument('-s', '-structures', dest="structures", nargs="+", required=True)
    tmapp = parser.add_mutually_exclusive_group()
    tmapp.add_argument('-tm', '-tmscore', dest="tmscore", type=str)
    tmapp.add_argument('-ta', '-tmalign', dest="tmalign", type=str)
    args = parser.parse_args()

    # Set up some very basic logging - Logging taken from
    # http://stackoverflow.com/questions/30824981/do-i-need-to-explicitly-check-for-name-main-before-calling-getlogge
    logging.basicConfig(level=getattr(logging, args.log.upper(), None), format='%(levelname)s: %(message)s')
    
    # Make the run directory if it doesn't exist
    if not os.path.isdir(args.rundir):
        os.mkdir(args.rundir)
    
    # Perform the comparison
    if args.tmalign:
        entries = TMalign(args.tmalign, wdir=args.rundir).compare_structures(args.models, args.structures, all_vs_all=args.allvall)
    elif args.tmscore:
        entries = TMscore(args.tmscore, wdir=args.rundir).compare_structures(args.models, args.structures, fastas=args.fastas, all_vs_all=args.allvall)
    else:
        entries = None

    if entries:
        csv_file = os.path.join(args.rundir, 'tm_results.csv')
        with open(csv_file, 'wb') as f_out:
            w = csv.DictWriter(f_out, entries[0].keys())
            w.writeheader()
            for entry in entries:
                w.writerow(entry)

    return entries

if __name__ == "__main__":
    main()
    sys.exit(0)