Source code for ample.util.tm_util

#!/usr/bin/env ccp4-python

from __future__ import division

__author__ = "Felix Simkovic & Jens Thomas"
__date__ = "11 Apr 2018"
__version__ = 1.1

import itertools
import logging
import operator
import os
import random
import string
import sys
import warnings

from ample.parsers import alignment_parser, tm_parser
from ample.util import ample_util, pdb_edit

from pyjob.factory import TaskFactory
from pyjob.script import ScriptCollector, Script

try:
    from Bio import PDB, SeqIO
    BIOPYTHON_AVAILABLE = True
except ImportError:
    BIOPYTHON_AVAILABLE = False

logger = logging.getLogger(__name__)


[docs]class ModelData(object): """Class to store model data""" __slots__ = ( 'model_name', 'structure_name', 'model_fname', 'structure_fname', 'log_fname', 'tmscore', 'rmsd', 'nr_residues_common', 'gdtts', 'gdtha', 'maxsub', 'seq_id', ) def __init__( self, model_name, structure_name, model_fname, structure_fname, log_fname, tmscore, rmsd, nr_residues_common ): self.model_name = model_name self.structure_name = structure_name self.model_fname = model_fname self.structure_fname = structure_fname self.log_fname = log_fname self.tmscore = tmscore self.rmsd = rmsd self.nr_residues_common = nr_residues_common self.gdtts = 0.0 self.gdtha = 0.0 self.maxsub = 0.0 self.seq_id = 0 def _asdict(self): """Convert the object to a dictionary""" return {k: getattr(self, k) for k in self.__slots__}
[docs]class TMapps(object): """ Superclass of TMscore and TMalign Attributes ---------- entries : list List containing the TMscore entries on a per-model basis structure : str Path to the reference structure method : str One of [TMscore|TMalign] executable : str Path to the TMscore executable work_dir : str Path to the working directory Todo ---- * Function to return the entries as numpy matrix * Function to return the entries in a pandas dataframe """ def __init__(self, executable, method, wdir=".", **kwargs): """ Parameters ---------- executable : str Path to the executable method : str One of [TMscore|TMalign] work_dir : str Path to the working directory """ self.entries = [] self.executable = executable self.method = method.lower() self.tmp_dir = None if wdir: self.work_dir = os.path.abspath(wdir) else: self.work_dir = os.getcwd() self._nproc = 1 self._qtype = 'local' self._queue = None self._max_array_jobs = None if 'nproc' in kwargs: self._nproc = kwargs.pop('nproc') # if 'submit_cluster' in kwargs: # self._qtype = kwargs.pop('submit_cluster') # if 'submit_queue' in kwargs: # self._queue = kwargs.pop('submit_queue') # if 'submit_max_array' in kwargs: # self._max_array_jobs = kwargs.pop('submit_max_array') if len(kwargs) > 0: logger.debug('Ignoring keyword arguments: %s', ' '.join(kwargs.keys())) self.tmp_dir = os.path.join(self.work_dir, "tm_util_pdbs") if not os.path.isdir(self.tmp_dir): os.mkdir(self.tmp_dir)
[docs] def comparison(self, models, structures): """ Compare a list of model structures to a second list of reference structures Parameters ---------- models : list List containing the paths to the model structure files structures : list List containing the paths to the reference structure files Returns ------- entries : list List of TMscore data entries on a per-model basis """ if len(models) < 1 or len(structures) < 1: msg = 'No model structures provided' if len(models) < 1 else 'No reference structures provided' logger.critical(msg) raise RuntimeError(msg) elif len(structures) == 1: logger.info('Using single structure provided for all model comparisons') structures = [structures[0] for _ in xrange(len(models))] elif len(models) != len(structures): msg = "Unequal number of models and structures!" logger.critical(msg) raise RuntimeError(msg) if self.method == "tmalign": pt = tm_parser.TMalignLogParser() elif self.method == "tmscore": pt = tm_parser.TMscoreLogParser() else: msg = "Invalid method selected: %s", self.method logger.critical(msg) raise RuntimeError(msg) logger.info('Using algorithm: {0}'.format(self.method)) logger.info('------- Evaluating decoys -------') data_entries, log_files = [], [] collector = ScriptCollector(None) for model_pdb, structure_pdb in zip(models, structures): model_name = os.path.splitext(os.path.basename(model_pdb))[0] structure_name = os.path.splitext(os.path.basename(structure_pdb))[0] stem = "_".join([model_name, structure_name, self.method]) if os.path.isfile(model_pdb) and os.path.isfile(structure_pdb): data_entries.append([model_name, structure_name, model_pdb, structure_pdb]) script = Script(directory=self.tmp_dir, prefic="tmscore_", stem=stem) script.append(" ".join([self.executable, model_pdb, structure_pdb])) collector.add(script) log_files.append(os.path.splitext(script)[0] + ".log") else: if not os.path.isfile(model_pdb): logger.warning("Cannot find: %s", model_pdb) if not os.path.isfile(structure_pdb): logger.warning("Cannot find: %s", structure_pdb) continue logger.info('Executing TManalysis scripts') j = Job(self._qtype) j.submit(job_scripts, nproc=self._nproc, max_array_jobs=self._max_array_jobs, queue=self._queue, name="tmscore") j.wait(interval=1) with TaskFactory( self._qtype, collector, name="tmscore", nprocesses=self._nproc, max_array_size=self._max_array_jobs, queue=self._queue, shell="/bin/bash", ) as task: task.run() task.wait(interval=1) self.entries = [] for entry, log, script in zip(data_entries, log_files, job_scripts): try: pt.reset() pt.parse(log) except Exception: logger.critical("Error processing the %s log file: %s", self.method, log) log = "None" model_name, structure_name, model_pdb, structure_pdb = entry _entry = self._store(model_name, structure_name, model_pdb, structure_pdb, log, pt) self.entries.append(_entry) os.unlink(script) return self.entries
def _get_iterator(self, all_vs_all): """ Parameters ---------- all_vs_all: bool Returns ------- function """ if all_vs_all: logger.info("All-vs-all comparison of models and structures") return itertools.product else: logger.info("Direct comparison of models and structures") return itertools.izip def _store(self, model_name, structure_name, model_pdb, structure_pdb, logfile, pt): model = ModelData( model_name, structure_name, model_pdb, structure_pdb, logfile, pt.tm, pt.rmsd, pt.nr_residues_common ) if hasattr(pt, 'gdtts'): model.gdtts = pt.gdtts if hasattr(pt, 'gdtha'): model.gdtha = pt.gdtha if hasattr(pt, 'maxsub'): model.maxsub = pt.maxsub if hasattr(pt, 'seq_id'): model.seq_id = pt.seq_id return model._asdict()
[docs] @staticmethod def binary_avail(binary): """Check if TM binary is available Paramaters ---------- binary : str The binary name of `TMscore` or `TMalign` Returns ------- bool Raises ------ ValueError The binary is not `TMalign` or `TMscore` """ if binary.lower() == 'tmalign': exe_name = "TMalign" + ample_util.EXE_EXT elif binary.lower() == 'tmscore': exe_name = "TMscore" + ample_util.EXE_EXT else: raise ValueError('Provide one of TMalign or TMscore') try: ample_util.find_exe(exe_name) except: return False return True
[docs]class TMalign(TMapps): """ Wrapper to handle TMalign scoring for one or more structures Examples -------- >>> models = ["<MODEL_1>", "<MODEL_2>", "<MODEL_3>"] >>> references = ["<REFERENCE_1>", "<REFERENCE>", "<REFERENCE>"] >>> tm = TMalign("<PATH_TO_EXE>") >>> entries = tm.compare_structures(models, references) """ def __init__(self, executable, wdir=None, **kwargs): super(TMalign, self).__init__(executable, "TMalign", wdir=wdir, **kwargs)
[docs] def compare_structures(self, models, structures, all_vs_all=False): """ Compare a list of model structures to a second list of reference structures Parameters ---------- models : list List containing the paths to the model structure files structures : list List containing the paths to the reference structure files all_vs_all : bool Flag to compare all models against all structures Returns ------- entries : list """ if len(structures) == 1: logger.info('Using single structure provided for all model comparisons') structures = [structures[0] for _ in xrange(len(models))] models_to_compare, structures_to_compare = [], [] combination_iterator = self._get_iterator(all_vs_all) for (model, structure) in combination_iterator(models, structures): models_to_compare.append(model) structures_to_compare.append(structure) return self.comparison(models_to_compare, structures_to_compare)
[docs]class TMscore(TMapps): """ Wrapper to handle TMscoring for one or more structures Examples -------- >>> models = ["<MODEL_1>", "<MODEL_2>", "<MODEL_3>"] >>> references = ["<REFERENCE_1>", "<REFERENCE>", "<REFERENCE>"] >>> tm = TMscore("<PATH_TO_EXE>") >>> entries = tm.compare_structures(models, references) """ def __init__(self, executable, wdir=None, **kwargs): super(TMscore, self).__init__(executable, "TMscore", wdir=wdir, **kwargs)
[docs] def compare_structures(self, models, structures, fastas=None, all_vs_all=False): """ Compare a list of model structures to a second list of reference structures Parameters ---------- models : list List containing the paths to the model structure files structures : list List containing the paths to the reference structure files fastas : list List containing the paths to the FASTA files all_vs_all : bool Flag to compare all models against all structures Returns ------- entries : list List of TMscore data entries on a per-model basis Notes ----- If a FASTA sequence is provided, a much more accurate comparison can be carried out. However, to by-pass this there is also an option to run the comparison without it. This might work just fine for larger models. """ if not BIOPYTHON_AVAILABLE: raise RuntimeError("Biopython is not available") if structures and len(structures) == 1: logger.info('Using single structure provided for all model comparisons') structures = [structures[0] for _ in xrange(len(models))] if fastas and len(fastas) == 1: logger.info('Using single FASTA provided for all model comparisons') fastas = [fastas[0] for _ in xrange(len(models))] models_to_compare, structures_to_compare = [], [] combination_iterator = self._get_iterator(all_vs_all) if fastas: for (model, structure, fasta) in combination_iterator(models, structures, fastas): fasta_record = list(SeqIO.parse(open(fasta, 'r'), 'fasta'))[0] fasta_data = [(i + 1, j) for i, j in enumerate(str(fasta_record.seq))] model_data = list(self._pdb_info(model)) structure_data = list(self._pdb_info(structure)) for fasta_pos in fasta_data: if fasta_pos not in model_data: model_data.append((fasta_pos[0], "-")) model_data.sort(key=operator.itemgetter(0)) aln_parser = alignment_parser.AlignmentParser() fasta_structure_aln = aln_parser.align_sequences( "".join(zip(*model_data)[1]), "".join(zip(*structure_data)[1]) ) to_remove = [] _alignment = zip(fasta_structure_aln[0], fasta_structure_aln[1]) for i, (model_res, structure_res) in enumerate(_alignment): if model_res == "-" and structure_res == "-": to_remove.append(i) for i in reversed(to_remove): _alignment.pop(i) model_aln = "".join(zip(*_alignment)[0]) structure_aln = "".join(zip(*_alignment)[1]) if len(model_aln) != len(structure_aln): msg = "Unequal lengths of your model and structure sequences" logger.critical(msg) raise RuntimeError(msg) pdb_combo = self._mod_structures(model_aln, structure_aln, model, structure) models_to_compare.append(pdb_combo[0]) structures_to_compare.append(pdb_combo[1]) else: for (model, structure) in combination_iterator(models, structures): model_data = list(self._pdb_info(model)) structure_data = list(self._pdb_info(structure)) alignment = alignment_parser.AlignmentParser().align_sequences( "".join(zip(*model_data)[1]), "".join(zip(*structure_data)[1]) ) alignment = zip(alignment[0], alignment[1]) model_aln = "".join(zip(*alignment)[0]) structure_aln = "".join(zip(*alignment)[1]) if len(model_aln) != len(structure_aln): msg = "Unequal lengths of your model and structure sequences" logger.critical(msg) raise RuntimeError(msg) pdb_combo = self._mod_structures(model_aln, structure_aln, model, structure) models_to_compare.append(pdb_combo[0]) structures_to_compare.append(pdb_combo[1]) return self.comparison(models_to_compare, structures_to_compare)
def _mod_structures(self, model_aln, structure_aln, model_pdb, structure_pdb): """ Parameters ---------- model_aln : str A string containing the aligned sequence of the model structure_aln : str A string containing the alignment sequence of the structure model_pdb : str The path to the model pdb file structure_pdb : str The path to the structure pdb file Returns ------- model_pdb_ret : str The path to the modified model pdb file structure_pdb_ret : str The path to the modified structure pdb file """ random_suffix = ''.join(random.SystemRandom().choice(string.ascii_lowercase + string.digits) for _ in range(10)) model_name = os.path.basename(model_pdb).rsplit(".", 1)[0] model_pdb_ret = os.path.join(self.tmp_dir, "_".join([model_name, random_suffix, "mod.pdb"])) structure_name = os.path.basename(structure_pdb).rsplit(".", 1)[0] structure_pdb_ret = os.path.join(self.tmp_dir, "_".join([structure_name, random_suffix, "mod.pdb"])) if os.path.isfile(model_pdb_ret) or os.path.isfile(structure_pdb_ret): msg = "Comparison structures exist. Move, delete or rename before continuing" logger.critical(msg) raise RuntimeError(msg) _model_pdb_tmp_stage1 = ample_util.tmp_file_name(delete=False, directory=self.tmp_dir, suffix=".pdb") _model_pdb_tmp_stage2 = ample_util.tmp_file_name(delete=False, directory=self.tmp_dir, suffix=".pdb") _structure_pdb_tmp_stage1 = ample_util.tmp_file_name(delete=False, directory=self.tmp_dir, suffix=".pdb") _structure_pdb_tmp_stage2 = ample_util.tmp_file_name(delete=False, directory=self.tmp_dir, suffix=".pdb") model_gaps = self._find_gaps(model_aln) structure_gaps = self._find_gaps(structure_aln) pdb_edit.renumber_residues_gaps(model_pdb, _model_pdb_tmp_stage1, model_gaps) pdb_edit.renumber_residues_gaps(structure_pdb, _structure_pdb_tmp_stage1, structure_gaps) model_gaps_indices = [i + 1 for i, is_gap in enumerate(model_gaps) if is_gap] structure_gaps_indices = [i + 1 for i, is_gap in enumerate(structure_gaps) if is_gap] pdb_edit.select_residues(_model_pdb_tmp_stage1, _model_pdb_tmp_stage2, delete=structure_gaps_indices) pdb_edit.select_residues(_structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2, delete=model_gaps_indices) pdb_edit.renumber_residues(_model_pdb_tmp_stage2, model_pdb_ret) pdb_edit.renumber_residues(_structure_pdb_tmp_stage2, structure_pdb_ret) _model_data = list(self._pdb_info(model_pdb_ret)) _structure_data = list(self._pdb_info(structure_pdb_ret)) # Alignment not always identical, let's aim for 90% identical = set(_model_data).intersection(set(_structure_data)) if len(identical) / float(len(_model_data)) < 0.90: msg = "Differing residues in model and structure. Affected PDBs %s - %s\n%s\n%s" raise RuntimeError(msg % (model_name, structure_name, model_aln, structure_aln)) files = [_model_pdb_tmp_stage1, _model_pdb_tmp_stage2, _structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2] for i in range(4): os.unlink(files[i]) if not os.path.isfile(model_pdb_ret): raise RuntimeError("Modified model %s does not exist!" % model_pdb_ret) if not os.path.isfile(structure_pdb_ret): raise RuntimeError("Modified reference %s does not exist!" % structure_pdb_ret) return model_pdb_ret, structure_pdb_ret def _find_gaps(self, seq): """ Identify gaps in the protein chain Parameters ---------- seq : str String of amino acids Returns ------- indexes : list List of booleans that contain gaps """ return [char == "-" for char in seq] def _pdb_info(self, pdb): """ Obtain the pdb indeces and residue names Parameters ---------- pdb : str The path to a PDB file Yields ------ list A list containing per residue information """ if not BIOPYTHON_AVAILABLE: raise RuntimeError("Biopython is not available") with warnings.catch_warnings(): logger.debug("Suppressing BIOPYTHON warnings for PDBParser") warnings.simplefilter("ignore") structure = PDB.PDBParser().get_structure("pdb", pdb) # Weird problem with get_...() methods if MODEL is not explicitly stated in # PDB file. Thus, just iterate over everything return when top chain completed for model in structure: for i, chain in enumerate(model): if i > 0: return for residue in chain: hetero, res_seq, _ = residue.get_id() hetero = hetero.strip().lower() if hetero == "w": continue elif hetero: msg = "Hetero atom {} detected in {} in residue {} --- please rename to ATOM or remove!" raise TypeError(msg.format(hetero, pdb, res_seq)) resname_three = residue.resname if resname_three == "MSE": logger.warning("Treating MSE as MET!") resname_three = "MET" resname_one = PDB.Polypeptide.three_to_one(resname_three) yield (res_seq, resname_one) def _residue_one(self, pdb): """ Find the first residue index in a pdb structure Parameters ---------- pdb : str Path to a structure file in PDB format Returns ------- int Residue sequence index of first residue in structure file """ for line in open(pdb, 'r'): if line.startswith("ATOM"): return int(line.split()[5])
[docs]def main(): import argparse import pandas as pd parser = argparse.ArgumentParser() parser.add_argument('--allvall', action="store_true", help="All vs all comparison") parser.add_argument('--debug', action="store_true") parser.add_argument('--purge', action="store_true", help="Remove all temporary files") parser.add_argument('--rundir', default='.', help="Run directory") parser.add_argument('-f', '--fasta', dest="fastas", nargs="+", default=None) parser.add_argument('-m', '--models', dest="models", nargs="+", required=True) parser.add_argument('-s', '--structures', dest="structures", nargs="+", required=True) parser.add_argument('-t', '--threads', default=1, type=int) tmapp = parser.add_mutually_exclusive_group(required=True) tmapp.add_argument('-tm', '--tmscore', dest="tmscore", type=str) tmapp.add_argument('-ta', '--tmalign', dest="tmalign", type=str) args = parser.parse_args() loglevel = logging.INFO if args.debug: loglevel = logging.DEBUG logging.basicConfig(level=loglevel, format="%(levelname)s: %(message)s") if not os.path.isdir(args.rundir): os.mkdir(args.rundir) if args.fastas: args.fastas = [os.path.abspath(m) for m in args.fastas] args.models = [os.path.abspath(m) for m in args.models] args.structures = [os.path.abspath(m) for m in args.structures] kwargs = dict(wdir=args.rundir, nproc=args.threads) if args.tmalign: tmapp = TMalign(args.tmalign, **kwargs) tmapp.compare_structures(args.models, args.structures, all_vs_all=args.allvall) elif args.tmscore: tmapp = TMscore(args.tmscore, **kwargs) tmapp.compare_structures(args.models, args.structures, fastas=args.fastas, all_vs_all=args.allvall) else: raise RuntimeError("You're doomed if you get here!") df = pd.DataFrame(tmapp.entries) df.to_csv(os.path.join(args.rundir, 'tm_results.csv'), index=False) logger.info("Final TMscore table:\n\n" + df.to_string(index=False) + "\n") if args.purge and tmapp.tmp_dir: from shutil import rmtree rmtree(tmapp.tmp_dir) return tmapp.entries
if __name__ == "__main__": main() sys.exit(0)