#!/usr/bin/env ccp4-python
from __future__ import division
__author__ = "Felix Simkovic & Jens Thomas"
__date__ = "11 Apr 2018"
__version__ = 1.1
import itertools
import logging
import operator
import os
import random
import string
import sys
import warnings
from ample.parsers import alignment_parser, tm_parser
from ample.util import ample_util, pdb_edit
from pyjob.factory import TaskFactory
from pyjob.script import ScriptCollector, Script
try:
from Bio import PDB, SeqIO
BIOPYTHON_AVAILABLE = True
except ImportError:
BIOPYTHON_AVAILABLE = False
logger = logging.getLogger(__name__)
[docs]class ModelData(object):
"""Class to store model data"""
__slots__ = (
'model_name',
'structure_name',
'model_fname',
'structure_fname',
'log_fname',
'tmscore',
'rmsd',
'nr_residues_common',
'gdtts',
'gdtha',
'maxsub',
'seq_id',
)
def __init__(
self, model_name, structure_name, model_fname, structure_fname, log_fname, tmscore, rmsd, nr_residues_common
):
self.model_name = model_name
self.structure_name = structure_name
self.model_fname = model_fname
self.structure_fname = structure_fname
self.log_fname = log_fname
self.tmscore = tmscore
self.rmsd = rmsd
self.nr_residues_common = nr_residues_common
self.gdtts = 0.0
self.gdtha = 0.0
self.maxsub = 0.0
self.seq_id = 0
def _asdict(self):
"""Convert the object to a dictionary"""
return {k: getattr(self, k) for k in self.__slots__}
[docs]class TMapps(object):
"""
Superclass of TMscore and TMalign
Attributes
----------
entries : list
List containing the TMscore entries on a per-model basis
structure : str
Path to the reference structure
method : str
One of [TMscore|TMalign]
executable : str
Path to the TMscore executable
work_dir : str
Path to the working directory
Todo
----
* Function to return the entries as numpy matrix
* Function to return the entries in a pandas dataframe
"""
def __init__(self, executable, method, wdir=".", **kwargs):
"""
Parameters
----------
executable : str
Path to the executable
method : str
One of [TMscore|TMalign]
work_dir : str
Path to the working directory
"""
self.entries = []
self.executable = executable
self.method = method.lower()
self.tmp_dir = None
if wdir:
self.work_dir = os.path.abspath(wdir)
else:
self.work_dir = os.getcwd()
self._nproc = 1
self._qtype = 'local'
self._queue = None
self._max_array_jobs = None
if 'nproc' in kwargs:
self._nproc = kwargs.pop('nproc')
# if 'submit_cluster' in kwargs:
# self._qtype = kwargs.pop('submit_cluster')
# if 'submit_queue' in kwargs:
# self._queue = kwargs.pop('submit_queue')
# if 'submit_max_array' in kwargs:
# self._max_array_jobs = kwargs.pop('submit_max_array')
if len(kwargs) > 0:
logger.debug('Ignoring keyword arguments: %s', ' '.join(kwargs.keys()))
self.tmp_dir = os.path.join(self.work_dir, "tm_util_pdbs")
if not os.path.isdir(self.tmp_dir):
os.mkdir(self.tmp_dir)
[docs] def comparison(self, models, structures):
"""
Compare a list of model structures to a second list of reference structures
Parameters
----------
models : list
List containing the paths to the model structure files
structures : list
List containing the paths to the reference structure files
Returns
-------
entries : list
List of TMscore data entries on a per-model basis
"""
if len(models) < 1 or len(structures) < 1:
msg = 'No model structures provided' if len(models) < 1 else 'No reference structures provided'
logger.critical(msg)
raise RuntimeError(msg)
elif len(structures) == 1:
logger.info('Using single structure provided for all model comparisons')
structures = [structures[0] for _ in xrange(len(models))]
elif len(models) != len(structures):
msg = "Unequal number of models and structures!"
logger.critical(msg)
raise RuntimeError(msg)
if self.method == "tmalign":
pt = tm_parser.TMalignLogParser()
elif self.method == "tmscore":
pt = tm_parser.TMscoreLogParser()
else:
msg = "Invalid method selected: %s", self.method
logger.critical(msg)
raise RuntimeError(msg)
logger.info('Using algorithm: {0}'.format(self.method))
logger.info('------- Evaluating decoys -------')
data_entries, log_files = [], []
collector = ScriptCollector(None)
for model_pdb, structure_pdb in zip(models, structures):
model_name = os.path.splitext(os.path.basename(model_pdb))[0]
structure_name = os.path.splitext(os.path.basename(structure_pdb))[0]
stem = "_".join([model_name, structure_name, self.method])
if os.path.isfile(model_pdb) and os.path.isfile(structure_pdb):
data_entries.append([model_name, structure_name, model_pdb, structure_pdb])
script = Script(directory=self.tmp_dir, prefic="tmscore_", stem=stem)
script.append(" ".join([self.executable, model_pdb, structure_pdb]))
collector.add(script)
log_files.append(os.path.splitext(script)[0] + ".log")
else:
if not os.path.isfile(model_pdb):
logger.warning("Cannot find: %s", model_pdb)
if not os.path.isfile(structure_pdb):
logger.warning("Cannot find: %s", structure_pdb)
continue
logger.info('Executing TManalysis scripts')
j = Job(self._qtype)
j.submit(job_scripts, nproc=self._nproc, max_array_jobs=self._max_array_jobs, queue=self._queue, name="tmscore")
j.wait(interval=1)
with TaskFactory(
self._qtype,
collector,
name="tmscore",
nprocesses=self._nproc,
max_array_size=self._max_array_jobs,
queue=self._queue,
shell="/bin/bash",
) as task:
task.run()
task.wait(interval=1)
self.entries = []
for entry, log, script in zip(data_entries, log_files, job_scripts):
try:
pt.reset()
pt.parse(log)
except Exception:
logger.critical("Error processing the %s log file: %s", self.method, log)
log = "None"
model_name, structure_name, model_pdb, structure_pdb = entry
_entry = self._store(model_name, structure_name, model_pdb, structure_pdb, log, pt)
self.entries.append(_entry)
os.unlink(script)
return self.entries
def _get_iterator(self, all_vs_all):
"""
Parameters
----------
all_vs_all: bool
Returns
-------
function
"""
if all_vs_all:
logger.info("All-vs-all comparison of models and structures")
return itertools.product
else:
logger.info("Direct comparison of models and structures")
return itertools.izip
def _store(self, model_name, structure_name, model_pdb, structure_pdb, logfile, pt):
model = ModelData(
model_name, structure_name, model_pdb, structure_pdb, logfile, pt.tm, pt.rmsd, pt.nr_residues_common
)
if hasattr(pt, 'gdtts'):
model.gdtts = pt.gdtts
if hasattr(pt, 'gdtha'):
model.gdtha = pt.gdtha
if hasattr(pt, 'maxsub'):
model.maxsub = pt.maxsub
if hasattr(pt, 'seq_id'):
model.seq_id = pt.seq_id
return model._asdict()
[docs] @staticmethod
def binary_avail(binary):
"""Check if TM binary is available
Paramaters
----------
binary : str
The binary name of `TMscore` or `TMalign`
Returns
-------
bool
Raises
------
ValueError
The binary is not `TMalign` or `TMscore`
"""
if binary.lower() == 'tmalign':
exe_name = "TMalign" + ample_util.EXE_EXT
elif binary.lower() == 'tmscore':
exe_name = "TMscore" + ample_util.EXE_EXT
else:
raise ValueError('Provide one of TMalign or TMscore')
try:
ample_util.find_exe(exe_name)
except:
return False
return True
[docs]class TMalign(TMapps):
"""
Wrapper to handle TMalign scoring for one or more structures
Examples
--------
>>> models = ["<MODEL_1>", "<MODEL_2>", "<MODEL_3>"]
>>> references = ["<REFERENCE_1>", "<REFERENCE>", "<REFERENCE>"]
>>> tm = TMalign("<PATH_TO_EXE>")
>>> entries = tm.compare_structures(models, references)
"""
def __init__(self, executable, wdir=None, **kwargs):
super(TMalign, self).__init__(executable, "TMalign", wdir=wdir, **kwargs)
[docs] def compare_structures(self, models, structures, all_vs_all=False):
"""
Compare a list of model structures to a second list of reference structures
Parameters
----------
models : list
List containing the paths to the model structure files
structures : list
List containing the paths to the reference structure files
all_vs_all : bool
Flag to compare all models against all structures
Returns
-------
entries : list
"""
if len(structures) == 1:
logger.info('Using single structure provided for all model comparisons')
structures = [structures[0] for _ in xrange(len(models))]
models_to_compare, structures_to_compare = [], []
combination_iterator = self._get_iterator(all_vs_all)
for (model, structure) in combination_iterator(models, structures):
models_to_compare.append(model)
structures_to_compare.append(structure)
return self.comparison(models_to_compare, structures_to_compare)
[docs]class TMscore(TMapps):
"""
Wrapper to handle TMscoring for one or more structures
Examples
--------
>>> models = ["<MODEL_1>", "<MODEL_2>", "<MODEL_3>"]
>>> references = ["<REFERENCE_1>", "<REFERENCE>", "<REFERENCE>"]
>>> tm = TMscore("<PATH_TO_EXE>")
>>> entries = tm.compare_structures(models, references)
"""
def __init__(self, executable, wdir=None, **kwargs):
super(TMscore, self).__init__(executable, "TMscore", wdir=wdir, **kwargs)
[docs] def compare_structures(self, models, structures, fastas=None, all_vs_all=False):
"""
Compare a list of model structures to a second list of reference structures
Parameters
----------
models : list
List containing the paths to the model structure files
structures : list
List containing the paths to the reference structure files
fastas : list
List containing the paths to the FASTA files
all_vs_all : bool
Flag to compare all models against all structures
Returns
-------
entries : list
List of TMscore data entries on a per-model basis
Notes
-----
If a FASTA sequence is provided, a much more accurate comparison can be carried out. However, to by-pass this
there is also an option to run the comparison without it. This might work just fine for larger models.
"""
if not BIOPYTHON_AVAILABLE:
raise RuntimeError("Biopython is not available")
if structures and len(structures) == 1:
logger.info('Using single structure provided for all model comparisons')
structures = [structures[0] for _ in xrange(len(models))]
if fastas and len(fastas) == 1:
logger.info('Using single FASTA provided for all model comparisons')
fastas = [fastas[0] for _ in xrange(len(models))]
models_to_compare, structures_to_compare = [], []
combination_iterator = self._get_iterator(all_vs_all)
if fastas:
for (model, structure, fasta) in combination_iterator(models, structures, fastas):
fasta_record = list(SeqIO.parse(open(fasta, 'r'), 'fasta'))[0]
fasta_data = [(i + 1, j) for i, j in enumerate(str(fasta_record.seq))]
model_data = list(self._pdb_info(model))
structure_data = list(self._pdb_info(structure))
for fasta_pos in fasta_data:
if fasta_pos not in model_data:
model_data.append((fasta_pos[0], "-"))
model_data.sort(key=operator.itemgetter(0))
aln_parser = alignment_parser.AlignmentParser()
fasta_structure_aln = aln_parser.align_sequences(
"".join(zip(*model_data)[1]), "".join(zip(*structure_data)[1])
)
to_remove = []
_alignment = zip(fasta_structure_aln[0], fasta_structure_aln[1])
for i, (model_res, structure_res) in enumerate(_alignment):
if model_res == "-" and structure_res == "-":
to_remove.append(i)
for i in reversed(to_remove):
_alignment.pop(i)
model_aln = "".join(zip(*_alignment)[0])
structure_aln = "".join(zip(*_alignment)[1])
if len(model_aln) != len(structure_aln):
msg = "Unequal lengths of your model and structure sequences"
logger.critical(msg)
raise RuntimeError(msg)
pdb_combo = self._mod_structures(model_aln, structure_aln, model, structure)
models_to_compare.append(pdb_combo[0])
structures_to_compare.append(pdb_combo[1])
else:
for (model, structure) in combination_iterator(models, structures):
model_data = list(self._pdb_info(model))
structure_data = list(self._pdb_info(structure))
alignment = alignment_parser.AlignmentParser().align_sequences(
"".join(zip(*model_data)[1]), "".join(zip(*structure_data)[1])
)
alignment = zip(alignment[0], alignment[1])
model_aln = "".join(zip(*alignment)[0])
structure_aln = "".join(zip(*alignment)[1])
if len(model_aln) != len(structure_aln):
msg = "Unequal lengths of your model and structure sequences"
logger.critical(msg)
raise RuntimeError(msg)
pdb_combo = self._mod_structures(model_aln, structure_aln, model, structure)
models_to_compare.append(pdb_combo[0])
structures_to_compare.append(pdb_combo[1])
return self.comparison(models_to_compare, structures_to_compare)
def _mod_structures(self, model_aln, structure_aln, model_pdb, structure_pdb):
"""
Parameters
----------
model_aln : str
A string containing the aligned sequence of the model
structure_aln : str
A string containing the alignment sequence of the structure
model_pdb : str
The path to the model pdb file
structure_pdb : str
The path to the structure pdb file
Returns
-------
model_pdb_ret : str
The path to the modified model pdb file
structure_pdb_ret : str
The path to the modified structure pdb file
"""
random_suffix = ''.join(random.SystemRandom().choice(string.ascii_lowercase + string.digits) for _ in range(10))
model_name = os.path.basename(model_pdb).rsplit(".", 1)[0]
model_pdb_ret = os.path.join(self.tmp_dir, "_".join([model_name, random_suffix, "mod.pdb"]))
structure_name = os.path.basename(structure_pdb).rsplit(".", 1)[0]
structure_pdb_ret = os.path.join(self.tmp_dir, "_".join([structure_name, random_suffix, "mod.pdb"]))
if os.path.isfile(model_pdb_ret) or os.path.isfile(structure_pdb_ret):
msg = "Comparison structures exist. Move, delete or rename before continuing"
logger.critical(msg)
raise RuntimeError(msg)
_model_pdb_tmp_stage1 = ample_util.tmp_file_name(delete=False, directory=self.tmp_dir, suffix=".pdb")
_model_pdb_tmp_stage2 = ample_util.tmp_file_name(delete=False, directory=self.tmp_dir, suffix=".pdb")
_structure_pdb_tmp_stage1 = ample_util.tmp_file_name(delete=False, directory=self.tmp_dir, suffix=".pdb")
_structure_pdb_tmp_stage2 = ample_util.tmp_file_name(delete=False, directory=self.tmp_dir, suffix=".pdb")
model_gaps = self._find_gaps(model_aln)
structure_gaps = self._find_gaps(structure_aln)
pdb_edit.renumber_residues_gaps(model_pdb, _model_pdb_tmp_stage1, model_gaps)
pdb_edit.renumber_residues_gaps(structure_pdb, _structure_pdb_tmp_stage1, structure_gaps)
model_gaps_indices = [i + 1 for i, is_gap in enumerate(model_gaps) if is_gap]
structure_gaps_indices = [i + 1 for i, is_gap in enumerate(structure_gaps) if is_gap]
pdb_edit.select_residues(_model_pdb_tmp_stage1, _model_pdb_tmp_stage2, delete=structure_gaps_indices)
pdb_edit.select_residues(_structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2, delete=model_gaps_indices)
pdb_edit.renumber_residues(_model_pdb_tmp_stage2, model_pdb_ret)
pdb_edit.renumber_residues(_structure_pdb_tmp_stage2, structure_pdb_ret)
_model_data = list(self._pdb_info(model_pdb_ret))
_structure_data = list(self._pdb_info(structure_pdb_ret))
# Alignment not always identical, let's aim for 90%
identical = set(_model_data).intersection(set(_structure_data))
if len(identical) / float(len(_model_data)) < 0.90:
msg = "Differing residues in model and structure. Affected PDBs %s - %s\n%s\n%s"
raise RuntimeError(msg % (model_name, structure_name, model_aln, structure_aln))
files = [_model_pdb_tmp_stage1, _model_pdb_tmp_stage2, _structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2]
for i in range(4):
os.unlink(files[i])
if not os.path.isfile(model_pdb_ret):
raise RuntimeError("Modified model %s does not exist!" % model_pdb_ret)
if not os.path.isfile(structure_pdb_ret):
raise RuntimeError("Modified reference %s does not exist!" % structure_pdb_ret)
return model_pdb_ret, structure_pdb_ret
def _find_gaps(self, seq):
"""
Identify gaps in the protein chain
Parameters
----------
seq : str
String of amino acids
Returns
-------
indexes : list
List of booleans that contain gaps
"""
return [char == "-" for char in seq]
def _pdb_info(self, pdb):
"""
Obtain the pdb indeces and residue names
Parameters
----------
pdb : str
The path to a PDB file
Yields
------
list
A list containing per residue information
"""
if not BIOPYTHON_AVAILABLE:
raise RuntimeError("Biopython is not available")
with warnings.catch_warnings():
logger.debug("Suppressing BIOPYTHON warnings for PDBParser")
warnings.simplefilter("ignore")
structure = PDB.PDBParser().get_structure("pdb", pdb)
# Weird problem with get_...() methods if MODEL is not explicitly stated in
# PDB file. Thus, just iterate over everything return when top chain completed
for model in structure:
for i, chain in enumerate(model):
if i > 0:
return
for residue in chain:
hetero, res_seq, _ = residue.get_id()
hetero = hetero.strip().lower()
if hetero == "w":
continue
elif hetero:
msg = "Hetero atom {} detected in {} in residue {} --- please rename to ATOM or remove!"
raise TypeError(msg.format(hetero, pdb, res_seq))
resname_three = residue.resname
if resname_three == "MSE":
logger.warning("Treating MSE as MET!")
resname_three = "MET"
resname_one = PDB.Polypeptide.three_to_one(resname_three)
yield (res_seq, resname_one)
def _residue_one(self, pdb):
"""
Find the first residue index in a pdb structure
Parameters
----------
pdb : str
Path to a structure file in PDB format
Returns
-------
int
Residue sequence index of first residue in structure file
"""
for line in open(pdb, 'r'):
if line.startswith("ATOM"):
return int(line.split()[5])
[docs]def main():
import argparse
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument('--allvall', action="store_true", help="All vs all comparison")
parser.add_argument('--debug', action="store_true")
parser.add_argument('--purge', action="store_true", help="Remove all temporary files")
parser.add_argument('--rundir', default='.', help="Run directory")
parser.add_argument('-f', '--fasta', dest="fastas", nargs="+", default=None)
parser.add_argument('-m', '--models', dest="models", nargs="+", required=True)
parser.add_argument('-s', '--structures', dest="structures", nargs="+", required=True)
parser.add_argument('-t', '--threads', default=1, type=int)
tmapp = parser.add_mutually_exclusive_group(required=True)
tmapp.add_argument('-tm', '--tmscore', dest="tmscore", type=str)
tmapp.add_argument('-ta', '--tmalign', dest="tmalign", type=str)
args = parser.parse_args()
loglevel = logging.INFO
if args.debug:
loglevel = logging.DEBUG
logging.basicConfig(level=loglevel, format="%(levelname)s: %(message)s")
if not os.path.isdir(args.rundir):
os.mkdir(args.rundir)
if args.fastas:
args.fastas = [os.path.abspath(m) for m in args.fastas]
args.models = [os.path.abspath(m) for m in args.models]
args.structures = [os.path.abspath(m) for m in args.structures]
kwargs = dict(wdir=args.rundir, nproc=args.threads)
if args.tmalign:
tmapp = TMalign(args.tmalign, **kwargs)
tmapp.compare_structures(args.models, args.structures, all_vs_all=args.allvall)
elif args.tmscore:
tmapp = TMscore(args.tmscore, **kwargs)
tmapp.compare_structures(args.models, args.structures, fastas=args.fastas, all_vs_all=args.allvall)
else:
raise RuntimeError("You're doomed if you get here!")
df = pd.DataFrame(tmapp.entries)
df.to_csv(os.path.join(args.rundir, 'tm_results.csv'), index=False)
logger.info("Final TMscore table:\n\n" + df.to_string(index=False) + "\n")
if args.purge and tmapp.tmp_dir:
from shutil import rmtree
rmtree(tmapp.tmp_dir)
return tmapp.entries
if __name__ == "__main__":
main()
sys.exit(0)