Source code for ample.util.benchmark_util

"""
Created on 24 Oct 2014

@author: jmht
"""

import copy
import glob
import logging
import os
import pandas as pd
import shutil
import sys

from ample.util import ample_util, csymmatch, mtz_util, pdb_edit, pdb_model, reforigin, residue_map, rio, shelxe, \
    tm_util
from pyjob import Script

logger = logging.getLogger(__name__)

_oldroot = None
_newroot = None
SHELXE_STEM = 'shelxe'

_CSV_KEYLIST = [
    'ample_version',
    # Native info
    'native_pdb_code',
    'native_pdb_title',
    'native_pdb_resolution',
    'native_pdb_solvent_content',
    'native_pdb_space_group',
    'native_pdb_num_atoms',
    'native_pdb_num_residues',
    'native_pdb_num_chains',
    # The modelled sequence
    'fasta_length',
    # Get the ensemble data and add to the MRBUMP data
    'ensemble_name',
    'ensemble_percent_model',
    # cluster info
    'cluster_method',
    'num_clusters',
    'cluster_num',
    'cluster_centroid',
    'cluster_num_models',
    # truncation info
    'truncation_level',
    'percent_truncation',
    'truncation_method',
    'truncation_pruning',
    'truncation_variance',
    'num_residues',
    'pruned_residues',
    # subclustering info
    'subcluster_num_models',
    'subcluster_radius_threshold',
    'subcluster_centroid_model',
    'subcluster_centroid_model_RMSD',
    'subcluster_centroid_model_TM',
    # ensemble info
    # 'name',
    'side_chain_treatment',
    'ensemble_num_atoms',
    # MR result info
    # 'name',
    'MR_program',
    'Solution_Type',
    'PHASER_LLG',
    'PHASER_TFZ',
    'PHASER_RFZ',
    'PHASER_time',
    'PHASER_killed',
    'PHASER_version',
    'PHASER_errors',
    'MOLREP_score',
    'MOLREP_time',
    'MOLREP_version',
    'MR_MPE',
    'MR_wMPE',
    'REFMAC_Rfact',
    'REFMAC_Rfree',
    #     'REFMAC_MPE',
    #     'REFMAC_wMPE',
    'REFMAC_version',
    'BUCC_final_Rfact',
    'BUCC_final_Rfree',
    'BUCC_version',
    'ARP_final_Rfact',
    'ARP_final_Rfree',
    'ARP_version',
    'SHELXE_CC',
    'SHELXE_ACL',
    'SHELXE_MCL',
    'SHELXE_NC',
    'SHELXE_wPE',
    'SHELXE_wMPE',
    'SHELXE_os',
    'SHELXE_time',
    'SHELXE_version',
    'SXRBUCC_version',
    'SXRBUCC_final_Rfact',
    'SXRBUCC_final_Rfree',
    'SXRBUCC_MPE',
    'SXRBUCC_wMPE',
    'SXRARP_version',
    'SXRARP_final_Rfact',
    'SXRARP_final_Rfree',
    'SXRARP_MPE',
    'SXRARP_wMPE',
    'num_placed_chains',
    'num_placed_atoms',
    'reforigin_RMSD',
    'AA_num_contacts',
    'RIO_num_contacts',
    'RIO_in_register',
    'RIO_oo_register',
    'RIO_backwards',
    'RIO',
    'RIO_no_cat',
    'RIO_norm',
]


[docs]def analyse(amoptd, newroot=None):
    if newroot:
        assert os.path.isdir(newroot)
        global _oldroot, _newroot
        _newroot = newroot
        _oldroot = amoptd['work_dir']

    if not os.path.isdir(fixpath(amoptd['benchmark_dir'])):
        os.mkdir(fixpath(amoptd['benchmark_dir']))
    os.chdir(fixpath(amoptd['benchmark_dir']))

    # AnalysePdb may have already been called from the main script
    if amoptd['native_pdb'] and 'native_pdb_std' not in amoptd:
        analysePdb(amoptd)

    if amoptd['native_pdb_std']:
        # Generate an SHELXE HKL and ENT file so that we can calculate phase errors
        mtz_util.to_hkl(amoptd['mtz'], hkl_file=os.path.join(amoptd['benchmark_dir'], SHELXE_STEM + ".hkl"))
        shutil.copyfile(amoptd['native_pdb_std'], os.path.join(amoptd['benchmark_dir'], SHELXE_STEM + ".ent"))

    if amoptd['native_pdb'] and not (amoptd['homologs'] or amoptd['ideal_helices'] or amoptd['helical_ensembles']
                                     or amoptd['import_ensembles'] or amoptd['single_model_mode']):
        analyseModels(amoptd)

    # Get the ensembling data
    if 'ensembles_data' not in amoptd or not len(amoptd['ensembles_data']):
        logger.critical("Benchmark cannot find any ensemble data!")
        return

    # Get dict of ensemble name -> ensemble result
    ensemble_results = {e['name']: e for e in amoptd['ensembles_data']}

    # Get mrbump_results for cluster
    if 'mrbump_results' not in amoptd or not len(amoptd['mrbump_results']):
        logger.critical("Benchmark cannot find any mrbump results!")
        return

    data = []
    mrinfo = shelxe.MRinfo(amoptd['shelxe_exe'], amoptd['native_pdb_info'].pdb, amoptd['mtz'])
    for result in amoptd['mrbump_results']:

        # use mrbump dict as basis for result object
        d = copy.copy(result)

        # Add in the data from the ensemble
        d.update(ensemble_results[d['ensemble_name']])
        assert d['ensemble_name'] == d['name'], d

        # Hack for old results
        if 'truncation_num_residues' in d:
            d['num_residues'] = d['truncation_num_residues']
            del d['truncation_num_residues']

        # Hack for ideal helices where num_residues are missing
        if amoptd['ideal_helices'] and ('num_residues' not in d or d['num_residues'] is None):
            d['num_residues'] = int(d['ensemble_name'].lstrip('polyala_'))

        # Get the ensemble data and add to the MRBUMP data
        d['ensemble_percent_model'] = int((float(d['num_residues']) / float(amoptd['fasta_length'])) * 100)

        if amoptd['native_pdb']:
            # Add in stuff we've cleaned from the pdb
            native_keys = [
                'native_pdb_code',
                'native_pdb_title',
                'native_pdb_resolution',
                'native_pdb_solvent_content',
                'native_pdb_space_group',
                'native_pdb_num_chains',
                'native_pdb_num_atoms',
                'native_pdb_num_residues',
            ]
            d.update({key: amoptd[key] for key in native_keys})
            # Analyse the solution
            analyseSolution(amoptd, d, mrinfo)
        data.append(d)

    # Put everything in a pandas DataFrame
    dframe = pd.DataFrame(data)

    # General stuff
    dframe['ample_version'] = amoptd['ample_version']
    dframe['fasta_length'] = amoptd['fasta_length']

    # Analyse subcluster centroid models
    if 'subcluster_centroid_model' in dframe.columns and amoptd['native_pdb']:
        centroid_index = dframe.index
        centroid_models = [fixpath(f) for f in dframe.subcluster_centroid_model]
        native_pdb_std = fixpath(amoptd['native_pdb_std'])
        fasta = fixpath(amoptd['fasta'])

        # Calculation of TMscores for subcluster centroid models
        if amoptd['have_tmscore']:
            tm = tm_util.TMscore(amoptd['tmscore_exe'], wdir=fixpath(amoptd['benchmark_dir']), **amoptd)
            tm_results = tm.compare_structures(centroid_models, [native_pdb_std], [fasta])
            centroid_tmscores = [r['tmscore'] for r in tm_results]
            centroid_rmsds = [r['rmsd'] for r in tm_results]
        else:
            raise RuntimeError("No program to calculate tmscores!")

        dframe['subcluster_centroid_model_TM'] = pd.Series(centroid_tmscores, index=centroid_index)
        dframe['subcluster_centroid_model_RMSD'] = pd.Series(centroid_rmsds, index=centroid_index)

    # Save the data
    file_name = os.path.join(fixpath(amoptd['benchmark_dir']), 'results.csv')
    dframe.to_csv(file_name, columns=_CSV_KEYLIST, index=False, na_rep="N/A")
    amoptd['benchmark_results'] = dframe.to_dict('records')

    return


[docs]def analyseModels(amoptd):
    # Get hold of a full model so we can do the mapping of residues
    refModelPdb = glob.glob(os.path.join(amoptd['models_dir'], "*.pdb"))[0]

    nativePdbInfo = amoptd['native_pdb_info']
    refModelPdbInfo = pdb_edit.get_info(refModelPdb)
    amoptd['ref_model_pdb_info'] = refModelPdbInfo
    try:
        resSeqMap = residue_map.residueSequenceMap()
        resSeqMap.fromInfo(
            refInfo=refModelPdbInfo,
            refChainID=refModelPdbInfo.models[0].chains[0],  # Only 1 chain in model
            targetInfo=nativePdbInfo,
            targetChainID=nativePdbInfo.models[0].chains[0],
        )
        amoptd['res_seq_map'] = resSeqMap
    except Exception as e:
        logger.exception("Error calculating resSeqMap: %s" % e)
        amoptd['res_seq_map'] = None  # Won't be able to calculate RIO scores

    if amoptd['have_tmscore']:
        try:
            tm = tm_util.TMscore(amoptd['tmscore_exe'], wdir=fixpath(amoptd['benchmark_dir']))
            # Calculation of TMscores for all models
            logger.info("Analysing Rosetta models with TMscore")
            model_list = sorted(glob.glob(os.path.join(amoptd['models_dir'], "*pdb")))
            structure_list = [amoptd['native_pdb_std']]
            amoptd['tmComp'] = tm.compare_structures(model_list, structure_list, fastas=[amoptd['fasta']])
        except Exception as e:
            logger.exception("Unable to run TMscores: %s", e)
    else:
        raise RuntimeError("No program to calculate TMSCORES")


[docs]def analysePdb(amoptd):
    """Collect data on the native pdb structure"""

    nativePdb = fixpath(amoptd['native_pdb'])
    nativePdbInfo = pdb_edit.get_info(nativePdb)

    # number atoms/residues
    natoms, nresidues = pdb_edit.num_atoms_and_residues(nativePdb)

    # Get information on the origins for this spaceGroup
    try:
        originInfo = pdb_model.OriginInfo(spaceGroupLabel=nativePdbInfo.crystalInfo.spaceGroup)
    except Exception:
        originInfo = None

    # Do this here as a bug in pdbcur can knacker the CRYST1 data
    amoptd['native_pdb_code'] = nativePdbInfo.pdbCode
    amoptd['native_pdb_title'] = nativePdbInfo.title
    amoptd['native_pdb_resolution'] = nativePdbInfo.resolution
    amoptd['native_pdb_solvent_content'] = nativePdbInfo.solventContent
    amoptd['native_pdb_matthews_coefficient'] = nativePdbInfo.matthewsCoefficient
    if not originInfo:
        space_group = "P1"
    else:
        space_group = originInfo.spaceGroup()
    amoptd['native_pdb_space_group'] = space_group
    amoptd['native_pdb_num_atoms'] = natoms
    amoptd['native_pdb_num_residues'] = nresidues

    # First check if the native has > 1 model and extract the first if so
    if len(nativePdbInfo.models) > 1:
        logger.info("nativePdb has > 1 model - using first")
        nativePdb1 = ample_util.filename_append(
            filename=nativePdb, astr="model1", directory=fixpath(amoptd['work_dir'])
        )
        pdb_edit.extract_model(nativePdb, nativePdb1, modelID=nativePdbInfo.models[0].serial)
        nativePdb = nativePdb1

    # Standardise the PDB to rename any non-standard AA, remove solvent etc
    nativePdbStd = ample_util.filename_append(filename=nativePdb, astr="std", directory=fixpath(amoptd['work_dir']))
    pdb_edit.standardise(nativePdb, nativePdbStd, del_hetatm=True)
    nativePdb = nativePdbStd

    # Get the new Info about the native
    nativePdbInfo = pdb_edit.get_info(nativePdb)

    # For comparsion of shelxe model we need a single chain from the native so we get this here
    if len(nativePdbInfo.models[0].chains) > 1:
        nativeChain1 = ample_util.filename_append(
            filename=nativePdbInfo.pdb, astr="chain1", directory=fixpath(amoptd['work_dir'])
        )
        pdb_edit.merge_chains(nativePdbInfo.pdb, nativeChain1)
    else:
        nativeChain1 = nativePdbInfo.pdb

    # Additional data
    amoptd['native_pdb_num_chains'] = len(nativePdbInfo.models[0].chains)
    amoptd['native_pdb_info'] = nativePdbInfo
    amoptd['native_pdb_std'] = nativePdbStd
    amoptd['native_pdb_1chain'] = nativeChain1
    amoptd['native_pdb_origin_info'] = originInfo

    return


[docs]def analyseSolution(amoptd, d, mrinfo):
    logger.info("Benchmark: analysing result: {0}".format(d['ensemble_name']))

    mrPdb = None
    if d['MR_program'] == "PHASER":
        mrPdb = d['PHASER_pdbout']
        mrMTZ = d['PHASER_mtzout']
    elif d['MR_program'] == "MOLREP":
        mrPdb = d['MOLREP_pdbout']
    elif d['MR_program'] == "unknown":
        return

    if mrPdb is None or not os.path.isfile(mrPdb):
        # logger.critical("Cannot find mrPdb {0} for solution {1}".format(mrPdb,d))
        return

    # debug - copy into work directory as reforigin struggles with long pathnames
    shutil.copy(mrPdb, os.path.join(fixpath(amoptd['benchmark_dir']), os.path.basename(mrPdb)))

    mrPdbInfo = pdb_edit.get_info(mrPdb)

    d['num_placed_chains'] = mrPdbInfo.numChains()
    d['num_placed_atoms'] = mrPdbInfo.numAtoms()
    d['num_placed_CA'] = mrPdbInfo.numCalpha()

    if amoptd['native_pdb']:
        if not d['SHELXE_os']:
            logger.critical("mrPdb {0} has no SHELXE_os origin shift. Calculating...".format(mrPdb))
            mrinfo.analyse(mrPdb)
            mrOrigin = mrinfo.originShift
            d['SHELXE_MPE'] = mrinfo.MPE
            d['SHELXE_wMPE'] = mrinfo.wMPE
        else:
            mrOrigin = [c * -1 for c in d['SHELXE_os']]

        # Move pdb onto new origin
        originPdb = ample_util.filename_append(mrPdb, astr='offset', directory=fixpath(amoptd['benchmark_dir']))
        pdb_edit.translate(mrPdb, originPdb, mrOrigin)

        # offset.pdb is the mrModel shifted onto the new origin use csymmatch to wrap onto native
        csymmatch.Csymmatch().wrapModelToNative(
            originPdb,
            amoptd['native_pdb'],
            csymmatchPdb=os.path.join(
                fixpath(amoptd['benchmark_dir']), "phaser_{0}_csymmatch.pdb".format(d['ensemble_name'])
            ),
        )
        # can now delete origin pdb
        os.unlink(originPdb)

        # Calculate phase error for the MR PDB
        try:
            mrinfo.analyse(mrPdb)
            d['MR_MPE'] = mrinfo.MPE
            d['MR_wMPE'] = mrinfo.wMPE
        except Exception as e:
            logger.critical("Error analysing mrPdb: {0}\n{1}".format(mrPdb, e))

        # We cannot calculate the Reforigin RMSDs or RIO scores for runs where we don't have a full initial model
        # to compare to the native to allow us to determine which parts of the ensemble correspond to which parts of
        # the native structure - or if we were unable to calculate a res_seq_map
        if not (
                amoptd['homologs']
                or amoptd['ideal_helices']
                or amoptd['helical_ensembles']
                or amoptd['import_ensembles']
                or amoptd['single_model_mode']
                or amoptd['res_seq_map']
        ):

            # Get reforigin info
            rmsder = reforigin.ReforiginRmsd()
            try:
                rmsder.getRmsd(
                    nativePdbInfo=amoptd['native_pdb_info'],
                    placedPdbInfo=mrPdbInfo,
                    refModelPdbInfo=amoptd['ref_model_pdb_info'],
                    cAlphaOnly=True,
                    workdir=fixpath(amoptd['benchmark_dir']),
                )
                d['reforigin_RMSD'] = rmsder.rmsd
            except Exception as e:
                logger.critical("Error calculating RMSD: {0}".format(e))
                d['reforigin_RMSD'] = 999

            # Score the origin with all-atom and rio
            rioData = rio.Rio().scoreOrigin(
                mrOrigin,
                mrPdbInfo=mrPdbInfo,
                nativePdbInfo=amoptd['native_pdb_info'],
                resSeqMap=amoptd['res_seq_map'],
                workdir=fixpath(amoptd['benchmark_dir']),
            )

            # Set attributes
            d['AA_num_contacts'] = rioData.aaNumContacts
            d['RIO_num_contacts'] = rioData.rioNumContacts
            d['RIO_in_register'] = rioData.rioInRegister
            d['RIO_oo_register'] = rioData.rioOoRegister
            d['RIO_backwards'] = rioData.rioBackwards
            d['RIO'] = rioData.rioInRegister + rioData.rioOoRegister
            d['RIO_no_cat'] = rioData.rioNumContacts - (rioData.rioInRegister + rioData.rioOoRegister)
            d['RIO_norm'] = float(d['RIO']) / float(d['native_pdb_num_residues'])
        else:
            d['AA_num_contacts'] = None
            d['RIO_num_contacts'] = None
            d['RIO_in_register'] = None
            d['RIO_oo_register'] = None
            d['RIO_backwards'] = None
            d['RIO'] = None
            d['RIO_no_cat'] = None
            d['RIO_norm'] = None

        #     # Now get the helix
        #     helixSequence = contacts.Rio().helixFromContacts( contacts=rioData.contacts,
        #                                                            dsspLog=dsspLog )
        #     if helixSequence is not None:
        #         ampleResult.rioHelixSequence = helixSequence
        #         ampleResult.rioLenHelix      = len( helixSequence )
        #         hfile = os.path.join( workdir, "{0}.helix".format( ampleResult.ensembleName ) )
        #         with open( hfile, 'w' ) as f:
        #             f.write( helixSequence+"\n" )

        #
        # This purely for checking and so we have pdbs to view
        #
        # Wrap shelxe trace onto native using Csymmatch
        if not d['SHELXE_pdbout'] is None and os.path.isfile(fixpath(d['SHELXE_pdbout'])):
            csymmatch.Csymmatch().wrapModelToNative(
                fixpath(d['SHELXE_pdbout']),
                amoptd['native_pdb'],
                origin=mrOrigin,
                workdir=fixpath(amoptd['benchmark_dir']),
            )

        if not ('SHELXE_wMPE' in d and d['SHELXE_wMPE']):
            try:
                mrinfo.analyse(d['SHELXE_pdbout'])
                d['SHELXE_MPE'] = mrinfo.MPE
                d['SHELXE_wMPE'] = mrinfo.wMPE
            except Exception as e:
                logger.critical("Error analysing SHELXE_pdbout: {0}\n{1}".format(d['SHELXE_pdbout'], e))

        # Wrap parse_buccaneer model onto native
        if d['SXRBUCC_pdbout'] and os.path.isfile(fixpath(d['SXRBUCC_pdbout'])):
            # Need to rename Pdb as is just called buccSX_output.pdb
            csymmatchPdb = os.path.join(
                fixpath(amoptd['benchmark_dir']), "buccaneer_{0}_csymmatch.pdb".format(d['ensemble_name'])
            )

            csymmatch.Csymmatch().wrapModelToNative(
                fixpath(d['SXRBUCC_pdbout']),
                amoptd['native_pdb'],
                origin=mrOrigin,
                csymmatchPdb=csymmatchPdb,
                workdir=fixpath(amoptd['benchmark_dir']),
            )
            # Calculate phase error
            try:
                mrinfo.analyse(d['SXRBUCC_pdbout'])
                d['SXRBUCC_MPE'] = mrinfo.MPE
                d['SXRBUCC_wMPE'] = mrinfo.wMPE
            except Exception as e:
                logger.critical("Error analysing SXRBUCC_pdbout: {0}\n{1}".format(d['SXRBUCC_pdbout'], e))

        # Wrap parse_buccaneer model onto native
        if d['SXRARP_pdbout'] and os.path.isfile(fixpath(d['SXRARP_pdbout'])):
            # Need to rename Pdb as is just called buccSX_output.pdb
            csymmatchPdb = os.path.join(
                fixpath(amoptd['benchmark_dir']), "arpwarp_{0}_csymmatch.pdb".format(d['ensemble_name'])
            )

            csymmatch.Csymmatch().wrapModelToNative(
                fixpath(d['SXRARP_pdbout']),
                amoptd['native_pdb'],
                origin=mrOrigin,
                csymmatchPdb=csymmatchPdb,
                workdir=fixpath(amoptd['benchmark_dir']),
            )
            # Calculate phase error
            try:
                mrinfo.analyse(d['SXRARP_pdbout'])
                d['SXRARP_MPE'] = mrinfo.MPE
                d['SXRARP_wMPE'] = mrinfo.wMPE
            except Exception as e:
                logger.critical("Error analysing SXRARP_pdbout: {0}\n{1}".format(d['SXRARP_pdbout'], e))
    return


[docs]def cluster_script(amoptd, python_path="ccp4-python"):
    """Create the script for benchmarking on a cluster"""
    # write out script
    work_dir = amoptd['work_dir']
    script = Script(directory=work_dir, stem="submit_benchmark")
    pydir = os.path.abspath(os.path.dirname(__file__))
    benchmark_script = os.path.join(pydir, "benchmark_util.py")
    script.append("{0} {1} {2} {3}".format(python_path, "-u", benchmark_script, amoptd['results_path']))
    script.write()

    return script



[docs]def fixpath(path):
    # fix for analysing on a different machine
    if _oldroot and _newroot:
        return path.replace(_oldroot, _newroot)
    else:
        return path


# Run unit tests
if __name__ == "__main__":
    # Set up logging - could append to an existing log?
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    # This runs the benchmarking starting from a pickled file containing an amopt dictionary.
    # - used when submitting the modelling jobs to a cluster
    if len(sys.argv) != 2 or not os.path.isfile(sys.argv[1]):
        logging.debug("benchmark script requires the path to a pickled amopt dictionary!")
        sys.exit(1)

    # Get the amopt dictionary
    amoptd = ample_util.read_amoptd(sys.argv[1])

    fl = logging.FileHandler(os.path.join(amoptd['work_dir'], "benchmark.log"))
    fl.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fl.setFormatter(formatter)
    logger.addHandler(fl)

    analyse(amoptd)
    ample_util.save_amoptd(amoptd)