Source code for ample.util.benchmark_util

'''
Created on 24 Oct 2014

@author: jmht
'''

# Python imports
import copy
import glob
import logging
import os
import pandas as pd
import shutil
import sys

# Our imports
from ample.util import ample_util
from ample.util import csymmatch
from ample.util import maxcluster
from ample.util import mtz_util
from ample.util import pdb_edit
from ample.util import pdb_model
from ample.util import reforigin
from ample.util import residue_map
from ample.util import rio
from ample.util import shelxe
from ample.util import tm_util

logger = logging.getLogger(__name__)

_oldroot = None
_newroot = None
_MAXCLUSTERER = None
SHELXE_STEM = 'shelxe'

_CSV_KEYLIST = [
    'ample_version',

    # Native info
    'native_pdb_code',
    'native_pdb_title',
    'native_pdb_resolution',
    'native_pdb_solvent_content',
    'native_pdb_space_group',
    'native_pdb_num_atoms',
    'native_pdb_num_residues',
    'native_pdb_num_chains',

    # The modelled sequence
    'fasta_length',

    # Get the ensemble data and add to the MRBUMP data
    'ensemble_name',
    'ensemble_percent_model',

    # cluster info
    'cluster_method',
    'num_clusters',
    'cluster_num',
    'cluster_centroid',
    'cluster_num_models',

    # truncation info
    'truncation_level',
    'percent_truncation',
    'truncation_method',
    'truncation_pruning',
    'truncation_variance',
    'num_residues',
    'pruned_residues',

    # subclustering info
    'subcluster_num_models',
    'subcluster_radius_threshold',
    'subcluster_centroid_model',
    'subcluster_centroid_model_RMSD',
    'subcluster_centroid_model_TM',

    # ensemble info
    # 'name',
    'side_chain_treatment',
    'ensemble_num_atoms',

    # MR result info
    # 'name',
    'MR_program',
    'Solution_Type',

    'PHASER_LLG',
    'PHASER_TFZ',
    'PHASER_RFZ',
    'PHASER_time',
    'PHASER_killed',
    'PHASER_version',
    'PHASER_errors',

    'MOLREP_score',
    'MOLREP_time',
    'MOLREP_version',

    'MR_MPE',
    'MR_wMPE',

    'REFMAC_Rfact',
    'REFMAC_Rfree',
#     'REFMAC_MPE',
#     'REFMAC_wMPE',
    'REFMAC_version',

    'BUCC_final_Rfact',
    'BUCC_final_Rfree',
    'BUCC_version',

    'ARP_final_Rfact',
    'ARP_final_Rfree',
    'ARP_version',

    'SHELXE_CC',
    'SHELXE_ACL',
    'SHELXE_MCL',
    'SHELXE_NC',
    'SHELXE_wPE',
    'SHELXE_wMPE',
    'SHELXE_os',
    'SHELXE_time',
    'SHELXE_version',

    'SXRBUCC_version',
    'SXRBUCC_final_Rfact',
    'SXRBUCC_final_Rfree',
    'SXRBUCC_MPE',
    'SXRBUCC_wMPE',

    'SXRARP_version',
    'SXRARP_final_Rfact',
    'SXRARP_final_Rfree',
    'SXRARP_MPE',
    'SXRARP_wMPE',

    'num_placed_chains',
    'num_placed_atoms',
    'reforigin_RMSD',

    'AA_num_contacts',
    'RIO_num_contacts',
    'RIO_in_register',
    'RIO_oo_register',
    'RIO_backwards',
    'RIO',
    'RIO_no_cat',
    'RIO_norm'
]


[docs]def analyse(amoptd, newroot=None):
    if newroot:
        assert os.path.isdir(newroot)
        global _oldroot,_newroot
        _newroot = newroot
        _oldroot = amoptd['work_dir']

    if not os.path.isdir(fixpath(amoptd['benchmark_dir'])):
        os.mkdir(fixpath(amoptd['benchmark_dir']))
    os.chdir(fixpath(amoptd['benchmark_dir']))
    
    # AnalysePdb may have already been called from the main script
    if amoptd['native_pdb'] and 'native_pdb_std' not in amoptd:
        analysePdb(amoptd)

    if amoptd['native_pdb_std']:
        # Generate an SHELXE HKL and ENT file so that we can calculate phase errors
        mtz_util.to_hkl(amoptd['mtz'], hkl_file=os.path.join(amoptd['benchmark_dir'], SHELXE_STEM + ".hkl"))
        shutil.copyfile(amoptd['native_pdb_std'], os.path.join(amoptd['benchmark_dir'], SHELXE_STEM + ".ent"))
        
    if amoptd['native_pdb'] and \
       not (amoptd['homologs'] or amoptd['ideal_helices']
            or amoptd['import_ensembles'] or amoptd['single_model_mode']):
        analyseModels(amoptd)
    
    # Get the ensembling data
    if 'ensembles_data' not in amoptd or not len(amoptd['ensembles_data']):
        logger.critical("Benchmark cannot find any ensemble data!")
        return

    # Get dict of ensemble name -> ensemble result
    ensemble_results = {
        e['name']: e for e in amoptd['ensembles_data']
    }
                    
    # Get mrbump_results for cluster
    if 'mrbump_results' not in amoptd or not len(amoptd['mrbump_results']):
        logger.critical("Benchmark cannot find any mrbump results!")
        return
    
    data = []
    mrinfo = shelxe.MRinfo(amoptd['shelxe_exe'], amoptd['native_pdb_info'].pdb, amoptd['mtz'])
    for result in amoptd['mrbump_results']:
        
        # use mrbump dict as basis for result object
        d = copy.copy(result)
        
        # Add in the data from the ensemble
        d.update(ensemble_results[d['ensemble_name']])
        assert d['ensemble_name'] == d['name'], d
        
        # Hack for old results
        if 'truncation_num_residues' in d:
            d['num_residues'] = d['truncation_num_residues']
            del d['truncation_num_residues']
            
        # Hack for ideal helices where num_residues are missing
        if amoptd['ideal_helices'] and ('num_residues' not in d or d['num_residues'] is None):
            d['num_residues'] = int(d['ensemble_name'].lstrip('polyala'))

        # Get the ensemble data and add to the MRBUMP data
        d['ensemble_percent_model'] = int((float(d['num_residues']) / float(amoptd['fasta_length'])) * 100)

        if amoptd['native_pdb']:
            # Add in stuff we've cleaned from the pdb
            native_keys = [
                'native_pdb_code', 'native_pdb_title', 'native_pdb_resolution', 'native_pdb_solvent_content',
                'native_pdb_space_group', 'native_pdb_num_chains', 'native_pdb_num_atoms', 'native_pdb_num_residues'
            ]
            d.update({key: amoptd[key] for key in native_keys})
            # Analyse the solution
            analyseSolution(amoptd, d, mrinfo)
        data.append(d)

    # Put everything in a pandas DataFrame
    dframe = pd.DataFrame(data)

    # General stuff
    dframe['ample_version'] = amoptd['ample_version']
    dframe['fasta_length'] = amoptd['fasta_length']

    # Analyse subcluster centroid models
    if 'subcluster_centroid_model' in dframe.columns and amoptd['native_pdb']:
        centroid_index = dframe.index
        centroid_models = [fixpath(f) for f in dframe.subcluster_centroid_model]
        native_pdb_std = fixpath(amoptd['native_pdb_std'])
        fasta = fixpath(amoptd['fasta'])

        # Calculation of TMscores for subcluster centroid models
        if amoptd['have_tmscore']:
            tm = tm_util.TMscore(amoptd['tmscore_exe'], wdir=fixpath(amoptd['benchmark_dir']), **amoptd)
            tm_results = tm.compare_structures(centroid_models, [native_pdb_std], [fasta])
            centroid_tmscores = [r['tmscore'] for r in tm_results]
            centroid_rmsds = [r['rmsd'] for r in tm_results]

        else:
            # Use maxcluster
            centroid_tmscores = []
            for centroid_model in centroid_models:
                n = os.path.splitext(os.path.basename(centroid_model))[0]
                cm = None
                for pdb in amoptd['models']:
                    if n.startswith(os.path.splitext(os.path.basename(pdb))[0]):
                        cm = pdb
                        break
                if cm:
                    centroid_tmscores.append(_MAXCLUSTERER.tm(cm))
                else:
                    msg = "Cannot find model for subcluster_centroid_model {0}".format(
                        dframe[0, 'subcluster_centroid_model']
                    )
                    raise RuntimeError(msg)

            # There is an issue here as this is the RMSD over the TM-aligned residues
            # NOT the global RMSD, which it is for the tm_util results
            centroid_rmsds = [None for _ in centroid_index]

        dframe['subcluster_centroid_model_TM'] = pd.Series(centroid_tmscores, index=centroid_index)
        dframe['subcluster_centroid_model_RMSD'] = pd.Series(centroid_rmsds, index=centroid_index)

    # Save the data
    file_name = os.path.join(fixpath(amoptd['benchmark_dir']), 'results.csv')
    dframe.to_csv(file_name, columns=_CSV_KEYLIST, index=False, na_rep="N/A")
    amoptd['benchmark_results'] = dframe.to_dict('records')

    return


[docs]def analyseModels(amoptd):
    
    # Get hold of a full model so we can do the mapping of residues
    refModelPdb = glob.glob(os.path.join(amoptd['models_dir'], "*.pdb"))[0]
    
    nativePdbInfo=amoptd['native_pdb_info']
    
    resSeqMap = residue_map.residueSequenceMap()
    refModelPdbInfo = pdb_edit.get_info(refModelPdb)
    resSeqMap.fromInfo( refInfo=refModelPdbInfo,
                        refChainID=refModelPdbInfo.models[0].chains[0], # Only 1 chain in model
                        targetInfo=nativePdbInfo,
                        targetChainID=nativePdbInfo.models[0].chains[0]
                      )
    amoptd['res_seq_map']=resSeqMap
    amoptd['ref_model_pdb_info']=refModelPdbInfo

    if amoptd['have_tmscore']:
        try:
            tm = tm_util.TMscore(amoptd['tmscore_exe'], wdir=fixpath(amoptd['benchmark_dir']))
            # Calculation of TMscores for all models
            logger.info("Analysing Rosetta models with TMscore")
            model_list = sorted(glob.glob(os.path.join(amoptd['models_dir'], "*pdb")))
            structure_list = [amoptd['native_pdb_std']]
            amoptd['tmComp'] = tm.compare_structures(model_list, structure_list, fastas=[amoptd['fasta']])
        except Exception as e:
            msg = "Unable to run TMscores: {0}".format(e)
            logger.critical(msg)
    else:
        global _MAXCLUSTERER # setting a module-level variable so need to use global keyword to it doesn't become a local variable
        _MAXCLUSTERER = maxcluster.Maxcluster(amoptd['maxcluster_exe'])
        logger.info("Analysing Rosetta models with Maxcluster")
        _MAXCLUSTERER.compareDirectory(nativePdbInfo=nativePdbInfo,
                                       resSeqMap=resSeqMap,
                                        modelsDirectory=amoptd['models_dir'],
                                        workdir=fixpath(amoptd['benchmark_dir']))
    return


[docs]def analysePdb(amoptd):
    """Collect data on the native pdb structure"""
    
    nativePdb = fixpath(amoptd['native_pdb'])
    nativePdbInfo = pdb_edit.get_info(nativePdb)
    
    # number atoms/residues
    natoms, nresidues = pdb_edit.num_atoms_and_residues(nativePdb)

    # Get information on the origins for this spaceGroup
    try:
        originInfo = pdb_model.OriginInfo(spaceGroupLabel=nativePdbInfo.crystalInfo.spaceGroup)
    except:
        originInfo = None

    # Do this here as a bug in pdbcur can knacker the CRYST1 data
    amoptd['native_pdb_code'] = nativePdbInfo.pdbCode
    amoptd['native_pdb_title'] = nativePdbInfo.title
    amoptd['native_pdb_resolution'] = nativePdbInfo.resolution
    amoptd['native_pdb_solvent_content'] = nativePdbInfo.solventContent
    amoptd['native_pdb_matthews_coefficient'] = nativePdbInfo.matthewsCoefficient
    if not originInfo:
        space_group = "P1"
    else:
        space_group = originInfo.spaceGroup()
    amoptd['native_pdb_space_group'] = space_group
    amoptd['native_pdb_num_atoms'] = natoms
    amoptd['native_pdb_num_residues'] = nresidues
    
    # First check if the native has > 1 model and extract the first if so
    if len( nativePdbInfo.models ) > 1:
        logger.info("nativePdb has > 1 model - using first")
        nativePdb1 = ample_util.filename_append( filename=nativePdb, astr="model1", directory=fixpath(amoptd['work_dir']))
        pdb_edit.extract_model( nativePdb, nativePdb1, modelID=nativePdbInfo.models[0].serial )
        nativePdb = nativePdb1
        
    # Standardise the PDB to rename any non-standard AA, remove solvent etc
    nativePdbStd = ample_util.filename_append( filename=nativePdb, astr="std", directory=fixpath(amoptd['work_dir']))
    pdb_edit.standardise(nativePdb, nativePdbStd, del_hetatm=True)
    nativePdb = nativePdbStd
    
    # Get the new Info about the native
    nativePdbInfo = pdb_edit.get_info( nativePdb )
    
    # For maxcluster comparsion of shelxe model we need a single chain from the native so we get this here
    if len( nativePdbInfo.models[0].chains ) > 1:
        chainID = nativePdbInfo.models[0].chains[0]
        nativeChain1  = ample_util.filename_append( filename=nativePdbInfo.pdb,
                                                       astr="chain1", 
                                                       directory=fixpath(amoptd['work_dir']))
        pdb_edit.to_single_chain( nativePdbInfo.pdb, nativeChain1 )
    else:
        nativeChain1 = nativePdbInfo.pdb
    
    # Additional data
    amoptd['native_pdb_num_chains'] = len( nativePdbInfo.models[0].chains )
    amoptd['native_pdb_info'] = nativePdbInfo
    amoptd['native_pdb_std'] = nativePdbStd
    amoptd['native_pdb_1chain'] = nativeChain1
    amoptd['native_pdb_origin_info'] = originInfo
    
    return


[docs]def analyseSolution(amoptd, d, mrinfo):

    logger.info("Benchmark: analysing result: {0}".format(d['ensemble_name']))

    mrPdb=None
    if d['MR_program']=="PHASER":
        mrPdb = d['PHASER_pdbout']
        mrMTZ = d['PHASER_mtzout']
    elif d['MR_program']=="MOLREP":
        mrPdb = d['MOLREP_pdbout']
    elif d['MR_program']=="unknown":
        return

    if mrPdb is None or not os.path.isfile(mrPdb):
        #logger.critical("Cannot find mrPdb {0} for solution {1}".format(mrPdb,d))
        return

    # debug - copy into work directory as reforigin struggles with long pathnames
    shutil.copy(mrPdb, os.path.join(fixpath(amoptd['benchmark_dir']), os.path.basename(mrPdb)))
    
    mrPdbInfo = pdb_edit.get_info( mrPdb )
    
    d['num_placed_chains'] = mrPdbInfo.numChains()
    d['num_placed_atoms'] = mrPdbInfo.numAtoms()
    d['num_placed_CA'] = mrPdbInfo.numCalpha()
    
    if amoptd['native_pdb']:
        if not d['SHELXE_os']:
            logger.critical("mrPdb {0} has no SHELXE_os origin shift. Calculating...".format(mrPdb))
            mrinfo.analyse(mrPdb)
            mrOrigin = mrinfo.originShift
            d['SHELXE_MPE'] = mrinfo.MPE
            d['SHELXE_wMPE'] = mrinfo.wMPE
        else:
            mrOrigin=[c*-1 for c in d['SHELXE_os']]
        
        # Move pdb onto new origin
        originPdb = ample_util.filename_append(mrPdb, astr='offset',directory=fixpath(amoptd['benchmark_dir']))
        #print(mrPdb, originPdb, mrOrigin)
        pdb_edit.translate(mrPdb, originPdb, mrOrigin)
        
        # offset.pdb is the mrModel shifted onto the new origin use csymmatch to wrap onto native
        csymmatch.Csymmatch().wrapModelToNative(originPdb,
                                                amoptd['native_pdb'],
                                                csymmatchPdb=os.path.join(fixpath(amoptd['benchmark_dir']),
                                                "phaser_{0}_csymmatch.pdb".format(d['ensemble_name'])))
        # can now delete origin pdb
        os.unlink(originPdb)
        
        # Calculate phase error for the MR PDB
        try:
            mrinfo.analyse(mrPdb)
            d['MR_MPE'] = mrinfo.MPE
            d['MR_wMPE'] = mrinfo.wMPE
        except Exception as e:
            logger.critical("Error analysing mrPdb: {0}\n{1}".format(mrPdb,e))         
    
        # We cannot calculate the Reforigin RMSDs or RIO scores for runs where we don't have a full initial model
        # to compare to the native to allow us to determine which parts of the ensemble correspond to which parts of 
        # the native structure.
        if not (amoptd['homologs'] or \
                amoptd['ideal_helices'] or \
                amoptd['import_ensembles'] or \
                amoptd['single_model_mode']):
    
            # Get reforigin info
            rmsder = reforigin.ReforiginRmsd()
            try:
                rmsder.getRmsd(nativePdbInfo=amoptd['native_pdb_info'],
                               placedPdbInfo=mrPdbInfo,
                               refModelPdbInfo=amoptd['ref_model_pdb_info'],
                               cAlphaOnly=True,
                               workdir=fixpath(amoptd['benchmark_dir']))
                d['reforigin_RMSD']=rmsder.rmsd
            except Exception,e:
                logger.critical("Error calculating RMSD: {0}".format(e))
                d['reforigin_RMSD']=999
    
    
            # Score the origin with all-atom and rio
            rioData=rio.Rio().scoreOrigin(mrOrigin,
                                          mrPdbInfo=mrPdbInfo,
                                          nativePdbInfo=amoptd['native_pdb_info'],
                                          resSeqMap=amoptd['res_seq_map'],
                                          workdir=fixpath(amoptd['benchmark_dir'])
                                          )
        
            # Set attributes
            d['AA_num_contacts']  = rioData.aaNumContacts
            d['RIO_num_contacts'] = rioData.rioNumContacts
            d['RIO_in_register']  = rioData.rioInRegister
            d['RIO_oo_register']  = rioData.rioOoRegister
            d['RIO_backwards']    = rioData.rioBackwards
            d['RIO']              = rioData.rioInRegister + rioData.rioOoRegister
            d['RIO_no_cat']       = rioData.rioNumContacts - ( rioData.rioInRegister + rioData.rioOoRegister )
            d['RIO_norm']         = float(d['RIO']) / float(d['native_pdb_num_residues'])
        else:
            d['AA_num_contacts']  = None
            d['RIO_num_contacts'] = None
            d['RIO_in_register']  = None
            d['RIO_oo_register']  = None
            d['RIO_backwards']    = None
            d['RIO']              = None
            d['RIO_no_cat']       = None
            d['RIO_norm']         = None
    
    #     # Now get the helix
    #     helixSequence = contacts.Rio().helixFromContacts( contacts=rioData.contacts,
    #                                                            dsspLog=dsspLog )
    #     if helixSequence is not None:
    #         ampleResult.rioHelixSequence = helixSequence
    #         ampleResult.rioLenHelix      = len( helixSequence )
    #         hfile = os.path.join( workdir, "{0}.helix".format( ampleResult.ensembleName ) )
    #         with open( hfile, 'w' ) as f:
    #             f.write( helixSequence+"\n" )
    
        #
        # This purely for checking and so we have pdbs to view
        # 
        # Wrap shelxe trace onto native using Csymmatch
        if not d['SHELXE_pdbout'] is None and os.path.isfile(fixpath(d['SHELXE_pdbout'])):
            csymmatch.Csymmatch().wrapModelToNative( fixpath(d['SHELXE_pdbout']),
                                                     amoptd['native_pdb'],
                                                     origin=mrOrigin,
                                                     workdir=fixpath(amoptd['benchmark_dir']))

        if not('SHELXE_wMPE' in d and d['SHELXE_wMPE']):
            try:
                mrinfo.analyse(d['SHELXE_pdbout'])
                d['SHELXE_MPE'] = mrinfo.MPE
                d['SHELXE_wMPE'] = mrinfo.wMPE
            except Exception as e:
                logger.critical("Error analysing SHELXE_pdbout: {0}\n{1}".format(d['SHELXE_pdbout'],e))    
                
        # Wrap parse_buccaneer model onto native
        if d['SXRBUCC_pdbout'] and os.path.isfile(fixpath(d['SXRBUCC_pdbout'])):
            # Need to rename Pdb as is just called buccSX_output.pdb
            csymmatchPdb = os.path.join(fixpath(amoptd['benchmark_dir']), "buccaneer_{0}_csymmatch.pdb".format(d['ensemble_name']))
    
            csymmatch.Csymmatch().wrapModelToNative( fixpath(d['SXRBUCC_pdbout']),
                                                     amoptd['native_pdb'],
                                                     origin=mrOrigin,
                                                     csymmatchPdb=csymmatchPdb,
                                                     workdir=fixpath(amoptd['benchmark_dir']))
            # Calculate phase error
            try:
                mrinfo.analyse(d['SXRBUCC_pdbout'])
                d['SXRBUCC_MPE'] = mrinfo.MPE
                d['SXRBUCC_wMPE'] = mrinfo.wMPE
            except Exception as e:
                logger.critical("Error analysing SXRBUCC_pdbout: {0}\n{1}".format(d['SXRBUCC_pdbout'],e))
                            
        # Wrap parse_buccaneer model onto native
        if d['SXRARP_pdbout'] and os.path.isfile(fixpath(d['SXRARP_pdbout'])):
            # Need to rename Pdb as is just called buccSX_output.pdb
            csymmatchPdb = os.path.join(fixpath(amoptd['benchmark_dir']), "arpwarp_{0}_csymmatch.pdb".format(d['ensemble_name']))
    
            csymmatch.Csymmatch().wrapModelToNative( fixpath(d['SXRARP_pdbout']),
                                                     amoptd['native_pdb'],
                                                     origin=mrOrigin,
                                                     csymmatchPdb=csymmatchPdb,
                                                     workdir=fixpath(amoptd['benchmark_dir']))
            # Calculate phase error
            try:
                mrinfo.analyse(d['SXRARP_pdbout'])
                d['SXRARP_MPE'] = mrinfo.MPE
                d['SXRARP_wMPE'] = mrinfo.wMPE
            except Exception as e:
                logger.critical("Error analysing SXRARP_pdbout: {0}\n{1}".format(d['SXRARP_pdbout'],e))
    return

[docs]def cluster_script(amoptd, python_path="ccp4-python"):
    """Create the script for benchmarking on a cluster"""
    # write out script
    work_dir = amoptd['work_dir']
    script_path = os.path.join(work_dir, "submit_benchmark.sh")
    with open(script_path, "w") as job_script:
        job_script.write("#!/bin/sh\n")
        # Find path to this directory to get path to python ensemble.py script
        pydir = os.path.abspath(os.path.dirname(__file__))
        benchmark_script = os.path.join(pydir, "benchmark_util.py")
        job_script.write("{0} {1} {2} {3}\n".format(python_path, "-u", benchmark_script, amoptd['results_path']))

    # Make executable
    os.chmod(script_path, 0o777)
    return script_path

[docs]def fixpath(path):
    # fix for analysing on a different machine
    if _oldroot and _newroot:
        return path.replace(_oldroot,_newroot)
    else:
        return path


# Run unit tests
if __name__ == "__main__":

    # This runs the benchmarking starting from a pickled file containing an amopt dictionary.
    # - used when submitting the modelling jobs to a cluster
    if len(sys.argv) != 2 or not os.path.isfile(sys.argv[1]):
        print("benchmark script requires the path to a pickled amopt dictionary!")
        sys.exit(1)

    # Get the amopt dictionary
    amoptd = ample_util.read_amoptd(sys.argv[1])

    # Set up logging - could append to an existing log?
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    fl = logging.FileHandler(os.path.join(amoptd['work_dir'], "benchmark.log"))
    fl.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fl.setFormatter(formatter)
    logger.addHandler(fl)

    analyse(amoptd)
    ample_util.save_amoptd(amoptd)