Source code for ample.ensembler.single_model

"""Ensembler module for single model structures"""

__author__ = "Felix Simkovic, and Jens Thomas"
__date__ = "16 Feb 2016"
__version__ = "1.0"

import csv
import logging
import os
import pandas as pd

import _ensembler
import truncation_util
from constants import SIDE_CHAIN_TREATMENTS
from ample.util import ample_util
from ample.util import pdb_edit

logger = logging.getLogger(__name__)


[docs]class SingleModelEnsembler(_ensembler.Ensembler): """Ensemble creator using on a single input structure and a corresponding score file with per residue scores for truncation """ def __init__(self, **kwargs): # Inherit all functions from Parent Ensembler super(SingleModelEnsembler, self).__init__(**kwargs) # Set SingleModelEnsembler specific parameters self.truncation_scorefile = None return
[docs] def generate_ensembles(self, models, ensembles_directory=None, nproc=None, percent_truncation=None, side_chain_treatments=SIDE_CHAIN_TREATMENTS, truncation_method=None, truncation_pruning=None, truncation_scorefile=None, truncation_scorefile_header=None): """Method to generate ensembles from a single structure based on residue scores""" if not truncation_method: truncation_method = self.truncation_method if not truncation_pruning: truncation_pruning = self.truncation_pruning if not truncation_scorefile: truncation_scorefile = self.truncation_scorefile if len(models) > 1: msg = "More than 1 structure provided" logger.critical(msg) raise RuntimeError(msg) if len(truncation_scorefile_header) < 2: msg = "At least two header options for scorefile are required" logger.critical(msg) raise RuntimeError(msg) # standardise the structure std_models_dir = os.path.join(self.work_dir, "std_models") os.mkdir(std_models_dir) std_model = ample_util.filename_append(models[0], 'std', std_models_dir) pdb_edit.standardise(pdbin=models[0], pdbout=std_model, del_hetatm=True) std_models = [std_model] logger.info('Standardised input model: %s', std_models[0]) # Create final ensembles directory if not os.path.isdir(self.ensembles_directory): os.mkdir(self.ensembles_directory) truncate_dir = os.path.join(self.work_dir, "single_truncate") if not os.path.isdir(truncate_dir): os.mkdir(truncate_dir) # Read all the scores into a per residue dictionary assert len(truncation_scorefile_header) > 1, \ "At least two column labels are required" residue_scores = self._read_scorefile(truncation_scorefile) residue_key = truncation_scorefile_header.pop(0).lower() truncation_scorefile_header = map(str.strip, truncation_scorefile_header) assert all(h in residue_scores[0] for h in truncation_scorefile_header), \ "Not all column labels are in your CSV file" self.ensembles = [] for score_key in truncation_scorefile_header: zipped_scores = self._generate_residue_scorelist(residue_key, score_key, residue_scores) score_truncate_dir = os.path.join(truncate_dir, "{}".format(score_key)) if not os.path.isdir(score_truncate_dir): os.mkdir(score_truncate_dir) self.truncator = truncation_util.Truncator( work_dir=score_truncate_dir) self.truncator.theseus_exe = self.theseus_exe for truncation in self.truncator.truncate_models(models=std_models, truncation_method=truncation_method, percent_truncation=percent_truncation, truncation_pruning=truncation_pruning, residue_scores=zipped_scores): pre_ensemble = _ensembler.Ensemble() pre_ensemble.num_residues = truncation.num_residues pre_ensemble.truncation_dir = truncation.directory pre_ensemble.truncation_level = truncation.level pre_ensemble.truncation_method = truncation.method pre_ensemble.truncation_percent = truncation.percent pre_ensemble.truncation_residues = truncation.residues pre_ensemble.truncation_variance = truncation.variances pre_ensemble.truncation_score_key = score_key.lower() pre_ensemble.pdb = truncation.models[0] for ensemble in self.edit_side_chains(pre_ensemble, side_chain_treatments, single_structure=True): self.ensembles.append(ensemble) return self.ensembles
[docs] def generate_ensembles_from_amoptd(self, models, amoptd): """Generate ensembles from data in supplied ample data dictionary.""" kwargs = {'percent_truncation': amoptd['percent'], 'side_chain_treatments': amoptd['side_chain_treatments'], 'truncation_method': amoptd['truncation_method'], 'truncation_pruning': amoptd['truncation_pruning'], 'truncation_scorefile': amoptd['truncation_scorefile'], 'truncation_scorefile_header': amoptd['truncation_scorefile_header']} kwargs = {k: v for k, v in kwargs.iteritems() if v is not None} return self.generate_ensembles(models, **kwargs)
@staticmethod def _generate_residue_scorelist(residue_key, score_key, scores): """Generate a zipped list of residue indexes and corresponding scores :residue_key: residue column header keyword :score_key: score column header keyword :scores: list of dictionaries for each residue :returns: zipped list of residue index plus score """ assert residue_key in scores[0], "Cannot find residue key in scoresfile" assert score_key in scores[0], "Cannot find score key in scoresfile" return [(i[residue_key], i[score_key]) for i in scores] @staticmethod def _read_scorefile(scorefile): """ :scorefile: CSV score file INCLUDING header line :returns: list of per residue dictionaries containing column data """ df = pd.read_csv(scorefile) df.rename(columns=lambda x: x.strip(), inplace=True) return df.T.to_dict().values()