"""Ensembler module for single model structures"""
__author__ = "Felix Simkovic, and Jens Thomas"
__date__ = "16 Feb 2016"
__version__ = "1.0"
import csv
import logging
import os
import pandas as pd
import _ensembler
import truncation_util
from constants import SIDE_CHAIN_TREATMENTS
from ample.util import ample_util
from ample.util import pdb_edit
logger = logging.getLogger(__name__)
[docs]class SingleModelEnsembler(_ensembler.Ensembler):
"""Ensemble creator using on a single input structure and a corresponding
score file with per residue scores for truncation
"""
def __init__(self, **kwargs):
# Inherit all functions from Parent Ensembler
super(SingleModelEnsembler, self).__init__(**kwargs)
# Set SingleModelEnsembler specific parameters
self.truncation_scorefile = None
return
[docs] def generate_ensembles(self,
models,
ensembles_directory=None,
nproc=None,
percent_truncation=None,
side_chain_treatments=SIDE_CHAIN_TREATMENTS,
truncation_method=None,
truncation_pruning=None,
truncation_scorefile=None,
truncation_scorefile_header=None):
"""Method to generate ensembles from a single structure based on
residue scores"""
if not truncation_method:
truncation_method = self.truncation_method
if not truncation_pruning:
truncation_pruning = self.truncation_pruning
if not truncation_scorefile:
truncation_scorefile = self.truncation_scorefile
if len(models) > 1:
msg = "More than 1 structure provided"
logger.critical(msg)
raise RuntimeError(msg)
if len(truncation_scorefile_header) < 2:
msg = "At least two header options for scorefile are required"
logger.critical(msg)
raise RuntimeError(msg)
# standardise the structure
std_models_dir = os.path.join(self.work_dir, "std_models")
os.mkdir(std_models_dir)
std_model = ample_util.filename_append(models[0], 'std',
std_models_dir)
pdb_edit.standardise(pdbin=models[0], pdbout=std_model,
del_hetatm=True)
std_models = [std_model]
logger.info('Standardised input model: %s', std_models[0])
# Create final ensembles directory
if not os.path.isdir(self.ensembles_directory):
os.mkdir(self.ensembles_directory)
truncate_dir = os.path.join(self.work_dir, "single_truncate")
if not os.path.isdir(truncate_dir):
os.mkdir(truncate_dir)
# Read all the scores into a per residue dictionary
assert len(truncation_scorefile_header) > 1, \
"At least two column labels are required"
residue_scores = self._read_scorefile(truncation_scorefile)
residue_key = truncation_scorefile_header.pop(0).lower()
truncation_scorefile_header = map(str.strip,
truncation_scorefile_header)
assert all(h in residue_scores[0] for h in truncation_scorefile_header), \
"Not all column labels are in your CSV file"
self.ensembles = []
for score_key in truncation_scorefile_header:
zipped_scores = self._generate_residue_scorelist(residue_key,
score_key,
residue_scores)
score_truncate_dir = os.path.join(truncate_dir,
"{}".format(score_key))
if not os.path.isdir(score_truncate_dir):
os.mkdir(score_truncate_dir)
self.truncator = truncation_util.Truncator(
work_dir=score_truncate_dir)
self.truncator.theseus_exe = self.theseus_exe
for truncation in self.truncator.truncate_models(models=std_models,
truncation_method=truncation_method,
percent_truncation=percent_truncation,
truncation_pruning=truncation_pruning,
residue_scores=zipped_scores):
pre_ensemble = _ensembler.Ensemble()
pre_ensemble.num_residues = truncation.num_residues
pre_ensemble.truncation_dir = truncation.directory
pre_ensemble.truncation_level = truncation.level
pre_ensemble.truncation_method = truncation.method
pre_ensemble.truncation_percent = truncation.percent
pre_ensemble.truncation_residues = truncation.residues
pre_ensemble.truncation_variance = truncation.variances
pre_ensemble.truncation_score_key = score_key.lower()
pre_ensemble.pdb = truncation.models[0]
for ensemble in self.edit_side_chains(pre_ensemble,
side_chain_treatments,
single_structure=True):
self.ensembles.append(ensemble)
return self.ensembles
[docs] def generate_ensembles_from_amoptd(self, models, amoptd):
"""Generate ensembles from data in supplied ample data dictionary."""
kwargs = {'percent_truncation': amoptd['percent'],
'side_chain_treatments': amoptd['side_chain_treatments'],
'truncation_method': amoptd['truncation_method'],
'truncation_pruning': amoptd['truncation_pruning'],
'truncation_scorefile': amoptd['truncation_scorefile'],
'truncation_scorefile_header': amoptd['truncation_scorefile_header']}
kwargs = {k: v for k, v in kwargs.iteritems() if v is not None}
return self.generate_ensembles(models, **kwargs)
@staticmethod
def _generate_residue_scorelist(residue_key, score_key, scores):
"""Generate a zipped list of residue indexes and corresponding scores
:residue_key: residue column header keyword
:score_key: score column header keyword
:scores: list of dictionaries for each residue
:returns: zipped list of residue index plus score
"""
assert residue_key in scores[0], "Cannot find residue key in scoresfile"
assert score_key in scores[0], "Cannot find score key in scoresfile"
return [(i[residue_key], i[score_key]) for i in scores]
@staticmethod
def _read_scorefile(scorefile):
"""
:scorefile: CSV score file INCLUDING header line
:returns: list of per residue dictionaries containing column data
"""
df = pd.read_csv(scorefile)
df.rename(columns=lambda x: x.strip(), inplace=True)
return df.T.to_dict().values()