Source code for ample.ensembler.subcluster_util
"""Subcluster utility module"""
__author__ = "Jens Thomas, and Felix Simkovic"
__date__ = "02 Mar 2016"
__version__ = "1.0"
import logging
import random
logger = logging.getLogger(__name__)
[docs]def pick_nmodels(models, clusters, ensemble_max_models):
MAXTRIES = 50
tries = 0
clusters = set(clusters)
nmodels = min(len(models), ensemble_max_models)
while True:
subcluster = random.sample(models, nmodels)
subcluster = tuple(sorted(subcluster))
if subcluster not in clusters: break
tries += 1
if tries >= MAXTRIES: return None
return subcluster
[docs]def slice_subcluster(cluster_files, previous_clusters, ensemble_max_models, radius, radius_thresholds):
"""Select a unique set of models from a subcluster of models.
"""
len_cluster = len(cluster_files)
if not len_cluster: return None
len_radius_thresholds = len(radius_thresholds)
if len_cluster <= ensemble_max_models:
if cluster_files not in previous_clusters: return cluster_files
else: return None
if len_cluster > ensemble_max_models:
idx = radius_thresholds.index(radius)
selected = cluster_files[:ensemble_max_models]
if idx == 0 or selected not in previous_clusters: return selected
# Here we have more models then we need, but the first slice has already been selected
# we therefore need to select another slice
# If last radius threshold, just take the slice to the end
if idx + 1 == len_radius_thresholds:
start = len_cluster - ensemble_max_models
selected = cluster_files[start:]
if selected not in previous_clusters:
return selected
else:
return None
# Work out how many residues are extra
remainder = len_cluster - ensemble_max_models
# Use the position of the radius in the list of radii to work out where to start this slice
prop = float(idx) / float(len(radius_thresholds) - 1) # -1 as the first is always at the start
# Work out how many residues in to the remainder to start
start = int(round(float(remainder) * prop))
selected = cluster_files[start : start + ensemble_max_models]
if selected and selected not in previous_clusters:
return selected
else:
return None
return None
[docs]def subcluster_nmodels(nmodels, radius, clusterer, direction, increment):
MINRADIUS = 0.0001
MAXRADIUS = 100
subcluster_models = clusterer.cluster_by_radius(radius)
len_models = len(subcluster_models) if subcluster_models else 0
logger.debug("subcluster nmodels: {0} {1} {2} {3} {4}".format(len_models, nmodels, radius, direction, increment))
if len_models == nmodels or radius < MINRADIUS or radius > MAXRADIUS:
logger.debug("nmodels: {0} radius: {1}".format(len_models, radius))
return subcluster_models, radius
def lower_increment(increment):
increment = increment / float(10)
if increment <= 0.00001: raise RuntimeError, "increment out of bounds"
return increment
# Am sure the logic could be improved here, but it seems to work
try:
if len_models > nmodels:
# If we have more models than we want and we are increasing the radius, we've overshot, so we need to
# decrease the radius but by a smaller increment
# If the radius is the same as the increment, we need to decrease the incrememnt before we subtract it
# as both of the above require decreasing the increment we have one test and just change the direction
# for the overshoot
if direction == 'up' or abs(radius - increment) < 0.0000001:
if direction == 'up': direction = 'down'
increment = lower_increment(increment)
radius -= increment
elif len_models < nmodels:
if direction == 'down' :
direction = 'up'
increment = lower_increment(increment)
radius += increment
except RuntimeError:
# Can't get a match so just return what we have
logger.debug("subcluster nmodels exceeded increment. Returning: nmodels: {0} radius: {1}".format(len(subcluster_models), radius))
return subcluster_models, radius
return subcluster_nmodels(nmodels, radius, clusterer, direction, increment)