Source code for ample.ensembler.subcluster_util

"""Subcluster utility module"""

__author__ = "Jens Thomas, and Felix Simkovic"
__date__ = "02 Mar 2016"
__version__ = "1.0"

import logging
import random

logger = logging.getLogger(__name__)


[docs]def pick_nmodels(models, clusters, ensemble_max_models): MAXTRIES = 50 tries = 0 clusters = set(clusters) nmodels = min(len(models), ensemble_max_models) while True: subcluster = random.sample(models, nmodels) subcluster = tuple(sorted(subcluster)) if subcluster not in clusters: break tries += 1 if tries >= MAXTRIES: return None return subcluster
[docs]def slice_subcluster(cluster_files, previous_clusters, ensemble_max_models, radius, radius_thresholds): """Select a unique set of models from a subcluster of models. """ len_cluster = len(cluster_files) if not len_cluster: return None len_radius_thresholds = len(radius_thresholds) if len_cluster <= ensemble_max_models: if cluster_files not in previous_clusters: return cluster_files else: return None if len_cluster > ensemble_max_models: idx = radius_thresholds.index(radius) selected = cluster_files[:ensemble_max_models] if idx == 0 or selected not in previous_clusters: return selected # Here we have more models then we need, but the first slice has already been selected # we therefore need to select another slice # If last radius threshold, just take the slice to the end if idx + 1 == len_radius_thresholds: start = len_cluster - ensemble_max_models selected = cluster_files[start:] if selected not in previous_clusters: return selected else: return None # Work out how many residues are extra remainder = len_cluster - ensemble_max_models # Use the position of the radius in the list of radii to work out where to start this slice prop = float(idx) / float(len(radius_thresholds) - 1) # -1 as the first is always at the start # Work out how many residues in to the remainder to start start = int(round(float(remainder) * prop)) selected = cluster_files[start : start + ensemble_max_models] if selected and selected not in previous_clusters: return selected else: return None return None
[docs]def subcluster_nmodels(nmodels, radius, clusterer, direction, increment): MINRADIUS = 0.0001 MAXRADIUS = 100 subcluster_models = clusterer.cluster_by_radius(radius) len_models = len(subcluster_models) if subcluster_models else 0 logger.debug("subcluster nmodels: {0} {1} {2} {3} {4}".format(len_models, nmodels, radius, direction, increment)) if len_models == nmodels or radius < MINRADIUS or radius > MAXRADIUS: logger.debug("nmodels: {0} radius: {1}".format(len_models, radius)) return subcluster_models, radius def lower_increment(increment): increment = increment / float(10) if increment <= 0.00001: raise RuntimeError, "increment out of bounds" return increment # Am sure the logic could be improved here, but it seems to work try: if len_models > nmodels: # If we have more models than we want and we are increasing the radius, we've overshot, so we need to # decrease the radius but by a smaller increment # If the radius is the same as the increment, we need to decrease the incrememnt before we subtract it # as both of the above require decreasing the increment we have one test and just change the direction # for the overshoot if direction == 'up' or abs(radius - increment) < 0.0000001: if direction == 'up': direction = 'down' increment = lower_increment(increment) radius -= increment elif len_models < nmodels: if direction == 'down' : direction = 'up' increment = lower_increment(increment) radius += increment except RuntimeError: # Can't get a match so just return what we have logger.debug("subcluster nmodels exceeded increment. Returning: nmodels: {0} radius: {1}".format(len(subcluster_models), radius)) return subcluster_models, radius return subcluster_nmodels(nmodels, radius, clusterer, direction, increment)