Source code for ample.util.fast_protein_cluster


import os

from ample.util import ample_util
from ample.ensembler._ensembler import Cluster

[docs]class FPC(object): """ Class """ def __init__(self): pass
[docs] def cluster(self, models=None, num_clusters=None, nproc=1, score_type="rmsd", cluster_method="kmeans", work_dir=None, fpc_exe=None, max_cluster_size=200, benchmark=False ): # FPC default if 5 clusters - we just run with this for the time being FPC_NUM_CLUSTERS=5 if num_clusters is None or num_clusters > FPC_NUM_CLUSTERS: msg = "Cannot work with more than {0} clusters, got: {1}.".format(FPC_NUM_CLUSTERS,num_clusters) raise RuntimeError(msg) owd=os.getcwd() if not os.path.isdir(work_dir): os.mkdir(work_dir) os.chdir(work_dir) if not len(models) or not all([os.path.isfile(m) for m in models]): msg = "Missing models: {0}".format(models) raise RuntimeError(msg) # Create list of files flist='files.list' with open(flist,'w') as f: for m in models: f.write("{0}\n".format(os.path.abspath(m))) if not os.path.isfile(fpc_exe): msg = "Cannot find fast_protein_cluster executable: {0}".format(fpc_exe) raise RuntimeError(msg) # Build up the command-line cmd=[fpc_exe] if score_type=="rmsd": cmd += ['--rmsd'] elif score_type=="tm": cmd += ['--tmscore'] else: msg = "Unrecognised score_type: {0}".format(score_type) raise RuntimeError(msg) if cluster_method=="kmeans": cmd += ['--cluster_kmeans'] elif cluster_method=="hcomplete": cmd += ['--cluster_hcomplete'] else: msg = "Unrecognised cluster_method: {0}".format(cluster_method) raise RuntimeError(msg) if nproc > 1: cmd += ['--nthreads',str(nproc)] # Always save the distance matrix cmd += ['--write_text_matrix','matrix.txt'] # For benchmark we use a constant seed to make sure we get the same results if benchmark: cmd += ['-S','1'] # Finally the list of files cmd += ['-i',flist] logfile=os.path.abspath("fast_protein_cluster.log") retcode = ample_util.run_command(cmd,logfile=logfile) if retcode != 0: msg = "non-zero return code for fast_protein_cluster in cluster!\nCheck logfile:{0}".format(logfile) raise RuntimeError(msg) cluster_list='cluster_output.clusters' cluster_stats='cluster_output.cluster.stats' if not os.path.isfile(cluster_list) or not os.path.isfile(cluster_stats): msg = "Cannot find files: {0} and {1}".format(cluster_list,cluster_stats) raise RuntimeError(msg) # Check stats and get centroids csizes=[] centroids=[] with open(cluster_stats) as f: for line in f: if line.startswith("Cluster:"): fields=line.split() csizes.append(int(fields[4])) centroids.append(fields[7]) if len(csizes) != FPC_NUM_CLUSTERS: msg = "Found {0} clusters in {1} but was expecting {2}".format(len(csizes),cluster_stats,FPC_NUM_CLUSTERS) raise RuntimeError(msg) all_clusters=[[] for i in range(FPC_NUM_CLUSTERS)] # Read in the clusters with open(cluster_list) as f: for line in f: fields=line.split() model=fields[0] idxCluster=int(fields[1]) all_clusters[idxCluster].append(model) # Check if False: # Ignore this test for now as there seems to be a bug in fast_protein_cluster with the printing of sizes maxc=None for i,cs in enumerate(csizes): if not cs == len(all_clusters[i]): msg = "Cluster {0} size {1} does not match stats size {2}".format(i,len(all_clusters[i]),cs) raise RuntimeError(msg) if i==0: maxc=cs else: if cs > maxc: msg = "Clusters do not appear to be in size order!" raise RuntimeError(msg) # make sure all clusters are < max_cluster_size for i, c in enumerate(all_clusters): if len(c) > max_cluster_size: all_clusters[i]=c[:max_cluster_size] # Create the data - we loop through the number of clusters specified by the user clusters=[] for i in range(num_clusters): cluster = Cluster() cluster.method = cluster_method cluster.score_type = score_type cluster.index = i + 1 cluster.centroid = centroids[i] cluster.num_clusters = num_clusters cluster.models = all_clusters[i] os.chdir(owd) return clusters