Source code for ample.util.fast_protein_cluster


import os

from ample.util import ample_util
from ample.ensembler._ensembler import Cluster

[docs]class FPC(object):
    """
    Class 
    """

    def __init__(self):
        pass

[docs]    def cluster(self,
                models=None,
                num_clusters=None,
                nproc=1,
                score_type="rmsd",
                cluster_method="kmeans",
                work_dir=None,
                fpc_exe=None,
                max_cluster_size=200,
                benchmark=False
                ):
        
        # FPC default if 5 clusters - we just run with this for the time being
        FPC_NUM_CLUSTERS=5
        if num_clusters is None or num_clusters > FPC_NUM_CLUSTERS:
            msg = "Cannot work with more than {0} clusters, got: {1}.".format(FPC_NUM_CLUSTERS,num_clusters)
            raise RuntimeError(msg)
  
        owd=os.getcwd()
        if not os.path.isdir(work_dir): os.mkdir(work_dir)
        os.chdir(work_dir)
        
        if not len(models) or not all([os.path.isfile(m) for m in models]):
            msg = "Missing models: {0}".format(models)
            raise RuntimeError(msg)
        
        # Create list of files
        flist='files.list'
        with open(flist,'w') as f:
            for m in models:
                f.write("{0}\n".format(os.path.abspath(m)))
        
        if not os.path.isfile(fpc_exe):
            msg = "Cannot find fast_protein_cluster executable: {0}".format(fpc_exe)
            raise RuntimeError(msg)
        
        # Build up the command-line
        cmd=[fpc_exe]
        if score_type=="rmsd":
            cmd += ['--rmsd']
        elif score_type=="tm":
            cmd += ['--tmscore']
        else:
            msg = "Unrecognised score_type: {0}".format(score_type)
            raise RuntimeError(msg)
        
        if cluster_method=="kmeans":
            cmd += ['--cluster_kmeans']
        elif cluster_method=="hcomplete":
            cmd += ['--cluster_hcomplete']
        else:
            msg = "Unrecognised cluster_method: {0}".format(cluster_method)
            raise RuntimeError(msg)
        
        if nproc > 1: cmd += ['--nthreads',str(nproc)]
        
        # Always save the distance matrix
        cmd += ['--write_text_matrix','matrix.txt']
        
        # For benchmark we use a constant seed to make sure we get the same results
        if benchmark: cmd += ['-S','1']
        
        # Finally the list of files
        cmd += ['-i',flist]
        
        logfile=os.path.abspath("fast_protein_cluster.log")
        retcode = ample_util.run_command(cmd,logfile=logfile)
        if retcode != 0:
            msg = "non-zero return code for fast_protein_cluster in cluster!\nCheck logfile:{0}".format(logfile)
            raise RuntimeError(msg)
    
        cluster_list='cluster_output.clusters'
        cluster_stats='cluster_output.cluster.stats'
        if not os.path.isfile(cluster_list) or not os.path.isfile(cluster_stats):
            msg = "Cannot find files: {0} and {1}".format(cluster_list,cluster_stats)
            raise RuntimeError(msg)
        
        # Check stats and get centroids
        csizes=[]
        centroids=[]
        with open(cluster_stats) as f:
            for line in f:
                if line.startswith("Cluster:"):
                    fields=line.split()
                    csizes.append(int(fields[4]))
                    centroids.append(fields[7])
        
        if len(csizes) != FPC_NUM_CLUSTERS:
            msg = "Found {0} clusters in {1} but was expecting {2}".format(len(csizes),cluster_stats,FPC_NUM_CLUSTERS)
            raise RuntimeError(msg)
        
        all_clusters=[[] for i in range(FPC_NUM_CLUSTERS)]
        # Read in the clusters
        with open(cluster_list) as f:
            for line in f:
                fields=line.split()
                model=fields[0]
                idxCluster=int(fields[1])
                all_clusters[idxCluster].append(model)
        
        # Check
        if False:
            # Ignore this test for now as there seems to be a bug in fast_protein_cluster with the printing of sizes
            maxc=None
            for i,cs in enumerate(csizes):
                if not cs == len(all_clusters[i]):
                    msg = "Cluster {0} size {1} does not match stats size {2}".format(i,len(all_clusters[i]),cs)
                    raise RuntimeError(msg)
                if i==0:
                    maxc=cs
                else:
                    if cs > maxc:
                        msg = "Clusters do not appear to be in size order!"
                        raise RuntimeError(msg)
                    
        # make sure all clusters are < max_cluster_size
        for i, c in enumerate(all_clusters):
            if len(c) > max_cluster_size:
                all_clusters[i]=c[:max_cluster_size]
        
        # Create the data - we loop through the number of clusters specified by the user
        clusters=[]
        for i in range(num_clusters):
            cluster = Cluster()
            cluster.method = cluster_method
            cluster.score_type = score_type
            cluster.index = i + 1
            cluster.centroid = centroids[i]
            cluster.num_clusters = num_clusters
            cluster.models = all_clusters[i]
        os.chdir(owd)
        return clusters