Source code for ample.util.cluster_entropy

#!/usr/bin/python

#
#  divide into parallel, select one, compare to rest- measure cluster repeat untill largest
#sort and rewrite
import re
import os, glob
from multiprocessing import Process, JoinableQueue


##############
[docs]def count_files_in_dir(path):
 counter =0
 for infile in glob.glob( os.path.join(path, '*.pdb') ):
  counter=counter+1
 return counter

##################
[docs]def split(models_path, outpath, NProc, NMODELS):  #split into parallel comparisons for speed

 
 cluster_number =0
 counter=0
 divide= NMODELS / NProc #number of models in each split folder  use to divide 
 if divide <1:
  divide = 1


 divided_files=[]
 files = []

 for infile in glob.glob( os.path.join(models_path, '*.pdb') ):
  counter = counter +1
  files.append(infile)
 no_of_models =  counter




 files.sort()
 for i in xrange(0, len(files), divide):
    divided_files.append(outpath + 'divide/' + str(i) )
    #print files[i:i+divide]
    if not os.path.exists(outpath + 'divide/'):
     os.system ('mkdir ' + outpath + 'divide/')
    if not os.path.exists(outpath + 'divide/'+ str(i)):
     os.system ('mkdir ' + outpath + 'divide/' + str(i) )
    for j in files[i:i+divide]:
    
     #print 'cp ' + models_path + '/' + str(j) + ' ' + outpath + 'divide/' + str(i)
     os.system ('cp ' +str(j) + ' ' + outpath + 'divide/' + str(i) )
# print divided_files

 return divided_files


###############
[docs]def test_2_files(test1, test2, path_to_LGA, gdtout):

 string = test2
 SCORE = ''
 path = os.getcwd()
 cur_path = path

 if not os.path.exists( path + '/run_dir_'+gdtout):
  os.system ('mkdir ' + path + '/run_dir_'+gdtout)
  os.system ('cp ' +path_to_LGA + '/lga ' +path + '/run_dir_'+gdtout )
  os.system ('mkdir ' + path + '/run_dir_'+gdtout+'/MOL2') 
  os.system ('mkdir ' + path + '/run_dir_'+gdtout+'/TMP')
 
 os.chdir(path + '/run_dir_'+gdtout)
 my_LGA_run = open (path + '/run_dir_'+gdtout+'/LGA_run'+gdtout, "w")
 my_LGA_run.write('ulimit -s unlimited\n'+ path + '/run_dir_'+gdtout+'/lga run.pdb -3 -sda -atom:CA \n')
 my_LGA_run.close()
 os.system ('chmod uoga=wrx ' + path + '/run_dir_'+gdtout+'/LGA_run'+gdtout)


 test1 = open (test1)
 test2 = open (test2)
 test1_out = open(path + '/run_dir_'+gdtout+'/MOL2/run.pdb', "w")
 test1_out.write('MOLECULE 1\n')
 for line in test1:
  test1_out.write(line)
 test1_out.write('ENDMOL\n')
 test1_out.write('MOLECULE 2\n')
 for line in test2:
  test1_out.write(line)
 test1_out.write('END\n')
 test1_out.close()
#test2_out.close()



 os.system(path + '/run_dir_'+gdtout+'/LGA_run'+gdtout+ ' > RESULTS')####

 RESULTS = open(path + '/run_dir_'+gdtout+'/RESULTS')

 for line in RESULTS:
  get_stat1 = re.compile('^SUMMARY')
  result_stat1 = get_stat1.match(line)
  if result_stat1:
   # print line
    stat1_get = re.split('SUMMARY\(GDT\)\s*(\d*)\s*(\d*)\s*(\d*.\d*)\s*(\d*)\s*(\d*.\d*)\s*(\d*.\d*)', line)  
    SCORE = stat1_get[6] 
 #
 os.system('rm -r ' +path + '/run_dir_'+gdtout)
 os.chdir(cur_path)

 return SCORE
################
# compare 1 folder - thread this
#takes the split folder, top model, name, place to output 'GET',threshold for comparison (20, 25, ... 
[docs]def compare_one_split_folder(folder, random_model_to_compare, suffix, current_cluster_output, threshold, q, path_to_LGA):
     
     #Number_passed =0
     for infile in glob.glob( os.path.join(folder, '*.pdb') ): #loop through models in folder
         name= re.split('/', infile)
         name= name.pop()
         name= re.split('.pdb', name)
         name =  name[0]

         SCORE = test_2_files(random_model_to_compare, infile, path_to_LGA, suffix + '_' +name )
      #   print 'GOT ' + SCORE 
         if float(SCORE) >= float(threshold):
       #      print SCORE
             #print 'cp ' + infile + ' ' + outpath + '/' 
             os.system('mv ' + infile + ' ' + current_cluster_output + '/' )

             #Number_passed +=1
             #q.put(Number_passed)

      
########################
[docs]def SORT_clusters(LIST_of_clusters, outpath):  #sort clusters and renumber by size

  # [cluster, size]
  sorted_by_size = []
  sorted_by_size = sorted(LIST_of_clusters, key=lambda student: student[1], reverse=True)
  print sorted_by_size

  largest_size = sorted_by_size[0][1]

  sorted_cluster = 0
  for cluster in  sorted_by_size:
    os.system('mv ' + outpath +'/cluster'+str(cluster[0]) +' ' + outpath + '/sorted_cluster_' +str(sorted_cluster) )
    sorted_cluster += 1


  return largest_size, outpath + '/sorted_cluster_0'

#########################
[docs]def cluster(path_of_models, LGA, nproc, NMODELS, outpath, list_of_cores):
  #list_of_cores=[20]#, 25]#, 30, 40, 50, 60, 70, 80]
    
  ###temp dir
  if os.path.exists(outpath + '/temp'):
    os.system('rm -r ' +outpath + '/temp')
  os.system('mkdir '+ outpath + '/temp')
  for infile in glob.glob( os.path.join(path_of_models, '*.pdb') ):
    os.system('cp '+infile +' '+outpath + '/temp/')
  path_of_models = outpath + '/temp'

  random_model_to_compare = ''
  LIST_of_clusters = []
  LIST_of_cluster_sizes=[]
  used_random_models = []

  for core in list_of_cores:
    if not os.path.exists(outpath + 'cluster_' + str(core) ):
     os.system('mkdir ' +outpath + 'cluster_' + str(core)  )
  
    os.chdir(outpath + 'cluster_' + str(core) ) 
    if os.path.exists(outpath + 'split/'):
      os.system('rm -r ' +  outpath + 'split/' ) 

    if not os.path.exists(outpath + 'split/'):
      os.mkdir( outpath + 'split/')
    list_of_split_paths = split(path_of_models, outpath + 'split/', nproc, NMODELS)
    #print list_of_split_paths


    GOT_LARGEST_CLUSTER = False
    cluster = 0
    temp_path_name = path_of_models

    files = []
    for infile in glob.glob( os.path.join(path_of_models, '*.pdb') ):
       files.append(infile)

    while  GOT_LARGEST_CLUSTER == False:
      current_cluster = outpath + 'cluster_' + str(core) +'/cluster'+str(cluster)
      os.system('mkdir ' +outpath + 'cluster_' + str(core) +'/cluster'+str(cluster))




      #get a random model as a starting 'top model'

      for each_file in files:
         if not each_file in used_random_models:
          random_model_to_compare = each_file
          used_random_models.append(random_model_to_compare)
          #print used_random_models, random_model_to_compare
          break

      os.chdir(outpath)
    #run comparisons parallel against 'top model' (thread to make parallel) if similar- keep

      suffix_char_used = []
     # for letter in range(ord('a'), ord('z') + 1):
    #  print chr(letter)
     #   suffix_char.append(chr(letter))
        

      q = JoinableQueue()
      number_passed = 0

      threads = []
      counter = 0
      for split_path in list_of_split_paths:  ## Threadding to make parallel, the number of folders = number of comparisons
                                 ## as each takes little processor power, can make more than the number of cores?
                                            ##can split into 100 to run fast but with lag

        counter +=1
        suffix = str(counter)
        #compare_one_split_folder
        thread = Process(target=compare_one_split_folder, args=(split_path, random_model_to_compare, suffix, current_cluster, core, q, LGA))        
        thread.start() 
        threads.append(thread)
        #print 'number_passed ', q.get()


      for thread in threads:

        thread.join()


      

      number_in_cluster = 0
      for infile in glob.glob( os.path.join(outpath + 'cluster_' + str(core) +'/cluster'+str(cluster), '*.pdb') ): 
       number_in_cluster += 1
       name= re.split('/', infile)
       name= name.pop()
       #if  temp_path_name+'/'+name in files:
       files.remove(temp_path_name+'/'+name)

      LIST_of_clusters.append([cluster, number_in_cluster])
      LIST_of_cluster_sizes.append(int(number_in_cluster))

      print 'current cluster size= ',  number_in_cluster
     

      if float(number_in_cluster)  >= float(NMODELS/2):
   
        GOT_LARGEST_CLUSTER = True
        print 'got_largest_cluster', number_in_cluster , NMODELS, ' cluster_number ',  cluster  

        #return the largest_cluster
        largest_size, largest_cluster_path  = SORT_clusters(LIST_of_clusters, outpath + '/cluster_' + str(core) )
        return largest_size, largest_cluster_path

      else:
       cluster += 1
       #check if all done
       #if a cluster is larger than reamining files
       LIST_of_cluster_sizes.sort()
       LIST_of_cluster_sizes.reverse()

       total_files_in_all_clusters = 0
       for sizes in LIST_of_cluster_sizes:
         total_files_in_all_clusters +=sizes #total no of sorted files
       
       remaining_unsorted = NMODELS - total_files_in_all_clusters  #get number of remaining files 

       print 'CLUSTER= ', cluster, ' sorted= ', total_files_in_all_clusters, ' remaining= ', len(files), ' largest so far = ', LIST_of_cluster_sizes[0]
      # print random_model_to_compare
       if float(LIST_of_cluster_sizes[0]) >= float(len(files)):  #if there is a cluster bigger than the remaining files

         largest_size, largest_cluster_path = SORT_clusters(LIST_of_clusters, outpath + '/cluster_' + str(core) )
         #print 'RETURNING largest ', largest_size,  ' cluster ' ,    largest_cluster
         return largest_size, largest_cluster_path








############
[docs]def RUN_CLUSTER(levels, path, path_to_LGA, outpath, nproc, NMODELS):
 LOG = open(outpath +'/CLUSTERING_LOG',"w")
 largest_cluster = path
 cluster_results=[]

 index = 0
 for level in levels:

  
  using_level = [levels[index]]
  NMODELS, largest_cluster = cluster(largest_cluster, path_to_LGA, nproc, NMODELS, outpath, using_level )
  print 'DONE', largest_cluster
  print 'NEW NMODELS ' , NMODELS
  LOG.write(str(level) + ' ' + str(NMODELS) + '\n')
  if NMODELS > nproc:
    nproc = NMODELS-1
    
  cluster_results.append(NMODELS)
  index +=1
  
  if NMODELS <=2:  # stop at threshold or crash
    return cluster_results

 return cluster_results
  



####################################
#path = '/home/jaclyn/Desktop/backup_scripts/NEW_WORKFLOW/MASTER_parallel/PROGRAM/Version_0.1/test/models' 
#path_to_LGA = '/home/jaclyn/Desktop/cmd/casp/LGA_package'
#outpath = '/home/jaclyn/Desktop/backup_scripts/NEW_WORKFLOW/MASTER_parallel/PROGRAM/Version_0.1/test/run/'
#nproc = 1
#NMODELS = 1000#

#levels = [20,25,30,40,50,60]


#path = '/home/jaclyn/Desktop/backup_scripts/NEW_WORKFLOW/MASTER_parallel/PROGRAM/Version_0.1/test/models' 


#outpath = '/home/jaclyn/Desktop/backup_scripts/NEW_WORKFLOW/MASTER_parallel/PROGRAM/Version_0.1/test/clusters/' 
#nproc = 8 
#NMODELS = 10

#cluster_results = RUN_CLUSTER(levels, path, path_to_LGA, outpath, nproc, NMODELS)

#print 'GOT', cluster_results