Source code for ample.util.workers_util

Created on Feb 28, 2013

@author: jmht

import logging
import multiprocessing
import os
import time

from ample.util import ample_util
from ample.util import clusterize
from ample.util import worker

# logger = logging.getLogger(__name__)
logger = logging.getLogger()

[docs]class JobServer(object): def __init__(self): self.inqueue = None self.outqueue = None"Running jobs on a local machine")
[docs] def setJobs(self, jobs): """Add the list of jobs we are to run""" if self.inqueue: raise RuntimeError,"NOT THOUGHT ABOUT MULTIPLE INVOCATIONS!" # Queue to hold the jobs we want to run queue = multiprocessing.Queue() # Add jobs to the inqueue"Generating MRBUMP runscripts in: {0}".format( os.getcwd() ) ) for job in jobs: if not os.path.isfile(job): raise RuntimeError,"JobServer cannot find job: {0}".format(job) queue.put(job) self.inqueue = queue #self.inqueue.close() # We can't call inqueue.close() even though we've finished as it all goes horribly wrong # We sleep to allow enough time for the objects to be picked and put on the queue time.sleep(2) return
[docs] def start(self, nproc=None, early_terminate=False, check_success=None, monitor=None): assert nproc != None # Now start the jobs processes = [] for i in range(nproc): process = multiprocessing.Process(target=worker.worker, args=(self.inqueue, early_terminate, check_success)) process.start() processes.append(process) # Loop through the processes checking if any are done timeout=1*60 # Broken on OSX #qsize=self.outqueue.qsize() if monitor: monitor() success=True while len(processes): for i, process in enumerate(processes): # Join process for timeout seconds and if we haven't finished by then move onto the next process process.join(timeout) if not process.is_alive(): logger.debug("Checking completed process {0} with exitcode {1}".format(process,process.exitcode)) # Set failed if any job failed if process.exitcode != 0: logger.critical("Process {0} failed with exitcode {1}".format(process,process.exitcode)) success=False # Finished so see what happened if process.exitcode == 0 and early_terminate: if not self.inqueue.empty(): print "Process {0} was successful so removing remaining jobs from inqueue".format( "Process {0} was successful so removing remaining jobs from inqueue".format( ) # Remove all remaining processes from the inqueue. We do this rather than terminate the processes # as terminating leaves the MRBUMP processes running. This way we hang around until all our # running processes have finished while not self.inqueue.empty(): job = self.inqueue.get() logger.debug( "Removed job [{0}] from inqueue".format(job)) else: print "Got empty queue - all jobs done" # Remove from processes to check del processes[i] # Run the monitor function if monitor: monitor() # need to wait here as sometimes it takes a while for the results files to get written time.sleep(3) return success
[docs]def run_scripts(job_scripts, monitor=None, check_success=None, early_terminate=None, nproc=None, job_time=None, job_name=None, submit_cluster=None, submit_qtype=None, submit_queue=None, submit_pe_lsf=None, submit_pe_sge=None, submit_array=None, submit_max_array=None): if submit_cluster: return run_scripts_cluster(job_scripts, nproc=nproc, monitor=monitor, job_time=job_time, job_name=job_name, submit_cluster=submit_cluster, submit_qtype=submit_qtype, submit_queue=submit_queue, submit_pe_lsf=submit_pe_lsf, submit_pe_sge=submit_pe_sge, submit_array=submit_array, submit_max_array=submit_max_array ) else: return run_scripts_serial(job_scripts, nproc=nproc, monitor=monitor, early_terminate=early_terminate, check_success=check_success, )
[docs]def run_scripts_cluster(job_scripts, monitor=None, job_time=None, job_name=None, submit_cluster=None, submit_qtype=None, submit_queue=None, submit_pe_lsf=None, submit_pe_sge=None, submit_array=None, submit_max_array=None, nproc=None): logger = logging.getLogger()"Running jobs on a cluster") cluster_run = clusterize.ClusterRun() cluster_run.QTYPE = submit_qtype if submit_array and len(job_scripts) > 1: cluster_run.submitArrayJob(job_scripts, job_time=job_time, job_name=job_name, submit_max_array=submit_max_array, submit_qtype=submit_qtype, submit_queue=submit_queue ) else: for script in job_scripts: dirname, sname = os.path.split(script) name = os.path.splitext(sname)[0] logfile = os.path.join(dirname, "{0}.log".format(name)) if not job_name: job_name = name if nproc is None: nproc = 1 with open(script) as f: lines = f.readlines() slines = clusterize.ClusterRun().queueDirectives(nproc=nproc, job_name=job_name, job_time=job_time, log_file=logfile, submit_queue=submit_queue, submit_qtype=submit_qtype, submit_pe_lsf=submit_pe_lsf, submit_pe_sge=submit_pe_sge ) # We add the queue directives after the first line of the script with open(script,'w') as f: f.writelines("".join([lines[0]] + slines + lines[1:])) os.chmod(script, 0o777) cluster_run.submitJob(script) # Monitor the cluster queue to see when all jobs have finished cluster_run.monitorQueue(monitor=monitor) # Rename scripts for array jobs if submit_array and len(job_scripts) > 1: cluster_run.cleanUpArrayJob() return True
[docs]def run_scripts_serial(job_scripts, nproc=None, monitor=None, early_terminate=None, check_success=None, ): success=False if len(job_scripts) > 1: # Don't need early terminate - check_success if it exists states what's happening js = JobServer() js.setJobs(job_scripts) success = js.start(nproc=nproc, early_terminate=bool(early_terminate), check_success=check_success, monitor=monitor, ) else: script=job_scripts[0] name=os.path.splitext(os.path.basename(script))[0] logfile="{0}.log".format(name) wdir=os.path.dirname(script) os.chdir(wdir) rtn = ample_util.run_command([script], logfile=logfile) if rtn == 0: success = True return success
# Need this defined outside of the test or it can't be pickled on Windoze def _check_success_test( job ): jobname = os.path.splitext(os.path.basename(job))[0] if jobname == "job_2": return True return False