# -*-Python-*-
# Created by bgriers at 04 Jul 2018 10:27
"""
This script runs a batch SLURM submission that is capable of either
waiting for the run to finish, or storing the job info for later
retrieval of the output.
This is a skeleton to be used when running an expensive executable that
will be queued on a large computational cluster.
"""
defaultVars(runid=None, cont=cont, parallel=False, wait=True)
# Lets make it so that this job can randomly crash and fail to submit
if np.random.rand() >= 0.8:
printe('**** Intentionally crashing this submission to simulate a failed run')
# The runid that labels this SLURM job name
if runid is None:
runid = root['SETTINGS']['EXPERIMENT']['runid']
else:
root['SETTINGS']['EXPERIMENT']['runid'] = runid
if SERVER(root['SETTINGS']['REMOTE_SETUP']['serverPicker']) == 'iris':
partition = 'preemptable'
elif SERVER(root['SETTINGS']['REMOTE_SETUP']['serverPicker']) == 'portal':
partition = 'sque'
else:
printe('SLURM partition not defined for chosen server')
OMFITx.End()
# The SLURM batch submission script
slurm_script = '''#!/bin/bash
#SBATCH -J {}
#SBATCH -o batch.out
#SBATCH -e batch.err
#SBATCH -t 0:01:00
#SBATCH -n 1
#SBATCH --mem=10M
#SBATCH -p {}
echo "The job's id is :$SLURM_JOBID"
# Lets sleep for a bit
sleep $[ ( $RANDOM %10 ) +1 ]s
chmod +x ex.sh
./ex.sh
exit 0
'''.format(
runid, partition
)
# The code to run.
ex = '''
#!/bin/bash
echo 'hello world std_out'
>&2 echo 'hello world std_err'
echo "We are sleeping a bit"
sleep $[ ( $RANDOM % 5 ) +1 ]s
echo "We are sleeping some more"
sleep $[ ( $RANDOM % 5 ) +1 ]s
echo "Printing directory..."
pwd > pwd.txt
echo "Done printing"
exit 0
'''
# Make the code we want to run an executable script in a file
ex_sh = OMFITascii('ex.sh', fromString=ex)
inputs = [ex_sh]
outputs = ['pwd.txt']
script = (slurm_script, 'slurm_script.sh')
executable = 'sbatch %s'
if wait:
queued = True
std_out = 'batch.out'
std_err = 'batch.err'
else:
queued = False
std_out = []
std_err = []
OMFITx.executable(
root,
inputs=inputs,
outputs=outputs,
std_out=std_out,
std_err=std_err,
script=slurm_script,
executable=executable,
queued=queued,
clean=True,
)
# Set up the output structure
root['OUTPUTS'].setdefault('SLURM', OMFITtree())
root['OUTPUTS']['SLURM'].setdefault('OUTPUTS', OMFITtree())
root['OUTPUTS']['SLURM']['OUTPUTS'].setdefault(runid, OMFITtree())
if wait:
# Load the output of the job into OMFIT
root['OUTPUTS']['SLURM']['OUTPUTS'][runid]['pwd'] = OMFITascii('./pwd.txt')
else:
# Store the job manager object
root['OUTPUTS']['SLURM']['OUTPUTS'][runid]['job'] = OMFITx.manage_job(
root, std_out, remotedir=evalExpr(root['SETTINGS']['REMOTE_SETUP']['workDir'])
)
# If this is a prun then the result is the OUTPUTS[runid] OMFITtree that contians the 'job'
if parallel:
result = root['OUTPUTS']['SLURM']['OUTPUTS'][runid]