SCRIPTS SLURM batch_slurm_for_prun¶

# -*-Python-*-
# Created by bgriers at 04 Jul 2018  10:27

"""
This script runs a batch SLURM submission that is capable of either
waiting for the run to finish, or storing the job info for later
retrieval of the output.

This is a skeleton to be used when running an expensive executable that
will be queued on a large computational cluster.


"""

defaultVars(runid=None, cont=cont, parallel=False, wait=True)

# Lets make it so that this job can randomly crash and fail to submit
if np.random.rand() >= 0.8:
    printe('**** Intentionally crashing this submission to simulate a failed run')

# The runid that labels this SLURM job name
if runid is None:
    runid = root['SETTINGS']['EXPERIMENT']['runid']
else:
    root['SETTINGS']['EXPERIMENT']['runid'] = runid

if SERVER(root['SETTINGS']['REMOTE_SETUP']['serverPicker']) == 'omega':
    partition = 'preemptable'
elif SERVER(root['SETTINGS']['REMOTE_SETUP']['serverPicker']) == 'portal':
    partition = 'sque'
else:
    printe('SLURM partition not defined for chosen server')
    OMFITx.End()

# The SLURM batch submission script
slurm_script = '''#!/bin/bash
#SBATCH -J {}
#SBATCH -o batch.out
#SBATCH -e batch.err
#SBATCH -t 0:01:00
#SBATCH -n 1
#SBATCH --mem=10M
#SBATCH -p {}

echo "The job's id is :$SLURM_JOBID"

# Lets sleep for a bit
sleep $[ ( $RANDOM %10 ) +1 ]s

chmod +x ex.sh

./ex.sh

exit 0

'''.format(
    runid, partition
)

# The code to run.
ex = '''
#!/bin/bash

echo 'hello world std_out'
>&2 echo 'hello world std_err'

echo "We are sleeping a bit"
sleep $[ ( $RANDOM % 5 ) +1 ]s

echo "We are sleeping some more"
sleep $[ ( $RANDOM % 5 ) +1 ]s

echo "Printing directory..."
pwd > pwd.txt
echo "Done printing"

exit 0

'''

# Make the code we want to run an executable script in a file
ex_sh = OMFITascii('ex.sh', fromString=ex)

inputs = [ex_sh]
outputs = ['pwd.txt']
script = (slurm_script, 'slurm_script.sh')
executable = 'sbatch %s'
if wait:
    queued = True
    std_out = 'batch.out'
    std_err = 'batch.err'
else:
    queued = False
    std_out = []
    std_err = []

OMFITx.executable(
    root,
    inputs=inputs,
    outputs=outputs,
    std_out=std_out,
    std_err=std_err,
    script=slurm_script,
    executable=executable,
    queued=queued,
    clean=True,
)

# Set up the output structure
root['OUTPUTS'].setdefault('SLURM', OMFITtree())
root['OUTPUTS']['SLURM'].setdefault('OUTPUTS', OMFITtree())
root['OUTPUTS']['SLURM']['OUTPUTS'].setdefault(runid, OMFITtree())

if wait:
    # Load the output of the job into OMFIT
    root['OUTPUTS']['SLURM']['OUTPUTS'][runid]['pwd'] = OMFITascii('./pwd.txt')
else:
    # Store the job manager object
    root['OUTPUTS']['SLURM']['OUTPUTS'][runid]['job'] = OMFITx.manage_job(
        root, std_out, remotedir=evalExpr(root['SETTINGS']['REMOTE_SETUP']['workDir'])
    )

# If this is a prun then the result is the OUTPUTS[runid] OMFITtree that contians the 'job'
if parallel:
    result = root['OUTPUTS']['SLURM']['OUTPUTS'][runid]