I'm working with a the SAM cloud resolving model by Marat Khairoutdinov on Yellowstone and I'd like my run script to get the exit code from the current job and decide whether to resubmit or not. Here's what I have so far:#!/bin/tcsh
#
# LSF batch script to run an MPI application
#
#BSUB -P P35081334
#BSUB -W 02:00 # wall-clock time (hrs:mins)
#BSUB -n 16 # number of tasks in job
#BSUB -R "span[ptile=16]" # run 16 MPI tasks per node
#BSUB -J BUBBLE_500_64x64_08km_1.0k_3g # job name
#BSUB -o BUBBLE_500_64x64_08km_1.0k_3g.out.%J # output file name in which %J is replaced by the job ID
#BSUB -e BUBBLE_500_64x64_08km_1.0k_3g.err.%J # error file name in which %J is replaced by the job ID
#BSUB -q regular # queueset case = BUBBLE
set subcase = advsel
set jobfile = $case/resub.$subcase
set prmfile = $case/prm.$subcase
set prmloc = $case/prmsetenv LID "`date +%y%m%d-%H%M%S`"#--------------------------------------------------------------
#run the executable
#--------------------------------------------------------------
mpirun.lsf ./SAM_ADV_MPDATA_RAD_CAM_MICRO_SAM1MOM_64x64_B08km >&! sam.log.$LID
echo
echo sam.log.$LID
echo
#--------------------------------------------------------------
# Resubmit the job if not finished
#--------------------------------------------------------------
set exitstatus = $?
echo SAM stopped with exit status $exitstatusif [ $exitstatus -eq 0 ]
then
echo It appears the previous run ended properly and job not yet finished.
echo Resubmitting $jobfile
cat $prmfile | sed s/nrestart.*=.*0/nrestart = 1/ > temp.namelist
mv temp.namelist $prmfile
cp $prmfile $prmloc
bsub < sam_run
fi
#--------------------------------------------------------------
#--------------------------------------------------------------The variable $exitcode does not have the right value in the test runs that I've done so far. $exitcode has the value 0 when I know that the model indeed exited with exit code 9.So my question is, is there a different syntax for obtaining the exit code rather than $?, which I think was meant for a different system. I don't know wher eot look any of this up for the LSF. Thanks,Walter
#
# LSF batch script to run an MPI application
#
#BSUB -P P35081334
#BSUB -W 02:00 # wall-clock time (hrs:mins)
#BSUB -n 16 # number of tasks in job
#BSUB -R "span[ptile=16]" # run 16 MPI tasks per node
#BSUB -J BUBBLE_500_64x64_08km_1.0k_3g # job name
#BSUB -o BUBBLE_500_64x64_08km_1.0k_3g.out.%J # output file name in which %J is replaced by the job ID
#BSUB -e BUBBLE_500_64x64_08km_1.0k_3g.err.%J # error file name in which %J is replaced by the job ID
#BSUB -q regular # queueset case = BUBBLE
set subcase = advsel
set jobfile = $case/resub.$subcase
set prmfile = $case/prm.$subcase
set prmloc = $case/prmsetenv LID "`date +%y%m%d-%H%M%S`"#--------------------------------------------------------------
#run the executable
#--------------------------------------------------------------
mpirun.lsf ./SAM_ADV_MPDATA_RAD_CAM_MICRO_SAM1MOM_64x64_B08km >&! sam.log.$LID
echo
echo sam.log.$LID
echo
#--------------------------------------------------------------
# Resubmit the job if not finished
#--------------------------------------------------------------
set exitstatus = $?
echo SAM stopped with exit status $exitstatusif [ $exitstatus -eq 0 ]
then
echo It appears the previous run ended properly and job not yet finished.
echo Resubmitting $jobfile
cat $prmfile | sed s/nrestart.*=.*0/nrestart = 1/ > temp.namelist
mv temp.namelist $prmfile
cp $prmfile $prmloc
bsub < sam_run
fi
#--------------------------------------------------------------
#--------------------------------------------------------------The variable $exitcode does not have the right value in the test runs that I've done so far. $exitcode has the value 0 when I know that the model indeed exited with exit code 9.So my question is, is there a different syntax for obtaining the exit code rather than $?, which I think was meant for a different system. I don't know wher eot look any of this up for the LSF. Thanks,Walter