sms733@psu_edu
New Member
I recently submitted a job using the CAM 5 component of CESM (component set F_2000_CAM5) for a length of 12 months and with a configured wall clock time of 6:00. The prescribed SST forcing is that of the model default (I want to run a control simulation before running simulations with my own SST forcing). Before the job completes, the model run times out and I get the following error message:
LSBATCH: User input
#! /bin/tcsh -f
#==============================================================================
# This is a CCSM coupled model Load Leveler batch job script for bluefire
#==============================================================================
#BSUB -n 128
#BSUB -R "span[ptile=64]"
#BSUB -q regular
#BSUB -N
#BSUB -x
#BSUB -a poe
#BSUB -o poe.stdout.%J
#BSUB -e poe.stderr.%J
#BSUB -J mytest.04
#BSUB -W 6:00
#BSUB -P 36961017
setenv LSB_PJL_TASK_GEOMETRY
"{(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63)(64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127)}"
setenv BIND_THRD_GEOMETRY "1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1"
setenv OMP_NUM_THREADS 1
# ----------------------------------------
# PE LAYOUT:
# total number of tasks = 128
# maximum threads per task = 1
# cpl ntasks=128 nthreads=1 rootpe=0
# cam ntasks=128 nthreads=1 rootpe=0
# clm ntasks=128 nthreads=1 rootpe=0
# cice ntasks=128 nthreads=1 rootpe=0
# docn ntasks=128 nthreads=1 rootpe=0
# sglc ntasks=128 nthreads=1 rootpe=0
#
# total number of hw pes = 128
# cpl hw pe range ~ from 0 to 127
# cam hw pe range ~ from 0 to 127
# clm hw pe range ~ from 0 to 127
# cice hw pe range ~ from 0 to 127
# docn hw pe range ~ from 0 to 127
# sglc hw pe range ~ from 0 to 127
# ----------------------------------------
#-----------------------------------------------------------------------
# Determine necessary environment variables
#-----------------------------------------------------------------------
cd /glade/home/ssimon/cases/mytest.04
./Tools/ccsm_check_lockedfiles || exit -1
source ./Tools/ccsm_getenv || exit -2
if ($BUILD_COMPLETE != "TRUE") then
echo "BUILD_COMPLETE is not TRUE"
echo "Please rebuild the model interactively via"
(... more ...)
------------------------------------------------------------
TERM_RUNLIMIT: job killed after reaching LSF run time limit.
Exited with exit code 255.
Resource usage summary:
CPU time :1377384.88 sec.
Max Memory : 60924 MB
Max Swap : 59599 MB
Max Processes : 262
Max Threads : 6281
If needed, what modifications should I make to my run directory files and/or shell files to resolve this problem?
LSBATCH: User input
#! /bin/tcsh -f
#==============================================================================
# This is a CCSM coupled model Load Leveler batch job script for bluefire
#==============================================================================
#BSUB -n 128
#BSUB -R "span[ptile=64]"
#BSUB -q regular
#BSUB -N
#BSUB -x
#BSUB -a poe
#BSUB -o poe.stdout.%J
#BSUB -e poe.stderr.%J
#BSUB -J mytest.04
#BSUB -W 6:00
#BSUB -P 36961017
setenv LSB_PJL_TASK_GEOMETRY
"{(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63)(64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127)}"
setenv BIND_THRD_GEOMETRY "1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:1"
setenv OMP_NUM_THREADS 1
# ----------------------------------------
# PE LAYOUT:
# total number of tasks = 128
# maximum threads per task = 1
# cpl ntasks=128 nthreads=1 rootpe=0
# cam ntasks=128 nthreads=1 rootpe=0
# clm ntasks=128 nthreads=1 rootpe=0
# cice ntasks=128 nthreads=1 rootpe=0
# docn ntasks=128 nthreads=1 rootpe=0
# sglc ntasks=128 nthreads=1 rootpe=0
#
# total number of hw pes = 128
# cpl hw pe range ~ from 0 to 127
# cam hw pe range ~ from 0 to 127
# clm hw pe range ~ from 0 to 127
# cice hw pe range ~ from 0 to 127
# docn hw pe range ~ from 0 to 127
# sglc hw pe range ~ from 0 to 127
# ----------------------------------------
#-----------------------------------------------------------------------
# Determine necessary environment variables
#-----------------------------------------------------------------------
cd /glade/home/ssimon/cases/mytest.04
./Tools/ccsm_check_lockedfiles || exit -1
source ./Tools/ccsm_getenv || exit -2
if ($BUILD_COMPLETE != "TRUE") then
echo "BUILD_COMPLETE is not TRUE"
echo "Please rebuild the model interactively via"
(... more ...)
------------------------------------------------------------
TERM_RUNLIMIT: job killed after reaching LSF run time limit.
Exited with exit code 255.
Resource usage summary:
CPU time :1377384.88 sec.
Max Memory : 60924 MB
Max Swap : 59599 MB
Max Processes : 262
Max Threads : 6281
If needed, what modifications should I make to my run directory files and/or shell files to resolve this problem?