run compset ne120_t12 B1850C5 failed after 11 month integration

Hello there,I am trying run compset ne120_t12 B1850C5 for one year. Unfortunately, it failed after 11 months integration,as can be see from the cpl.log.xxx-yyy file below.The error message is below. I use Intel compiler 17, and MPT on SGI ICE-XA machine, run with 72 node. The PE layout is: set COMPSET = B1850C5 set RES = ne120_t12  set NTHRDS = 1 set PES_PER_NODE = 36 @ MAX_TASKS_PER_NODE = $PES_PER_NODE / $NTHRDS  set NTASKS_LND=648 set NTASKS_ICE=1296 set NTASKS_OCN=648  @ NTASKS_ATM = $NTASKS_LND + $NTASKS_ICE @ NTASKS_ROF = $NTASKS_LND @ NTASKS_WAV = $NTASKS_ICE @ NTASKS_CPL = $NTASKS_ATM  @ TOTALPES = $NTASKS_ATM + $NTASKS_OCN  echo "NTASKS_ATM = $NTASKS_ATM" echo "TOTALPES = $TOTALPES"  setenv CASE            ${MPILIB}.${RES}.${COMPSET}.${TOTALPES}c.lnd${NTASKS_LND}_ice${NTASKS_ICE}_ocn${NTASKS_OCN}.omp${NTHRDS} ${CESMROOT}/scripts/create_newcase         -case ${CASEROOT}         -compset ${COMPSET}         -res ${RES}         -compiler intel         -mach cy002mpt  cd $CASEROOT  set ROOTPE_LND = 0 set ROOTPE_ROF = 0 set ROOTPE_ATM = 0 set ROOTPE_ICE = $NTASKS_LND set ROOTPE_WAV = $NTASKS_ROF set ROOTPE_OCN = $NTASKS_ATM  setenv MAX_TASKS_PER_NODE  xmlchange MAX_TASKS_PER_NODE=$MAX_TASKS_PER_NODE xmlchange PES_PER_NODE=$PES_PER_NODE xmlchange NTASKS_ATM=$NTASKS_ATM xmlchange NTASKS_OCN=$NTASKS_OCN xmlchange NTASKS_CPL=$NTASKS_ATM xmlchange NTASKS_GLC=$TOTALPES xmlchange NTASKS_LND=$NTASKS_LND xmlchange NTASKS_ICE=$NTASKS_ICE xmlchange NTASKS_ROF=$NTASKS_ROF xmlchange NTASKS_WAV=$NTASKS_WAV  xmlchange ROOTPE_LND=$ROOTPE_LND xmlchange ROOTPE_ICE=$ROOTPE_ICE xmlchange ROOTPE_ATM=$ROOTPE_ATM xmlchange ROOTPE_ROF=$ROOTPE_ROF xmlchange ROOTPE_WAV=$ROOTPE_WAV xmlchange ROOTPE_OCN=$ROOTPE_OCN  xmlchange NTHRDS_ATM=$NTHRDS xmlchange NTHRDS_LND=$NTHRDS xmlchange NTHRDS_ICE=$NTHRDS xmlchange NTHRDS_OCN=$NTHRDS xmlchange NTHRDS_CPL=$NTHRDS xmlchange NTHRDS_GLC=$NTHRDS xmlchange NTHRDS_ROF=$NTHRDS xmlchange NTHRDS_WAV=$NTHRDS  xmlchange TOTALPES=$TOTALPES  xmlchange DOUT_S="FALSE"  xmlchange STOP_OPTION="nyears" xmlchange STOP_N=1  xmlchange PIO_TYPENAME='pnetcdf' xmlchange PIO_STRIDE="36" Thanks for your help! Wei -----------------------------------------------------------------------tail -20 run/cpl.log.171013-185442     wrunoff     0.00000000    -1.85812889    -0.00060345     1.72222144     0.00000000     0.00000000    -0.13651090     wfrzrof     0.00000000    -0.18854875    -0.00637638     0.17526315     0.00000000     0.00000000    -0.01966198       *SUM*    -0.30192647     0.76028752    -0.00697983    -1.68492274     1.38497375    -0.30789814    -0.15646591  tStamp_write: model date =    11101       0 wall clock = 2017-10-15 08:26:24 avg dt =   443.90 dt =   447.61 memory_write: model date =    11101       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF) tStamp_write: model date =    11102       0 wall clock = 2017-10-15 08:33:47 avg dt =   443.90 dt =   443.21 memory_write: model date =    11102       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF) tStamp_write: model date =    11103       0 wall clock = 2017-10-15 08:41:18 avg dt =   443.92 dt =   450.87 memory_write: model date =    11103       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF) tStamp_write: model date =    11104       0 wall clock = 2017-10-15 08:48:48 avg dt =   443.94 dt =   450.60 memory_write: model date =    11104       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF) tStamp_write: model date =    11105       0 wall clock = 2017-10-15 08:56:20 avg dt =   443.97 dt =   451.15 memory_write: model date =    11105       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF) tStamp_write: model date =    11106       0 wall clock = 2017-10-15 09:03:47 avg dt =   443.98 dt =   447.64 memory_write: model date =    11106       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF) tStamp_write: model date =    11107       0 wall clock = 2017-10-15 09:11:13 avg dt =   443.98 dt =   445.88 memory_write: model date =    11107       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF) tStamp_write: model date =    11108       0 wall clock = 2017-10-15 09:18:39 avg dt =   443.99 dt =   445.98 memory_write: model date =    11108       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF) ----------------------------------------------------------------------------- Repeat ridging, niter =           1 Repeat ridging, niter =           1 Repeat ridging, niter =           2 Repeat ridging, niter =           2 Repeat ridging, niter =           3 Repeat ridging, niter =           3 Repeat ridging, niter =           4 Repeat ridging, niter =           4 Repeat ridging, niter =           5 Repeat ridging, niter =           5 Repeat ridging, niter =           6 Repeat ridging, niter =           7 Repeat ridging, niter =           6 Repeat ridging, niter =           7  Warning: Departure points out of bounds in remap my_task, i, j =         179           2           2 dpx, dpy =   3228.55094392047       -2414.66812790976 HTN(i,j), HTN(i+1,j) =   3179.87294777240        3179.87294777402 HTE(i,j), HTE(i,j+1) =   4698.56305327633        4698.56305327491 istep1, my_task, iblk =       29940         179           1 Global block:         596 Global i and j:        3451         121(shr_sys_abort) ERROR: remap transport: bad departure points(shr_sys_abort) WARNING: calling shr_mpi_abort() and stoppingMPT ERROR: Rank 827(g:827) is aborting with error code 1001.        Process ID: 357750, Host: r2i0n22, Program: /gbc-lustre/whuang/wd4cesm1/mpt.ne120_t12.B1850C5.2592c.lnd648_ice1296_ocn648.omp1/bld/cesm.exe        MPT Version: HPE MPT 2.16  06/02/17 00:58:10 MPT: --------stack traceback-------MPT: Attaching to program: /proc/357750/exe, process 357750MPT: (no debugging symbols found)...done.MPT: [Thread debugging using libthread_db enabled]MPT: Using host libthread_db library "/lib64/libthread_db.so.1".MPT: (no debugging symbols found)...done.MPT: (no debugging symbols found)...done.MPT: (no debugging symbols found)...done.MPT: (no debugging symbols found)...done.MPT: (no debugging symbols found)...done.MPT: (no debugging symbols found)...done.MPT: (no debugging symbols found)...done.MPT: (no debugging symbols found)...done.MPT: (no debugging symbols found)...done.MPT: (no debugging symbols found)...done.MPT: 0x00002aaaac052ecc in waitpid () from /lib64/libpthread.so.0MPT: Missing separate debuginfos, use: debuginfo-install glibc-2.17-157.el7.x86_64 libbitmask-2.0-sgi716r63.rhel73.x86_64 libcpuset-1.0-sgi716r94.rhel73.x86_64 libcxgb3-1.3.1-8.el7.x86_64 libgcc-4.8.5-11.el7.x86_64 libibverbs-1.2.1mlnx1-OFED.3.4.2.1.4.34218.x86_64 libmlx4-1.2.1mlnx1-OFED.3.4.0.0.4.34218.x86_64 libmlx5-1.2.1mlnx1-OFED.3.4.2.1.4.34218.x86_64 libmthca-1.0.6-13.el7.x86_64 libnl-1.1.4-3.el7.x86_64 libnuma-3.0sgi-sgi716r61.rhel73.x86_64 numatools-2.0-sgi716r146.rhel73.x86_64 xpmem-1.6-sgi716r125.rhel73.x86_64MPT: (gdb) #0  0x00002aaaac052ecc in waitpid () from /lib64/libpthread.so.0MPT: #1  0x00002aaaaba8484c in mpi_sgi_system (command=,MPT:     __statbuf=, __fd=) at sig.c:98MPT: #2  MPI_SGI_stacktraceback (header=) at sig.c:339MPT: #3  0x00002aaaab9d789e in print_traceback (ecode=1001) at abort.c:238MPT: #4  0x00002aaaab9d7a21 in PMPI_Abort (comm=, errorcode=1001)MPT:     at abort.c:67MPT: #5  0x00002aaaab9d7a7a in pmpi_abort__ ()MPT:    from /sw/sdev/mpt-x86_64/2.16-p11435/lib/libmpi.soMPT: #6  0x0000000002003958 in shr_mpi_mod_mp_shr_mpi_abort_ ()MPT: #7  0x000000000206478a in shr_sys_mod_mp_shr_sys_abort_ ()MPT: #8  0x00000000015f58bf in ice_transport_remap_mp_horizontal_remap_ ()MPT: #9  0x00000000015e33d9 in ice_transport_driver_mp_transport_remap_ ()MPT: #10 0x00000000015c312d in ice_step_mod_mp_step_dynamics_ ()MPT: #11 0x00000000014fea84 in ice_comp_mct_mp_ice_run_mct_ ()MPT: #12 0x000000000041de32 in ccsm_comp_mod_mp_ccsm_run_ ()MPT: #13 0x000000000044b118 in MAIN__ ()MPT: #14 0x000000000041945e in main ()MPT: (gdb) A debugging session is active.MPT:MPT:    Inferior 1 [process 357750] will be detached.MPT:MPT: Quit anyway? (y or n) [answered Y; input not from terminal]MPT: Detaching from program: /proc/357750/exe, process 357750 MPT: -----stack traceback ends-----MPT ERROR: MPI_COMM_WORLD rank 827 has terminated without calling MPI_Finalize()        aborting job 
 
Back
Top