Main menu

Navigation

run compset ne120_t12 B1850C5 failed after 11 month integration

1 post / 0 new
wei.huang2@...
run compset ne120_t12 B1850C5 failed after 11 month integration

Hello there,

I am trying run compset ne120_t12 B1850C5 for one year. Unfortunately, it failed after 11 months integration,

as can be see from the cpl.log.xxx-yyy file below.

The error message is below.

 

I use Intel compiler 17, and MPT on SGI ICE-XA machine, run with 72 node. The PE layout is:

 set COMPSET = B1850C5

 set RES = ne120_t12

 

 set NTHRDS = 1

 set PES_PER_NODE = 36

 @ MAX_TASKS_PER_NODE = $PES_PER_NODE / $NTHRDS

 

 set NTASKS_LND=648

 set NTASKS_ICE=1296

 set NTASKS_OCN=648

 

 @ NTASKS_ATM = $NTASKS_LND + $NTASKS_ICE

 @ NTASKS_ROF = $NTASKS_LND

 @ NTASKS_WAV = $NTASKS_ICE

 @ NTASKS_CPL = $NTASKS_ATM

 

 @ TOTALPES = $NTASKS_ATM + $NTASKS_OCN

 

 echo "NTASKS_ATM = $NTASKS_ATM"

 echo "TOTALPES = $TOTALPES"

 

 setenv CASE            ${MPILIB}.${RES}.${COMPSET}.${TOTALPES}c.lnd${NTASKS_LND}_ice${NTASKS_ICE}_ocn${NTASKS_OCN}.omp${NTHRDS}

 ${CESMROOT}/scripts/create_newcase \        -case ${CASEROOT} \        -compset ${COMPSET} \        -res ${RES} \        -compiler intel \        -mach cy002mpt  cd $CASEROOT  set ROOTPE_LND = 0 set ROOTPE_ROF = 0 set ROOTPE_ATM = 0 set ROOTPE_ICE = $NTASKS_LND set ROOTPE_WAV = $NTASKS_ROF set ROOTPE_OCN = $NTASKS_ATM  setenv MAX_TASKS_PER_NODE  xmlchange MAX_TASKS_PER_NODE=$MAX_TASKS_PER_NODE xmlchange PES_PER_NODE=$PES_PER_NODE xmlchange NTASKS_ATM=$NTASKS_ATM xmlchange NTASKS_OCN=$NTASKS_OCN xmlchange NTASKS_CPL=$NTASKS_ATM xmlchange NTASKS_GLC=$TOTALPES xmlchange NTASKS_LND=$NTASKS_LND xmlchange NTASKS_ICE=$NTASKS_ICE xmlchange NTASKS_ROF=$NTASKS_ROF xmlchange NTASKS_WAV=$NTASKS_WAV  xmlchange ROOTPE_LND=$ROOTPE_LND xmlchange ROOTPE_ICE=$ROOTPE_ICE xmlchange ROOTPE_ATM=$ROOTPE_ATM xmlchange ROOTPE_ROF=$ROOTPE_ROF xmlchange ROOTPE_WAV=$ROOTPE_WAV xmlchange ROOTPE_OCN=$ROOTPE_OCN  xmlchange NTHRDS_ATM=$NTHRDS xmlchange NTHRDS_LND=$NTHRDS xmlchange NTHRDS_ICE=$NTHRDS xmlchange NTHRDS_OCN=$NTHRDS xmlchange NTHRDS_CPL=$NTHRDS xmlchange NTHRDS_GLC=$NTHRDS xmlchange NTHRDS_ROF=$NTHRDS xmlchange NTHRDS_WAV=$NTHRDS  xmlchange TOTALPES=$TOTALPES  xmlchange DOUT_S="FALSE"  xmlchange STOP_OPTION="nyears" xmlchange STOP_N=1  xmlchange PIO_TYPENAME='pnetcdf' xmlchange PIO_STRIDE="36" 

Thanks for your help!

 

Wei

 

-----------------------------------------------------------------------

tail -20 run/cpl.log.171013-185442

     wrunoff     0.00000000    -1.85812889    -0.00060345     1.72222144     0.00000000     0.00000000    -0.13651090

     wfrzrof     0.00000000    -0.18854875    -0.00637638     0.17526315     0.00000000     0.00000000    -0.01966198

       *SUM*    -0.30192647     0.76028752    -0.00697983    -1.68492274     1.38497375    -0.30789814    -0.15646591

 

 tStamp_write: model date =    11101       0 wall clock = 2017-10-15 08:26:24 avg dt =   443.90 dt =   447.61

 memory_write: model date =    11101       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF)

 tStamp_write: model date =    11102       0 wall clock = 2017-10-15 08:33:47 avg dt =   443.90 dt =   443.21

 memory_write: model date =    11102       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF)

 tStamp_write: model date =    11103       0 wall clock = 2017-10-15 08:41:18 avg dt =   443.92 dt =   450.87

 memory_write: model date =    11103       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF)

 tStamp_write: model date =    11104       0 wall clock = 2017-10-15 08:48:48 avg dt =   443.94 dt =   450.60

 memory_write: model date =    11104       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF)

 tStamp_write: model date =    11105       0 wall clock = 2017-10-15 08:56:20 avg dt =   443.97 dt =   451.15

 memory_write: model date =    11105       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF)

 tStamp_write: model date =    11106       0 wall clock = 2017-10-15 09:03:47 avg dt =   443.98 dt =   447.64

 memory_write: model date =    11106       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF)

 tStamp_write: model date =    11107       0 wall clock = 2017-10-15 09:11:13 avg dt =   443.98 dt =   445.88

 memory_write: model date =    11107       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF)

 tStamp_write: model date =    11108       0 wall clock = 2017-10-15 09:18:39 avg dt =   443.99 dt =   445.98

 memory_write: model date =    11108       0 memory =     670.78 MB (highwater)         -0.00 MB (usage)  (pe=    0 comps= cpl ATM LND GLC ROF)

 

-----------------------------------------------------------------------------

 Repeat ridging, niter =           1

 Repeat ridging, niter =           1

 Repeat ridging, niter =           2

 Repeat ridging, niter =           2

 Repeat ridging, niter =           3

 Repeat ridging, niter =           3

 Repeat ridging, niter =           4

 Repeat ridging, niter =           4

 Repeat ridging, niter =           5

 Repeat ridging, niter =           5

 Repeat ridging, niter =           6

 Repeat ridging, niter =           7

 Repeat ridging, niter =           6

 Repeat ridging, niter =           7

 

 Warning: Departure points out of bounds in remap

 my_task, i, j =         179           2           2

 dpx, dpy =   3228.55094392047       -2414.66812790976

 HTN(i,j), HTN(i+1,j) =   3179.87294777240        3179.87294777402

 HTE(i,j), HTE(i,j+1) =   4698.56305327633        4698.56305327491

 istep1, my_task, iblk =       29940         179           1

 Global block:         596

 Global i and j:        3451         121

(shr_sys_abort) ERROR: remap transport: bad departure points

(shr_sys_abort) WARNING: calling shr_mpi_abort() and stopping

MPT ERROR: Rank 827(g:827) is aborting with error code 1001.

        Process ID: 357750, Host: r2i0n22, Program: /gbc-lustre/whuang/wd4cesm1/mpt.ne120_t12.B1850C5.2592c.lnd648_ice1296_ocn648.omp1/bld/cesm.exe

        MPT Version: HPE MPT 2.16  06/02/17 00:58:10

 

MPT: --------stack traceback-------

MPT: Attaching to program: /proc/357750/exe, process 357750

MPT: (no debugging symbols found)...done.

MPT: [Thread debugging using libthread_db enabled]

MPT: Using host libthread_db library "/lib64/libthread_db.so.1".

MPT: (no debugging symbols found)...done.

MPT: (no debugging symbols found)...done.

MPT: (no debugging symbols found)...done.

MPT: (no debugging symbols found)...done.

MPT: (no debugging symbols found)...done.

MPT: (no debugging symbols found)...done.

MPT: (no debugging symbols found)...done.

MPT: (no debugging symbols found)...done.

MPT: (no debugging symbols found)...done.

MPT: (no debugging symbols found)...done.

MPT: 0x00002aaaac052ecc in waitpid () from /lib64/libpthread.so.0

MPT: Missing separate debuginfos, use: debuginfo-install glibc-2.17-157.el7.x86_64 libbitmask-2.0-sgi716r63.rhel73.x86_64 libcpuset-1.0-sgi716r94.rhel73.x86_64 libcxgb3-1.3.1-8.el7.x86_64 libgcc-4.8.5-11.el7.x86_64 libibverbs-1.2.1mlnx1-OFED.3.4.2.1.4.34218.x86_64 libmlx4-1.2.1mlnx1-OFED.3.4.0.0.4.34218.x86_64 libmlx5-1.2.1mlnx1-OFED.3.4.2.1.4.34218.x86_64 libmthca-1.0.6-13.el7.x86_64 libnl-1.1.4-3.el7.x86_64 libnuma-3.0sgi-sgi716r61.rhel73.x86_64 numatools-2.0-sgi716r146.rhel73.x86_64 xpmem-1.6-sgi716r125.rhel73.x86_64

MPT: (gdb) #0  0x00002aaaac052ecc in waitpid () from /lib64/libpthread.so.0

MPT: #1  0x00002aaaaba8484c in mpi_sgi_system (command=<optimized out>,

MPT:     __statbuf=<optimized out>, __fd=<optimized out>) at sig.c:98

MPT: #2  MPI_SGI_stacktraceback (header=<optimized out>) at sig.c:339

MPT: #3  0x00002aaaab9d789e in print_traceback (ecode=1001) at abort.c:238

MPT: #4  0x00002aaaab9d7a21 in PMPI_Abort (comm=<optimized out>, errorcode=1001)

MPT:     at abort.c:67

MPT: #5  0x00002aaaab9d7a7a in pmpi_abort__ ()

MPT:    from /sw/sdev/mpt-x86_64/2.16-p11435/lib/libmpi.so

MPT: #6  0x0000000002003958 in shr_mpi_mod_mp_shr_mpi_abort_ ()

MPT: #7  0x000000000206478a in shr_sys_mod_mp_shr_sys_abort_ ()

MPT: #8  0x00000000015f58bf in ice_transport_remap_mp_horizontal_remap_ ()

MPT: #9  0x00000000015e33d9 in ice_transport_driver_mp_transport_remap_ ()

MPT: #10 0x00000000015c312d in ice_step_mod_mp_step_dynamics_ ()

MPT: #11 0x00000000014fea84 in ice_comp_mct_mp_ice_run_mct_ ()

MPT: #12 0x000000000041de32 in ccsm_comp_mod_mp_ccsm_run_ ()

MPT: #13 0x000000000044b118 in MAIN__ ()

MPT: #14 0x000000000041945e in main ()

MPT: (gdb) A debugging session is active.

MPT:

MPT:    Inferior 1 [process 357750] will be detached.

MPT:

MPT: Quit anyway? (y or n) [answered Y; input not from terminal]

MPT: Detaching from program: /proc/357750/exe, process 357750

 

MPT: -----stack traceback ends-----

MPT ERROR: MPI_COMM_WORLD rank 827 has terminated without calling MPI_Finalize()

        aborting job

 

Wei Huang

Who's new

  • Nicholas.Davis@...
  • numarsanifa@...
  • bingdian_46@...
  • mxy2832029@...
  • nthg2000@...