weit08@lzu_cn
New Member
Hello,
I port CCSM4 to our Linux cluster(16 pes/node). I compile it with mpicc and mpif90. the fortran compiler is PGI. The batch system is PBS. I could run CCSM4 succesfully. However my problem is that it run on only one node no matter how many pes I set.
My env_mach_pes.xml and $case.run files are set as follows. I use 64 pes then qsub it. The job occupies 64 pes exactly but only 16 of them(one node) are really used, the remains are not working. I test 176 pes, the result is the same.
Ayn help is greatly appreciated! Thank you all in advance!
Sincerely
Qing YAN
> *************************************************************************************************
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
> ****************************************************************************************************
>
> %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
> the $case.run file:
>
> #!/bin/csh -f
> #===============================================================================
> # GENERIC_USER
> # This is where the batch submission is set. The above code computes
> # the total number of tasks, nodes, and other things that can be useful
> # here. Use PBS, BSUB, or whatever the local environment supports.
> #===============================================================================
>
> #PBS -N test
> #PBS -q default
> #PBS -l nodes=4:ppn=16
> #PBS -l walltime=2400:00:00
> #PBS -r n
> #PBS -j oe
> #PBS -S /bin/csh -V
>
> #limit coredumpsize 1000000
> #limit stacksize unlimited
>
>
> # ----------------------------------------
> # PE LAYOUT:
> # total number of tasks = 64
> # maximum threads per task = 1
> # cpl ntasks=64 nthreads=1 rootpe=0
> # cam ntasks=64 nthreads=1 rootpe=0
> # clm ntasks=64 nthreads=1 rootpe=0
> # cice ntasks=64 nthreads=1 rootpe=0
> # pop2 ntasks=64 nthreads=1 rootpe=0
> # sglc ntasks=64 nthreads=1 rootpe=0
> #
> # total number of hw pes = 64
> # cpl hw pe range ~ from 0 to 63
> # cam hw pe range ~ from 0 to 63
> # clm hw pe range ~ from 0 to 63
> # cice hw pe range ~ from 0 to 63
> # pop2 hw pe range ~ from 0 to 63
> # sglc hw pe range ~ from 0 to 63
> # ----------------------------------------
> #-----------------------------------------------------------------------
> # Determine necessary environment variables
> #-----------------------------------------------------------------------
>
> cd /mnt/storage-space/disk1/yanq/ccsm/build/test
>
> ./Tools/ccsm_check_lockedfiles || exit -1
> source ./Tools/ccsm_getenv || exit -2
>
> if ($BUILD_COMPLETE != "TRUE") then
> echo "BUILD_COMPLETE is not TRUE"
> echo "Please rebuild the model interactively via"
> echo " ./${CASE}.${MACH}.build"
> exit -2
> endif
>
> setenv LBQUERY TRUE
> setenv LBSUBMIT TRUE
>
> #-----------------------------------------------------------------------
> # Determine time-stamp/file-ID string
> # Clean up previous run timing files
> #-----------------------------------------------------------------------
>
> setenv LID "`date +%y%m%d-%H%M%S`"
> env | egrep '(MP_|LOADL|XLS|FPE|DSM|OMP|MPC)' # document env vars
>
> # -------------------------------------------------------------------------
> # Build the namelists and check prestage
> # -------------------------------------------------------------------------
>
> cd $CASEROOT
> source $CASETOOLS/ccsm_buildnml.csh || exit -3
> cd $CASEROOT
> source $CASETOOLS/ccsm_prestage.csh || exit -3
>
> # -------------------------------------------------------------------------
> # Create and cleanup the timing directories
> # -------------------------------------------------------------------------
>
> if !(-d /mnt/storage-space/disk1/yanq/ccsm/build/test/run/timing) mkdir /mnt/storage-space/disk1/yanq/ccsm/build/test/run/timing
> if !(-d /mnt/storage-space/disk1/yanq/ccsm/build/test/run/timing/checkpoints) mkdir /mnt/storage-space/disk1/yanq/ccsm/build/test/run/timing/checkpoints
> rm -f /mnt/storage-space/disk1/yanq/ccsm/build/test/run/timing/ccsm_timing*
>
> # -------------------------------------------------------------------------
> # Run the model
> # -------------------------------------------------------------------------
>
> sleep 25
> cd $RUNDIR
> echo "`date` -- CSM EXECUTION BEGINS HERE"
>
> #===============================================================================
> # GENERIC_USER
> # Launch the job here. Some samples are commented out below
> #===============================================================================
>
> setenv OMP_NUM_THREADS 1
> #mpiexec -n 64 ./ccsm.exe >&! ccsm.log.$LID
> mpirun -np 64 ./ccsm.exe >&! ccsm.log.$LID
>
> wait
> echo "`date` -- CSM EXECUTION HAS FINISHED"
>
>
> cd $CASEROOT
> ./Tools/ccsm_postrun.csh || exit 1
>
> %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
I port CCSM4 to our Linux cluster(16 pes/node). I compile it with mpicc and mpif90. the fortran compiler is PGI. The batch system is PBS. I could run CCSM4 succesfully. However my problem is that it run on only one node no matter how many pes I set.
My env_mach_pes.xml and $case.run files are set as follows. I use 64 pes then qsub it. The job occupies 64 pes exactly but only 16 of them(one node) are really used, the remains are not working. I test 176 pes, the result is the same.
Ayn help is greatly appreciated! Thank you all in advance!
Sincerely
Qing YAN
> *************************************************************************************************
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
> ****************************************************************************************************
>
> %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
> the $case.run file:
>
> #!/bin/csh -f
> #===============================================================================
> # GENERIC_USER
> # This is where the batch submission is set. The above code computes
> # the total number of tasks, nodes, and other things that can be useful
> # here. Use PBS, BSUB, or whatever the local environment supports.
> #===============================================================================
>
> #PBS -N test
> #PBS -q default
> #PBS -l nodes=4:ppn=16
> #PBS -l walltime=2400:00:00
> #PBS -r n
> #PBS -j oe
> #PBS -S /bin/csh -V
>
> #limit coredumpsize 1000000
> #limit stacksize unlimited
>
>
> # ----------------------------------------
> # PE LAYOUT:
> # total number of tasks = 64
> # maximum threads per task = 1
> # cpl ntasks=64 nthreads=1 rootpe=0
> # cam ntasks=64 nthreads=1 rootpe=0
> # clm ntasks=64 nthreads=1 rootpe=0
> # cice ntasks=64 nthreads=1 rootpe=0
> # pop2 ntasks=64 nthreads=1 rootpe=0
> # sglc ntasks=64 nthreads=1 rootpe=0
> #
> # total number of hw pes = 64
> # cpl hw pe range ~ from 0 to 63
> # cam hw pe range ~ from 0 to 63
> # clm hw pe range ~ from 0 to 63
> # cice hw pe range ~ from 0 to 63
> # pop2 hw pe range ~ from 0 to 63
> # sglc hw pe range ~ from 0 to 63
> # ----------------------------------------
> #-----------------------------------------------------------------------
> # Determine necessary environment variables
> #-----------------------------------------------------------------------
>
> cd /mnt/storage-space/disk1/yanq/ccsm/build/test
>
> ./Tools/ccsm_check_lockedfiles || exit -1
> source ./Tools/ccsm_getenv || exit -2
>
> if ($BUILD_COMPLETE != "TRUE") then
> echo "BUILD_COMPLETE is not TRUE"
> echo "Please rebuild the model interactively via"
> echo " ./${CASE}.${MACH}.build"
> exit -2
> endif
>
> setenv LBQUERY TRUE
> setenv LBSUBMIT TRUE
>
> #-----------------------------------------------------------------------
> # Determine time-stamp/file-ID string
> # Clean up previous run timing files
> #-----------------------------------------------------------------------
>
> setenv LID "`date +%y%m%d-%H%M%S`"
> env | egrep '(MP_|LOADL|XLS|FPE|DSM|OMP|MPC)' # document env vars
>
> # -------------------------------------------------------------------------
> # Build the namelists and check prestage
> # -------------------------------------------------------------------------
>
> cd $CASEROOT
> source $CASETOOLS/ccsm_buildnml.csh || exit -3
> cd $CASEROOT
> source $CASETOOLS/ccsm_prestage.csh || exit -3
>
> # -------------------------------------------------------------------------
> # Create and cleanup the timing directories
> # -------------------------------------------------------------------------
>
> if !(-d /mnt/storage-space/disk1/yanq/ccsm/build/test/run/timing) mkdir /mnt/storage-space/disk1/yanq/ccsm/build/test/run/timing
> if !(-d /mnt/storage-space/disk1/yanq/ccsm/build/test/run/timing/checkpoints) mkdir /mnt/storage-space/disk1/yanq/ccsm/build/test/run/timing/checkpoints
> rm -f /mnt/storage-space/disk1/yanq/ccsm/build/test/run/timing/ccsm_timing*
>
> # -------------------------------------------------------------------------
> # Run the model
> # -------------------------------------------------------------------------
>
> sleep 25
> cd $RUNDIR
> echo "`date` -- CSM EXECUTION BEGINS HERE"
>
> #===============================================================================
> # GENERIC_USER
> # Launch the job here. Some samples are commented out below
> #===============================================================================
>
> setenv OMP_NUM_THREADS 1
> #mpiexec -n 64 ./ccsm.exe >&! ccsm.log.$LID
> mpirun -np 64 ./ccsm.exe >&! ccsm.log.$LID
>
> wait
> echo "`date` -- CSM EXECUTION HAS FINISHED"
>
>
> cd $CASEROOT
> ./Tools/ccsm_postrun.csh || exit 1
>
> %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%