Commit a87e6b9c authored by Youngsung Kim's avatar Youngsung Kim
Browse files

moved kernels from file system to this repo with lfs support.

parent 79a3fc02
*.dat filter=lfs diff=lfs merge=lfs -text
sample.dat filter=lfs diff=lfs merge=lfs -text
# Makefile for KGEN-generated kernel
#
########## application-original compiler and compiler option s############
#FC_0 := /opt/intel/compilers_and_libraries_2018.1.163/linux/bin/intel64/ifort
#FC_FLAGS_SET_0 := -xmic-avx512 -static -convert big_endian -assume byterecl -ftz -traceback -assume realloc_lhs -fp-model consistent -fimf-use-svml -O2 -debug minimal -O2 -debug minimal -qno-opt-dynamic-align -free -fpp
# default configuration used during kernel extraction
#
# compiler, processor, and machine
COMP ?= ifort
PROC ?= knl
MACH ?= cori
# MPI configuration
# 1 to enable MPI build/run
MPI ?=
# number of MPI ranks
NPROCS ?= 4
# internal variable for mpi run command
_MPIRUN :=
# internal variable for mpi runner
_RUNNER :=
# data
# text list of input and output data files
DATA ?= kgen_statefile.lst.cori_knl_ifort
# maximum tolerance between kernel-generated data values and application original values
MAX_TOL ?= 1.D-14
# sanity sum check between application-generated and kernel-calculated
MAX_ABS_DIFF ?= 1.D-307
# sanity sum check between application-generated and kernel-calculated
MAX_REL_DIFF ?= 1.D-10
# Misc.
## preprocessing compiler flag
_PREPROCFLAG :=
# 1<= verbosity <= 3
VERBOSITY ?= 1
# repeat >= 1
REPEAT ?= 1
# skip sum check during data loading
SKIP_SUMCHECK ?=
############ compiler ####################
ifeq (${COMP}, ifort)
FC_0 := ifort
FC_FLAGS_SET_0 := -static -convert big_endian -assume byterecl -ftz -traceback -assume realloc_lhs -fp-model consistent -fimf-use-svml -O2 -debug minimal -qno-opt-dynamic-align -free -fpp
_PREPROCFLAG := -fpp
else ifeq (${COMP}, gfortran)
FC_0 := gfortran
FC_FLAGS_SET_0 := -O2 -ffree-form -cpp
DATA := kgen_statefile.lst.ubuntu_i5_gfortran
_PREPROCFLAG := -cpp
else ifeq (${COMP}, xlf)
FC_0 := xlf
FC_FLAGS_SET_0 := -O2 -qfree=f90 -qpreprocess
DATA := kgen_statefile.lst.summit_power9_xlf
_PREPROCFLAG := -qpreprocess
else ifeq (${COMP}, crayftn)
FC_0 := ftn
FC_FLAGS_SET_0 := -O2 -ffree -eF
DATA := kgen_statefile.lst.tulip_epyc_crayftn
_PREPROCFLAG := -eF
else
$(error "${COMP}" is not found.)
endif
############ processor ###################
ifeq (${PROC}, knl)
FC_FLAGS_SET_0 += -xmic-avx512
endif
############ machine #####################
ifeq (${MACH}, cori)
_RUNNER := srun -n
ifdef MPI
FC_0 := ftn
endif
else ifeq (${MACH}, summit)
_RUNNER := jsrun -n
ifdef MPI
FC_0 := mpifort
endif
else ifeq (${MACH}, tulip)
_RUNNER := srun -n
endif
############ MPI #########################
ifdef MPI
FC_FLAGS_SET_0 += -D_MPI
ifdef _RUNNER
_MPIRUN := ${_RUNNER} ${NPROCS}
endif
endif
############ data sum check #########################
ifdef SKIP_SUMCHECK
_SKIP_SUMCHECK := .TRUE.
else
_SKIP_SUMCHECK := .FALSE.
endif
help:
@echo ""
@echo "************* micro_mg_get_clols2_0 kernel *************"
@echo "usage: make [targets] [arguments]"
@echo ""
@echo "targets: help, clean, build, run"
@echo "arguments: COMP, PROC, MACH, MPI, NPROCS, DATA, MAX_TOL"
@echo " MAX_ABS_DIFF, MAX_REL_DIFF, VERBOSITY, REPEAT"
@echo " SKIP_SUMCHECK"
@echo ""
@echo " COMP: compiler (=ifort, gfortran, xlf, crayftn, <blank>)"
@echo " PROC: processor (=knl, <blank>)"
@echo " MACH: machine (=cori, summit, tulip, <blank>)"
@echo " MPI: MPI (1, =<blank>)"
@echo " NPROCS: number of MPI ranks (=${NPROCS})"
@echo " DATA: file path to a list of application-generated data"
@echo " (=${DATA})"
@echo " MAX_TOL: maximum tolerance for kernel verification (=${MAX_TOL})"
@echo " MAX_ABS_DIFF: max abs. difference for data check (=${MAX_ABS_DIFF})"
@echo " MAX_REL_DIFF: max rel. difference for data check (=${MAX_REL_DIFF})"
@echo " SKIP_SUMCHECK: skip array sum check (1, =<blank>)"
@echo " VERBOSITY: kernel output verbosity (=1, 2, 3)"
@echo " REPEAT: no. of kernel runs for averaging runtime (=${REPEAT})"
@echo ""
@echo "Please see 'README.txt for further details.'"
@echo ""
ALL_OBJS := micro_mg_cam.o micro_mg2_0.o micro_mg_utils.o ref_pres.o physics_types.o shr_kind_mod.o kernel_driver.o kgen_utils.o tprof_mod.o
build: ${ALL_OBJS}
${FC_0} ${FC_FLAGS_SET_0} -o kernel.exe $^
run:
${_MPIRUN} ./kernel.exe
micro_mg_cam.o: micro_mg_cam.F90 micro_mg2_0.o micro_mg_utils.o ref_pres.o physics_types.o shr_kind_mod.o kgen_utils.o tprof_mod.o
${FC_0} ${FC_FLAGS_SET_0} ${_PREPROCFLAG} -DMAX_TOL=${MAX_TOL} -DVERBOSITY=${VERBOSITY} -DNUM_REPEAT=${REPEAT} -c -o $@ $<
micro_mg2_0.o: micro_mg2_0.F90 kgen_utils.o tprof_mod.o micro_mg_utils.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
micro_mg_utils.o: micro_mg_utils.F90 kgen_utils.o tprof_mod.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
ref_pres.o: ref_pres.F90 kgen_utils.o tprof_mod.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
physics_types.o: physics_types.F90 kgen_utils.o tprof_mod.o shr_kind_mod.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
shr_kind_mod.o: shr_kind_mod.F90 kgen_utils.o tprof_mod.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
kernel_driver.o: kernel_driver.F90 micro_mg_cam.o micro_mg2_0.o micro_mg_utils.o ref_pres.o physics_types.o shr_kind_mod.o kgen_utils.o tprof_mod.o
${FC_0} ${FC_FLAGS_SET_0} -DKGEN_STATEFILES='"${DATA}"' -c -o $@ $<
kgen_utils.o: kgen_utils.f90
${FC_0} ${FC_FLAGS_SET_0} ${_PREPROCFLAG} -DMAX_ABS_DIFF=${MAX_ABS_DIFF} -DMAX_REL_DIFF=${MAX_REL_DIFF} -DSKIP_SUMCHECK=${_SKIP_SUMCHECK} -c -o $@ $<
tprof_mod.o: tprof_mod.f90
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
clean:
rm -f kernel.exe *.mod ${ALL_OBJS}
# Makefile for KGEN-generated kernel
# Originally used compiler(s)
#FC_0 := /opt/intel/compilers_and_libraries_2018.1.163/linux/bin/intel64/ifort
FC_0 := ftn
# KNL
#FC_FLAGS_SET_0 := -xmic-avx512 -static -convert big_endian -assume byterecl -ftz -traceback -assume realloc_lhs -fp-model consistent -fimf-use-svml -O2 -debug minimal -O2 -debug minimal -qno-opt-dynamic-align -free -fpp -D_MPI
FC_FLAGS_SET_0 := -static -convert big_endian -assume byterecl -ftz -traceback -assume realloc_lhs -fp-model consistent -fimf-use-svml -O2 -debug minimal -O2 -debug minimal -qno-opt-dynamic-align -free -fpp -D_MPI
ALL_OBJS := micro_mg_cam.o micro_mg2_0.o micro_mg_utils.o ref_pres.o physics_types.o shr_kind_mod.o kernel_driver.o kgen_utils.o tprof_mod.o
build: ${ALL_OBJS}
${FC_0} ${FC_FLAGS_SET_0} -o kernel.exe $^
run: build
srun -n 4 ./kernel.exe
micro_mg_cam.o: micro_mg_cam.F90 micro_mg2_0.o micro_mg_utils.o ref_pres.o physics_types.o shr_kind_mod.o kgen_utils.o tprof_mod.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
micro_mg2_0.o: micro_mg2_0.F90 kgen_utils.o tprof_mod.o micro_mg_utils.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
micro_mg_utils.o: micro_mg_utils.F90 kgen_utils.o tprof_mod.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
ref_pres.o: ref_pres.F90 kgen_utils.o tprof_mod.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
physics_types.o: physics_types.F90 kgen_utils.o tprof_mod.o shr_kind_mod.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
shr_kind_mod.o: shr_kind_mod.F90 kgen_utils.o tprof_mod.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
kernel_driver.o: kernel_driver.F90 micro_mg_cam.o micro_mg2_0.o micro_mg_utils.o ref_pres.o physics_types.o shr_kind_mod.o kgen_utils.o tprof_mod.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
kgen_utils.o: kgen_utils.f90
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
tprof_mod.o: tprof_mod.f90
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
clean:
rm -f kernel.exe *.mod ${ALL_OBJS}
micro_mg_get_clols2_0 kernel
===================================
micro_mg_get_clols2_0 kernel is a source code extraction of MG2 column selection
utility from atmospheric model in E3SM. In addition to MG2 column selection code,
it includes application-generated input data to drive the kernel execution and output
data to verify the correctness of kernel executioin. It also has a Makefile
that makes it easy to use on multiple systems under various situations.
1. Overview
--------------
1.1 What is a kernel?
~~~~~~~~~~~~~~~~~~~~~~
A kernel is a small software that represents a certain characteristic of a larger
application. It can be compiled and run generally without using external library
on a single computing node. Due to its simple usage, it can greatly improve
productivity of various software engineering tasks such as performance optimization,
unit-testing, debugging, porting, verification, and so on.
(source: https://ncar.github.io/kgendocs/)
2. Kernel extraction
----------------------
rrtmg_sw kernel is extarcted by using `KGen kernel extraction tool
<https://github.com/E3SM-Project/KGen/>`_.
2.1 The scope of code extraction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The scope of source code extraction includes any E3SM code that is requried to
replicate the execution of following code-line(s) in the extracted kernel.
In ${E3SM_TOP}/components/cam/src/physics/cam/micro_mg_cam.F90
MODULE micro_mg_cam
...
SUBROUTINE micro_mg_cam_tend
...
! begin of kernel extraction
CALL micro_mg_get_cols2_0(...)
! end of kernel extraction
...
END SUBROUTINE
...
END MODULE
2.2 Extraction configurations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
KGen : 0.9.0 (not released as of this kernel extraction)
E3SM commit : 477faf4cf7f76a8c779b1475585d01419bff3c4b
on master branch (03/18/2020)
System : Cori KNL of NERSC
Compiler : ifort (IFORT) 19.0.3.199 20190206
3. How to use the kernel
--------------------------
In principle, user of this kernel has a full control about usage. User can
access all kernel source code, input/output data, and Makefile so that
user can modify any part of the kernel per user's needs. Please be mind that
there could be some restriction imposed on this kernel such as licensing or
other regulations. Please see "License" section for further details.
3.1 Getting help
~~~~~~~~~~~~~~~~
Makefile generates a short help on kernel usage. Go to kernel directory and
run following command.
.. code-block:: bash
>>> make [help]
************* micro_mg_tend2_0 kernel *************
usage: make [targets] [arguments]
targets: help, clean, build, run
arguments: COMP, PROC, MACH, MPI, NPROCS, DATA, MAX_TOL
MAX_ABS_DIFF, MAX_REL_DIFF, VERBOSITY, REPEAT
SKIP_SUMCHECK
COMP: compiler (=ifort, gfortran, xlf, crayftn, <blank>)
PROC: processor (=knl, <blank>)
MACH: machine (=cori, summit, tulip, <blank>)
MPI: MPI (1, =<blank>)
NPROCS: number of MPI ranks (=4)
DATA: file path to a list of application-generated data
(=kgen_statefile.lst.tulip_epyc_crayftn)
MAX_TOL: maximum tolerance for kernel verification (=1.D-14)
MAX_ABS_DIFF: max abs. difference for data check (=1.D-307)
MAX_REL_DIFF: max rel. difference for data check (=1.D-10)
SKIP_SUMCHECK: skip array sum check (1, =<blank>)
VERBOSITY: kernel output verbosity (=1, 2, 3)
REPEAT: no. of kernel runs for averaging runtime (=1)
As shown above, there are five makefile targets whose name explains itself.
There are a number of makefile arguments that are explained following sections
in detail. Each arguments can have a range of values, or possibly <blank> to
turn off. In above example, default value for an argument is indicated with "="
prefix.
Example) >>> make build COMP=ifort PROC=knl MACH=cori MPI=1
3.2 Kernel makefile arguments
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Makefile arguments give user several controls about how to buid and run
the kernel. While there are default values for most of arguments, user
could change and/or create values in Makefile.
COMP ::
This argument sets the type of compiler to be used for kernel compilation.
Available values, and default value, are shown in help section above.
Within Makefile, this value is interpreted to a set of compiler flags.
PROC ::
This argument sets the type of processor to be used for kernel compilation.
The processor used during kernel extraction is always set to default value.
You may change this to other value. Or you can turn off this argument by using
"PROC=" in command line. Within Makefile, this value may set a compiler flag
to a particular CPU binary generation.
MACH ::
This argument sets the type of system to be used for kernel compilation and
execution. This may sets the type MPI related configuration such as MPI
launcher.
MPI ::
This argument enables MPI-enabled compiation and execution. Add MPI=1
in command line to enable MPI. When MPI is enalbed, kernel execution
is dupulicated on each MPI ranks so that timing result of the execution
is closer to original timing from application execution.
NPROCS ::
This argument sets the number of MPI ranks to be used on execution.
DATA ::
This argument sets a filepath to a text file that contains a list of file
paths to binary data files.
MAX_TOL ::
This argument sets the maximum absolute difference between kernel output and
application-generated value to pass verification.
MAX_ABS_DIFF ::
This argument sets the maximum absolute difference of array sum between data read by
kernel and data generated by application. This ensured that data read by kernel
is actually used in application execution.
MAX_REL_DIFF ::
This argument sets the maximum relative difference of array sum between data read by
kernel and data generated by application. This ensured that data read by kernel
is actually used in application execution.
SKIP_SUMCHECK ::
This argument controls the data sumcheck feature in kernel. With "SKIP_SUMCHECK=1",
this kernel skips this sum check. This is sometimes required to handle some
unusual cases such as an array having NaN values.
VERBOSITY ::
This argument sets the verbosity of kernel output. Higher value shows more debugging
information.
REPEAT ::
This argument sets the no. of repetitions of kernel executions. Kernel will be
repeatdly executed as many as this argument value. Total execution time is then
divided by the no. of repetiton to get an average time. This may help to overcome
the limit of coarse timing capabililty on the system.
3.3 System-specific command-line examples
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Makefile in this kernel is already pre-configured to support well-known systems.
In this section, several useful use-cases are presented per each system.
To run following example commands, you first need to go into kernel directory.
Make sure that you run "make clean" command first to reset kernel compilation.
Cori KNL of NERSC ::
This kernel was extracted on Cori KNL. Therefore it is simplest to use if you
are on Cori KNL system.
* to build : >>> make build
* to run : >>> make run
* to build with MPI : >>> make build MPI=1
* to run with MPI : >>> make run MPI=1
Cori Haswell of NERSC ::
To support Haswell, only PROC argument needs to be changed.
* to build : >>> make build PROC=
* to run : >>> make run PROC=
* to build with MPI : >>> make build PROC= MPI=1
* to run with MPI : >>> make run PROC= MPI=1
Summit of ORNL ::
On Summit, IBM xlf compiler is used as default compiler.
* to build : >>> make build COMP=xlf PROC= MACH=summit
* to run : >>> make run COMP=xlf PROC= MACH=summit
* to build with MPI : >>> make build COMP=xlf PROC= MACH=summit MPI=1
* to run with MPI : >>> make run COMP=xlf PROC= MACH=summit MPI=1
Tulip of Cray Frontier-testbed ::
On Tulip, IBM crayftn compiler is used as default compiler.
* to build : >>> make build COMP=crayftn PROC= MACH=tulip
* to run : >>> make run COMP=crayftn PROC= MACH=tulip
* to build with MPI : >>> make build COMP=crayftn PROC= MACH=tulip MPI=1
* to run with MPI : >>> make run COMP=crayftn PROC= MACH=tulip MPI=1
Generic Linux::
Gfortran is freely available for many Linux distros.
* to build : >>> make build COMP=gfortran PROC=
* to run : >>> make run COMP=gfortran PROC=
Feel free to add more system supports in Makefile.
3.4 Kernel Perturbation Experiment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
It is sometimes to be useful to know how much kernel output changes as
we perturb input data. For example, this expermient may give us an
insight to decide if a deviated results from porting is acceptable.
To conduct perturbation experiments, user needs to modify source code as
explained below.
1. Open "micro_mg_cam.F90" in a text editor.
2. Uncomment a code line shown below.
!CALL kgen_perturb_real( your_variable, 1.0D-15 )
3. Change "your_variable" to a variable name to be perturbed.
4. Optionally change 2nd argument to the level of perturbation.
Perturbed value = Original Value * (1. + 1.0D-15)
4. Kernel representativeness
---------------------------------
Kernel representativeness is about how well does kernel replicate
the original application in terms of desired characteristic(s).
To provide a systematic way to evaluate/improve kernel representativeness
(timing as of this kernel), the kernel provides data collected from
application execution and a Python script for visualization.
Application-generated timing data ::
This kernel has a file whose name starts with "model" and ends
with "ini" extension similar to "model_cori_knl_ifort.ini".
The file is a text file that contains timestamps for begining
and end of the kernel region per every instances of MPI rank,
OpenMP thread, and invocation order. This timing data provides
us of application reference characteristic.
Kernel timing generation ::
When kernel runs, the kernel shows verfication and timing results
on screen. Please capture the output to a file similar to following:
>>> make run > kernel.out
Timing comparison between application and kernel ::
To visualize the comparision, the kernel has a Python script of
"kernel_representativeness_check.py". Please run the script similar
to following.
>>> python kernel_representativeness_check.py model_cori_knl_ifort.ini \
kernel.out
It will show two histograms for application timing samples and kernel
timing results each. The histogram reveals how different the kernel timing
is from application timing.
How to improve timing representativeness ::
In most cases, the timing of kernel execution is much faster than
application. It is mainly because that application runs the code in
a much stressed situation than one for kernel execution.
To improve representativeness, we need to mimic the "stressed situation"
that application went through. One effective method is to dupulicate
kernel execution using MPI. Please try to clean/recompile the kernel
with "MPI=1 NPROCS=<some positive integer>" and recollect the kernel
output. Depends on the value of NPROCS and current system setting,
the timing of kernel would be getting closer to that of application.
To further improve, you may try additional techniques such as cache
pollution or repetition with "REPEAT" makefile argument.
5. License
-------------
In principle, this kernel does not affect to the license(s) of original
source codes extracted in this kernel. To get further information
about source code licensing, please refer to E3SM documentation.
6. Further resources
-------------------
KGen github repo.: https://github.com/E3SM-Project/KGen
KGen documentation: https://ncar.github.io/kgendocs/
E3SM public page: https://e3sm.org/
!KGEN-generated Fortran source file
!Generated at : 2020-03-21 12:44:20
!KGEN version : 0.9.0
PROGRAM kernel_driver
USE kgen_utils_mod, ONLY: kgen_get_newunit, kgen_error_stop, kgen_dp, kgen_array_sumcheck, kgen_rankthreadinvoke
USE tprof_mod, ONLY: tstart, tstop, tnull, tprnt
USE micro_mg_cam, ONLY: micro_mg_cam_tend
USE micro_mg_cam, ONLY: kr_externs_in_micro_mg_cam
USE physics_types, ONLY: physics_state
USE physics_types, ONLY: kr_physics_types_physics_state
USE micro_mg_utils, ONLY: kr_externs_in_micro_mg_utils
USE ref_pres, ONLY: kr_externs_in_ref_pres
IMPLICIT NONE
#ifdef _MPI
include "mpif.h"
#endif
LOGICAL :: kgen_isverified
INTEGER :: kgen_ierr_list, kgen_unit_list
INTEGER :: kgen_ierr, kgen_unit, kgen_case_count, kgen_count_verified
CHARACTER(LEN=1024) :: kgen_filepath
REAL(KIND=kgen_dp) :: kgen_measure, kgen_total_time, kgen_min_time, kgen_max_time
REAL(KIND=8) :: kgen_array_sum
INTEGER :: kgen_mpirank, kgen_openmptid, kgen_kernelinvoke
INTEGER :: myrank, mpisize
LOGICAL :: kgen_evalstage, kgen_warmupstage, kgen_mainstage
COMMON / state / kgen_mpirank, kgen_openmptid, kgen_kernelinvoke, kgen_evalstage, kgen_warmupstage, kgen_mainstage
TYPE(physics_state) :: state
#ifdef _MPI
CALL MPI_INIT(kgen_ierr)
IF (kgen_ierr .NE. MPI_SUCCESS) THEN
PRINT *, "MPI Initialization is failed."
CALL MPI_ABORT(MPI_COMM_WORLD, -1, kgen_ierr)
END IF
call mpi_comm_rank(mpi_comm_world, myrank, kgen_ierr)
call mpi_comm_size(mpi_comm_world, mpisize, kgen_ierr)
#else
myrank = 0
mpisize = 1
#endif
kgen_total_time = 0.0_kgen_dp
kgen_min_time = HUGE(0.0_kgen_dp)
kgen_max_time = 0.0_kgen_dp
kgen_case_count = 0
kgen_count_verified = 0
kgen_unit_list = kgen_get_newunit()