Commit 11bad468 authored by Youngsung Kim's avatar Youngsung Kim
Browse files

baseline performance is collected

parent f790c860
......@@ -36,7 +36,7 @@ _PREPROCFLAG :=
# 1<= verbosity <= 3
VERBOSITY ?= 1
# repeat >= 1
REPEAT ?= 1
REPEAT ?= 1 #2
# skip sum check during data loading
SKIP_SUMCHECK ?=
......@@ -69,7 +69,7 @@ else ifeq (${COMP}, crayftn)
else ifeq (${COMP}, pgfortran)
FC_0 := pgfortran
#FC_FLAGS_SET_0 := -i4 -time -Mstack_arrays -Mextend -byteswapio -Mflushz -Kieee -Mallocatable=03 -O2 -Mpreprocess
#FC_FLAGS_SET_0 := -gopt -i4 -time -Mstack_arrays -Mextend -byteswapio -Mflushz -Kieee -Mallocatable=03 -O2 -Mpreprocess
FC_FLAGS_SET_0 := -gopt -i4 -time -Mstack_arrays -Mextend -byteswapio -Mflushz -Kieee -Mallocatable=03 -O2 -Mpreprocess -ta=tesla:debug,lineinfo -Minfo=all,ccff
_PREPROCFLAG := -Mpreprocess
......@@ -170,11 +170,10 @@ ncu: ${OUTDIR}/${APP}.compute.ncu-rep
@echo "Output files are in ${OUTDIR}"
${OUTDIR}/${APP}.compute.ncu-rep: ${OUTDIR}/${APP}
#jsrun -n1 -c1 -g1 -a1 --smpiargs="-disable_gpu_hooks" -- ncu --target-processes=all -c 1500 --set=full --force-overwrite -o ${OUTDIR}/${APP}.compute ${OUTDIR}/${APP}
jsrun -n1 -c1 -g1 -a1 --smpiargs="-disable_gpu_hooks" -- ncu --target-processes=all --set=full --force-overwrite -o ${OUTDIR}/${APP}.compute ${OUTDIR}/${APP}
${OUTDIR}/${APP}.systems.qdrep: ${OUTDIR}/${APP}
jsrun -n1 -c1 -g1 -a1 -- nsys profile -o ${OUTDIR}/${APP}.systems.qdrep -f true -t cuda,osrt,openacc ${OUTDIR}/${APP}
jsrun -n1 -c1 -g1 -a1 -- nsys profile -o ${OUTDIR}/${APP}.systems.qdrep -f true -t cuda,osrt,openacc ${OUTDIR}/${APP} || true
${OUTDIR}/${APP}: build
......@@ -185,7 +184,7 @@ run: build
${_MPIRUN} ./kernel.exe
mpas_ocn_gm.o: mpas_ocn_gm.f90 mpas_ocn_constants.o mpas_constants.o mpas_kind_types.o kgen_utils.o tprof_mod.o
${FC_0} ${FC_FLAGS_SET_0} ${_PREPROCFLAG} -DMAX_TOL=${MAX_TOL} -DVERBOSITY=${VERBOSITY} -DNUM_REPEAT=${REPEAT} -c -o $@ $<
${FC_0} ${FC_FLAGS_SET_0} ${_PREPROCFLAG} -DMAX_TOL=${MAX_TOL} -DVERBOSITY=${VERBOSITY} -DNUM_REPEAT=${REPEAT} -c -o $@ $^
mpas_ocn_constants.o: mpas_ocn_constants.f90 kgen_utils.o tprof_mod.o mpas_kind_types.o
${FC_0} ${FC_FLAGS_SET_0} -c -o $@ $<
......
pgfortran -gopt -i4 -time -Mstack_arrays -Mextend -byteswapio -Mflushz -Kieee -Mallocatable=03 -O2 -Mpreprocess -ta=tesla:debug,lineinfo -Minfo=all,ccff -Mpreprocess -DMAX_ABS_DIFF=1.D-307 -DMAX_REL_DIFF=1.D-10 -DSKIP_SUMCHECK=.FALSE. -c -o kgen_utils.o kgen_utils.f90
kgen_perturb_real4_dim1:
188, Memory set idiom, loop replaced by call to __c_mset4
190, Loop not vectorized/parallelized: contains call
kgen_perturb_real4_dim2:
208, Memory set idiom, loop replaced by call to __c_mset4
211, Loop not vectorized/parallelized: contains call
kgen_perturb_real4_dim3:
230, Memory set idiom, loop replaced by call to __c_mset4
234, Loop not vectorized/parallelized: contains call
kgen_perturb_real8_dim1:
254, Memory set idiom, loop replaced by call to __c_mset4
256, Loop not vectorized/parallelized: contains call
kgen_perturb_real8_dim2:
274, Memory set idiom, loop replaced by call to __c_mset4
277, Loop not vectorized/parallelized: contains call
kgen_perturb_real8_dim3:
296, Memory set idiom, loop replaced by call to __c_mset4
300, Loop not vectorized/parallelized: contains call
kgen_get_newunit:
375, Loop not vectorized/parallelized: potential early exits
FMA (fused multiply-add) instruction(s) generated
kgen_rankthreadinvoke:
405, Loop not vectorized/parallelized: contains call
Timing stats:
init 16 millisecs 13%
import 50 millisecs 43%
expand 17 millisecs 14%
schedule 33 millisecs 28%
Total time 116 millisecs
pgfortran -gopt -i4 -time -Mstack_arrays -Mextend -byteswapio -Mflushz -Kieee -Mallocatable=03 -O2 -Mpreprocess -ta=tesla:debug,lineinfo -Minfo=all,ccff -c -o tprof_mod.o tprof_mod.f90
tinit:
41, Loop not vectorized/parallelized: contains call
tstart:
77, Loop not vectorized/parallelized: potential early exits
tstop:
107, Loop not vectorized/parallelized: potential early exits
tprnt:
128, Loop not vectorized/parallelized: contains call
to_upper:
142, Loop not vectorized: data dependency
Timing stats:
vectorize 16 millisecs 100%
Total time 16 millisecs
pgfortran -gopt -i4 -time -Mstack_arrays -Mextend -byteswapio -Mflushz -Kieee -Mallocatable=03 -O2 -Mpreprocess -ta=tesla:debug,lineinfo -Minfo=all,ccff -c -o mpas_kind_types.o mpas_kind_types.f90
Timing stats:
Total time 0 millisecs
pgfortran -gopt -i4 -time -Mstack_arrays -Mextend -byteswapio -Mflushz -Kieee -Mallocatable=03 -O2 -Mpreprocess -ta=tesla:debug,lineinfo -Minfo=all,ccff -c -o mpas_ocn_constants.o mpas_ocn_constants.f90
Timing stats:
Total time 0 millisecs
pgfortran -gopt -i4 -time -Mstack_arrays -Mextend -byteswapio -Mflushz -Kieee -Mallocatable=03 -O2 -Mpreprocess -ta=tesla:debug,lineinfo -Minfo=all,ccff -c -o mpas_constants.o mpas_constants.f90
Timing stats:
Total time 0 millisecs
pgfortran -gopt -i4 -time -Mstack_arrays -Mextend -byteswapio -Mflushz -Kieee -Mallocatable=03 -O2 -Mpreprocess -ta=tesla:debug,lineinfo -Minfo=all,ccff -Mpreprocess -DMAX_TOL=1.D-14 -DVERBOSITY=1 -DNUM_REPEAT=1 -c -o mpas_ocn_gm.o mpas_ocn_gm.f90 mpas_ocn_constants.o mpas_constants.o mpas_kind_types.o kgen_utils.o tprof_mod.o
mpas_ocn_gm.f90:
ocn_gm_compute_bolus_velocity:
349, Loop not vectorized: data dependency
362, Loop not vectorized: data dependency
377, Loop not vectorized: data dependency
403, Loop not vectorized: data dependency
424, Loop not vectorized: data dependency
427, Conflict or overlap between ddensitydztopofedge and ddensitydztopofcell
453, Loop not vectorized: data dependency
454, Conflict or overlap between graddensityedge and density
455, Conflict or overlap between gradzmidedge and zmid
471, Loop not vectorized: data dependency
475, Conflict or overlap between graddensitytopofedge and graddensityedge
476, Conflict or overlap between gradzmidtopofedge and gradzmidedge
482, Conflict or overlap between graddensitytopofedge and graddensityedge
483, Conflict or overlap between graddensitytopofedge and graddensityedge
484, Conflict or overlap between gradzmidtopofedge and gradzmidedge
485, Conflict or overlap between gradzmidtopofedge and gradzmidedge
504, Loop not vectorized: data dependency
505, Conflict or overlap between graddensityconstztopofedge and graddensitytopofedge
527, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
531, Loop not vectorized: data dependency
532, Conflict or overlap between relativeslopetopofedge and graddensitytopofedge
557, Loop not vectorized: data dependency
571, FMA (fused multiply-add) instruction(s) generated
572, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
579, Loop not vectorized: data dependency
595, Loop not vectorized: data dependency
611, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
617, Conflict or overlap between relativeslopetaperingcell and relativeslopetopofcell
Loop not vectorized: non-stride-1 array reference
Loop unrolled 4 times
624, Loop not vectorized/parallelized: contains call
626, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
628, Loop not vectorized: data dependency
633, Conflict or overlap between relativeslopetopofcell and relativeslopetaperingcell
Loop not vectorized: non-stride-1 array reference
Loop unrolled 4 times
646, Loop not vectorized: data dependency
647, Conflict or overlap between relativeslopetaperingcell and config_max_relative_slope
663, Loop not vectorized: data dependency
681, Loop not vectorized: data dependency
695, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
697, Loop not vectorized: data dependency
698, Conflict or overlap between k33 and relativeslopetopofcell
713, Loop not vectorized: data dependency
714, Conflict or overlap between relativeslopetapering and relativeslopetaperingcell
726, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
750, Loop not vectorized: mixed data types
761, Conflict or overlap between cgmphasespeed and config_gm_min_phase_speed
771, Loop not vectorized: data dependency
772, Conflict or overlap between cgmphasespeed and config_gravwavespeed_trunc
782, Conflict or overlap between kappagm3d and gmboluskappa
Memory set idiom, loop replaced by call to __c_mset8
Loop not vectorized: non-stride-1 array reference
Loop unrolled 4 times
797, Loop not vectorized: mixed data types
805, Loop not vectorized: data dependency
809, Conflict or overlap between kappagm3d and gmboluskappa
822, Loop not vectorized/parallelized: contains call
826, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
834, Conflict or overlap between tridiagb and cgmphasespeed
836, Conflict or overlap between tridiagc and cgmphasespeed
838, Conflict or overlap between righthandside and graddensityconstztopofedge
841, Loop not vectorized: data dependency
844, Conflict or overlap between tridiaga and cgmphasespeed
846, Conflict or overlap between tridiagb and cgmphasespeed
848, Conflict or overlap between tridiagc and cgmphasespeed
850, Conflict or overlap between righthandside and graddensityconstztopofedge
857, Conflict or overlap between tridiaga and cgmphasespeed
859, Conflict or overlap between tridiagb and cgmphasespeed
861, Conflict or overlap between righthandside and graddensityconstztopofedge
867, Possible copy in and copy out of gmstreamfunctopofedge in call to tridiagonal_solve
Possible copy in and copy out of righthandside in call to tridiagonal_solve
Possible copy in and copy out of tridiagc in call to tridiagonal_solve
Possible copy in and copy out of tridiagb in call to tridiagonal_solve
Possible copy in and copy out of tridiaga in call to tridiagonal_solve
880, Loop not vectorized: data dependency
881, Conflict or overlap between normalgmbolusvelocity and gmstreamfunctopofedge
895, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
902, Loop not vectorized: data dependency
914, Loop not vectorized/parallelized: contains call
915, Conflict or overlap between gmstreamfunctopofcell and areacell
Loop not vectorized: non-stride-1 array reference
Loop unrolled 4 times
1022, Generating copyin(nvertlevels) [if not already present]
1025, Loop not vectorized/parallelized: too deeply nested
1035, Generating Tesla code
1036, !$acc loop gang ! blockidx%x
1038, !$acc loop vector(128) ! threadidx%x
1035, Generating implicit copyout(normalgmbolusvelocity(:,1:nedges),gradzmidedge(:,1:nedges),graddensityedge(:,1:nedges)) [if not already present]
1038, Loop is parallelizable
1051, Loop not vectorized: data dependency
1066, Loop not vectorized: data dependency
1089, Loop not vectorized: data dependency
1106, Generating Tesla code
1107, !$acc loop gang ! blockidx%x
1109, !$acc loop vector(128) ! threadidx%x
1106, Generating implicit copyin(maxleveledgetop(1:nedges)) [if not already present]
Generating implicit copy(ddensitydztopofedge(:,1:nedges)) [if not already present]
Generating implicit copyin(cellsonedge(1:2,1:nedges),ddensitydztopofcell(:,:)) [if not already present]
1109, Loop is parallelizable
1112, Conflict or overlap between ddensitydztopofedge and ddensitydztopofcell
1135, Loop not vectorized: data dependency
1136, Conflict or overlap between graddensityedge and density
1137, Conflict or overlap between gradzmidedge and zmid
1149, Loop not vectorized: data dependency
1153, Conflict or overlap between graddensitytopofedge and graddensityedge
1154, Conflict or overlap between gradzmidtopofedge and gradzmidedge
1160, Conflict or overlap between graddensitytopofedge and graddensityedge
1161, Conflict or overlap between graddensitytopofedge and graddensityedge
1162, Conflict or overlap between gradzmidtopofedge and gradzmidedge
1163, Conflict or overlap between gradzmidtopofedge and gradzmidedge
1177, Generating Tesla code
1178, !$acc loop gang ! blockidx%x
1181, !$acc loop vector(128) ! threadidx%x
1177, Generating implicit copyin(maxleveledgetop(1:nedges),gradzmidtopofedge(:,1:nedges),ddensitydztopofedge(:,1:nedges),graddensitytopofedge(:,1:nedges)) [if not already present]
Generating implicit copy(graddensityconstztopofedge(:,1:nedges)) [if not already present]
1181, Loop is parallelizable
1182, Conflict or overlap between graddensityconstztopofedge and graddensitytopofedge
1202, Generating Tesla code
1203, !$acc loop gang ! blockidx%x
1204, !$acc loop vector(128) ! threadidx%x
1208, !$acc loop vector(128) ! threadidx%x
1202, Generating implicit copyin(graddensitytopofedge(:,1:nedges)) [if not already present]
Generating implicit copy(relativeslopetopofedge(:,1:nedges)) [if not already present]
Generating implicit copyin(maxleveledgetop(1:nedges),ddensitydztopofedge(:,1:nedges)) [if not already present]
1204, Loop is parallelizable
1208, Loop is parallelizable
1209, Conflict or overlap between relativeslopetopofedge and graddensitytopofedge
1231, Generating Tesla code
1232, !$acc loop gang ! blockidx%x
1234, !$acc loop vector(128) ! threadidx%x
1231, Generating implicit copy(relativeslopetopofedge(:,1:nedges)) [if not already present]
1234, Loop is parallelizable
1247, Generating Tesla code
1248, !$acc loop gang ! blockidx%x
1249, !$acc loop vector(128) ! threadidx%x
1250, !$acc loop seq
1256, !$acc loop vector(128) ! threadidx%x
1247, Generating implicit copyin(maxleveledgetop(:),edgesoncell(:,1:ncells),dvedge(:),dcedge(:)) [if not already present]
Generating implicit copy(areacellsum(:,1:ncells)) [if not already present]
Generating implicit copyin(relativeslopetopofedge(:,:)) [if not already present]
Generating implicit copy(relativeslopetopofcell(:,1:ncells)) [if not already present]
Generating implicit copyin(nedgesoncell(1:ncells)) [if not already present]
1249, Loop is parallelizable
1250, Complex loop carried dependence of edgesoncell,dvedge,dcedge,maxleveledgetop,relativeslopetopofedge,relativeslopetopofcell prevents parallelization
Loop carried reuse of relativeslopetopofcell,areacellsum prevents parallelization
Complex loop carried dependence of areacellsum prevents parallelization
1251, Accelerator restriction: induction variable live-out from loop: i
1256, Loop is parallelizable
1261, Accelerator restriction: induction variable live-out from loop: i
1269, Generating Tesla code
1270, !$acc loop gang ! blockidx%x
1272, !$acc loop vector(128) ! threadidx%x
1269, Generating implicit copyin(areacellsum(:,1:ncells)) [if not already present]
Generating implicit copy(relativeslopetopofcell(:,1:ncells)) [if not already present]
Generating implicit copyin(maxlevelcell(1:ncells)) [if not already present]
1272, Loop is parallelizable
1286, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1291, Conflict or overlap between relativeslopetaperingcell and relativeslopetopofcell
Loop not vectorized: non-stride-1 array reference
Loop unrolled 4 times
1297, Loop not vectorized/parallelized: contains call
1299, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1300, Loop not vectorized: data dependency
1305, Conflict or overlap between relativeslopetopofcell and relativeslopetaperingcell
Loop not vectorized: non-stride-1 array reference
Loop unrolled 4 times
1314, Loop not vectorized: data dependency
1315, Conflict or overlap between relativeslopetaperingcell and config_max_relative_slope
1327, Loop not vectorized: data dependency
1344, Loop not vectorized: data dependency
1356, Generating Tesla code
1357, !$acc loop gang ! blockidx%x
1358, !$acc loop vector(128) ! threadidx%x
1360, !$acc loop vector(128) ! threadidx%x
1356, Generating implicit copy(k33(:,1:ncells)) [if not already present]
Generating implicit copyin(relativeslopetopofcell(:,1:ncells),relativeslopetaperingcell(:,1:ncells),maxlevelcell(1:ncells)) [if not already present]
1358, Loop is parallelizable
1360, Loop is parallelizable
1361, Conflict or overlap between k33 and relativeslopetopofcell
1371, Generating Tesla code
1372, !$acc loop gang ! blockidx%x
1376, !$acc loop vector(128) ! threadidx%x
1371, Generating implicit copyin(cellsonedge(1:2,1:nedges),relativeslopetaperingcell(:,:)) [if not already present]
Generating implicit copy(relativeslopetapering(:,1:nedges)) [if not already present]
Generating implicit copyin(maxleveledgetop(1:nedges)) [if not already present]
1376, Loop is parallelizable
1377, Conflict or overlap between relativeslopetapering and relativeslopetaperingcell
1387, Generating Tesla code
1388, !$acc loop gang ! blockidx%x
1389, !$acc loop vector(128) ! threadidx%x
1387, Generating implicit copyout(k33(:,1:ncells)) [if not already present]
1389, Loop is parallelizable
1410, Loop not vectorized: mixed data types
1421, Conflict or overlap between cgmphasespeed and config_gm_min_phase_speed
1428, Loop not vectorized: data dependency
1429, Conflict or overlap between cgmphasespeed and config_gravwavespeed_trunc
1436, Conflict or overlap between kappagm3d and gmboluskappa
Memory set idiom, loop replaced by call to __c_mset8
Loop not vectorized: non-stride-1 array reference
Loop unrolled 4 times
1448, Loop not vectorized: mixed data types
1456, Loop not vectorized: data dependency
1460, Conflict or overlap between kappagm3d and gmboluskappa
1470, Loop not vectorized/parallelized: contains call
1474, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1482, Conflict or overlap between tridiagb and cgmphasespeed
1484, Conflict or overlap between tridiagc and cgmphasespeed
1486, Conflict or overlap between righthandside and graddensityconstztopofedge
1489, Loop not vectorized: data dependency
1492, Conflict or overlap between tridiaga and cgmphasespeed
1494, Conflict or overlap between tridiagb and cgmphasespeed
1496, Conflict or overlap between tridiagc and cgmphasespeed
1498, Conflict or overlap between righthandside and graddensityconstztopofedge
1505, Conflict or overlap between tridiaga and cgmphasespeed
1507, Conflict or overlap between tridiagb and cgmphasespeed
1509, Conflict or overlap between righthandside and graddensityconstztopofedge
1515, Possible copy in and copy out of gmstreamfunctopofedge in call to tridiagonal_solve
Possible copy in and copy out of righthandside in call to tridiagonal_solve
Possible copy in and copy out of tridiagc in call to tridiagonal_solve
Possible copy in and copy out of tridiagb in call to tridiagonal_solve
Possible copy in and copy out of tridiaga in call to tridiagonal_solve
1524, Generating Tesla code
1525, !$acc loop gang ! blockidx%x
1527, !$acc loop vector(128) ! threadidx%x
1524, Generating implicit copyin(layerthicknessedge(:,1:nedges),gmstreamfunctopofedge(:,1:nedges)) [if not already present]
Generating implicit copy(normalgmbolusvelocity(:,1:nedges)) [if not already present]
Generating implicit copyin(maxleveledgetop(1:nedges)) [if not already present]
1527, Loop is parallelizable
1528, Conflict or overlap between normalgmbolusvelocity and gmstreamfunctopofedge
1540, Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1546, Loop not vectorized: data dependency
1555, Loop not vectorized/parallelized: contains call
1556, Conflict or overlap between gmstreamfunctopofcell and areacell
Loop not vectorized: non-stride-1 array reference
Loop unrolled 4 times
kr_kgen_ocn_gm_compute_bolus_velocity_subp0:
1613, sum reduction inlined
1615, sum reduction inlined
kr_kgen_ocn_gm_compute_bolus_velocity_subp1:
1641, sum reduction inlined
1643, sum reduction inlined
kr_kgen_ocn_gm_compute_bolus_velocity_subp2:
1669, sum reduction inlined
Loop unrolled 1 times
1671, sum reduction inlined
Loop unrolled 1 times
kr_kgen_ocn_gm_compute_bolus_velocity_subp3:
1699, sum reduction inlined
Loop unrolled 1 times
1701, sum reduction inlined
Loop unrolled 1 times
kv_kgen_ocn_gm_compute_bolus_velocity_subp0:
1744, all reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop not vectorized: may not be beneficial
Loop unrolled 2 times
1756, Memory set idiom, loop replaced by call to __c_mset4
Loop not vectorized: non-stride-1 array reference
Loop not vectorized: may not be beneficial
Loop unrolled 4 times
1759, Generated vector simd code for the loop
1763, sum reduction inlined
Generated vector simd code for the loop containing reductions
1764, sum reduction inlined
Generated vector simd code for the loop containing reductions
1786, count reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop not vectorized: may not be beneficial
Loop unrolled 1 times
1787, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1788, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1797, count reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop not vectorized: may not be beneficial
Loop unrolled 1 times
1798, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1799, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1808, count reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop not vectorized: may not be beneficial
Loop unrolled 1 times
1809, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1810, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
kv_kgen_ocn_gm_compute_bolus_velocity_subp1:
1837, all reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop not vectorized: may not be beneficial
Loop unrolled 2 times
1849, Memory set idiom, loop replaced by call to __c_mset4
Loop not vectorized: non-stride-1 array reference
Loop not vectorized: may not be beneficial
Loop unrolled 4 times
1852, Generated vector simd code for the loop
1856, sum reduction inlined
Generated vector simd code for the loop containing reductions
1857, sum reduction inlined
Generated vector simd code for the loop containing reductions
1879, count reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop not vectorized: may not be beneficial
Loop unrolled 1 times
1880, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1881, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1890, count reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop not vectorized: may not be beneficial
Loop unrolled 1 times
1891, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1892, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1901, count reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop not vectorized: may not be beneficial
Loop unrolled 1 times
1902, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
1903, sum reduction inlined
Loop not vectorized: non-stride-1 array reference
Loop unrolled 2 times
kv_kgen_ocn_gm_compute_bolus_velocity_subp2:
1930, all reduction inlined
Loop unrolled 2 times
1942, Memory set idiom, loop replaced by call to __c_mset4
Loop not vectorized: may not be beneficial
Loop unrolled 2 times
1945, Generated vector simd code for the loop
1949, sum reduction inlined
Generated vector simd code for the loop containing reductions
1950, sum reduction inlined
Generated vector simd code for the loop containing reductions
1972, count reduction inlined
Loop unrolled 2 times
1973, sum reduction inlined
Loop unrolled 1 times
1974, sum reduction inlined
Loop unrolled 1 times
1983, count reduction inlined
Loop unrolled 2 times
1984, sum reduction inlined
Loop unrolled 1 times
1985, sum reduction inlined
Loop unrolled 1 times
1994, count reduction inlined
Loop unrolled 2 times
1995, sum reduction inlined
Loop unrolled 1 times
1996, sum reduction inlined
Loop unrolled 1 times
tridiagonal_solve:
2189, Loop not vectorized: data dependency
FMA (fused multiply-add) instruction(s) generated
2197, Loop not vectorized: data dependency
Loop unrolled 2 times
ptxas warning : Conflicting options --device-debug and --generate-line-info specified, ignoring --generate-line-info option
Timing stats:
init 416 millisecs 25%
import 383 millisecs 23%
expand 67 millisecs 4%
vectorize 315 millisecs 19%
optimize 67 millisecs 4%
schedule 335 millisecs 20%
unroll 33 millisecs 2%
Total time 1616 millisecs
pgfortran -gopt -i4 -time -Mstack_arrays -Mextend -byteswapio -Mflushz -Kieee -Mallocatable=03 -O2 -Mpreprocess -ta=tesla:debug,lineinfo -Minfo=all,ccff -DKGEN_STATEFILES='"kgen_statefile.lst"' -c -o kernel_driver.o kernel_driver.F90
kernel_driver:
64, Loop not vectorized/parallelized: contains call
Timing stats:
vectorize 16 millisecs 100%
Total time 16 millisecs
pgfortran -gopt -i4 -time -Mstack_arrays -Mextend -byteswapio -Mflushz -Kieee -Mallocatable=03 -O2 -Mpreprocess -ta=tesla:debug,lineinfo -Minfo=all,ccff -L/sw/summit/cuda/10.1.243/lib64 -lnvToolsExt -o kernel.exe mpas_ocn_gm.o mpas_ocn_constants.o mpas_constants.o mpas_kind_types.o kernel_driver.o kgen_utils.o tprof_mod.o
./kernel.exe
***************** Verification against 'ocn_gm_compute_Bolus_velocity.1.0.1.dat' *****************
ocn_gm_compute_Bolus_velocity : Time per call (usec):
38347.90000000000
****************************************************
kernel execution summary: ocn_gm_compute_Bolus_velocity
****************************************************
number of processes 1
Average call time (usec): 0.383E+05
Minimum call time (usec): 0.383E+05
Maximum call time (usec): 0.383E+05
****************************************************
......@@ -138,14 +138,14 @@
IF (kgen_case_count == 0) THEN
WRITE (*, *) "No data file is verified."
ELSE
WRITE (*, "(4X, A36, A1, I6)") "Total number of verification cases ", ":", kgen_case_count
WRITE (*, "(4X, A36, A1, I6)") "Number of verification-passed cases ", ":", kgen_count_verified
WRITE (*, *) ""
IF (kgen_case_count == kgen_count_verified) THEN
WRITE (*, "(4X,A)") "kernel: ocn_gm_compute_Bolus_velocity: PASSED verification"
ELSE
WRITE (*, "(4X,A)") "kernel: ocn_gm_compute_Bolus_velocity: FAILED verification"
END IF
! WRITE (*, "(4X, A36, A1, I6)") "Total number of verification cases ", ":", kgen_case_count
! WRITE (*, "(4X, A36, A1, I6)") "Number of verification-passed cases ", ":", kgen_count_verified
! WRITE (*, *) ""
! IF (kgen_case_count == kgen_count_verified) THEN
! WRITE (*, "(4X,A)") "kernel: ocn_gm_compute_Bolus_velocity: PASSED verification"
! ELSE
! WRITE (*, "(4X,A)") "kernel: ocn_gm_compute_Bolus_velocity: FAILED verification"
! END IF
WRITE (*, *) ""
WRITE (*, "(4X,A19,I3)") "number of processes: ", mpisize
WRITE (*, *) ""
......
......@@ -14,7 +14,6 @@
!
module ocn_gm
!use nvtx
USE mpas_constants
USE ocn_constants
......@@ -187,7 +186,7 @@ SUBROUTINE ocn_gm_compute_bolus_velocity(kgen_unit, kgen_measure, kgen_isverifie
REAL(KIND=rkind) :: kgenref_maxn
INTEGER :: kgenref_ncells
INTEGER :: kgenref_nedges
!parent block preprocessing
#ifdef _MPI
......@@ -321,8 +320,6 @@ SUBROUTINE ocn_gm_compute_bolus_velocity(kgen_unit, kgen_measure, kgen_isverifie
!$kgen begin_callsite ocn_gm_compute_Bolus_velocity
!call nvtxStartRange("First label")
IF (kgen_evalstage) THEN
END IF
......@@ -337,7 +334,6 @@ SUBROUTINE ocn_gm_compute_bolus_velocity(kgen_unit, kgen_measure, kgen_isverifie
!call to kgen kernel
!$acc data copyin(nvertlevels)
nCells = nCellsArray( size(nCellsArray) )
nEdges = nEdgesArray( size(nEdgesArray) )
......@@ -347,22 +343,22 @@ SUBROUTINE ocn_gm_compute_bolus_velocity(kgen_unit, kgen_measure, kgen_isverifie
!$omp do schedule(runtime) private(k)
!print *, "NEDGES", nEdges
!print *, "NVERTLEVELS", nvertlevels
!$acc parallel loop gang
!!$acc parallel loop gang
do iEdge = 1, nEdges
!$acc loop vector
!!$acc loop vector
do k = 1, nVertLevels
gradDensityEdge(k, iEdge) = huge(0D0)
gradZMidEdge(k, iEdge) = huge(0D0)
normalGMBolusVelocity(k, iEdge) = 0.0_RKIND
end do
end do
!$acc end parallel
!!$acc end parallel
!$omp end do
!$omp do schedule(runtime) private(k)
!$acc parallel loop gang
!!$acc parallel loop gang
do iEdge = 1, nEdges
!$acc loop vector
!!$acc loop vector
do k = 1, nVertLevels + 1
gradDensityTopOfEdge(k, iEdge) = huge(0D0)
dDensityDzTopOfEdge(k, iEdge) = huge(0D0)
......@@ -371,13 +367,13 @@ SUBROUTINE ocn_gm_compute_bolus_velocity(kgen_unit, kgen_measure, kgen_isverifie
relativeSlopeTapering(k, iEdge) = 0.0_RKIND
end do
end do
!$acc end parallel
!!$acc end parallel
!$omp end do
!$omp do schedule(runtime) private(k)
!$acc parallel loop gang
!!$acc parallel loop gang
do iCell = 1, nCells + 1
!$acc loop vector
!!$acc loop vector
do k = 1, nVertLevels
dDensityDzTopOfCell(k, iCell) = huge(0D0)
k33(k, iCell) = 0.0_RKIND
......@@ -385,7 +381,7 @@ SUBROUTINE ocn_gm_compute_bolus_velocity(kgen_unit, kgen_measure, kgen_isverifie
relativeSlopeTaperingCell(k, iCell) = 0.0_RKIND
end do
end do
!$acc end parallel
!!$acc end parallel
!$omp end do
!--------------------------------------------------------------------
......@@ -422,16 +418,16 @@ SUBROUTINE ocn_gm_compute_bolus_velocity(kgen_unit, kgen_measure, kgen_isverifie