Commit b382ba2f authored by Budiardja, Reuben's avatar Budiardja, Reuben
Browse files

Updated Reconstruction_Kernel test case, reproduced slow-ness issue with offload with CCE

parent 4a3f3b74
clear:
rm -f *.mod *.o ifeq ($(GENASIS_MACHINE), Cray_CCE)
FC=ftn -fopenmp
endif
ifeq ($(GENASIS_MACHINE), POWER_XL)
FC=xlf2008_r -qsmp=omp -qoffload
endif
Reconstruction_Kernel_Test: \
Reconstruction_Kernel_Test.f90 \
Reconstruction_Kernel.f90
$(FC) -c Reconstruction_Kernel_Test.f90
$(FC) -c Reconstruction_Kernel.f90
$(FC) Reconstruction_Kernel_Test.o Reconstruction_Kernel.o \
-o Reconstruction_Kernel_Test_$(GENASIS_MACHINE)
clean:
rm -f *.mod *.o *.acc.*
...@@ -10,7 +10,6 @@ contains ...@@ -10,7 +10,6 @@ contains
integer ( KDI ) :: & integer ( KDI ) :: &
iS, & iS, &
iF, & iF, &
iF_R, &
iV, jV, kV iV, jV, kV
integer ( KDI ), dimension ( 3 ) :: & integer ( KDI ), dimension ( 3 ) :: &
iaS, & iaS, &
...@@ -50,19 +49,18 @@ contains ...@@ -50,19 +49,18 @@ contains
if ( UseDevice ) then if ( UseDevice ) then
!$OMP target teams distribute parallel do simd collapse ( 4 ) & !$OMP target teams distribute parallel do collapse ( 4 ) &
!$OMP schedule ( static, 1 ) & !$OMP schedule ( static, 1 ) &
!$OMP private ( iF, iF_R, iaVP, iaVM, fM, fC, fP, fI, fO ) & !$OMP private ( iF, iaVP, iaVM, fM, fC, fP, fI, fO ) &
!$OMP private ( xAM, xAC, xAP, x2AM, x2AC, x2AP, xI, xO, xE ) & !$OMP private ( xAM, xAC, xAP, x2AM, x2AC, x2AP, xI, xO, xE ) &
!$OMP private ( c0, c1, c2, c2_S, d ) & !$OMP private ( c0, c1, c2, c2_S, d ) &
!$OMP firstprivate ( SqrtTiny ) !$OMP firstprivate ( SqrtTiny, iaS )
do iS = 1, size ( iaSlctd ) do iS = 1, size ( iaSlctd )
do kV = lV ( 3 ), uV ( 3 ) do kV = lV ( 3 ), uV ( 3 )
do jV = lV ( 2 ), uV ( 2 ) do jV = lV ( 2 ), uV ( 2 )
do iV = lV ( 1 ), uV ( 1 ) do iV = lV ( 1 ), uV ( 1 )
iF = iaSlctd ( iS ) iF = iaSlctd ( iS )
iF_R = iaSlctd_R ( iS )
iaVP = [ iV, jV, kV ] + iaS iaVP = [ iV, jV, kV ] + iaS
iaVM = [ iV, jV, kV ] - iaS iaVM = [ iV, jV, kV ] - iaS
...@@ -75,9 +73,9 @@ contains ...@@ -75,9 +73,9 @@ contains
xAC = XA ( iV, jV, kV ) xAC = XA ( iV, jV, kV )
xAP = XA ( iaVP ( 1 ), iaVP ( 2 ), iaVP ( 3 ) ) xAP = XA ( iaVP ( 1 ), iaVP ( 2 ), iaVP ( 3 ) )
x2AM = X2A ( iaVM ( 1 ), iaVM ( 2 ), iaVM ( 3 ) ) x2AM = XA ( iaVM ( 1 ), iaVM ( 2 ), iaVM ( 3 ) ) ** 2
x2AC = X2A ( iV, jV, kV ) x2AC = X ( iV, jV, kV ) ** 2
x2AP = X2A ( iaVP ( 1 ), iaVP ( 2 ), iaVP ( 3 ) ) x2AP = X ( iaVP ( 1 ), iaVP ( 2 ), iaVP ( 3 ) ) ** 2
xI = X ( iV, jV, kV ) - 0.5 * dX ( iV, jV, kV ) xI = X ( iV, jV, kV ) - 0.5 * dX ( iV, jV, kV )
xO = X ( iV, jV, kV ) + 0.5 * dX ( iV, jV, kV ) xO = X ( iV, jV, kV ) + 0.5 * dX ( iV, jV, kV )
...@@ -217,10 +215,10 @@ contains ...@@ -217,10 +215,10 @@ contains
end if !-- Local extremum end if !-- Local extremum
F_IR ( iV, jV, kV, iF_R ) & F_IR ( iV, jV, kV, iS ) &
= c0 + c1 * xI + c2 * xI**2 = c0 + c1 * xI + c2 * xI**2
F_IL ( iaVP ( 1 ), iaVP ( 2 ), iaVP ( 3 ), iF_R ) & F_IL ( iaVP ( 1 ), iaVP ( 2 ), iaVP ( 3 ), iS ) &
= c0 + c1 * xO + c2 * xO**2 = c0 + c1 * xO + c2 * xO**2
!call Show ( '>>> Final values' ) !call Show ( '>>> Final values' )
...@@ -231,23 +229,22 @@ contains ...@@ -231,23 +229,22 @@ contains
end do !-- jV end do !-- jV
end do !-- kV end do !-- kV
end do !-- iS end do !-- iS
!$OMP end target teams distribute parallel do simd !$OMP end target teams distribute parallel do
else !-- use host else !-- use host
!$OMP parallel do collapse ( 4 ) & !$OMP parallel do collapse ( 4 ) &
!$OMP schedule ( runtime ) & !$OMP schedule ( runtime ) &
!$OMP private ( iF, iF_R, iaVP, iaVM, fM, fC, fP, fI, fO ) & !$OMP private ( iF, iaVP, iaVM, fM, fC, fP, fI, fO ) &
!$OMP private ( xAM, xAC, xAP, x2AM, x2AC, x2AP, xI, xO, xE ) & !$OMP private ( xAM, xAC, xAP, x2AM, x2AC, x2AP, xI, xO, xE ) &
!$OMP private ( c0, c1, c2, c2_S, d ) & !$OMP private ( c0, c1, c2, c2_S, d ) &
!$OMP firstprivate ( SqrtTiny ) !$OMP firstprivate ( SqrtTiny, iaS )
do iS = 1, size ( iaSlctd ) do iS = 1, size ( iaSlctd )
do kV = lV ( 3 ), uV ( 3 ) do kV = lV ( 3 ), uV ( 3 )
do jV = lV ( 2 ), uV ( 2 ) do jV = lV ( 2 ), uV ( 2 )
do iV = lV ( 1 ), uV ( 1 ) do iV = lV ( 1 ), uV ( 1 )
iF = iaSlctd ( iS ) iF = iaSlctd ( iS )
iF_R = iaSlctd_R ( iS )
iaVP = [ iV, jV, kV ] + iaS iaVP = [ iV, jV, kV ] + iaS
iaVM = [ iV, jV, kV ] - iaS iaVM = [ iV, jV, kV ] - iaS
...@@ -402,10 +399,10 @@ contains ...@@ -402,10 +399,10 @@ contains
end if !-- Local extremum end if !-- Local extremum
F_IR ( iV, jV, kV, iF_R ) & F_IR ( iV, jV, kV, iS ) &
= c0 + c1 * xI + c2 * xI**2 = c0 + c1 * xI + c2 * xI**2
F_IL ( iaVP ( 1 ), iaVP ( 2 ), iaVP ( 3 ), iF_R ) & F_IL ( iaVP ( 1 ), iaVP ( 2 ), iaVP ( 3 ), iS ) &
= c0 + c1 * xO + c2 * xO**2 = c0 + c1 * xO + c2 * xO**2
!call Show ( '>>> Final values' ) !call Show ( '>>> Final values' )
......
...@@ -7,8 +7,19 @@ module Reconstruction_Form ...@@ -7,8 +7,19 @@ module Reconstruction_Form
KDR = kind ( 1.0d0 ), & KDR = kind ( 1.0d0 ), &
KDL = kind ( .true. ) KDL = kind ( .true. )
integer, parameter :: & !-- Change N_CELLS to adjust problem size,
N_CELLS = 8, & ! N_COMPUTE to adjust the number of time the kernel is run.
!-- A 'typical' run would have N_CELLS = 128 or 256, and
! 6 reconstruction kernels per time step ( 1 per dim x 2 for 2nd order),
! with a total of 1 millions time steps (e.g. a typical supernova
! simulations.)
integer ( KDI ), parameter :: &
N_CELLS = 64, &
N_COMPUTES = 10
integer ( KDI ), parameter :: &
N_FIELDS = 2, & N_FIELDS = 2, &
N_GHOSTS = 2 N_GHOSTS = 2
...@@ -65,6 +76,7 @@ module Reconstruction_Form ...@@ -65,6 +76,7 @@ module Reconstruction_Form
contains contains
subroutine Initialize ( R ) subroutine Initialize ( R )
class ( ReconstructionForm ), intent ( inout ) :: & class ( ReconstructionForm ), intent ( inout ) :: &
...@@ -74,6 +86,8 @@ contains ...@@ -74,6 +86,8 @@ contains
iC, & iC, &
nWavelength nWavelength
real ( KDR ) :: & real ( KDR ) :: &
Pi, &
TwoPi, &
Offset, & Offset, &
Amplitude Amplitude
real ( KDR ), dimension ( 3 ) :: & real ( KDR ), dimension ( 3 ) :: &
...@@ -100,24 +114,25 @@ contains ...@@ -100,24 +114,25 @@ contains
associate ( & associate ( &
dX => R % Width ( 1, 1, 1 ), & dX => R % Width ( 1, 1, 1 ), &
X => R % Center, & X => R % Center )
Pi => acos ( -1.0_KDR ), &
TwoPi => 2.0_KDR * acos ( -1.0_KDR ) ) Pi = acos ( -1.0_KDR )
TwoPi = 2.0_KDR * acos ( -1.0_KDR )
X = spread ( spread ( [ ( ( dX / 2 ) + ( dX * ( iC - 3 ) ), & X = spread ( spread ( [ ( ( dX / 2 ) + ( dX * ( iC - 3 ) ), &
iC = 1, N_CELLS + 4 ) ], & iC = 1, N_CELLS + 4 ) ], &
dim = 2, ncopies = size ( X, dim = 2 ) ), & dim = 2, ncopies = size ( X, dim = 2 ) ), &
dim = 3, ncopies = size ( R % Center, dim = 3 ) ) dim = 3, ncopies = size ( R % Center, dim = 3 ) )
associate & associate &
( Y => reshape ( R % Center, shape ( X ), order = [ 2, 3, 1 ] ), & ( Y => reshape ( R % Center, shape ( X ), order = [ 2, 3, 1 ] ), &
Z => reshape ( R % Center, shape ( X ), order = [ 3, 2, 1 ] ), & Z => reshape ( R % Center, shape ( X ), order = [ 3, 2, 1 ] ), &
F1 => R % Field ( :, :, :, 1 ), & F1 => R % Field ( :, :, :, 1 ), &
F2 => R % Field ( :, :, :, 2 ) ) F2 => R % Field ( :, :, :, 2 ) )
call Show_3D_R ( X, 'Center_X' ) !call Show_3D_R ( X, 'Center_X' )
call Show_3D_R ( Y, 'Center_Y' ) !call Show_3D_R ( Y, 'Center_Y' )
call Show_3D_R ( Z, 'Center_Z' ) !call Show_3D_R ( Z, 'Center_Z' )
!-- Set fields !-- Set fields
K = 1.0_KDR K = 1.0_KDR
...@@ -130,8 +145,8 @@ contains ...@@ -130,8 +145,8 @@ contains
+ Amplitude * 2.0_KDR & + Amplitude * 2.0_KDR &
* sin ( Pi * ( K ( 1 ) * X + K ( 2 ) * Y + K ( 3 ) * Z ) ) * sin ( Pi * ( K ( 1 ) * X + K ( 2 ) * Y + K ( 3 ) * Z ) )
call Show_3D_R ( F1, 'SetWave' ) !call Show_3D_R ( F1, 'SetWave' )
call Show_3D_R ( F2, 'SetWave_2' ) !call Show_3D_R ( F2, 'SetWave_2' )
end associate !-- X, Y, Z end associate !-- X, Y, Z
...@@ -183,10 +198,12 @@ contains ...@@ -183,10 +198,12 @@ contains
end subroutine Show_3D_R end subroutine Show_3D_R
subroutine Compute ( R ) subroutine Compute ( R, UseDeviceOption )
class ( ReconstructionForm ), intent ( inout ) :: & class ( ReconstructionForm ), intent ( inout ) :: &
R R
logical ( KDL ), intent ( in ), optional :: &
UseDeviceOption
integer ( KDI ) :: & integer ( KDI ) :: &
iDimension iDimension
...@@ -196,7 +213,7 @@ contains ...@@ -196,7 +213,7 @@ contains
( R % Field, R % Center, R % Width, & ( R % Field, R % Center, R % Width, &
R % Average_1_U, R % Average_2_U, R % iaSelected, & R % Average_1_U, R % Average_2_U, R % iaSelected, &
R % iaSelected, iDimension, N_GHOSTS, & R % iaSelected, iDimension, N_GHOSTS, &
R % Field_IL, R % Field_IR, UseDeviceOption = .true. ) R % Field_IL, R % Field_IR, UseDeviceOption = UseDeviceOption )
end subroutine Compute end subroutine Compute
...@@ -206,14 +223,43 @@ end module Reconstruction_Form ...@@ -206,14 +223,43 @@ end module Reconstruction_Form
program Reconstruction_Form_Test program Reconstruction_Form_Test
use OMP_LIB
use Reconstruction_Form use Reconstruction_Form
implicit none implicit none
integer ( KDI ) :: &
iC, &
nComputes = 10
real ( KDR ) :: &
TimeStart, &
TimeStop
type ( ReconstructionForm ) :: & type ( ReconstructionForm ) :: &
RF RF
call RF % Initialize ( ) call RF % Initialize ( )
call RF % Compute ( )
TimeStart = OMP_GET_WTIME ( )
do iC = 1, N_COMPUTES
call RF % Compute ( UseDeviceOption = .true. )
end do
TimeStop = OMP_GET_WTIME ( )
print '(a30, i5)', 'N_CELLS : ', N_CELLS
print '(a30, i5)', 'N_COMPUTES : ', N_COMPUTES
print*
print '(a30, es15.6e3)', 'Kernel Offload Timing (s) : ', TimeStop - TimeStart
print*
TimeStart = OMP_GET_WTIME ( )
do iC = 1, N_COMPUTES
call RF % Compute ( UseDeviceOption = .false. )
end do
TimeStop = OMP_GET_WTIME ( )
print '(a30, i5)', 'CPU nThreads : ', OMP_GET_MAX_THREADS ( )
print '(a30, es15.6e3)', 'Kernel CPU Timing (s) : ', TimeStop - TimeStart
end program Reconstruction_Form_Test end program Reconstruction_Form_Test
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment