Commit 8f621f8e authored by Melesse Vergara, Veronica's avatar Melesse Vergara, Veronica
Browse files

added single node tests for nuccor_kernels

parent 69ee0151
/*---------------------------------------------------------------------------*/
/*!
* \file get_wall_time.c
* \author Wayne Joubert
* \date Wed Nov 4 09:56:36 EST 2015
* \brief Fortran/C interface to gettimeofday function.
* \note Copyright (C) 2015 Oak Ridge National Laboratory, UT-Battelle, LLC.
*/
/*---------------------------------------------------------------------------*/
#include <stddef.h>
#include <sys/time.h>
/*---------------------------------------------------------------------------*/
void get_wall_time (double* time) {
struct timeval tv;
gettimeofday(&tv, NULL);
*time = ((double)tv.tv_sec + (double)tv.tv_usec * 1.e-6);
}
void get_wall_time_ (double* time) {
get_wall_time (time);
}
void GET_WALL_TIME (double* time) {
get_wall_time (time);
}
void GET_WALL_TIME_ (double* time) {
get_wall_time (time);
}
/*---------------------------------------------------------------------------*/
#!/bin/bash
#==============================================================================
if [ "$PE_ENV" == "PGI" ] ; then
#------------------------------
echo "Compiling with PGI compiler, C version ..."
EXECUTABLE=nuccor_dgemm_titan_c_pgi
module load cudatoolkit
module load acml
#module load cray-libsci
cc -c get_wall_time.c
cc -o $EXECUTABLE nuccor_dgemm.c get_wall_time.o \
$CRAY_CUDATOOLKIT_INCLUDE_OPTS \
-fast -acc -ta=nvidia:rdc -Minfo=accel \
-lcublas -lacml
#-lsci_pgi
module unload acml
#module unload cray-libsci
module unload cudatoolkit
rm -f *.o
#------------------------------
echo "Compiling with PGI compiler, Fortran version ..."
EXECUTABLE=nuccor_dgemm_titan_f_pgi
module load cudatoolkit
module load acml
module unload cray-libsci_acc
cc -c get_wall_time.c
ftn -o $EXECUTABLE nuccor_dgemm.F90 get_wall_time.o \
$CRAY_CUDATOOLKIT_INCLUDE_OPTS \
-fast -acc -ta=nvidia -Minfo=accel -Mcuda \
-lacml
module load cray-libsci_acc
module unload acml
module unload cudatoolkit
rm -f *.o
#------------------------------
echo "Compiling with Cray compiler, Fortran version ..."
EXECUTABLE=nuccor_dgemm_titan_f_cray
module swap PrgEnv-pgi PrgEnv-cray
module load craype-accel-nvidia35
cc -c get_wall_time.c
ftn -o $EXECUTABLE nuccor_dgemm.F90 get_wall_time.o \
-m 3 -O 2 -e m -O noomp
# -lsci_cray
module swap PrgEnv-cray PrgEnv-pgi
module unload craype-accel-nvidia35
rm -f *.o
#------------------------------
fi
if [ "$(uname -n | grep '^crest' | wc -l)" = 1 ] ; then
#------------------------------
echo "Compiling with PGI compiler, C version ..."
EXECUTABLE=nuccor_dgemm_crest_c_pgi
module load pgi
module load cuda
pgcc -c get_wall_time.c
pgcc -o $EXECUTABLE nuccor_dgemm.c get_wall_time.o \
-fast -acc -ta=nvidia:rdc -Minfo=accel \
-L/usr/local/cuda-7.5/lib64/ \
-lcublas_static -lcudart -lculibos -ldl -lpthread \
-L/sw/power8/lapack/3.5.0/ppc64le_gnu4.9.2/lib -lrefblas \
/sw/power8/lapack/3.5.0/ppc64le_gnu4.9.2/lib/librefblas.a \
/usr/lib/gcc/powerpc64le-linux-gnu/4.9/libgfortran.a
#------------------------------
#pgcc -o nuccor_dgemm nuccor_dgemm.c get_wall_time.o -lcublas -Mcuda
#pgf90 -o nuccor_dgemm nuccor_dgemm.F90 get_wall_time.o -O3
rm -f *.o
fi
if [ -x $EXECUTABLE ]
then
echo "Build completed successfully."
else
exit_status="$?"
echo "Build failed! (Exit: ${exit_status})"
exit ${exit_status}
fi
exit 0
#==============================================================================
!------------------------------------------------------------------------------
!
! \file nuccor_dgemm.F90
! \author Wayne Joubert
! \date Wed Nov 4 09:56:36 EST 2015
! \brief Perform DGEMMs to simulate those of a NUCCOR run.
! \note Copyright (C) 2015 Oak Ridge National Laboratory, UT-Battelle, LLC.
!
!------------------------------------------------------------------------------
program main
implicit none
!--------------------
interface
subroutine get_wall_time (time)
use, intrinsic :: iso_c_binding, only : c_double
implicit none
real(c_double) :: time
end subroutine
end interface
interface
subroutine matrices_create (a1, a2, a3, m, n)
implicit none
doubleprecision, dimension(:,:), allocatable :: a1, a2, a3
integer :: m, n
end subroutine
end interface
interface
subroutine matrices_destroy (a1, a2, a3)
implicit none
doubleprecision, dimension(:,:), allocatable :: a1, a2, a3
end subroutine
end interface
interface
subroutine compute_product_gpu (c, a1, a2, a3)
implicit none
doubleprecision, dimension(:,:), allocatable :: c, a1, a2, a3
end subroutine
end interface
interface
subroutine compute_product_cpu (c, a1, a2, a3)
implicit none
doubleprecision, dimension(:,:), allocatable :: c, a1, a2, a3
end subroutine
end interface
interface
subroutine compute_product_ref (c, a1, a2, a3)
implicit none
doubleprecision, dimension(:,:), allocatable :: c, a1, a2, a3
end subroutine
end interface
!--------------------
integer, dimension(:,:), allocatable :: dims
integer :: num_triples
integer :: i, m, n
doubleprecision, dimension(:,:), allocatable :: a1, a2, a3, c
doubleprecision, dimension(:,:), allocatable :: c_correct
doubleprecision :: time_1, time_2
doubleprecision :: time_this_gpu, time_this_cpu
doubleprecision :: time_total_gpu, time_total_cpu
doubleprecision :: diff
integer :: num_failures
!doubleprecision, external :: MPI_Wtime
!--------------------
num_triples = 0
i = 0; m = 0; n = 0
time_1 = 0; time_2 = 0
time_this_gpu = 0; time_this_cpu = 0
time_total_gpu = 0; time_total_cpu = 0
diff = 0
num_failures = 0
!---Read in matrix sizes for each matrix triple.
open (unit=12, file='sizes.txt', action='READ')
read (12, *) num_triples
allocate (dims(2, num_triples))
do i = 1, num_triples
read (12, *) dims(:,i)
enddo
close (unit=12)
print *, num_triples
!---Loop over matrix triples.
do i = 1, num_triples
m = dims(1, i)
n = dims(2, i)
call matrices_create (a1, a2, a3, m, n)
allocate (c(n, n))
!---Compute mat^T * mat * mat.
call get_wall_time(time_1)
call compute_product_gpu (c, a1, a2, a3)
call get_wall_time(time_2)
time_this_gpu = time_2 - time_1
time_total_gpu = time_total_gpu + time_this_gpu
!---Compute reference answer.
allocate (c_correct(n, n))
call get_wall_time(time_1)
!time_1 = MPI_Wtime()
call compute_product_cpu (c_correct, a1, a2, a3)
call get_wall_time(time_2)
!time_2 = MPI_Wtime()
time_this_cpu = time_2 - time_1
time_total_cpu = time_total_cpu + time_this_cpu
!---Perform check.
diff = sum( abs( c - c_correct ) )
if ( diff .ne. 0.d0 ) then
num_failures = num_failures + 1
endif
deallocate (c_correct)
write(*, '(3(1x, i5), '' tgpu '' f10.6, '' tcpu '', f10.6, ' &
// ' '' rat '',f5.2, '' dif '', e6.1)') &
i, m, n, time_this_gpu, time_this_cpu, &
time_this_cpu/time_this_gpu, diff
call matrices_destroy (a1, a2, a3)
deallocate (c)
enddo
write(*, '('' FINAL: num_triples '', i5, '' tgpu '' f9.3, ' // &
' '' tcpu '', f9.3, '' num_failures '', i5)') &
num_triples, time_total_gpu, time_total_cpu, num_failures
deallocate (dims)
end
!-----------------------------------------------------------------------------
subroutine compute_product_gpu (c, a1, a2, a3)
#ifdef __PGI
use cublas
#endif
implicit none
doubleprecision, dimension(:,:), allocatable :: c, a1, a2, a3
integer, parameter :: COMPUTE_METHOD_CPU = 1
integer, parameter :: COMPUTE_METHOD_GPU_SYNC = 2
integer, parameter :: COMPUTE_METHOD_GPU_ASYNC = 3
integer, parameter :: COMPUTE_METHOD_GPU_PANELS = 4
doubleprecision, parameter :: alpha = 1, beta = 0
doubleprecision, dimension(:,:), allocatable :: b
integer :: m, n, i, j, k
integer :: m_, n_, k_
integer :: compute_method_requested
integer :: compute_method_actual
integer :: num_panel, panel_num
integer :: index_lo, index_hi
integer :: index_lo_prev, index_hi_prev
integer :: panel_width, panel_width_prev
logical :: is_compute_step, is_compute_step_prev
!----------------------------------------
m = 0; n = 0; i = 0; j = 0; k = 0
m_ = 0; n_ = 0; k_ = 0
compute_method_requested = 0
compute_method_actual = 0
num_panel = 0; panel_num = 0
index_lo = 0; index_hi = 0
index_lo_prev = 0; index_hi_prev = 0
panel_width = 0; panel_width_prev = 0
is_compute_step = .false.; is_compute_step_prev = .false.
m = size(a1, 1)
n = size(a1, 2)
allocate (b(m, n))
!---Select the desired compute method here.
!compute_method_requested = COMPUTE_METHOD_GPU_SYNC
compute_method_requested = COMPUTE_METHOD_GPU_ASYNC
!compute_method_requested = COMPUTE_METHOD_GPU_PANELS
!---End select.
compute_method_actual = compute_method_requested
#ifndef __PGI
!---Cray compiler is buggy so do simple way.
if ( compute_method_actual .eq. COMPUTE_METHOD_GPU_ASYNC .or. &
compute_method_actual .eq. COMPUTE_METHOD_GPU_PANELS ) then
compute_method_actual = COMPUTE_METHOD_GPU_SYNC
endif
#endif
!========================================
!---B = A2 * A3.
!========================================
if ( compute_method_actual .eq. COMPUTE_METHOD_CPU ) then
do i = 1, m
do j = 1, n
b(i, j) = 0
do k = 1, m
b(i, j) = b(i, j) + a2(i, k) * a3(k, j)
enddo
enddo
enddo
endif
!----------------------------------------
if ( compute_method_actual .eq. COMPUTE_METHOD_GPU_SYNC ) then
m_ = m
k_ = m
n_ = n
!$acc enter data create(b)
!$acc enter data copyin(a2)
!$acc enter data copyin(a3)
!$acc wait(0)
!$acc host_data use_device(a2, a3, b)
#ifdef __PGI
call cublasdgemm ('n', 'n', m_, n_, k_, alpha, a2, m_, &
a3, k_, beta, b, m_)
#else
call dgemm_acc ('n', 'n', m_, n_, k_, alpha, a2, m_, &
a3, k_, beta, b, m_)
#endif
!$acc wait(0)
!$acc end host_data
!$acc exit data delete(a3)
!$acc exit data delete(a2)
!$acc exit data
! !$acc exit data copyout(b)
!$acc wait(0)
endif
!----------------------------------------
if ( compute_method_actual .eq. COMPUTE_METHOD_GPU_ASYNC .or. &
compute_method_actual .eq. COMPUTE_METHOD_GPU_PANELS ) then
m_ = m
k_ = m
n_ = n
!$acc enter data create(b)
!$acc enter data copyin(a2) async(2)
!$acc enter data copyin(a3) async(3)
!$acc wait(0)
!$acc wait(2)
!$acc wait(3)
!$acc host_data use_device(a2, a3, b)
#ifdef __PGI
call cublasdgemm ('n', 'n', m_, n_, k_, alpha, a2, m_, &
a3, k_, beta, b, m_)
#else
call dgemm_acc ('n', 'n', m_, n_, k_, alpha, a2, m_, &
a3, k_, beta, b, m_)
#endif
!$acc wait(0)
!$acc end host_data
!$acc exit data delete(a3)
!$acc exit data delete(a2)
!$acc exit data
! !$acc exit data copyout(b)
!$acc wait(0)
endif
!========================================
!---C = ( A1 ^ T ) * B.
!========================================
if ( compute_method_actual .eq. COMPUTE_METHOD_CPU ) then
do i = 1, n
do j = 1, n
c(i, j) = 0
do k = 1, m
c(i, j) = c(i, j) + a1(k, i) * b(k, j)
enddo
enddo
enddo
endif
!----------------------------------------
if ( compute_method_actual .eq. COMPUTE_METHOD_GPU_SYNC ) then
m_ = n
k_ = m
n_ = n
!$acc enter data create(c)
!$acc enter data copyin(a1)
!$acc enter data present_or_copyin(b)
!$acc host_data use_device(a1, b, c)
!$acc wait(0)
#ifdef __PGI
call cublasdgemm ('t', 'n', m_, n_, k_, alpha, a1, k_, &
b, k_, beta, c, m_)
#else
call dgemm_acc ('t', 'n', m_, n_, k_, alpha, a1, k_, &
b, k_, beta, c, m_)
#endif
!$acc wait(0)
!$acc end host_data
!$acc exit data delete(b)
!$acc exit data delete(a1)
!$acc exit data copyout(c)
!$acc wait(0)
endif
!----------------------------------------
if ( compute_method_actual .eq. COMPUTE_METHOD_GPU_ASYNC ) then
m_ = n
k_ = m
n_ = n
!$acc enter data create(c)
!$acc enter data copyin(a1) async(2)
!$acc enter data present_or_copyin(b) async(3)
!$acc host_data use_device(a1, b, c)
!$acc wait(0)
!$acc wait(2)
!$acc wait(3)
#ifdef __PGI
call cublasdgemm ('t', 'n', m_, n_, k_, alpha, a1, k_, &
b, k_, beta, c, m_)
#else
call dgemm_acc ('t', 'n', m_, n_, k_, alpha, a1, k_, &
b, k_, beta, c, m_)
#endif
!$acc wait(0)
!$acc end host_data
!$acc exit data delete(b)
!$acc exit data delete(a1)
!$acc exit data copyout(c) async(1)
!$acc wait(0)
!$acc wait(1)
endif
!----------------------------------------
!---NOTE: the code below works correctly (PGI) but is currently not
!---performant any better than the other methods.
!---Apparently the transfers and compute are not being overlapped.
!----------------------------------------
!----------------------------------------
!---The following code is motivated by the following:
!---1) the second, "tn" computation seems to take much more time
!---than the first, "nn" computation;
!---2) the transfer-out from the GPU of the result c matrix
!---seems to take more time than the transfer-in of the a matrix
!---due to size.
!---Thus the following code creates a pipeline: the dgemm is done
!---in panels, with the previous panel transfer-out overlapped
!---with the current panel compute.
!----------------------------------------
if ( compute_method_actual .eq. COMPUTE_METHOD_GPU_PANELS ) then
m_ = n
k_ = m
n_ = n
!$acc enter data copyin(a1) async(2)
!$acc enter data present_or_copyin(b) async(3)
!$acc enter data create(c)
!$acc wait (0)
!$acc wait (2)
!$acc wait (3)
if ( n .gt. 1000 ) then
num_panel = 2
else
num_panel = 1
endif
do panel_num = 0, num_panel
!---The following are zero-based.
is_compute_step = panel_num .ge. 0 .and. &
panel_num .lt. num_panel
is_compute_step_prev = panel_num .gt. 0 .and. &
panel_num .le. num_panel
index_lo = ( ( panel_num ) * n ) / num_panel
index_hi = ( ( panel_num + 1 ) * n ) / num_panel - 1
index_lo_prev = ( ( panel_num - 1 ) * n ) / num_panel
index_hi_prev = ( ( panel_num ) * n ) / num_panel - 1
panel_width = index_hi - index_lo + 1
panel_width_prev = index_hi_prev - index_lo_prev + 1
if ( is_compute_step ) then
if ( panel_width .gt. 0 ) then
!$acc host_data use_device(a1, b, c)
#ifdef __PGI
!---NEED to launch this asynchronously
call cublasdgemm ( 't', 'n', &
m_, panel_width, k_, alpha, a1, k_, &
b(:,1+index_lo:), k_, beta, c(:,1+index_lo:), m_)
#else
call dgemm_acc_openacc_async ( 14, 't', 'n', &
m_, panel_width, k_, alpha, a1, k_, &
b(:,1+index_lo:), k_, beta, c(:,1+index_lo:), m_)
#endif
!$acc end host_data
endif
endif
if ( is_compute_step_prev ) then
if ( panel_width_prev .gt. 0 ) then
!$acc update host(c(:,1+index_lo_prev:1+index_hi_prev))
endif
endif
!$acc wait(14)
!$acc wait(0)
enddo !---panel_num
!$acc exit data delete(c)
!$acc exit data delete(b)
!$acc exit data delete(a1)
!$acc wait (0)
endif
!----------------------------------------
deallocate (b)
end