Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
olcf-acceptance-team
olcf4-acceptance-tests
Commits
8f621f8e
Commit
8f621f8e
authored
Feb 23, 2016
by
Melesse Vergara, Veronica
Browse files
added single node tests for nuccor_kernels
parent
69ee0151
Changes
25
Expand all
Hide whitespace changes
Inline
Side-by-side
olcf-harness-v1.0-tests/Crest/nuccor_kernels/Source/get_wall_time.c
0 → 100644
View file @
8f621f8e
/*---------------------------------------------------------------------------*/
/*!
* \file get_wall_time.c
* \author Wayne Joubert
* \date Wed Nov 4 09:56:36 EST 2015
* \brief Fortran/C interface to gettimeofday function.
* \note Copyright (C) 2015 Oak Ridge National Laboratory, UT-Battelle, LLC.
*/
/*---------------------------------------------------------------------------*/
#include
<stddef.h>
#include
<sys/time.h>
/*---------------------------------------------------------------------------*/
void
get_wall_time
(
double
*
time
)
{
struct
timeval
tv
;
gettimeofday
(
&
tv
,
NULL
);
*
time
=
((
double
)
tv
.
tv_sec
+
(
double
)
tv
.
tv_usec
*
1.e-6
);
}
void
get_wall_time_
(
double
*
time
)
{
get_wall_time
(
time
);
}
void
GET_WALL_TIME
(
double
*
time
)
{
get_wall_time
(
time
);
}
void
GET_WALL_TIME_
(
double
*
time
)
{
get_wall_time
(
time
);
}
/*---------------------------------------------------------------------------*/
olcf-harness-v1.0-tests/Crest/nuccor_kernels/Source/make.sh
0 → 100755
View file @
8f621f8e
#!/bin/bash
#==============================================================================
if
[
"
$PE_ENV
"
==
"PGI"
]
;
then
#------------------------------
echo
"Compiling with PGI compiler, C version ..."
EXECUTABLE
=
nuccor_dgemm_titan_c_pgi
module load cudatoolkit
module load acml
#module load cray-libsci
cc
-c
get_wall_time.c
cc
-o
$EXECUTABLE
nuccor_dgemm.c get_wall_time.o
\
$CRAY_CUDATOOLKIT_INCLUDE_OPTS
\
-fast
-acc
-ta
=
nvidia:rdc
-Minfo
=
accel
\
-lcublas
-lacml
#-lsci_pgi
module unload acml
#module unload cray-libsci
module unload cudatoolkit
rm
-f
*
.o
#------------------------------
echo
"Compiling with PGI compiler, Fortran version ..."
EXECUTABLE
=
nuccor_dgemm_titan_f_pgi
module load cudatoolkit
module load acml
module unload cray-libsci_acc
cc
-c
get_wall_time.c
ftn
-o
$EXECUTABLE
nuccor_dgemm.F90 get_wall_time.o
\
$CRAY_CUDATOOLKIT_INCLUDE_OPTS
\
-fast
-acc
-ta
=
nvidia
-Minfo
=
accel
-Mcuda
\
-lacml
module load cray-libsci_acc
module unload acml
module unload cudatoolkit
rm
-f
*
.o
#------------------------------
echo
"Compiling with Cray compiler, Fortran version ..."
EXECUTABLE
=
nuccor_dgemm_titan_f_cray
module swap PrgEnv-pgi PrgEnv-cray
module load craype-accel-nvidia35
cc
-c
get_wall_time.c
ftn
-o
$EXECUTABLE
nuccor_dgemm.F90 get_wall_time.o
\
-m
3
-O
2
-e
m
-O
noomp
# -lsci_cray
module swap PrgEnv-cray PrgEnv-pgi
module unload craype-accel-nvidia35
rm
-f
*
.o
#------------------------------
fi
if
[
"
$(
uname
-n
|
grep
'^crest'
|
wc
-l
)
"
=
1
]
;
then
#------------------------------
echo
"Compiling with PGI compiler, C version ..."
EXECUTABLE
=
nuccor_dgemm_crest_c_pgi
module load pgi
module load cuda
pgcc
-c
get_wall_time.c
pgcc
-o
$EXECUTABLE
nuccor_dgemm.c get_wall_time.o
\
-fast
-acc
-ta
=
nvidia:rdc
-Minfo
=
accel
\
-L
/usr/local/cuda-7.5/lib64/
\
-lcublas_static
-lcudart
-lculibos
-ldl
-lpthread
\
-L
/sw/power8/lapack/3.5.0/ppc64le_gnu4.9.2/lib
-lrefblas
\
/sw/power8/lapack/3.5.0/ppc64le_gnu4.9.2/lib/librefblas.a
\
/usr/lib/gcc/powerpc64le-linux-gnu/4.9/libgfortran.a
#------------------------------
#pgcc -o nuccor_dgemm nuccor_dgemm.c get_wall_time.o -lcublas -Mcuda
#pgf90 -o nuccor_dgemm nuccor_dgemm.F90 get_wall_time.o -O3
rm
-f
*
.o
fi
if
[
-x
$EXECUTABLE
]
then
echo
"Build completed successfully."
else
exit_status
=
"
$?
"
echo
"Build failed! (Exit:
${
exit_status
}
)"
exit
${
exit_status
}
fi
exit
0
#==============================================================================
olcf-harness-v1.0-tests/Crest/nuccor_kernels/Source/nuccor_dgemm.F90
0 → 100644
View file @
8f621f8e
!------------------------------------------------------------------------------
!
! \file nuccor_dgemm.F90
! \author Wayne Joubert
! \date Wed Nov 4 09:56:36 EST 2015
! \brief Perform DGEMMs to simulate those of a NUCCOR run.
! \note Copyright (C) 2015 Oak Ridge National Laboratory, UT-Battelle, LLC.
!
!------------------------------------------------------------------------------
program
main
implicit
none
!--------------------
interface
subroutine
get_wall_time
(
time
)
use
,
intrinsic
::
iso_c_binding
,
only
:
c_double
implicit
none
real
(
c_double
)
::
time
end
subroutine
end
interface
interface
subroutine
matrices_create
(
a1
,
a2
,
a3
,
m
,
n
)
implicit
none
doubleprecision
,
dimension
(:,:),
allocatable
::
a1
,
a2
,
a3
integer
::
m
,
n
end
subroutine
end
interface
interface
subroutine
matrices_destroy
(
a1
,
a2
,
a3
)
implicit
none
doubleprecision
,
dimension
(:,:),
allocatable
::
a1
,
a2
,
a3
end
subroutine
end
interface
interface
subroutine
compute_product_gpu
(
c
,
a1
,
a2
,
a3
)
implicit
none
doubleprecision
,
dimension
(:,:),
allocatable
::
c
,
a1
,
a2
,
a3
end
subroutine
end
interface
interface
subroutine
compute_product_cpu
(
c
,
a1
,
a2
,
a3
)
implicit
none
doubleprecision
,
dimension
(:,:),
allocatable
::
c
,
a1
,
a2
,
a3
end
subroutine
end
interface
interface
subroutine
compute_product_ref
(
c
,
a1
,
a2
,
a3
)
implicit
none
doubleprecision
,
dimension
(:,:),
allocatable
::
c
,
a1
,
a2
,
a3
end
subroutine
end
interface
!--------------------
integer
,
dimension
(:,:),
allocatable
::
dims
integer
::
num_triples
integer
::
i
,
m
,
n
doubleprecision
,
dimension
(:,:),
allocatable
::
a1
,
a2
,
a3
,
c
doubleprecision
,
dimension
(:,:),
allocatable
::
c_correct
doubleprecision
::
time_1
,
time_2
doubleprecision
::
time_this_gpu
,
time_this_cpu
doubleprecision
::
time_total_gpu
,
time_total_cpu
doubleprecision
::
diff
integer
::
num_failures
!doubleprecision, external :: MPI_Wtime
!--------------------
num_triples
=
0
i
=
0
;
m
=
0
;
n
=
0
time_1
=
0
;
time_2
=
0
time_this_gpu
=
0
;
time_this_cpu
=
0
time_total_gpu
=
0
;
time_total_cpu
=
0
diff
=
0
num_failures
=
0
!---Read in matrix sizes for each matrix triple.
open
(
unit
=
12
,
file
=
'sizes.txt'
,
action
=
'READ'
)
read
(
12
,
*
)
num_triples
allocate
(
dims
(
2
,
num_triples
))
do
i
=
1
,
num_triples
read
(
12
,
*
)
dims
(:,
i
)
enddo
close
(
unit
=
12
)
print
*
,
num_triples
!---Loop over matrix triples.
do
i
=
1
,
num_triples
m
=
dims
(
1
,
i
)
n
=
dims
(
2
,
i
)
call
matrices_create
(
a1
,
a2
,
a3
,
m
,
n
)
allocate
(
c
(
n
,
n
))
!---Compute mat^T * mat * mat.
call
get_wall_time
(
time_1
)
call
compute_product_gpu
(
c
,
a1
,
a2
,
a3
)
call
get_wall_time
(
time_2
)
time_this_gpu
=
time_2
-
time_1
time_total_gpu
=
time_total_gpu
+
time_this_gpu
!---Compute reference answer.
allocate
(
c_correct
(
n
,
n
))
call
get_wall_time
(
time_1
)
!time_1 = MPI_Wtime()
call
compute_product_cpu
(
c_correct
,
a1
,
a2
,
a3
)
call
get_wall_time
(
time_2
)
!time_2 = MPI_Wtime()
time_this_cpu
=
time_2
-
time_1
time_total_cpu
=
time_total_cpu
+
time_this_cpu
!---Perform check.
diff
=
sum
(
abs
(
c
-
c_correct
)
)
if
(
diff
.ne.
0.d0
)
then
num_failures
=
num_failures
+
1
endif
deallocate
(
c_correct
)
write
(
*
,
'(3(1x, i5),
''
tgpu
''
f10.6,
''
tcpu
''
, f10.6, '
&
//
'
''
rat
''
,f5.2,
''
dif
''
, e6.1)'
)
&
i
,
m
,
n
,
time_this_gpu
,
time_this_cpu
,
&
time_this_cpu
/
time_this_gpu
,
diff
call
matrices_destroy
(
a1
,
a2
,
a3
)
deallocate
(
c
)
enddo
write
(
*
,
'(
''
FINAL: num_triples
''
, i5,
''
tgpu
''
f9.3, '
//
&
'
''
tcpu
''
, f9.3,
''
num_failures
''
, i5)'
)
&
num_triples
,
time_total_gpu
,
time_total_cpu
,
num_failures
deallocate
(
dims
)
end
!-----------------------------------------------------------------------------
subroutine
compute_product_gpu
(
c
,
a1
,
a2
,
a3
)
#ifdef __PGI
use
cublas
#endif
implicit
none
doubleprecision
,
dimension
(:,:),
allocatable
::
c
,
a1
,
a2
,
a3
integer
,
parameter
::
COMPUTE_METHOD_CPU
=
1
integer
,
parameter
::
COMPUTE_METHOD_GPU_SYNC
=
2
integer
,
parameter
::
COMPUTE_METHOD_GPU_ASYNC
=
3
integer
,
parameter
::
COMPUTE_METHOD_GPU_PANELS
=
4
doubleprecision
,
parameter
::
alpha
=
1
,
beta
=
0
doubleprecision
,
dimension
(:,:),
allocatable
::
b
integer
::
m
,
n
,
i
,
j
,
k
integer
::
m_
,
n_
,
k_
integer
::
compute_method_requested
integer
::
compute_method_actual
integer
::
num_panel
,
panel_num
integer
::
index_lo
,
index_hi
integer
::
index_lo_prev
,
index_hi_prev
integer
::
panel_width
,
panel_width_prev
logical
::
is_compute_step
,
is_compute_step_prev
!----------------------------------------
m
=
0
;
n
=
0
;
i
=
0
;
j
=
0
;
k
=
0
m_
=
0
;
n_
=
0
;
k_
=
0
compute_method_requested
=
0
compute_method_actual
=
0
num_panel
=
0
;
panel_num
=
0
index_lo
=
0
;
index_hi
=
0
index_lo_prev
=
0
;
index_hi_prev
=
0
panel_width
=
0
;
panel_width_prev
=
0
is_compute_step
=
.false.
;
is_compute_step_prev
=
.false.
m
=
size
(
a1
,
1
)
n
=
size
(
a1
,
2
)
allocate
(
b
(
m
,
n
))
!---Select the desired compute method here.
!compute_method_requested = COMPUTE_METHOD_GPU_SYNC
compute_method_requested
=
COMPUTE_METHOD_GPU_ASYNC
!compute_method_requested = COMPUTE_METHOD_GPU_PANELS
!---End select.
compute_method_actual
=
compute_method_requested
#ifndef __PGI
!---Cray compiler is buggy so do simple way.
if
(
compute_method_actual
.eq.
COMPUTE_METHOD_GPU_ASYNC
.or.
&
compute_method_actual
.eq.
COMPUTE_METHOD_GPU_PANELS
)
then
compute_method_actual
=
COMPUTE_METHOD_GPU_SYNC
endif
#endif
!========================================
!---B = A2 * A3.
!========================================
if
(
compute_method_actual
.eq.
COMPUTE_METHOD_CPU
)
then
do
i
=
1
,
m
do
j
=
1
,
n
b
(
i
,
j
)
=
0
do
k
=
1
,
m
b
(
i
,
j
)
=
b
(
i
,
j
)
+
a2
(
i
,
k
)
*
a3
(
k
,
j
)
enddo
enddo
enddo
endif
!----------------------------------------
if
(
compute_method_actual
.eq.
COMPUTE_METHOD_GPU_SYNC
)
then
m_
=
m
k_
=
m
n_
=
n
!$acc enter data create(b)
!$acc enter data copyin(a2)
!$acc enter data copyin(a3)
!$acc wait(0)
!$acc host_data use_device(a2, a3, b)
#ifdef __PGI
call
cublasdgemm
(
'n'
,
'n'
,
m_
,
n_
,
k_
,
alpha
,
a2
,
m_
,
&
a3
,
k_
,
beta
,
b
,
m_
)
#else
call
dgemm_acc
(
'n'
,
'n'
,
m_
,
n_
,
k_
,
alpha
,
a2
,
m_
,
&
a3
,
k_
,
beta
,
b
,
m_
)
#endif
!$acc wait(0)
!$acc end host_data
!$acc exit data delete(a3)
!$acc exit data delete(a2)
!$acc exit data
! !$acc exit data copyout(b)
!$acc wait(0)
endif
!----------------------------------------
if
(
compute_method_actual
.eq.
COMPUTE_METHOD_GPU_ASYNC
.or.
&
compute_method_actual
.eq.
COMPUTE_METHOD_GPU_PANELS
)
then
m_
=
m
k_
=
m
n_
=
n
!$acc enter data create(b)
!$acc enter data copyin(a2) async(2)
!$acc enter data copyin(a3) async(3)
!$acc wait(0)
!$acc wait(2)
!$acc wait(3)
!$acc host_data use_device(a2, a3, b)
#ifdef __PGI
call
cublasdgemm
(
'n'
,
'n'
,
m_
,
n_
,
k_
,
alpha
,
a2
,
m_
,
&
a3
,
k_
,
beta
,
b
,
m_
)
#else
call
dgemm_acc
(
'n'
,
'n'
,
m_
,
n_
,
k_
,
alpha
,
a2
,
m_
,
&
a3
,
k_
,
beta
,
b
,
m_
)
#endif
!$acc wait(0)
!$acc end host_data
!$acc exit data delete(a3)
!$acc exit data delete(a2)
!$acc exit data
! !$acc exit data copyout(b)
!$acc wait(0)
endif
!========================================
!---C = ( A1 ^ T ) * B.
!========================================
if
(
compute_method_actual
.eq.
COMPUTE_METHOD_CPU
)
then
do
i
=
1
,
n
do
j
=
1
,
n
c
(
i
,
j
)
=
0
do
k
=
1
,
m
c
(
i
,
j
)
=
c
(
i
,
j
)
+
a1
(
k
,
i
)
*
b
(
k
,
j
)
enddo
enddo
enddo
endif
!----------------------------------------
if
(
compute_method_actual
.eq.
COMPUTE_METHOD_GPU_SYNC
)
then
m_
=
n
k_
=
m
n_
=
n
!$acc enter data create(c)
!$acc enter data copyin(a1)
!$acc enter data present_or_copyin(b)
!$acc host_data use_device(a1, b, c)
!$acc wait(0)
#ifdef __PGI
call
cublasdgemm
(
't'
,
'n'
,
m_
,
n_
,
k_
,
alpha
,
a1
,
k_
,
&
b
,
k_
,
beta
,
c
,
m_
)
#else
call
dgemm_acc
(
't'
,
'n'
,
m_
,
n_
,
k_
,
alpha
,
a1
,
k_
,
&
b
,
k_
,
beta
,
c
,
m_
)
#endif
!$acc wait(0)
!$acc end host_data
!$acc exit data delete(b)
!$acc exit data delete(a1)
!$acc exit data copyout(c)
!$acc wait(0)
endif
!----------------------------------------
if
(
compute_method_actual
.eq.
COMPUTE_METHOD_GPU_ASYNC
)
then
m_
=
n
k_
=
m
n_
=
n
!$acc enter data create(c)
!$acc enter data copyin(a1) async(2)
!$acc enter data present_or_copyin(b) async(3)
!$acc host_data use_device(a1, b, c)
!$acc wait(0)
!$acc wait(2)
!$acc wait(3)
#ifdef __PGI
call
cublasdgemm
(
't'
,
'n'
,
m_
,
n_
,
k_
,
alpha
,
a1
,
k_
,
&
b
,
k_
,
beta
,
c
,
m_
)
#else
call
dgemm_acc
(
't'
,
'n'
,
m_
,
n_
,
k_
,
alpha
,
a1
,
k_
,
&
b
,
k_
,
beta
,
c
,
m_
)
#endif
!$acc wait(0)
!$acc end host_data
!$acc exit data delete(b)
!$acc exit data delete(a1)
!$acc exit data copyout(c) async(1)
!$acc wait(0)
!$acc wait(1)
endif
!----------------------------------------
!---NOTE: the code below works correctly (PGI) but is currently not
!---performant any better than the other methods.
!---Apparently the transfers and compute are not being overlapped.
!----------------------------------------
!----------------------------------------
!---The following code is motivated by the following:
!---1) the second, "tn" computation seems to take much more time
!---than the first, "nn" computation;
!---2) the transfer-out from the GPU of the result c matrix
!---seems to take more time than the transfer-in of the a matrix
!---due to size.
!---Thus the following code creates a pipeline: the dgemm is done
!---in panels, with the previous panel transfer-out overlapped
!---with the current panel compute.
!----------------------------------------
if
(
compute_method_actual
.eq.
COMPUTE_METHOD_GPU_PANELS
)
then
m_
=
n
k_
=
m
n_
=
n
!$acc enter data copyin(a1) async(2)
!$acc enter data present_or_copyin(b) async(3)
!$acc enter data create(c)
!$acc wait (0)
!$acc wait (2)
!$acc wait (3)
if
(
n
.gt.
1000
)
then
num_panel
=
2
else
num_panel
=
1
endif
do
panel_num
=
0
,
num_panel
!---The following are zero-based.
is_compute_step
=
panel_num
.ge.
0
.and.
&
panel_num
.lt.
num_panel
is_compute_step_prev
=
panel_num
.gt.
0
.and.
&
panel_num
.le.
num_panel
index_lo
=
(
(
panel_num
)
*
n
)
/
num_panel
index_hi
=
(
(
panel_num
+
1
)
*
n
)
/
num_panel
-
1
index_lo_prev
=
(
(
panel_num
-
1
)
*
n
)
/
num_panel
index_hi_prev
=
(
(
panel_num
)
*
n
)
/
num_panel
-
1
panel_width
=
index_hi
-
index_lo
+
1
panel_width_prev
=
index_hi_prev
-
index_lo_prev
+
1
if
(
is_compute_step
)
then
if
(
panel_width
.gt.
0
)
then
!$acc host_data use_device(a1, b, c)
#ifdef __PGI
!---NEED to launch this asynchronously
call
cublasdgemm
(
't'
,
'n'
,
&
m_
,
panel_width
,
k_
,
alpha
,
a1
,
k_
,
&
b
(:,
1
+
index_lo
:),
k_
,
beta
,
c
(:,
1
+
index_lo
:),
m_
)
#else
call
dgemm_acc_openacc_async
(
14
,
't'
,
'n'
,
&
m_
,
panel_width
,
k_
,
alpha
,
a1
,
k_
,
&
b
(:,
1
+
index_lo
:),
k_
,
beta
,
c
(:,
1
+
index_lo
:),
m_
)
#endif
!$acc end host_data
endif
endif
if
(
is_compute_step_prev
)
then
if
(
panel_width_prev
.gt.
0
)
then
!$acc update host(c(:,1+index_lo_prev:1+index_hi_prev))
endif
endif
!$acc wait(14)
!$acc wait(0)
enddo
!---panel_num
!$acc exit data delete(c)
!$acc exit data delete(b)
!$acc exit data delete(a1)
!$acc wait (0)
endif
!----------------------------------------
deallocate
(
b
)
end