Commit 7a5aba00 authored by Joubert, Wayne's avatar Joubert, Wayne
Browse files

Fixes to nuccor_kernels for better gpu performance.

parent 5d72d309
......@@ -237,7 +237,6 @@
!$acc enter data create(b)
!$acc enter data copyin(a2)
!$acc enter data copyin(a3)
!$acc wait(0)
!$acc host_data use_device(a2, a3, b)
#ifdef __PGI
call cublasdgemm ('n', 'n', m_, n_, k_, alpha, a2, m_, &
......@@ -250,11 +249,7 @@
!$acc end host_data
!$acc exit data delete(a3)
!$acc exit data delete(a2)
! vvv TODO: FIX
! !$acc exit data
!$acc exit data copyout(b)
! ^^^ TODO: FIX
!$acc wait(0)
! !$acc exit data copyout(b)
endif
!----------------------------------------
......@@ -267,7 +262,6 @@
!$acc enter data create(b)
!$acc enter data copyin(a2) async(2)
!$acc enter data copyin(a3) async(3)
!$acc wait(0)
!$acc wait(2)
!$acc wait(3)
!$acc host_data use_device(a2, a3, b)
......@@ -282,11 +276,7 @@
!$acc end host_data
!$acc exit data delete(a3)
!$acc exit data delete(a2)
! vvv TODO: FIX
! !$acc exit data
!$acc exit data copyout(b)
! ^^^ TODO: FIX
!$acc wait(0)
! !$acc exit data copyout(b)
endif
!========================================
......@@ -312,9 +302,8 @@
n_ = n
!$acc enter data create(c)
!$acc enter data copyin(a1)
!$acc enter data present_or_copyin(b)
! !$acc enter data present_or_copyin(b)
!$acc host_data use_device(a1, b, c)
!$acc wait(0)
#ifdef __PGI
call cublasdgemm ('t', 'n', m_, n_, k_, alpha, a1, k_, &
b, k_, beta, c, m_)
......@@ -327,7 +316,6 @@
!$acc exit data delete(b)
!$acc exit data delete(a1)
!$acc exit data copyout(c)
!$acc wait(0)
endif
!----------------------------------------
......@@ -338,11 +326,10 @@
n_ = n
!$acc enter data create(c)
!$acc enter data copyin(a1) async(2)
!$acc enter data present_or_copyin(b) async(3)
! !$acc enter data present_or_copyin(b) async(3)
!$acc host_data use_device(a1, b, c)
!$acc wait(0)
!$acc wait(2)
!$acc wait(3)
! !$acc wait(3)
#ifdef __PGI
call cublasdgemm ('t', 'n', m_, n_, k_, alpha, a1, k_, &
b, k_, beta, c, m_)
......@@ -355,7 +342,6 @@
!$acc exit data delete(b)
!$acc exit data delete(a1)
!$acc exit data copyout(c) async(1)
!$acc wait(0)
!$acc wait(1)
endif
......@@ -383,8 +369,7 @@
n_ = n
!$acc enter data copyin(a1) async(2)
!$acc enter data present_or_copyin(b) async(3)
!$acc enter data create(c)
!$acc wait (0)
! !$acc enter data create(c)
!$acc wait (2)
!$acc wait (3)
......
......@@ -225,7 +225,7 @@ void compute_product_gpu(Matrix* c, Matrix* a1, Matrix* a2, Matrix* a3,
#pragma acc enter data create(bp_data[:m*n])
#pragma acc enter data copyin(a2_data[:m*m])
#pragma acc enter data copyin(a3_data[:m*n])
#pragma acc wait(0)
//#pragma acc wait(0)
#pragma acc host_data use_device(a2_data, a3_data, bp_data)
{
......@@ -236,11 +236,8 @@ void compute_product_gpu(Matrix* c, Matrix* a1, Matrix* a2, Matrix* a3,
#pragma acc exit data delete(a3_data[:m*n])
#pragma acc exit data delete(a2_data[:m*m])
// vvv TODO: FIX
//#pragma acc exit data
#pragma acc exit data copyout(bp_data[:m*n])
// ^^^ TODO: FIX
#pragma acc wait(0)
//#pragma acc exit data copyout(bp_data[:m*n])
//#pragma acc wait(0)
}
/*------------------------------------*/
......@@ -275,8 +272,8 @@ void compute_product_gpu(Matrix* c, Matrix* a1, Matrix* a2, Matrix* a3,
#pragma acc enter data create(c_data[:n*n])
#pragma acc enter data copyin(a1_data[:m*n])
#pragma acc enter data present_or_copyin(bp_data[:m*n])
#pragma acc wait(0)
//#pragma acc enter data present_or_copyin(bp_data[:m*n])
//#pragma acc wait(0)
#pragma acc host_data use_device(a1_data, bp_data, c_data)
{
......@@ -288,7 +285,7 @@ void compute_product_gpu(Matrix* c, Matrix* a1, Matrix* a2, Matrix* a3,
#pragma acc exit data delete(bp_data[:m*n])
#pragma acc exit data delete(a1_data[:m*n])
#pragma acc exit data copyout(c_data[:n*n])
#pragma acc wait(0)
//#pragma acc wait(0)
}
/*------------------------------------*/
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment