Commit 98614345 authored by Nichols, Stephen's avatar Nichols, Stephen
Browse files

Changes to be committed:

	modified:   main_cudaStreamAddCallback.F90
	new file:   run_it.sh
	modified:   setUpModules.sh

Bug fix: dpitch, spitch, and width are in bytes.

Now the code is copying the complete array which is verified by an
error check after the array is copied back from the device by OpenMP
when the data region is ended.
parent c34ef36c
Loading
Loading
Loading
Loading
+28 −7
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@ program CudaStreamAddCallback_detach
   real, allocatable, target :: ubuf(:,:,:,:)
   integer :: nx, ny, nz, mz, inyi, iny1, ierr
   integer :: ip, np, nbuf, ibuf, next(2), i1, i2
   integer :: i,j,k
   integer(kind=omp_event_handle_kind), target :: h2d_event
   integer(kind=omp_event_handle_kind) :: d2h_event, &
      fft_event
@@ -62,6 +63,7 @@ program CudaStreamAddCallback_detach
   nx = 768
   ny = 768
   nz = 768

   mz = nz/4
   np = 3
   inyi = ny/np
@@ -70,6 +72,9 @@ program CudaStreamAddCallback_detach
   allocate (vxz(nx, ny, mz))
   allocate (ubuf(nx, inyi, mz, nbuf))

   ubuf(:,:,:,:) = 0.0
   vxz(:,:,:) = 1.0

   ! pointer to callback function
   ptr_callback = C_FUNLOC(streamAddCallback_callback)

@@ -80,7 +85,7 @@ program CudaStreamAddCallback_detach
   flush(6)

   temp1 = omp_get_wtime()
   !$OMP TARGET DATA MAP(alloc:ubuf)
   !$OMP TARGET DATA MAP(from:ubuf)
   temp1 = omp_get_wtime() - temp1
   write(6,*) "After map",temp1

@@ -100,9 +105,9 @@ program CudaStreamAddCallback_detach
   !!call flush(6)
   flush(6)
   !$OMP TARGET DATA USE_DEVICE_PTR(ubuf)
   ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,1)), nx*inyi, &
                             C_LOC(vxz(1,iny1,1)), nx*ny, &
                             nx*inyi, mz, stream=h2d_stream)
   ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,1)), 4*nx*inyi, &
                             C_LOC(vxz(1,iny1,1)), 4*nx*ny, &
                             4*nx*inyi, mz, stream=h2d_stream)
   !$OMP END TARGET DATA
   ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero)
   write(6,*) "After Add callback",1
@@ -137,9 +142,9 @@ program CudaStreamAddCallback_detach
         !!call flush(6)
         flush(6)
         !$OMP TARGET DATA USE_DEVICE_PTR(ubuf)
         ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,i2)), nx*inyi, &
            C_LOC(vxz(1,iny1,1)), nx*ny, &
            nx*inyi, mz, stream=h2d_stream)
         ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,i2)), 4*nx*inyi, &
                                   C_LOC(vxz(1,iny1,1)), 4*nx*ny, &
                                   4*nx*inyi, mz, stream=h2d_stream)
         !$OMP END TARGET DATA
         ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero)
         write(6,*) "After Add callback",i1
@@ -171,6 +176,22 @@ program CudaStreamAddCallback_detach

   !$OMP END TARGET DATA

   !! error check
   inyi = ny/np
   do ip = 1,nbuf
      do k = 1,mz
         do j = 1,inyi
            do i = 1,nx
               if (ubuf(i,j,k,ip) .ne. 1.0) then
                  write(6,*) "ERROR: i,j,k,ip,ubuf =", i,j,k,ip,ubuf(i,j,k,ip)
                  !stop
               endif
            enddo
         enddo
      enddo
   enddo
   flush(6)

   deallocate (vxz, ubuf)

   write(6,*) "Finished"   

run_it.sh

0 → 100755
+6 −0
Original line number Diff line number Diff line
#!/bin/bash 

source setUpModules.sh
module list

./cudaStreamAddCallback_detach.x
+1 −2
Original line number Diff line number Diff line
#!/bin/bash 

module load xl/16.1.1-beta103
#module load cuda/10.1.243
module load cuda/11.0.3



export OMP_NUM_THREADS=4