Commit b876e712 authored by Nichols, Stephen's avatar Nichols, Stephen
Browse files

Changes to be committed:

	modified:   main_cudaStreamAddCallback.F90
	modified:   setUpModules.sh
	modified:   setUpModules_gcc.sh

Added some basic enumerators.  "ip" is now private.

Gfortran fails when using the device ptr for "ubuf" but has no
trouble doing a host-to-host copy.
parent 293f0a36
Loading
Loading
Loading
Loading
+126 −34
Original line number Diff line number Diff line
module my_enums
  implicit none

  enum, bind(c)
     enumerator :: mySuccess = 0

     ! memcpyDefault infers from the dst and src pointers (RECOMMENDED)
     enumerator :: memcpyHostToHost = 0
     enumerator :: memcpyHostToDevice = 1
     enumerator :: memcpyDeviceToHost = 2
     enumerator :: memcpyDevicToDevice = 3
     enumerator :: memcpyDefault = 4
  end enum

end module my_enums

module stream_addcallback_mod
   use iso_c_binding

   interface cudaStreamAddCallback
      integer function cudaStreamAddCallback (stream, ptr_callback, event, flag) &
      function cudaStreamAddCallback (stream, ptr_callback, event, flag) &
                              bind(c, name ='cudaStreamAddCallback')
         use my_enums
         use iso_c_binding
         use omp_lib
         implicit none
         integer(c_size_t), value :: stream
         !! not sure if stream needs "value" or not
         type(c_ptr), value :: stream

         type(c_funptr),value :: ptr_callback
         type(c_ptr),value :: event
         integer, value :: flag
         integer(kind(mySuccess)) :: cudaStreamAddCallback
      end function cudaStreamAddCallback
   end interface cudaStreamAddCallback

   interface cudaStreamCreate
      integer function cudaStreamCreate(stream) &
      function cudaStreamCreate(stream) &
                              bind(c, name = 'cudaStreamCreate')
         use my_enums
         use iso_c_binding
         integer(c_size_t), intent(out) :: stream
         type(c_ptr), intent(out) :: stream
         integer(kind(mySuccess)) :: cudaStreamCreate
      end function cudaStreamCreate
   end interface cudaStreamCreate

   interface cudaMemCpy
      function cudaMemCpy(dst,src,sizeBytes,my_kind) &
                              bind(c, name = 'cudaMemcpy')
         use my_enums
         use iso_c_binding
         type(c_ptr), value :: dst,src
         integer(c_size_t),value :: sizeBytes
         integer(kind(memcpyDefault)), value :: my_kind
         integer(kind(mySuccess)) :: cudaMemCpy
      end function cudaMemCpy
   end interface cudaMemCpy

   interface cudaMemCpy2D
      function cudaMemCpy2D(dst,dpitch,src,spitch,width,height,my_kind) &
                              bind(c, name = 'cudaMemcpy2D')
         use my_enums
         use iso_c_binding
         type(c_ptr), value :: dst,src
         integer(c_size_t),value :: dpitch, spitch, width, height
         integer(kind(memcpyDefault)), value :: my_kind
         integer(kind(mySuccess)) :: cudaMemCpy2D
      end function cudaMemCpy2D
   end interface cudaMemCpy2D

   interface cudaMemCpy2DAsync
      integer function cudaMemCpy2DAsync(dst,dpitch,src,spitch,width,height,kdir,stream) &
      function cudaMemCpy2DAsync(dst,dpitch,src,spitch,width,height,my_kind,stream) &
                              bind(c, name = 'cudaMemcpy2DAsync')
         use my_enums
         use iso_c_binding
         type(c_ptr) :: dst,src
         type(c_ptr),value :: dst,src
         integer(c_size_t),value :: dpitch, spitch, width, height
         !!! for kdir: 0 is host to host
         !!!           1 is host to device
         !!!           2 is device to host
         !!!           3 is device to device
         !!!           4 infers from the dst and src pointers (RECOMMENDED)
         integer, value :: kdir
         integer(c_size_t) :: stream
         integer(kind(memcpyDefault)), value :: my_kind
         type(c_ptr), value :: stream
         integer(kind(mySuccess)) :: cudaMemCpy2DAsync
      end function cudaMemCpy2DAsync
   end interface cudaMemCpy2DAsync

@@ -44,7 +87,7 @@ contains
      use iso_c_binding
      use omp_lib
      implicit none
      integer(c_size_t), value :: stream
      type(c_ptr),value :: stream
      integer(C_INT),value :: fstatus
      type(C_PTR),value :: event
      integer(kind=omp_event_handle_kind), pointer :: f_event
@@ -58,29 +101,37 @@ end module stream_addcallback_mod
program CudaStreamAddCallback_detach
   use ISO_C_BINDING
   use omp_lib
   use my_enums
   use stream_addcallback_mod
   implicit none

   real, allocatable, target :: vxz(:,:,:)
   real, allocatable, target :: ubuf(:,:,:,:)
   integer :: nx, ny, nz, mz, inyi, iny1, ierr
   integer :: nx, ny, nz, mz, inyi, iny1
   integer(kind(mySuccess)) :: ierr
   integer :: ip, np, nbuf, ibuf, next(2), i1, i2
   integer :: i,j,k
   integer(kind=omp_event_handle_kind), target :: h2d_event
   integer(kind=omp_event_handle_kind) :: d2h_event, &
      fft_event
   integer(c_size_t) :: h2d_stream
   type(c_ptr) :: h2d_stream
   type(c_funptr) :: ptr_callback
   integer :: zero = 0
   integer(kind=c_size_t) :: dpitch, spitch, width, height
   integer(kind=c_size_t) :: sizeBytes, dpitch, spitch, width, height
   real(kind=8) :: temp1

   write(6,*) "Enter main"
   !!call flush(6)
   flush(6)

   nx = 768
   ny = 768
   nz = 768
   !nx = 768
   !ny = 768
   !nz = 768

   nx = 12
   ny = 12
   nz = 12

   mz = nz/4
   np = 3
   inyi = ny/np
@@ -89,30 +140,36 @@ program CudaStreamAddCallback_detach
   allocate (vxz(nx, ny, mz))
   allocate (ubuf(nx, inyi, mz, nbuf))

   ubuf(:,:,:,:) = 0.0
   vxz(:,:,:) = 1.0

   ! pointer to callback function
   ptr_callback = C_FUNLOC(streamAddCallback_callback)

   ierr = cudaStreamCreate(h2d_stream)
   if (ierr .gt. 0) write(6,*) "ERROR: StreamCreate ierr = ", ierr

   write(6,*) "Before parallel"
   !!call flush(6)
   flush(6)

   temp1 = omp_get_wtime()
   !$OMP TARGET DATA MAP(alloc:ubuf)
   !$OMP TARGET DATA MAP(tofrom:ubuf)
   temp1 = omp_get_wtime() - temp1
   write(6,*) "After map",temp1

   dpitch = int((nx*inyi), kind=c_size_t)
   spitch = int((nx*ny), kind=c_size_t)
   width = int((nx*inyi), kind=c_size_t)
   height = int(mz, kind=c_size_t)

   !$OMP PARALLEL DEFAULT(NONE) PRIVATE(temp1, ierr, dpitch, spitch, &
   !$OMP width, height, i1, i2, iny1) SHARED(h2d_event, ip, vxz, ubuf, nx, &
   !$OMP width, height, sizeBytes, i1, i2, iny1, ip) SHARED(h2d_event, vxz, ubuf, nx, &
   !$OMP ny, nz, np, zero, ptr_callback, h2d_stream, ibuf, nbuf, next, &
   !$OMP inyi, mz)

   dpitch = int((4*nx*inyi), kind=c_size_t)
   spitch = int((4*nx*ny), kind=c_size_t)
   width = int((4*nx*inyi), kind=c_size_t)
   height = int(mz, kind=c_size_t)

   sizeBytes = int((4*nx*inyi*mz), kind=c_size_t)

   !$OMP SINGLE

   temp1 = omp_get_wtime()
@@ -123,22 +180,29 @@ program CudaStreamAddCallback_detach
   write(6,*) "After parameter set",1
   !!call flush(6)
   flush(6)
   !$OMP TARGET DATA USE_DEVICE_PTR(ubuf)

   ierr = 0
   !!$OMP TARGET DATA USE_DEVICE_PTR(ubuf)
   ierr = cudaMemCpy2DAsync(C_LOC(ubuf(1,1,1,1)), dpitch, &
                            C_LOC(vxz(1,iny1,1)), spitch, &
                            width, height, 4, h2d_stream)
   !$OMP END TARGET DATA
                            width, height, memcpyDefault, h2d_stream)
   !!$OMP END TARGET DATA
   if (ierr .gt. 0) write(6,*) "ERROR: MemCpy2DAsync ip, ierr = ", 1, ierr

   ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero)
   if (ierr .gt. 0) write(6,*) "ERROR: LaunchHostFunc ip, ierr = ", 1, ierr
   write(6,*) "After Add callback",1
   !!call flush(6)
   flush(6)

   !$OMP END TASK

   temp1 = omp_get_wtime() - temp1
   write(6,*) "ubuf HtoD",1,temp1
   !!call flush(6)
   flush(6)

   do ip=1,np
   do ip=1,np-1

      ibuf = mod(ip,nbuf)
      if(ibuf.eq.0) ibuf = nbuf
@@ -160,16 +224,23 @@ program CudaStreamAddCallback_detach
         write(6,*) "After parameter set",i1
         !!call flush(6)
         flush(6)
         !$OMP TARGET DATA USE_DEVICE_PTR(ubuf)

         ierr = 0
         !!$OMP TARGET DATA USE_DEVICE_PTR(ubuf)
         ierr = cudaMemCpy2DAsync(C_LOC(ubuf(1,1,1,i2)), dpitch, &
                                  C_LOC(vxz(1,iny1,1)), spitch, &
                                  width, height, 4, h2d_stream)
         !$OMP END TARGET DATA
                                  width, height, memcpyDefault, h2d_stream)
         !!$OMP END TARGET DATA
         if (ierr .gt. 0) write(6,*) "ERROR: MemCpy2DAsync ip, ierr = ", i1, ierr

         ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero)
         if (ierr .gt. 0) write(6,*) "ERROR: LaunchHostFunc ip, ierr = ", i1, ierr
         write(6,*) "After Add callback",i1
         !!call flush(6)
         flush(6)

         !$OMP END TASK

         temp1 = omp_get_wtime() - temp1
         write(6,*) "ubuf HtoD",i1,temp1
         !!call flush(6)
@@ -189,12 +260,33 @@ program CudaStreamAddCallback_detach

   !$OMP TASKWAIT

   !$OMP TARGET UPDATE TO(ubuf)

   !$OMP END SINGLE

   !$OMP END PARALLEL

   !$OMP END TARGET DATA

   !! error check
   inyi = ny/np
   do ip = 1,nbuf
      do k = 1,mz
         do j = 1,inyi
            do i = 1,nx
   !do ip = 1,1
   !   do k = 1,1
   !      do j = 1,1
   !         do i = 1,nx
               if (ubuf(i,j,k,ip) .ne. 1.0) then
                  write(6,*) "ERROR: i,j,k,ip,ubuf =", i,j,k,ip,ubuf(i,j,k,ip)
                  !stop
               endif
            enddo
         enddo
      enddo
   enddo

   deallocate (vxz, ubuf)

   write(6,*) "Finished"   
+1 −0
Original line number Diff line number Diff line
@@ -7,5 +7,6 @@ module load xl/16.1.1-beta103
module load cuda/11.0.3


export OMP_NUM_THREADS=4

+32 −4
Original line number Diff line number Diff line
#!/bin/bash 

USE_MPI=0

if [ ${USE_MPI} -eq 0 ]; then
   # use next two lines for GCC with OpenMP Offload
   module use /sw/summit/modulefiles/ums/stf010/Core
   module load gcc/11.1.0-20220305
   module load cuda/11.0.3

else

   # for mpi, we need to do a little trickery
   # first, load a production gcc and cuda
   module load gcc/11.1.0
   module load cuda/11.0.3

   # then prepend bin and and LD_LIBRARY_PATH
   GCC_UMS_DIR=/sw/summit/ums/stf010/gcc
   #latest=$(ls --color=never ${GCC_UMS_DIR} | tail -n1)
   latest="11.1.0-20220305"
   export GCC_ROOT=$GCC_UMS_DIR/$latest

   echo "Using GCC in $GCC_ROOT"

   export PATH=$GCC_ROOT/bin:${PATH}

   export OMPI_CC=${GCC_ROOT}/bin/gcc
   export OMPI_CXX=${GCC_ROOT}/bin/g++
   export OMPI_FC=${GCC_ROOT}/bin/gfortran
   export LD_LIBRARY_PATH=${GCC_ROOT}/lib64:${LD_LIBRARY_PATH}

   #gfortran --version
   #mpif90 --version

fi

export OMP_NUM_THREADS=4