Commit 3d5ccd10 authored by Nichols, Stephen's avatar Nichols, Stephen
Browse files

Initial commit

parents
Loading
Loading
Loading
Loading

Makefile

0 → 100644
+35 −0
Original line number Diff line number Diff line
# set compilers
FC=mpif90
#FC=xlcuf

# set linker
LD=mpif90
#LD=xlcuf

# set flags
FFLAGS=-qarch=pwr9 -qtune=pwr9 -qtgtarch=sm_70 #-g -Rb

OPENMP_FLAG=-qsmp=omp -qoffload -I$(OLCF_CUDA_ROOT)/include

OPT_FLAGS=-O3
#OPT_FLAGS=-Ofast

LDFLAGS=-L$(OLCF_CUDA_ROOT)/lib64 #-lcudart

FDEFINES=

OBJS=main_cudaStreamAddCallback.o 
EXE=cudaStreamAddCallback_detach.x

%.o: %.F90
	$(FC) -c $< $(OPT_FLAGS) $(FFLAGS) $(CPPFLAGS) $(OPENMP_FLAG) -o $@

$(EXE) : $(OBJS)
	$(LD) $(OBJS) $(OPT_FLAGS) $(OPENMP_FLAG) $(LDFLAGS) -o $@

clean:
	rm -f *.o *.mod *.s *.x

clobber:
	rm -f *.o *.mod *.x *.s *~
+31 −0
Original line number Diff line number Diff line
#!/bin/bash
  
#BSUB -P STF006
#BSUB -J StreamAddCallback_Detach_Test
#BSUB -o StreamAddCallback_Detach_Test
#BSUB -e StreamAddCallback.err
#BSUB -W 0:05
#BSUB -nnodes 1
#BSUB -alloc_flags smt4

source ./setUpModules.sh

module list

# -n : number of resource sets
# -a : number of MPI ranks per resource set
# -c : number of CPUs/cores per resource set
# -r : number of resource sets per host
# -g : number of GPUs per resource set
# -b : binding of tasks (not sure how this really works...)
# -l : latency priority
# -d : how tasks are started on resource sets
# -E : OMP_NUM_THREADS=4 allows up to four threads per MPI rank
# -E : OMP_NUM_THREADS=168 allows up to four threads per MPI rank
### NOTE: I think that you can also "export OMP_NUM_THREADS=***" before the jsrun cmd and exlude it from 
###       the jsrun cmd line

# 1 MPI rank and 4 threads per MPI rank
jsrun -n1 -a1 -c1 -r1 -g1 -b packed:1 -l cpu-cpu -d packed -E OMP_NUM_THREADS=4 ./cudaStreamAddCallback_detach.x

build.sh

0 → 100755
+9 −0
Original line number Diff line number Diff line
#!/bin/bash 

source setUpModules.sh
module list

make

ldd cudaStreamAddCallback_detach.x
+180 −0
Original line number Diff line number Diff line
module stream_addcallback_mod
   use iso_c_binding

   interface cudaStreamAddCallback
      integer function cudaStreamAddCallback (stream, ptr_callback, event, flag) &
                              bind(c, name ='cudaStreamAddCallback')
         use iso_c_binding
         use omp_lib
         use cudafor
         implicit none
         integer(kind=cuda_stream_kind), value :: stream
         type(c_funptr),value :: ptr_callback
         type(c_ptr),value :: event
         integer, value :: flag
      end function cudaStreamAddCallback
   end interface cudaStreamAddCallback

contains

   subroutine streamAddCallback_callback (stream, fstatus, event)
      use iso_c_binding
      use omp_lib
      use cudafor
      implicit none
      integer(kind=cuda_stream_kind), value :: stream
      integer(C_INT),value :: fstatus
      type(C_PTR),value :: event
      integer(kind=omp_event_handle_kind), pointer :: f_event
      
      call C_F_POINTER (event, f_event)
      call omp_fulfill_event(f_event)
   end subroutine streamAddCallback_callback

end module stream_addcallback_mod

program CudaStreamAddCallback_detach

   use ISO_C_BINDING
   use omp_lib
   use stream_addcallback_mod
   use cudafor
   implicit none

   real, allocatable, target :: vxz(:,:,:)
   real, allocatable, target :: ubuf(:,:,:,:)
   integer :: nx, ny, nz, mz, inyi, iny1, ierr
   integer :: ip, np, nbuf, ibuf, next(2), i1, i2
   integer(kind=omp_event_handle_kind), target :: h2d_event
   integer(kind=omp_event_handle_kind) :: d2h_event, &
      fft_event
   integer(kind=cuda_stream_kind) :: h2d_stream
   type(c_funptr) :: ptr_callback
   integer :: zero = 0
   ! variables for hipMemCpy2D
   integer(kind=c_size_t) :: dpitch, spitch, width, height
   real(kind=8) :: temp1

   write(6,*) "Enter main"
   !!call flush(6)
   flush(6)

   nx = 768
   ny = 768
   nz = 768
   mz = nz/4
   np = 3
   inyi = ny/np
   nbuf = 3

   allocate (vxz(nx, ny, mz))
   allocate (ubuf(nx, inyi, mz, nbuf))

   ! pointer to callback function
   ptr_callback = C_FUNLOC(streamAddCallback_callback)

   ierr = cudaStreamCreate(h2d_stream)

   write(6,*) "Before parallel"
   !!call flush(6)
   flush(6)

   temp1 = omp_get_wtime()
   !$OMP TARGET DATA MAP(alloc:ubuf)
   temp1 = omp_get_wtime() - temp1
   write(6,*) "After map",temp1

   !$OMP PARALLEL DEFAULT(NONE) PRIVATE(temp1, ierr, dpitch, spitch, &
   !$OMP width, height, i1, i2, iny1) SHARED(h2d_event, ip, vxz, ubuf, nx, &
   !$OMP ny, nz, np, zero, ptr_callback, h2d_stream, ibuf, nbuf, next, &
   !$OMP inyi, mz)

   !$OMP SINGLE

   temp1 = omp_get_wtime()
   !$OMP TASK DEPEND(OUT:ubuf(:,:,:,1)) DETACH(h2d_event) &
   !$OMP DEFAULT(NONE) PRIVATE(ierr,iny1) SHARED(ubuf,vxz,nx,inyi, &
   !$OMP ny,mz,ptr_callback,zero,h2d_stream)
   iny1 = 1
   write(6,*) "After parameter set",1
   !!call flush(6)
   flush(6)
   !$OMP TARGET DATA USE_DEVICE_PTR(ubuf)
   ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,1)), nx*inyi, &
                             C_LOC(vxz(1,iny1,1)), nx*ny, &
                             nx*inyi, mz, stream=h2d_stream)
   !$OMP END TARGET DATA
   ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero)
   write(6,*) "After Add callback",1
   !!call flush(6)
   flush(6)
   !$OMP END TASK
   temp1 = omp_get_wtime() - temp1
   write(6,*) "ubuf HtoD",1,temp1
   !!call flush(6)
   flush(6)

   do ip=1,np

      ibuf = mod(ip,nbuf)
      if(ibuf.eq.0) ibuf = nbuf

      next(1) = ip+1
      if(next(1).gt.np) next(1) = 1
      next(2) = ibuf+1
      if(next(2).gt.nbuf) next(2) = 1

      if (ip.ne.np) then
         temp1 = omp_get_wtime()
         i1 = next(1)
         i2 = next(2)
         !$OMP TASK DEPEND(OUT:ubuf(:,:,:,i2)) DETACH(h2d_event) &
         !$OMP DEFAULT(NONE) PRIVATE(ierr,iny1) FIRSTPRIVATE(i1,i2) &
         !$OMP SHARED(ubuf,vxz,nx,inyi,ny,mz,ptr_callback, &
         !$OMP zero,h2d_stream)
         iny1 = (i1-1)*inyi+1
         write(6,*) "After parameter set",i1
         !!call flush(6)
         flush(6)
         !$OMP TARGET DATA USE_DEVICE_PTR(ubuf)
         ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,i2)), nx*inyi, &
            C_LOC(vxz(1,iny1,1)), nx*ny, &
            nx*inyi, mz, stream=h2d_stream)
         !$OMP END TARGET DATA
         ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero)
         write(6,*) "After Add callback",i1
         !!call flush(6)
         flush(6)
         !$OMP END TASK
         temp1 = omp_get_wtime() - temp1
         write(6,*) "ubuf HtoD",i1,temp1
         !!call flush(6)
         flush(6)
      end if

   end do

   do ip=1,np
      !$OMP TASK DEPEND(IN:ubuf(:,:,:,ip))
      write(6,*) "Copy task complete, ip=",ip
      !!call flush(6)
      flush(6)
      !$OMP END TASK
      write(6,*) "After check task",ip
   end do

   !$OMP TASKWAIT

   !$OMP END SINGLE

   !$OMP END PARALLEL

   !$OMP END TARGET DATA

   deallocate (vxz, ubuf)

   write(6,*) "Finished"   
   !!call flush(6)
   flush(6)

end program

setUpModules.sh

0 → 100755
+9 −0
Original line number Diff line number Diff line
#!/bin/bash 

module load xl/16.1.1-beta103
#module load cuda/10.1.243
module load cuda/11.0.3