Loading Makefile 0 → 100644 +35 −0 Original line number Diff line number Diff line # set compilers FC=mpif90 #FC=xlcuf # set linker LD=mpif90 #LD=xlcuf # set flags FFLAGS=-qarch=pwr9 -qtune=pwr9 -qtgtarch=sm_70 #-g -Rb OPENMP_FLAG=-qsmp=omp -qoffload -I$(OLCF_CUDA_ROOT)/include OPT_FLAGS=-O3 #OPT_FLAGS=-Ofast LDFLAGS=-L$(OLCF_CUDA_ROOT)/lib64 #-lcudart FDEFINES= OBJS=main_cudaStreamAddCallback.o EXE=cudaStreamAddCallback_detach.x %.o: %.F90 $(FC) -c $< $(OPT_FLAGS) $(FFLAGS) $(CPPFLAGS) $(OPENMP_FLAG) -o $@ $(EXE) : $(OBJS) $(LD) $(OBJS) $(OPT_FLAGS) $(OPENMP_FLAG) $(LDFLAGS) -o $@ clean: rm -f *.o *.mod *.s *.x clobber: rm -f *.o *.mod *.x *.s *~ batch_cudaStreamAddCallback.sh 0 → 100755 +31 −0 Original line number Diff line number Diff line #!/bin/bash #BSUB -P STF006 #BSUB -J StreamAddCallback_Detach_Test #BSUB -o StreamAddCallback_Detach_Test #BSUB -e StreamAddCallback.err #BSUB -W 0:05 #BSUB -nnodes 1 #BSUB -alloc_flags smt4 source ./setUpModules.sh module list # -n : number of resource sets # -a : number of MPI ranks per resource set # -c : number of CPUs/cores per resource set # -r : number of resource sets per host # -g : number of GPUs per resource set # -b : binding of tasks (not sure how this really works...) # -l : latency priority # -d : how tasks are started on resource sets # -E : OMP_NUM_THREADS=4 allows up to four threads per MPI rank # -E : OMP_NUM_THREADS=168 allows up to four threads per MPI rank ### NOTE: I think that you can also "export OMP_NUM_THREADS=***" before the jsrun cmd and exlude it from ### the jsrun cmd line # 1 MPI rank and 4 threads per MPI rank jsrun -n1 -a1 -c1 -r1 -g1 -b packed:1 -l cpu-cpu -d packed -E OMP_NUM_THREADS=4 ./cudaStreamAddCallback_detach.x build.sh 0 → 100755 +9 −0 Original line number Diff line number Diff line #!/bin/bash source setUpModules.sh module list make ldd cudaStreamAddCallback_detach.x main_cudaStreamAddCallback.F90 0 → 100644 +180 −0 Original line number Diff line number Diff line module stream_addcallback_mod use iso_c_binding interface cudaStreamAddCallback integer function cudaStreamAddCallback (stream, ptr_callback, event, flag) & bind(c, name ='cudaStreamAddCallback') use iso_c_binding use omp_lib use cudafor implicit none integer(kind=cuda_stream_kind), value :: stream type(c_funptr),value :: ptr_callback type(c_ptr),value :: event integer, value :: flag end function cudaStreamAddCallback end interface cudaStreamAddCallback contains subroutine streamAddCallback_callback (stream, fstatus, event) use iso_c_binding use omp_lib use cudafor implicit none integer(kind=cuda_stream_kind), value :: stream integer(C_INT),value :: fstatus type(C_PTR),value :: event integer(kind=omp_event_handle_kind), pointer :: f_event call C_F_POINTER (event, f_event) call omp_fulfill_event(f_event) end subroutine streamAddCallback_callback end module stream_addcallback_mod program CudaStreamAddCallback_detach use ISO_C_BINDING use omp_lib use stream_addcallback_mod use cudafor implicit none real, allocatable, target :: vxz(:,:,:) real, allocatable, target :: ubuf(:,:,:,:) integer :: nx, ny, nz, mz, inyi, iny1, ierr integer :: ip, np, nbuf, ibuf, next(2), i1, i2 integer(kind=omp_event_handle_kind), target :: h2d_event integer(kind=omp_event_handle_kind) :: d2h_event, & fft_event integer(kind=cuda_stream_kind) :: h2d_stream type(c_funptr) :: ptr_callback integer :: zero = 0 ! variables for hipMemCpy2D integer(kind=c_size_t) :: dpitch, spitch, width, height real(kind=8) :: temp1 write(6,*) "Enter main" !!call flush(6) flush(6) nx = 768 ny = 768 nz = 768 mz = nz/4 np = 3 inyi = ny/np nbuf = 3 allocate (vxz(nx, ny, mz)) allocate (ubuf(nx, inyi, mz, nbuf)) ! pointer to callback function ptr_callback = C_FUNLOC(streamAddCallback_callback) ierr = cudaStreamCreate(h2d_stream) write(6,*) "Before parallel" !!call flush(6) flush(6) temp1 = omp_get_wtime() !$OMP TARGET DATA MAP(alloc:ubuf) temp1 = omp_get_wtime() - temp1 write(6,*) "After map",temp1 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(temp1, ierr, dpitch, spitch, & !$OMP width, height, i1, i2, iny1) SHARED(h2d_event, ip, vxz, ubuf, nx, & !$OMP ny, nz, np, zero, ptr_callback, h2d_stream, ibuf, nbuf, next, & !$OMP inyi, mz) !$OMP SINGLE temp1 = omp_get_wtime() !$OMP TASK DEPEND(OUT:ubuf(:,:,:,1)) DETACH(h2d_event) & !$OMP DEFAULT(NONE) PRIVATE(ierr,iny1) SHARED(ubuf,vxz,nx,inyi, & !$OMP ny,mz,ptr_callback,zero,h2d_stream) iny1 = 1 write(6,*) "After parameter set",1 !!call flush(6) flush(6) !$OMP TARGET DATA USE_DEVICE_PTR(ubuf) ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,1)), nx*inyi, & C_LOC(vxz(1,iny1,1)), nx*ny, & nx*inyi, mz, stream=h2d_stream) !$OMP END TARGET DATA ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero) write(6,*) "After Add callback",1 !!call flush(6) flush(6) !$OMP END TASK temp1 = omp_get_wtime() - temp1 write(6,*) "ubuf HtoD",1,temp1 !!call flush(6) flush(6) do ip=1,np ibuf = mod(ip,nbuf) if(ibuf.eq.0) ibuf = nbuf next(1) = ip+1 if(next(1).gt.np) next(1) = 1 next(2) = ibuf+1 if(next(2).gt.nbuf) next(2) = 1 if (ip.ne.np) then temp1 = omp_get_wtime() i1 = next(1) i2 = next(2) !$OMP TASK DEPEND(OUT:ubuf(:,:,:,i2)) DETACH(h2d_event) & !$OMP DEFAULT(NONE) PRIVATE(ierr,iny1) FIRSTPRIVATE(i1,i2) & !$OMP SHARED(ubuf,vxz,nx,inyi,ny,mz,ptr_callback, & !$OMP zero,h2d_stream) iny1 = (i1-1)*inyi+1 write(6,*) "After parameter set",i1 !!call flush(6) flush(6) !$OMP TARGET DATA USE_DEVICE_PTR(ubuf) ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,i2)), nx*inyi, & C_LOC(vxz(1,iny1,1)), nx*ny, & nx*inyi, mz, stream=h2d_stream) !$OMP END TARGET DATA ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero) write(6,*) "After Add callback",i1 !!call flush(6) flush(6) !$OMP END TASK temp1 = omp_get_wtime() - temp1 write(6,*) "ubuf HtoD",i1,temp1 !!call flush(6) flush(6) end if end do do ip=1,np !$OMP TASK DEPEND(IN:ubuf(:,:,:,ip)) write(6,*) "Copy task complete, ip=",ip !!call flush(6) flush(6) !$OMP END TASK write(6,*) "After check task",ip end do !$OMP TASKWAIT !$OMP END SINGLE !$OMP END PARALLEL !$OMP END TARGET DATA deallocate (vxz, ubuf) write(6,*) "Finished" !!call flush(6) flush(6) end program setUpModules.sh 0 → 100755 +9 −0 Original line number Diff line number Diff line #!/bin/bash module load xl/16.1.1-beta103 #module load cuda/10.1.243 module load cuda/11.0.3 Loading
Makefile 0 → 100644 +35 −0 Original line number Diff line number Diff line # set compilers FC=mpif90 #FC=xlcuf # set linker LD=mpif90 #LD=xlcuf # set flags FFLAGS=-qarch=pwr9 -qtune=pwr9 -qtgtarch=sm_70 #-g -Rb OPENMP_FLAG=-qsmp=omp -qoffload -I$(OLCF_CUDA_ROOT)/include OPT_FLAGS=-O3 #OPT_FLAGS=-Ofast LDFLAGS=-L$(OLCF_CUDA_ROOT)/lib64 #-lcudart FDEFINES= OBJS=main_cudaStreamAddCallback.o EXE=cudaStreamAddCallback_detach.x %.o: %.F90 $(FC) -c $< $(OPT_FLAGS) $(FFLAGS) $(CPPFLAGS) $(OPENMP_FLAG) -o $@ $(EXE) : $(OBJS) $(LD) $(OBJS) $(OPT_FLAGS) $(OPENMP_FLAG) $(LDFLAGS) -o $@ clean: rm -f *.o *.mod *.s *.x clobber: rm -f *.o *.mod *.x *.s *~
batch_cudaStreamAddCallback.sh 0 → 100755 +31 −0 Original line number Diff line number Diff line #!/bin/bash #BSUB -P STF006 #BSUB -J StreamAddCallback_Detach_Test #BSUB -o StreamAddCallback_Detach_Test #BSUB -e StreamAddCallback.err #BSUB -W 0:05 #BSUB -nnodes 1 #BSUB -alloc_flags smt4 source ./setUpModules.sh module list # -n : number of resource sets # -a : number of MPI ranks per resource set # -c : number of CPUs/cores per resource set # -r : number of resource sets per host # -g : number of GPUs per resource set # -b : binding of tasks (not sure how this really works...) # -l : latency priority # -d : how tasks are started on resource sets # -E : OMP_NUM_THREADS=4 allows up to four threads per MPI rank # -E : OMP_NUM_THREADS=168 allows up to four threads per MPI rank ### NOTE: I think that you can also "export OMP_NUM_THREADS=***" before the jsrun cmd and exlude it from ### the jsrun cmd line # 1 MPI rank and 4 threads per MPI rank jsrun -n1 -a1 -c1 -r1 -g1 -b packed:1 -l cpu-cpu -d packed -E OMP_NUM_THREADS=4 ./cudaStreamAddCallback_detach.x
build.sh 0 → 100755 +9 −0 Original line number Diff line number Diff line #!/bin/bash source setUpModules.sh module list make ldd cudaStreamAddCallback_detach.x
main_cudaStreamAddCallback.F90 0 → 100644 +180 −0 Original line number Diff line number Diff line module stream_addcallback_mod use iso_c_binding interface cudaStreamAddCallback integer function cudaStreamAddCallback (stream, ptr_callback, event, flag) & bind(c, name ='cudaStreamAddCallback') use iso_c_binding use omp_lib use cudafor implicit none integer(kind=cuda_stream_kind), value :: stream type(c_funptr),value :: ptr_callback type(c_ptr),value :: event integer, value :: flag end function cudaStreamAddCallback end interface cudaStreamAddCallback contains subroutine streamAddCallback_callback (stream, fstatus, event) use iso_c_binding use omp_lib use cudafor implicit none integer(kind=cuda_stream_kind), value :: stream integer(C_INT),value :: fstatus type(C_PTR),value :: event integer(kind=omp_event_handle_kind), pointer :: f_event call C_F_POINTER (event, f_event) call omp_fulfill_event(f_event) end subroutine streamAddCallback_callback end module stream_addcallback_mod program CudaStreamAddCallback_detach use ISO_C_BINDING use omp_lib use stream_addcallback_mod use cudafor implicit none real, allocatable, target :: vxz(:,:,:) real, allocatable, target :: ubuf(:,:,:,:) integer :: nx, ny, nz, mz, inyi, iny1, ierr integer :: ip, np, nbuf, ibuf, next(2), i1, i2 integer(kind=omp_event_handle_kind), target :: h2d_event integer(kind=omp_event_handle_kind) :: d2h_event, & fft_event integer(kind=cuda_stream_kind) :: h2d_stream type(c_funptr) :: ptr_callback integer :: zero = 0 ! variables for hipMemCpy2D integer(kind=c_size_t) :: dpitch, spitch, width, height real(kind=8) :: temp1 write(6,*) "Enter main" !!call flush(6) flush(6) nx = 768 ny = 768 nz = 768 mz = nz/4 np = 3 inyi = ny/np nbuf = 3 allocate (vxz(nx, ny, mz)) allocate (ubuf(nx, inyi, mz, nbuf)) ! pointer to callback function ptr_callback = C_FUNLOC(streamAddCallback_callback) ierr = cudaStreamCreate(h2d_stream) write(6,*) "Before parallel" !!call flush(6) flush(6) temp1 = omp_get_wtime() !$OMP TARGET DATA MAP(alloc:ubuf) temp1 = omp_get_wtime() - temp1 write(6,*) "After map",temp1 !$OMP PARALLEL DEFAULT(NONE) PRIVATE(temp1, ierr, dpitch, spitch, & !$OMP width, height, i1, i2, iny1) SHARED(h2d_event, ip, vxz, ubuf, nx, & !$OMP ny, nz, np, zero, ptr_callback, h2d_stream, ibuf, nbuf, next, & !$OMP inyi, mz) !$OMP SINGLE temp1 = omp_get_wtime() !$OMP TASK DEPEND(OUT:ubuf(:,:,:,1)) DETACH(h2d_event) & !$OMP DEFAULT(NONE) PRIVATE(ierr,iny1) SHARED(ubuf,vxz,nx,inyi, & !$OMP ny,mz,ptr_callback,zero,h2d_stream) iny1 = 1 write(6,*) "After parameter set",1 !!call flush(6) flush(6) !$OMP TARGET DATA USE_DEVICE_PTR(ubuf) ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,1)), nx*inyi, & C_LOC(vxz(1,iny1,1)), nx*ny, & nx*inyi, mz, stream=h2d_stream) !$OMP END TARGET DATA ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero) write(6,*) "After Add callback",1 !!call flush(6) flush(6) !$OMP END TASK temp1 = omp_get_wtime() - temp1 write(6,*) "ubuf HtoD",1,temp1 !!call flush(6) flush(6) do ip=1,np ibuf = mod(ip,nbuf) if(ibuf.eq.0) ibuf = nbuf next(1) = ip+1 if(next(1).gt.np) next(1) = 1 next(2) = ibuf+1 if(next(2).gt.nbuf) next(2) = 1 if (ip.ne.np) then temp1 = omp_get_wtime() i1 = next(1) i2 = next(2) !$OMP TASK DEPEND(OUT:ubuf(:,:,:,i2)) DETACH(h2d_event) & !$OMP DEFAULT(NONE) PRIVATE(ierr,iny1) FIRSTPRIVATE(i1,i2) & !$OMP SHARED(ubuf,vxz,nx,inyi,ny,mz,ptr_callback, & !$OMP zero,h2d_stream) iny1 = (i1-1)*inyi+1 write(6,*) "After parameter set",i1 !!call flush(6) flush(6) !$OMP TARGET DATA USE_DEVICE_PTR(ubuf) ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,i2)), nx*inyi, & C_LOC(vxz(1,iny1,1)), nx*ny, & nx*inyi, mz, stream=h2d_stream) !$OMP END TARGET DATA ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero) write(6,*) "After Add callback",i1 !!call flush(6) flush(6) !$OMP END TASK temp1 = omp_get_wtime() - temp1 write(6,*) "ubuf HtoD",i1,temp1 !!call flush(6) flush(6) end if end do do ip=1,np !$OMP TASK DEPEND(IN:ubuf(:,:,:,ip)) write(6,*) "Copy task complete, ip=",ip !!call flush(6) flush(6) !$OMP END TASK write(6,*) "After check task",ip end do !$OMP TASKWAIT !$OMP END SINGLE !$OMP END PARALLEL !$OMP END TARGET DATA deallocate (vxz, ubuf) write(6,*) "Finished" !!call flush(6) flush(6) end program
setUpModules.sh 0 → 100755 +9 −0 Original line number Diff line number Diff line #!/bin/bash module load xl/16.1.1-beta103 #module load cuda/10.1.243 module load cuda/11.0.3