Initial commit (3d5ccd10) · Commits · OpenMP-Offloading / CudaStreamAddCallback With Openmp Detach

Makefile

0 → 100644

+35 −0

Original line number	Diff line number	Diff line
		# set compilers
		FC=mpif90
		#FC=xlcuf

		# set linker
		LD=mpif90
		#LD=xlcuf

		# set flags
		FFLAGS=-qarch=pwr9 -qtune=pwr9 -qtgtarch=sm_70 #-g -Rb

		OPENMP_FLAG=-qsmp=omp -qoffload -I$(OLCF_CUDA_ROOT)/include

		OPT_FLAGS=-O3
		#OPT_FLAGS=-Ofast

		LDFLAGS=-L$(OLCF_CUDA_ROOT)/lib64 #-lcudart

		FDEFINES=

		OBJS=main_cudaStreamAddCallback.o
		EXE=cudaStreamAddCallback_detach.x

		%.o: %.F90
		$(FC) -c $< $(OPT_FLAGS) $(FFLAGS) $(CPPFLAGS) $(OPENMP_FLAG) -o $@

		$(EXE) : $(OBJS)
		$(LD) $(OBJS) $(OPT_FLAGS) $(OPENMP_FLAG) $(LDFLAGS) -o $@

		clean:
		rm -f .o .mod .s .x

		clobber:
		rm -f .o .mod .x .s *~

batch_cudaStreamAddCallback.sh

0 → 100755

+31 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		#BSUB -P STF006
		#BSUB -J StreamAddCallback_Detach_Test
		#BSUB -o StreamAddCallback_Detach_Test
		#BSUB -e StreamAddCallback.err
		#BSUB -W 0:05
		#BSUB -nnodes 1
		#BSUB -alloc_flags smt4

		source ./setUpModules.sh

		module list

		# -n : number of resource sets
		# -a : number of MPI ranks per resource set
		# -c : number of CPUs/cores per resource set
		# -r : number of resource sets per host
		# -g : number of GPUs per resource set
		# -b : binding of tasks (not sure how this really works...)
		# -l : latency priority
		# -d : how tasks are started on resource sets
		# -E : OMP_NUM_THREADS=4 allows up to four threads per MPI rank
		# -E : OMP_NUM_THREADS=168 allows up to four threads per MPI rank
		### NOTE: I think that you can also "export OMP_NUM_THREADS=***" before the jsrun cmd and exlude it from
		### the jsrun cmd line

		# 1 MPI rank and 4 threads per MPI rank
		jsrun -n1 -a1 -c1 -r1 -g1 -b packed:1 -l cpu-cpu -d packed -E OMP_NUM_THREADS=4 ./cudaStreamAddCallback_detach.x

build.sh

0 → 100755

+9 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		source setUpModules.sh
		module list

		make

		ldd cudaStreamAddCallback_detach.x

main_cudaStreamAddCallback.F90

0 → 100644

+180 −0

Original line number	Diff line number	Diff line
		module stream_addcallback_mod
		use iso_c_binding

		interface cudaStreamAddCallback
		integer function cudaStreamAddCallback (stream, ptr_callback, event, flag) &
		bind(c, name ='cudaStreamAddCallback')
		use iso_c_binding
		use omp_lib
		use cudafor
		implicit none
		integer(kind=cuda_stream_kind), value :: stream
		type(c_funptr),value :: ptr_callback
		type(c_ptr),value :: event
		integer, value :: flag
		end function cudaStreamAddCallback
		end interface cudaStreamAddCallback

		contains

		subroutine streamAddCallback_callback (stream, fstatus, event)
		use iso_c_binding
		use omp_lib
		use cudafor
		implicit none
		integer(kind=cuda_stream_kind), value :: stream
		integer(C_INT),value :: fstatus
		type(C_PTR),value :: event
		integer(kind=omp_event_handle_kind), pointer :: f_event

		call C_F_POINTER (event, f_event)
		call omp_fulfill_event(f_event)
		end subroutine streamAddCallback_callback

		end module stream_addcallback_mod

		program CudaStreamAddCallback_detach

		use ISO_C_BINDING
		use omp_lib
		use stream_addcallback_mod
		use cudafor
		implicit none

		real, allocatable, target :: vxz(:,:,:)
		real, allocatable, target :: ubuf(:,:,:,:)
		integer :: nx, ny, nz, mz, inyi, iny1, ierr
		integer :: ip, np, nbuf, ibuf, next(2), i1, i2
		integer(kind=omp_event_handle_kind), target :: h2d_event
		integer(kind=omp_event_handle_kind) :: d2h_event, &
		fft_event
		integer(kind=cuda_stream_kind) :: h2d_stream
		type(c_funptr) :: ptr_callback
		integer :: zero = 0
		! variables for hipMemCpy2D
		integer(kind=c_size_t) :: dpitch, spitch, width, height
		real(kind=8) :: temp1

		write(6,*) "Enter main"
		!!call flush(6)
		flush(6)

		nx = 768
		ny = 768
		nz = 768
		mz = nz/4
		np = 3
		inyi = ny/np
		nbuf = 3

		allocate (vxz(nx, ny, mz))
		allocate (ubuf(nx, inyi, mz, nbuf))

		! pointer to callback function
		ptr_callback = C_FUNLOC(streamAddCallback_callback)

		ierr = cudaStreamCreate(h2d_stream)

		write(6,*) "Before parallel"
		!!call flush(6)
		flush(6)

		temp1 = omp_get_wtime()
		!$OMP TARGET DATA MAP(alloc:ubuf)
		temp1 = omp_get_wtime() - temp1
		write(6,*) "After map",temp1

		!$OMP PARALLEL DEFAULT(NONE) PRIVATE(temp1, ierr, dpitch, spitch, &
		!$OMP width, height, i1, i2, iny1) SHARED(h2d_event, ip, vxz, ubuf, nx, &
		!$OMP ny, nz, np, zero, ptr_callback, h2d_stream, ibuf, nbuf, next, &
		!$OMP inyi, mz)

		!$OMP SINGLE

		temp1 = omp_get_wtime()
		!$OMP TASK DEPEND(OUT:ubuf(:,:,:,1)) DETACH(h2d_event) &
		!$OMP DEFAULT(NONE) PRIVATE(ierr,iny1) SHARED(ubuf,vxz,nx,inyi, &
		!$OMP ny,mz,ptr_callback,zero,h2d_stream)
		iny1 = 1
		write(6,*) "After parameter set",1
		!!call flush(6)
		flush(6)
		!$OMP TARGET DATA USE_DEVICE_PTR(ubuf)
		ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,1)), nx*inyi, &
		C_LOC(vxz(1,iny1,1)), nx*ny, &
		nx*inyi, mz, stream=h2d_stream)
		!$OMP END TARGET DATA
		ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero)
		write(6,*) "After Add callback",1
		!!call flush(6)
		flush(6)
		!$OMP END TASK
		temp1 = omp_get_wtime() - temp1
		write(6,*) "ubuf HtoD",1,temp1
		!!call flush(6)
		flush(6)

		do ip=1,np

		ibuf = mod(ip,nbuf)
		if(ibuf.eq.0) ibuf = nbuf

		next(1) = ip+1
		if(next(1).gt.np) next(1) = 1
		next(2) = ibuf+1
		if(next(2).gt.nbuf) next(2) = 1

		if (ip.ne.np) then
		temp1 = omp_get_wtime()
		i1 = next(1)
		i2 = next(2)
		!$OMP TASK DEPEND(OUT:ubuf(:,:,:,i2)) DETACH(h2d_event) &
		!$OMP DEFAULT(NONE) PRIVATE(ierr,iny1) FIRSTPRIVATE(i1,i2) &
		!$OMP SHARED(ubuf,vxz,nx,inyi,ny,mz,ptr_callback, &
		!$OMP zero,h2d_stream)
		iny1 = (i1-1)*inyi+1
		write(6,*) "After parameter set",i1
		!!call flush(6)
		flush(6)
		!$OMP TARGET DATA USE_DEVICE_PTR(ubuf)
		ierr = cudaMemCpy2DAsync (C_LOC(ubuf(1,1,1,i2)), nx*inyi, &
		C_LOC(vxz(1,iny1,1)), nx*ny, &
		nx*inyi, mz, stream=h2d_stream)
		!$OMP END TARGET DATA
		ierr = cudaStreamAddCallback (h2d_stream, ptr_callback, C_LOC(h2d_event), zero)
		write(6,*) "After Add callback",i1
		!!call flush(6)
		flush(6)
		!$OMP END TASK
		temp1 = omp_get_wtime() - temp1
		write(6,*) "ubuf HtoD",i1,temp1
		!!call flush(6)
		flush(6)
		end if

		end do

		do ip=1,np
		!$OMP TASK DEPEND(IN:ubuf(:,:,:,ip))
		write(6,*) "Copy task complete, ip=",ip
		!!call flush(6)
		flush(6)
		!$OMP END TASK
		write(6,*) "After check task",ip
		end do

		!$OMP TASKWAIT

		!$OMP END SINGLE

		!$OMP END PARALLEL

		!$OMP END TARGET DATA

		deallocate (vxz, ubuf)

		write(6,*) "Finished"
		!!call flush(6)
		flush(6)

		end program

setUpModules.sh

0 → 100755

+9 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		module load xl/16.1.1-beta103
		#module load cuda/10.1.243
		module load cuda/11.0.3