On branch gcc_openmp (e4fe3d53) · Commits · OpenMP-Offloading / CudaStreamAddCallback With Openmp Detach

Makefile

+1 −1

Original line number	Diff line number	Diff line
		@@ -14,7 +14,7 @@ OPENMP_FLAG=-qsmp=omp -qoffload -I$(OLCF_CUDA_ROOT)/include
		OPT_FLAGS=-O3
		#OPT_FLAGS=-Ofast

		LDFLAGS=-L$(OLCF_CUDA_ROOT)/lib64 #-lcudart
		LDFLAGS=-L$(OLCF_CUDA_ROOT)/lib64 -lcudart

		FDEFINES=

Makefile_gcc

0 → 100644

+37 −0

Original line number	Diff line number	Diff line
		# set compilers
		FC=gfortran
		#FC=mpif90
		#FC=xlcuf

		# set linker
		LD=gfortran
		#LD=mpif90
		#LD=xlcuf

		# set flags
		FFLAGS=-ffree-form -cpp

		OPENMP_FLAG=-fopenmp -foffload="-lm -latomic" -I$(OLCF_CUDA_ROOT)/include

		OPT_FLAGS=-O3
		#OPT_FLAGS=-Ofast

		LDFLAGS=-L$(OLCF_CUDA_ROOT)/lib64 -lcudart #-lnvToolsExt

		FDEFINES=

		OBJS=main_cudaStreamAddCallback.o
		EXE=cudaStreamAddCallback_detach_gcc.x

		%.o: %.F90
		$(FC) -c $< $(OPT_FLAGS) $(FFLAGS) $(CPPFLAGS) $(OPENMP_FLAG) -o $@

		$(EXE) : $(OBJS)
		$(LD) $(OBJS) $(OPT_FLAGS) $(OPENMP_FLAG) $(LDFLAGS) -o $@

		clean:
		rm -f .o .mod .s .x

		clobber:
		rm -f .o .mod .x .s *~

README.md

+13 −8

Original line number	Diff line number	Diff line
		Simple code to test OpenMP Detach functionality using cudaMemCpy2DAsync for asynchronous copies and cudaStreamAddCallback to perform the callback which fulfills the OpenMP detached event.
		Simple code to test OpenMP Detach functionality using cudaMemCpy2DAsync for asynchronous copies and cudaStreamAddCallback to perform the callback which fulfills the OpenMP detached event. This branch can build with the XLF or the pre-production GCC that supports OpenMP offloading. To use the pre-production versoin of GCC, the code doesn't use "cudafor" and includes a few more fortran interfaces for CUDA functionality.

		setUpModules.sh : loads appropriate modules
		IBM XLF versions
		setUpModules.sh : loads appropriate modules for IBM XLF
		build.sh : sources setUpModules and builds executable for IBM XLF
		batch_cudaStreamAddCallback.sh : submission script for IBM systems at OLCF for IBM XLF
		Makefile : makefile to build with IBM compilers for IBM XLF
		main_cudaStreamAddCallback.F90 : test code for IBM XLF

		build.sh : sources setUpModules and builds executable

		batch_cudaStreamAddCallback.sh : submission script for IBM systems at OLCF

		Makefile : makefile to build with IBM compilers

		main_cudaStreamAddCallback.F90 : test code
		GCC Versions
		setUpModules_gcc.sh : loads appropriate modules for GCC gfortran
		build_gcc.sh : sources setUpModules and builds executable for GCC gfortran
		batch_cudaStreamAddCallback_gcc.sh : submission script for IBM systems at OLCF for GCC gfortran
		Makefile_gcc : makefile to build with IBM compilers for GCC gfortran
		main_cudaStreamAddCallback_gcc.F90 : test code for GCC gfortran

batch_cudaStreamAddCallback_gcc.sh

0 → 100755

+31 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		#BSUB -P STF006
		#BSUB -J StreamAddCallback_Detach_Test
		#BSUB -o StreamAddCallback_Detach_Test
		#BSUB -e StreamAddCallback.err
		#BSUB -W 0:05
		#BSUB -nnodes 1
		#BSUB -alloc_flags smt4

		source ./setUpModules_gcc.sh

		module list

		# -n : number of resource sets
		# -a : number of MPI ranks per resource set
		# -c : number of CPUs/cores per resource set
		# -r : number of resource sets per host
		# -g : number of GPUs per resource set
		# -b : binding of tasks (not sure how this really works...)
		# -l : latency priority
		# -d : how tasks are started on resource sets
		# -E : OMP_NUM_THREADS=4 allows up to four threads per MPI rank
		# -E : OMP_NUM_THREADS=168 allows up to four threads per MPI rank
		### NOTE: I think that you can also "export OMP_NUM_THREADS=***" before the jsrun cmd and exlude it from
		### the jsrun cmd line

		# 1 MPI rank and 4 threads per MPI rank
		jsrun -n1 -a1 -c1 -r1 -g1 -b packed:1 -l cpu-cpu -d packed -E OMP_NUM_THREADS=4 ./cudaStreamAddCallback_detach_gcc.x

build_gcc.sh

0 → 100755

+10 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		source setUpModules_gcc.sh
		module list

		make -f Makefile_gcc clobber
		make -f Makefile_gcc

		ldd cudaStreamAddCallback_detach_gcc.x