Adding hip option (33270b99) · Commits · Berrill, Mark A / RayTrace-miniapp

build/configure

+4 −2

Original line number	Diff line number	Diff line
		@@ -9,11 +9,13 @@ cmake \
		-D CMAKE_CXX_COMPILER=mpic++ \
		-D CMAKE_CXX_STANDARD=11 \
		-D USE_OPENACC=0 \
		-D USE_OPENMP=1 \
		-D USE_OPENMP=0 \
		-D USE_KOKKOS=0 \
		-D KOKKOS_DIRECTORY=${KOKKOS_DIR} \
		-D KOKKOS_WRAPPER=${KOKKOS_DIR}/nvcc_wrapper \
		-D USE_CUDA=1 \
		-D USE_CUDA=0 \
		-D CUDA_FLAGS="-arch sm_30" \
		-D USE_HIP=1 \
		-D HIP_NVCC_OPTIONS="-arch sm_30" \
		../src

src/CMakeLists.txt

+34 −3

Original line number	Diff line number	Diff line
		@@ -137,6 +137,27 @@ IF ( USE_CUDA )
		ENABLE_LANGUAGE( CUDA )
		ENDIF()

		# Enable HIP
		CHECK_ENABLE_FLAG( USE_HIP 0 )
		IF ( USE_HIP )
		IF ( NOT DEFINED HIP_PATH )
		IF ( NOT DEFINED ENV{HIP_PATH} )
		SET( HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed" )
		ELSE()
		SET( HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed" )
		ENDIF()
		ENDIF()
		SET( CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH} )
		FIND_PACKAGE( HIP REQUIRED )
		FIND_PACKAGE( CUDA QUIET )
		MESSAGE( "HIP Found")
		MESSAGE( " HIP version: ${HIP_VERSION_STRING}")
		MESSAGE( " HIP platform: ${HIP_PLATFORM}")
		MESSAGE( " HIP Include Path: ${HIP_INCLUDE_DIRS}")
		MESSAGE( " HIP Libraries: ${HIP_LIBRARIES}")
		ADD_DEFINITIONS( -DUSE_HIP )
		ENDIF()


		# Include some paths
		INCLUDE_DIRECTORIES( "${RAYTRACE_INSTALL_DIR}/include" )
		@@ -173,16 +194,26 @@ ENDIF()
		IF ( USE_KOKKOS )
		#SET( SOURCES ${SOURCES} RayTraceImageKokkos.cpp )
		ENDIF()
		IF (USE_HIP )
		SET( SOURCES ${SOURCES} RayTraceImageHIP.cpp )
		ENDIF()
		IF ( USE_CUDA )
		SET( SOURCES ${SOURCES} RayTraceImageCuda.cu )
		ENDIF()
		ADD_LIBRARY( RayTrace ${SOURCES} ${CUBINS} )
		ADD_DEPENDENCIES( RayTrace copy-include )
		ADD_LIBRARY( RayTrace SHARED ${SOURCES} ${CUBINS} )
		IF ( USE_HIP )
		SET( HIP_SEPERABLE_COMPILATION ON )
		SET_SOURCE_FILES_PROPERTIES( RayTraceImageHIP.hip PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1 )
		HIP_ADD_LIBRARY( RayTraceHIP RayTraceImageHIP.hip SHARED HIPCC_OPTIONS ${HIP_HIPCC_OPTIONS} HCC_OPTIONS ${HIP_HCC_OPTIONS} NVCC_OPTIONS ${HIP_NVCC_OPTIONS} ${HIP_NVCC_FLAGS} )
		TARGET_LINK_LIBRARIES( RayTraceHIP /opt/rocm-3.3.0/lib/libhip_hcc.so )
		TARGET_LINK_LIBRARIES( RayTrace RayTraceHIP )
		ADD_DEPENDENCIES( RayTraceHIP copy-include )
		ENDIF()


		# Add the applications
		ADD_EXECUTABLE( CreateImage CreateImage.cpp )
		TARGET_LINK_LIBRARIES( CreateImage RayTrace ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} )
		TARGET_LINK_LIBRARIES( CreateImage RayTrace RayTraceHIP ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} )
		INSTALL( TARGETS CreateImage DESTINATION ${RAYTRACE_INSTALL_DIR}/bin )

src/CreateImageHelpers.cpp

+5 −1

Original line number	Diff line number	Diff line
		@@ -6,6 +6,10 @@
		#include <thread>


		#if defined( USE_MPI )
		#include "mpi.h"
		#endif

		#ifdef USE_CUDA
		#include <cuda_runtime_api.h>
		#endif
		@@ -368,7 +372,7 @@ Options::Options( int argc, char *argv[] ) : iterations( 1 ), scale( 1.0 )
		" CreateImage <args> file.dat\n"
		"Optional arguments:\n"
		" -methods=METHODS Comma seperated list of methods to test\n"
		" cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, OpenAcc, Kokkos-Serial, "
		" cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, HIP, OpenAcc, Kokkos-Serial, "
		"Kokkos-Thread, Kokkos-OpenMP, Kokkos-Cuda\n"
		" all - run all availible tests (default)\n"
		" parallel - run all availible parallel tests\n"

src/RayTraceImage.cpp

+19 −0

Original line number	Diff line number	Diff line
		@@ -28,6 +28,10 @@
		#define ENABLE_CUDA
		#undef USE_CUDA
		#endif
		#ifdef USE_HIP
		#define ENABLE_HIP
		#undef USE_HIP
		#endif
		#ifdef USE_OPENMP
		#define ENABLE_OPENMP
		#undef USE_OPENMP
		@@ -66,6 +70,12 @@ extern void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct &euv_b
		const std::vector<ray_struct> &rays, double scale, double image, double I_ang,
		unsigned int &failure_code, std::vector<ray_struct> &failed_rays );
		#endif
		#if defined( ENABLE_HIP )
		extern void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct &euv_beam,
		const RayTrace::ray_gain_struct gain, const RayTrace::ray_seed_struct seed, int method,
		const std::vector<ray_struct> &rays, double scale, double image, double I_ang,
		unsigned int &failure_code, std::vector<ray_struct> &failed_rays );
		#endif


		/**********************************************************************
		@@ -300,6 +310,8 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth
		compute_method = "openacc";
		#elif defined( ENABLE_CUDA )
		compute_method = "cuda";
		#elif defined( ENABLE_HIP )
		compute_method = "hip";
		#elif defined( ENABLE_OPENMP )
		compute_method = "openmp";
		#elif defined( ENABLE_KOKKOS )
		@@ -365,6 +377,13 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth
		failure_code, failed_rays );
		#else
		RAY_ERROR( "Cuda-MultiGPU is not availible" );
		#endif
		} else if ( compute_method == "hip" ) {
		#if defined( ENABLE_HIP )
		RayTraceImageHIPLoop( N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays,
		scale, image, I_ang, failure_code, failed_rays );
		#else
		RAY_ERROR( "HIP is not availible" );
		#endif
		} else if ( compute_method == "cpu" ) {
		RayTraceImageCPULoop( N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays,

src/RayTraceImageHIP.hip

0 → 100644

+344 −0

Original line number	Diff line number	Diff line
		#include "RayTrace/RayTraceStructures.h"
		#include "RayTrace/utilities/RayUtilities.h"

		#define HCC_COMPILE
		#include "RayTrace/common/RayTraceImageHelper.h"

		#include "hip/hip_runtime.h"
		#include "cuda.h"


		// Check for errors
		#define CUDA_CHECK() \
		do { \
		if ( cudaPeekAtLastError() != cudaSuccess ) { \
		cudaError_t error = cudaGetLastError(); \
		printf("cuda error: %i\n",error); \
		printf(" %s\n",cudaGetErrorString(error)); \
		printf(" line: %i\n",(int)__LINE__); \
		printf(" file: %s\n",__FILE__); \
		exit(-1); \
		} \
		} while (0)
		#define HIP_CHECK() \
		do { \
		} while (0)


		#define CUDA_PRINT_FUNCTION( fun ) \
		do { \
		hipFuncAttributes attr; \
		hipFuncGetAttributes(&attr,fun); \
		printf("%s:\n",#fun); \
		printf(" version = %i\n",attr.binaryVersion); \
		printf(" ptx = %i\n",attr.ptxVersion); \
		printf(" constSize = %i\n",attr.constSizeBytes); \
		printf(" localSize = %i\n",attr.localSizeBytes); \
		printf(" sharedSize = %i\n",attr.sharedSizeBytes); \
		printf(" maxThreads = %i\n",attr.maxThreadsPerBlock); \
		printf(" numRegs = %i\n",attr.numRegs); \
		} while (0)


		// Atomic add operation for double
		#if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ >= 600
		#define atomicAdd2 atomicAdd
		#else
		__device__ double atomicAdd2(double* address, double val)
		{
		unsigned long long int* address_as_ull = (unsigned long long int*)address;
		unsigned long long int old = *address_as_ull, assumed;
		do {
		assumed = old;
		old = atomicCAS(address_as_ull, assumed,
		__double_as_longlong(val +
		__longlong_as_double(assumed)));
		} while (assumed != old);
		return __longlong_as_double(old);
		}
		#endif



		// Get the globally unique thread id
		__device__ int getGlobalIdx3D()
		{
		int blockId = blockIdx.x
		+ blockIdx.y * gridDim.x
		+ gridDim.x * gridDim.y * blockIdx.z;
		int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
		+ (threadIdx.z * (blockDim.x * blockDim.y))
		+ (threadIdx.y * blockDim.x)
		+ threadIdx.x;
		return threadId;
		}


		// Get the index
		__device__ inline int getIndex( int n, const double *x, double dx, double y )
		{
		if ( y < x[0] - 0.5 * dx \|\| y > x[n - 1] + 0.5 * dx )
		return -1;
		return findfirstsingle( x, n, y - 0.5 * dx );
		}


		// Kernel that executes on the CUDA device
		__global__
		__launch_bounds__(128,8) // Set bounds to limit the number of registers
		void RayTraceImageHIPKernel( int N, int nx, int ny, int na, int nb, int nv,
		const double x, const double y, const double a, const double b,
		double dx, double dy, double dz, double da, double db, const double *dv,
		const RayTrace::ray_gain_struct* gain, const RayTrace::ray_seed_struct* seed,
		int method, int N_rays, const ray_struct *rays, double scale,
		double image, double I_ang )
		{
		int idx = getGlobalIdx3D();
		if ( idx<N_rays ) {
		const ray_struct ray = rays[idx];
		double Iv[K_MAX];
		ray_struct ray2;
		int error = RayTrace_calc_ray( ray, N, dz, gain, seed, nv, method, Iv, ray2 );
		if ( error!=0 ) {
		//failed_rays.push_back(ray);
		//set_bit(-error,failure_code);
		} else {
		if ( method == 1 ) {
		// We are propagating backward, use ray for the cell updates
		ray2 = ray;
		} else {
		// We are propagating forward, use ray2 for the cell updates
		// Note: The sign of the angle is reversed with respect to the euv_beam
		ray2.a = -ray2.a;
		ray2.b = -ray2.b;
		if ( ray2.y<0.0 && y[0]>=0.0 ) {
		// We need to change the sign of y
		ray2.y = -ray2.y;
		}
		}
		// Get the indicies to the cells in image and I_ang
		int i1 = getIndex( nx, x, dx, ray2.x );
		int i2 = getIndex( ny, y, dy, ray2.y );
		int i3 = getIndex( na, a, da, ray2.a );
		int i4 = getIndex( nb, b, db, ray2.b );
		// Copy I_out into image
		if (i1>=0 && i2>=0){
		double Iv2 = &image[nv(i1+i2*nx)];
		for (int iv=0; iv<nv; iv++)
		atomicAdd2(&Iv2[iv],Iv[iv]*scale);
		}
		// Copy I_out into I_ang
		if (i3>=0 && i4>=0) {
		double tmp = 0.0;
		for (int iv=0; iv<nv; iv++)
		tmp += 2.0dv[iv]Iv[iv];
		atomicAdd2(&I_ang[i3+i4*na],tmp);
		}
		}
		}
		}


		// Compute the block size to use
		inline dim3 calcBlockSize( size_t N_blocks )
		{
		dim3 block_size;
		if ( N_blocks < 65535 ) {
		block_size.x = N_blocks;
		} else {
		block_size.y = N_blocks/32768;
		block_size.x = N_blocks/block_size.y + (N_blocks%block_size.y == 0 ? 0:1);
		}
		return block_size;
		}


		// Create the image and call the cuda kernel
		void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct& beam,
		const RayTrace::ray_gain_struct* gain_in, const RayTrace::ray_seed_struct* seed_in,
		int method, const std::vector<ray_struct>& rays, double scale,
		double image, double I_ang,
		unsigned int& failure_code, std::vector<ray_struct>& failed_rays )
		{
		failure_code = 0; // Need to track failures on GPU
		// Get device properties
		static int maxThreadsPerBlock = 128;
		/*if ( maxThreadsPerBlock == 0 ) {
		hipFuncAttributes attr;
		hipFuncGetAttributes(&attr,RayTraceImageHIPKernel);
		maxThreadsPerBlock = attr.maxThreadsPerBlock;
		//CUDA_PRINT_FUNCTION(RayTraceImageCudaKernel);
		RAY_ASSERT(maxThreadsPerBlock>0);
		}*/
		// place the ray gain and seed structures on the device
		const int nx = beam.nx;
		const int ny = beam.ny;
		const int na = beam.na;
		const int nb = beam.nb;
		const int nv = beam.nv;
		const double dx = beam.dx;
		const double dy = beam.dy;
		const double dz = beam.dz;
		const double da = beam.da;
		const double db = beam.db;
		const RayTrace::ray_gain_struct* gain = RayTrace::ray_gain_struct::copy_device( N, gain_in );
		const RayTrace::ray_seed_struct* seed = NULL;
		if ( seed_in!=NULL )
		seed = seed_in->copy_device();
		// Allocate device memory
		size_t N_rays = rays.size();
		double x2, y2, a2, b2, dv2, image2, *I_ang2;
		ray_struct *rays2;
		hipMalloc(&x2,nx*sizeof(double));
		hipMalloc(&y2,ny*sizeof(double));
		hipMalloc(&a2,na*sizeof(double));
		hipMalloc(&b2,nb*sizeof(double));
		hipMalloc(&dv2,nv*sizeof(double));
		hipMalloc(&image2,nxnynv*sizeof(double));
		hipMalloc(&I_ang2,nanbsizeof(double));
		hipMalloc(&rays2,N_rays*sizeof(ray_struct));
		hipMemcpy(x2,beam.x,nx*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(y2,beam.y,ny*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(a2,beam.a,na*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(b2,beam.b,nb*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(dv2,beam.dv,nv*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),hipMemcpyHostToDevice);
		hipMemset(image2,0,nxnynv*sizeof(double));
		hipMemset(I_ang2,0,nanbsizeof(double));
		HIP_CHECK();
		// Do calculation on device:
		size_t threads = maxThreadsPerBlock;
		size_t N_blocks = N_rays/threads + (N_rays%threads == 0 ? 0:1);
		dim3 block_size = calcBlockSize(N_blocks);
		block_size.x = N_rays/threads + (N_rays%threads == 0 ? 0:1);
		hipLaunchKernelGGL( RayTraceImageHIPKernel, dim3(block_size), dim3(threads), 0, 0,
		N,nx,ny,na,nb,nv,x2,y2,a2,b2,
		dx,dy,dz,da,db,dv2,gain,seed,method,N_rays,rays2,scale,image2,I_ang2);
		HIP_CHECK();
		// Retrieve result from device and store it in host array
		hipMemcpy(image,image2,nxnynv*sizeof(double),hipMemcpyDeviceToHost);
		hipMemcpy(I_ang,I_ang2,nanbsizeof(double),hipMemcpyDeviceToHost);
		HIP_CHECK();
		// Cleanup
		hipFree(x2);
		hipFree(y2);
		hipFree(a2);
		hipFree(b2);
		hipFree(dv2);
		hipFree(rays2);
		hipFree(image2);
		hipFree(I_ang2);
		HIP_CHECK();
		RayTrace::ray_gain_struct::free_device( N, gain_in, gain );
		RayTrace::ray_seed_struct::free_device( seed_in, seed );
		}


		// Copy ray_gain_struct to GPU
		const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_hip( size_t N, const RayTrace::ray_gain_struct* arr )
		{
		RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N];
		for (size_t i=0; i<N; i++) {
		host_ptr[i].Nx = arr[i].Nx;
		host_ptr[i].Ny = arr[i].Ny;
		host_ptr[i].Nv = arr[i].Nv;
		host_ptr[i].E0 = nullptr;
		hipMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double));
		hipMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double));
		hipMalloc(&host_ptr[i].n,arr[i].Nxarr[i].Nysizeof(double));
		hipMalloc(&host_ptr[i].g0,arr[i].Nxarr[i].Nysizeof(float));
		hipMalloc(&host_ptr[i].E0,arr[i].Nxarr[i].Nysizeof(float));
		hipMalloc(&host_ptr[i].gv,arr[i].Nxarr[i].Nyarr[i].Nv*sizeof(float));
		hipMalloc(&host_ptr[i].gv0,arr[i].Nxarr[i].Nysizeof(float));
		hipMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nxarr[i].Nysizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nxarr[i].Nysizeof(float),hipMemcpyHostToDevice);
		if ( arr[i].E0 != nullptr )
		hipMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nxarr[i].Nysizeof(float),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nxarr[i].Nyarr[i].Nv*sizeof(float),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nxarr[i].Nysizeof(float),hipMemcpyHostToDevice);
		}
		RayTrace::ray_gain_struct* dev_ptr=NULL;
		hipMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct));
		hipMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyHostToDevice);
		for (size_t i=0; i<N; i++) {
		host_ptr[i].x = NULL;
		host_ptr[i].y = NULL;
		host_ptr[i].n = NULL;
		host_ptr[i].g0 = NULL;
		host_ptr[i].E0 = NULL;
		host_ptr[i].gv = NULL;
		host_ptr[i].gv0 = NULL;
		}
		delete [] host_ptr;
		HIP_CHECK();
		return dev_ptr;
		}
		// Free ray_gain_struct from GPU
		void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct* dev_ptr )
		{
		RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N];
		hipMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyDeviceToHost);
		for (size_t i=0; i<N; i++) {
		hipFree(host_ptr[i].x);
		hipFree(host_ptr[i].y);
		hipFree(host_ptr[i].n);
		hipFree(host_ptr[i].g0);
		hipFree(host_ptr[i].E0);
		hipFree(host_ptr[i].gv);
		hipFree(host_ptr[i].gv0);
		host_ptr[i].x = NULL;
		host_ptr[i].y = NULL;
		host_ptr[i].n = NULL;
		host_ptr[i].g0 = NULL;
		host_ptr[i].E0 = NULL;
		host_ptr[i].gv = NULL;
		host_ptr[i].gv0 = NULL;
		}
		hipFree((void*)dev_ptr);
		delete [] host_ptr;
		HIP_CHECK();
		}


		// Copy ray_seed_struct to GPU
		const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_hip( const RayTrace::ray_seed_struct& seed )
		{
		RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct();
		host_ptr->f0 = seed.f0;
		for (size_t i=0; i<5; i++) {
		host_ptr->dim[i] = seed.dim[i];
		hipMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double));
		hipMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double));
		hipMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice);
		}
		RayTrace::ray_seed_struct* dev_ptr=NULL;
		hipMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct));
		hipMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyHostToDevice);
		for (size_t i=0; i<5; i++) {
		host_ptr->x[i] = NULL;
		host_ptr->f[i] = NULL;
		}
		delete host_ptr;
		HIP_CHECK();
		return dev_ptr;
		}
		// Free ray_seed_struct from GPU
		void ray_seed_struct_free_device_hip( const RayTrace::ray_seed_struct* dev_ptr )
		{
		RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct;
		hipMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyDeviceToHost);
		for (size_t i=0; i<5; i++) {
		hipFree(host_ptr->x[i]);
		hipFree(host_ptr->f[i]);
		host_ptr->x[i] = NULL;
		host_ptr->f[i] = NULL;
		}
		hipFree((void*)dev_ptr);
		delete host_ptr;
		HIP_CHECK();
		}