Loading build/configure +4 −2 Original line number Diff line number Diff line Loading @@ -9,11 +9,13 @@ cmake \ -D CMAKE_CXX_COMPILER=mpic++ \ -D CMAKE_CXX_STANDARD=11 \ -D USE_OPENACC=0 \ -D USE_OPENMP=1 \ -D USE_OPENMP=0 \ -D USE_KOKKOS=0 \ -D KOKKOS_DIRECTORY=${KOKKOS_DIR} \ -D KOKKOS_WRAPPER=${KOKKOS_DIR}/nvcc_wrapper \ -D USE_CUDA=1 \ -D USE_CUDA=0 \ -D CUDA_FLAGS="-arch sm_30" \ -D USE_HIP=1 \ -D HIP_NVCC_OPTIONS="-arch sm_30" \ ../src src/CMakeLists.txt +34 −3 Original line number Diff line number Diff line Loading @@ -137,6 +137,27 @@ IF ( USE_CUDA ) ENABLE_LANGUAGE( CUDA ) ENDIF() # Enable HIP CHECK_ENABLE_FLAG( USE_HIP 0 ) IF ( USE_HIP ) IF ( NOT DEFINED HIP_PATH ) IF ( NOT DEFINED ENV{HIP_PATH} ) SET( HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed" ) ELSE() SET( HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed" ) ENDIF() ENDIF() SET( CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH} ) FIND_PACKAGE( HIP REQUIRED ) FIND_PACKAGE( CUDA QUIET ) MESSAGE( "HIP Found") MESSAGE( " HIP version: ${HIP_VERSION_STRING}") MESSAGE( " HIP platform: ${HIP_PLATFORM}") MESSAGE( " HIP Include Path: ${HIP_INCLUDE_DIRS}") MESSAGE( " HIP Libraries: ${HIP_LIBRARIES}") ADD_DEFINITIONS( -DUSE_HIP ) ENDIF() # Include some paths INCLUDE_DIRECTORIES( "${RAYTRACE_INSTALL_DIR}/include" ) Loading Loading @@ -173,16 +194,26 @@ ENDIF() IF ( USE_KOKKOS ) #SET( SOURCES ${SOURCES} RayTraceImageKokkos.cpp ) ENDIF() IF (USE_HIP ) SET( SOURCES ${SOURCES} RayTraceImageHIP.cpp ) ENDIF() IF ( USE_CUDA ) SET( SOURCES ${SOURCES} RayTraceImageCuda.cu ) ENDIF() ADD_LIBRARY( RayTrace ${SOURCES} ${CUBINS} ) ADD_DEPENDENCIES( RayTrace copy-include ) ADD_LIBRARY( RayTrace SHARED ${SOURCES} ${CUBINS} ) IF ( USE_HIP ) SET( HIP_SEPERABLE_COMPILATION ON ) SET_SOURCE_FILES_PROPERTIES( RayTraceImageHIP.hip PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1 ) HIP_ADD_LIBRARY( RayTraceHIP RayTraceImageHIP.hip SHARED HIPCC_OPTIONS ${HIP_HIPCC_OPTIONS} HCC_OPTIONS ${HIP_HCC_OPTIONS} NVCC_OPTIONS ${HIP_NVCC_OPTIONS} ${HIP_NVCC_FLAGS} ) TARGET_LINK_LIBRARIES( RayTraceHIP /opt/rocm-3.3.0/lib/libhip_hcc.so ) TARGET_LINK_LIBRARIES( RayTrace RayTraceHIP ) ADD_DEPENDENCIES( RayTraceHIP copy-include ) ENDIF() # Add the applications ADD_EXECUTABLE( CreateImage CreateImage.cpp ) TARGET_LINK_LIBRARIES( CreateImage RayTrace ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} ) TARGET_LINK_LIBRARIES( CreateImage RayTrace RayTraceHIP ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} ) INSTALL( TARGETS CreateImage DESTINATION ${RAYTRACE_INSTALL_DIR}/bin ) src/CreateImageHelpers.cpp +5 −1 Original line number Diff line number Diff line Loading @@ -6,6 +6,10 @@ #include <thread> #if defined( USE_MPI ) #include "mpi.h" #endif #ifdef USE_CUDA #include <cuda_runtime_api.h> #endif Loading Loading @@ -368,7 +372,7 @@ Options::Options( int argc, char *argv[] ) : iterations( 1 ), scale( 1.0 ) " CreateImage <args> file.dat\n" "Optional arguments:\n" " -methods=METHODS Comma seperated list of methods to test\n" " cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, OpenAcc, Kokkos-Serial, " " cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, HIP, OpenAcc, Kokkos-Serial, " "Kokkos-Thread, Kokkos-OpenMP, Kokkos-Cuda\n" " all - run all availible tests (default)\n" " parallel - run all availible parallel tests\n" Loading src/RayTraceImage.cpp +19 −0 Original line number Diff line number Diff line Loading @@ -28,6 +28,10 @@ #define ENABLE_CUDA #undef USE_CUDA #endif #ifdef USE_HIP #define ENABLE_HIP #undef USE_HIP #endif #ifdef USE_OPENMP #define ENABLE_OPENMP #undef USE_OPENMP Loading Loading @@ -66,6 +70,12 @@ extern void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct &euv_b const std::vector<ray_struct> &rays, double scale, double *image, double *I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays ); #endif #if defined( ENABLE_HIP ) extern void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct &euv_beam, const RayTrace::ray_gain_struct *gain, const RayTrace::ray_seed_struct *seed, int method, const std::vector<ray_struct> &rays, double scale, double *image, double *I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays ); #endif /********************************************************************** Loading Loading @@ -300,6 +310,8 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth compute_method = "openacc"; #elif defined( ENABLE_CUDA ) compute_method = "cuda"; #elif defined( ENABLE_HIP ) compute_method = "hip"; #elif defined( ENABLE_OPENMP ) compute_method = "openmp"; #elif defined( ENABLE_KOKKOS ) Loading Loading @@ -365,6 +377,13 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth failure_code, failed_rays ); #else RAY_ERROR( "Cuda-MultiGPU is not availible" ); #endif } else if ( compute_method == "hip" ) { #if defined( ENABLE_HIP ) RayTraceImageHIPLoop( N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, scale, image, I_ang, failure_code, failed_rays ); #else RAY_ERROR( "HIP is not availible" ); #endif } else if ( compute_method == "cpu" ) { RayTraceImageCPULoop( N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, Loading src/RayTraceImageHIP.hip 0 → 100644 +344 −0 Original line number Diff line number Diff line #include "RayTrace/RayTraceStructures.h" #include "RayTrace/utilities/RayUtilities.h" #define HCC_COMPILE #include "RayTrace/common/RayTraceImageHelper.h" #include "hip/hip_runtime.h" #include "cuda.h" // Check for errors #define CUDA_CHECK() \ do { \ if ( cudaPeekAtLastError() != cudaSuccess ) { \ cudaError_t error = cudaGetLastError(); \ printf("cuda error: %i\n",error); \ printf(" %s\n",cudaGetErrorString(error)); \ printf(" line: %i\n",(int)__LINE__); \ printf(" file: %s\n",__FILE__); \ exit(-1); \ } \ } while (0) #define HIP_CHECK() \ do { \ } while (0) #define CUDA_PRINT_FUNCTION( fun ) \ do { \ hipFuncAttributes attr; \ hipFuncGetAttributes(&attr,fun); \ printf("%s:\n",#fun); \ printf(" version = %i\n",attr.binaryVersion); \ printf(" ptx = %i\n",attr.ptxVersion); \ printf(" constSize = %i\n",attr.constSizeBytes); \ printf(" localSize = %i\n",attr.localSizeBytes); \ printf(" sharedSize = %i\n",attr.sharedSizeBytes); \ printf(" maxThreads = %i\n",attr.maxThreadsPerBlock); \ printf(" numRegs = %i\n",attr.numRegs); \ } while (0) // Atomic add operation for double #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ >= 600 #define atomicAdd2 atomicAdd #else __device__ double atomicAdd2(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } #endif // Get the globally unique thread id __device__ int getGlobalIdx3D() { int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z; int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x; return threadId; } // Get the index __device__ inline int getIndex( int n, const double *x, double dx, double y ) { if ( y < x[0] - 0.5 * dx || y > x[n - 1] + 0.5 * dx ) return -1; return findfirstsingle( x, n, y - 0.5 * dx ); } // Kernel that executes on the CUDA device __global__ __launch_bounds__(128,8) // Set bounds to limit the number of registers void RayTraceImageHIPKernel( int N, int nx, int ny, int na, int nb, int nv, const double *x, const double *y, const double *a, const double *b, double dx, double dy, double dz, double da, double db, const double *dv, const RayTrace::ray_gain_struct* gain, const RayTrace::ray_seed_struct* seed, int method, int N_rays, const ray_struct *rays, double scale, double *image, double *I_ang ) { int idx = getGlobalIdx3D(); if ( idx<N_rays ) { const ray_struct ray = rays[idx]; double Iv[K_MAX]; ray_struct ray2; int error = RayTrace_calc_ray( ray, N, dz, gain, seed, nv, method, Iv, ray2 ); if ( error!=0 ) { //failed_rays.push_back(ray); //set_bit(-error,failure_code); } else { if ( method == 1 ) { // We are propagating backward, use ray for the cell updates ray2 = ray; } else { // We are propagating forward, use ray2 for the cell updates // Note: The sign of the angle is reversed with respect to the euv_beam ray2.a = -ray2.a; ray2.b = -ray2.b; if ( ray2.y<0.0 && y[0]>=0.0 ) { // We need to change the sign of y ray2.y = -ray2.y; } } // Get the indicies to the cells in image and I_ang int i1 = getIndex( nx, x, dx, ray2.x ); int i2 = getIndex( ny, y, dy, ray2.y ); int i3 = getIndex( na, a, da, ray2.a ); int i4 = getIndex( nb, b, db, ray2.b ); // Copy I_out into image if (i1>=0 && i2>=0){ double *Iv2 = &image[nv*(i1+i2*nx)]; for (int iv=0; iv<nv; iv++) atomicAdd2(&Iv2[iv],Iv[iv]*scale); } // Copy I_out into I_ang if (i3>=0 && i4>=0) { double tmp = 0.0; for (int iv=0; iv<nv; iv++) tmp += 2.0*dv[iv]*Iv[iv]; atomicAdd2(&I_ang[i3+i4*na],tmp); } } } } // Compute the block size to use inline dim3 calcBlockSize( size_t N_blocks ) { dim3 block_size; if ( N_blocks < 65535 ) { block_size.x = N_blocks; } else { block_size.y = N_blocks/32768; block_size.x = N_blocks/block_size.y + (N_blocks%block_size.y == 0 ? 0:1); } return block_size; } // Create the image and call the cuda kernel void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct& beam, const RayTrace::ray_gain_struct* gain_in, const RayTrace::ray_seed_struct* seed_in, int method, const std::vector<ray_struct>& rays, double scale, double *image, double *I_ang, unsigned int& failure_code, std::vector<ray_struct>& failed_rays ) { failure_code = 0; // Need to track failures on GPU // Get device properties static int maxThreadsPerBlock = 128; /*if ( maxThreadsPerBlock == 0 ) { hipFuncAttributes attr; hipFuncGetAttributes(&attr,RayTraceImageHIPKernel); maxThreadsPerBlock = attr.maxThreadsPerBlock; //CUDA_PRINT_FUNCTION(RayTraceImageCudaKernel); RAY_ASSERT(maxThreadsPerBlock>0); }*/ // place the ray gain and seed structures on the device const int nx = beam.nx; const int ny = beam.ny; const int na = beam.na; const int nb = beam.nb; const int nv = beam.nv; const double dx = beam.dx; const double dy = beam.dy; const double dz = beam.dz; const double da = beam.da; const double db = beam.db; const RayTrace::ray_gain_struct* gain = RayTrace::ray_gain_struct::copy_device( N, gain_in ); const RayTrace::ray_seed_struct* seed = NULL; if ( seed_in!=NULL ) seed = seed_in->copy_device(); // Allocate device memory size_t N_rays = rays.size(); double *x2, *y2, *a2, *b2, *dv2, *image2, *I_ang2; ray_struct *rays2; hipMalloc(&x2,nx*sizeof(double)); hipMalloc(&y2,ny*sizeof(double)); hipMalloc(&a2,na*sizeof(double)); hipMalloc(&b2,nb*sizeof(double)); hipMalloc(&dv2,nv*sizeof(double)); hipMalloc(&image2,nx*ny*nv*sizeof(double)); hipMalloc(&I_ang2,na*nb*sizeof(double)); hipMalloc(&rays2,N_rays*sizeof(ray_struct)); hipMemcpy(x2,beam.x,nx*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(y2,beam.y,ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(a2,beam.a,na*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(b2,beam.b,nb*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(dv2,beam.dv,nv*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),hipMemcpyHostToDevice); hipMemset(image2,0,nx*ny*nv*sizeof(double)); hipMemset(I_ang2,0,na*nb*sizeof(double)); HIP_CHECK(); // Do calculation on device: size_t threads = maxThreadsPerBlock; size_t N_blocks = N_rays/threads + (N_rays%threads == 0 ? 0:1); dim3 block_size = calcBlockSize(N_blocks); block_size.x = N_rays/threads + (N_rays%threads == 0 ? 0:1); hipLaunchKernelGGL( RayTraceImageHIPKernel, dim3(block_size), dim3(threads), 0, 0, N,nx,ny,na,nb,nv,x2,y2,a2,b2, dx,dy,dz,da,db,dv2,gain,seed,method,N_rays,rays2,scale,image2,I_ang2); HIP_CHECK(); // Retrieve result from device and store it in host array hipMemcpy(image,image2,nx*ny*nv*sizeof(double),hipMemcpyDeviceToHost); hipMemcpy(I_ang,I_ang2,na*nb*sizeof(double),hipMemcpyDeviceToHost); HIP_CHECK(); // Cleanup hipFree(x2); hipFree(y2); hipFree(a2); hipFree(b2); hipFree(dv2); hipFree(rays2); hipFree(image2); hipFree(I_ang2); HIP_CHECK(); RayTrace::ray_gain_struct::free_device( N, gain_in, gain ); RayTrace::ray_seed_struct::free_device( seed_in, seed ); } // Copy ray_gain_struct to GPU const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_hip( size_t N, const RayTrace::ray_gain_struct* arr ) { RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N]; for (size_t i=0; i<N; i++) { host_ptr[i].Nx = arr[i].Nx; host_ptr[i].Ny = arr[i].Ny; host_ptr[i].Nv = arr[i].Nv; host_ptr[i].E0 = nullptr; hipMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double)); hipMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double)); hipMalloc(&host_ptr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double)); hipMalloc(&host_ptr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMalloc(&host_ptr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMalloc(&host_ptr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float)); hipMalloc(&host_ptr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); if ( arr[i].E0 != nullptr ) hipMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); } RayTrace::ray_gain_struct* dev_ptr=NULL; hipMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct)); hipMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyHostToDevice); for (size_t i=0; i<N; i++) { host_ptr[i].x = NULL; host_ptr[i].y = NULL; host_ptr[i].n = NULL; host_ptr[i].g0 = NULL; host_ptr[i].E0 = NULL; host_ptr[i].gv = NULL; host_ptr[i].gv0 = NULL; } delete [] host_ptr; HIP_CHECK(); return dev_ptr; } // Free ray_gain_struct from GPU void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct* dev_ptr ) { RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N]; hipMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyDeviceToHost); for (size_t i=0; i<N; i++) { hipFree(host_ptr[i].x); hipFree(host_ptr[i].y); hipFree(host_ptr[i].n); hipFree(host_ptr[i].g0); hipFree(host_ptr[i].E0); hipFree(host_ptr[i].gv); hipFree(host_ptr[i].gv0); host_ptr[i].x = NULL; host_ptr[i].y = NULL; host_ptr[i].n = NULL; host_ptr[i].g0 = NULL; host_ptr[i].E0 = NULL; host_ptr[i].gv = NULL; host_ptr[i].gv0 = NULL; } hipFree((void*)dev_ptr); delete [] host_ptr; HIP_CHECK(); } // Copy ray_seed_struct to GPU const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_hip( const RayTrace::ray_seed_struct& seed ) { RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct(); host_ptr->f0 = seed.f0; for (size_t i=0; i<5; i++) { host_ptr->dim[i] = seed.dim[i]; hipMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double)); hipMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double)); hipMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice); } RayTrace::ray_seed_struct* dev_ptr=NULL; hipMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct)); hipMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyHostToDevice); for (size_t i=0; i<5; i++) { host_ptr->x[i] = NULL; host_ptr->f[i] = NULL; } delete host_ptr; HIP_CHECK(); return dev_ptr; } // Free ray_seed_struct from GPU void ray_seed_struct_free_device_hip( const RayTrace::ray_seed_struct* dev_ptr ) { RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct; hipMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyDeviceToHost); for (size_t i=0; i<5; i++) { hipFree(host_ptr->x[i]); hipFree(host_ptr->f[i]); host_ptr->x[i] = NULL; host_ptr->f[i] = NULL; } hipFree((void*)dev_ptr); delete host_ptr; HIP_CHECK(); } Loading
build/configure +4 −2 Original line number Diff line number Diff line Loading @@ -9,11 +9,13 @@ cmake \ -D CMAKE_CXX_COMPILER=mpic++ \ -D CMAKE_CXX_STANDARD=11 \ -D USE_OPENACC=0 \ -D USE_OPENMP=1 \ -D USE_OPENMP=0 \ -D USE_KOKKOS=0 \ -D KOKKOS_DIRECTORY=${KOKKOS_DIR} \ -D KOKKOS_WRAPPER=${KOKKOS_DIR}/nvcc_wrapper \ -D USE_CUDA=1 \ -D USE_CUDA=0 \ -D CUDA_FLAGS="-arch sm_30" \ -D USE_HIP=1 \ -D HIP_NVCC_OPTIONS="-arch sm_30" \ ../src
src/CMakeLists.txt +34 −3 Original line number Diff line number Diff line Loading @@ -137,6 +137,27 @@ IF ( USE_CUDA ) ENABLE_LANGUAGE( CUDA ) ENDIF() # Enable HIP CHECK_ENABLE_FLAG( USE_HIP 0 ) IF ( USE_HIP ) IF ( NOT DEFINED HIP_PATH ) IF ( NOT DEFINED ENV{HIP_PATH} ) SET( HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed" ) ELSE() SET( HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed" ) ENDIF() ENDIF() SET( CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH} ) FIND_PACKAGE( HIP REQUIRED ) FIND_PACKAGE( CUDA QUIET ) MESSAGE( "HIP Found") MESSAGE( " HIP version: ${HIP_VERSION_STRING}") MESSAGE( " HIP platform: ${HIP_PLATFORM}") MESSAGE( " HIP Include Path: ${HIP_INCLUDE_DIRS}") MESSAGE( " HIP Libraries: ${HIP_LIBRARIES}") ADD_DEFINITIONS( -DUSE_HIP ) ENDIF() # Include some paths INCLUDE_DIRECTORIES( "${RAYTRACE_INSTALL_DIR}/include" ) Loading Loading @@ -173,16 +194,26 @@ ENDIF() IF ( USE_KOKKOS ) #SET( SOURCES ${SOURCES} RayTraceImageKokkos.cpp ) ENDIF() IF (USE_HIP ) SET( SOURCES ${SOURCES} RayTraceImageHIP.cpp ) ENDIF() IF ( USE_CUDA ) SET( SOURCES ${SOURCES} RayTraceImageCuda.cu ) ENDIF() ADD_LIBRARY( RayTrace ${SOURCES} ${CUBINS} ) ADD_DEPENDENCIES( RayTrace copy-include ) ADD_LIBRARY( RayTrace SHARED ${SOURCES} ${CUBINS} ) IF ( USE_HIP ) SET( HIP_SEPERABLE_COMPILATION ON ) SET_SOURCE_FILES_PROPERTIES( RayTraceImageHIP.hip PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1 ) HIP_ADD_LIBRARY( RayTraceHIP RayTraceImageHIP.hip SHARED HIPCC_OPTIONS ${HIP_HIPCC_OPTIONS} HCC_OPTIONS ${HIP_HCC_OPTIONS} NVCC_OPTIONS ${HIP_NVCC_OPTIONS} ${HIP_NVCC_FLAGS} ) TARGET_LINK_LIBRARIES( RayTraceHIP /opt/rocm-3.3.0/lib/libhip_hcc.so ) TARGET_LINK_LIBRARIES( RayTrace RayTraceHIP ) ADD_DEPENDENCIES( RayTraceHIP copy-include ) ENDIF() # Add the applications ADD_EXECUTABLE( CreateImage CreateImage.cpp ) TARGET_LINK_LIBRARIES( CreateImage RayTrace ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} ) TARGET_LINK_LIBRARIES( CreateImage RayTrace RayTraceHIP ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} ) INSTALL( TARGETS CreateImage DESTINATION ${RAYTRACE_INSTALL_DIR}/bin )
src/CreateImageHelpers.cpp +5 −1 Original line number Diff line number Diff line Loading @@ -6,6 +6,10 @@ #include <thread> #if defined( USE_MPI ) #include "mpi.h" #endif #ifdef USE_CUDA #include <cuda_runtime_api.h> #endif Loading Loading @@ -368,7 +372,7 @@ Options::Options( int argc, char *argv[] ) : iterations( 1 ), scale( 1.0 ) " CreateImage <args> file.dat\n" "Optional arguments:\n" " -methods=METHODS Comma seperated list of methods to test\n" " cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, OpenAcc, Kokkos-Serial, " " cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, HIP, OpenAcc, Kokkos-Serial, " "Kokkos-Thread, Kokkos-OpenMP, Kokkos-Cuda\n" " all - run all availible tests (default)\n" " parallel - run all availible parallel tests\n" Loading
src/RayTraceImage.cpp +19 −0 Original line number Diff line number Diff line Loading @@ -28,6 +28,10 @@ #define ENABLE_CUDA #undef USE_CUDA #endif #ifdef USE_HIP #define ENABLE_HIP #undef USE_HIP #endif #ifdef USE_OPENMP #define ENABLE_OPENMP #undef USE_OPENMP Loading Loading @@ -66,6 +70,12 @@ extern void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct &euv_b const std::vector<ray_struct> &rays, double scale, double *image, double *I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays ); #endif #if defined( ENABLE_HIP ) extern void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct &euv_beam, const RayTrace::ray_gain_struct *gain, const RayTrace::ray_seed_struct *seed, int method, const std::vector<ray_struct> &rays, double scale, double *image, double *I_ang, unsigned int &failure_code, std::vector<ray_struct> &failed_rays ); #endif /********************************************************************** Loading Loading @@ -300,6 +310,8 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth compute_method = "openacc"; #elif defined( ENABLE_CUDA ) compute_method = "cuda"; #elif defined( ENABLE_HIP ) compute_method = "hip"; #elif defined( ENABLE_OPENMP ) compute_method = "openmp"; #elif defined( ENABLE_KOKKOS ) Loading Loading @@ -365,6 +377,13 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth failure_code, failed_rays ); #else RAY_ERROR( "Cuda-MultiGPU is not availible" ); #endif } else if ( compute_method == "hip" ) { #if defined( ENABLE_HIP ) RayTraceImageHIPLoop( N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, scale, image, I_ang, failure_code, failed_rays ); #else RAY_ERROR( "HIP is not availible" ); #endif } else if ( compute_method == "cpu" ) { RayTraceImageCPULoop( N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, Loading
src/RayTraceImageHIP.hip 0 → 100644 +344 −0 Original line number Diff line number Diff line #include "RayTrace/RayTraceStructures.h" #include "RayTrace/utilities/RayUtilities.h" #define HCC_COMPILE #include "RayTrace/common/RayTraceImageHelper.h" #include "hip/hip_runtime.h" #include "cuda.h" // Check for errors #define CUDA_CHECK() \ do { \ if ( cudaPeekAtLastError() != cudaSuccess ) { \ cudaError_t error = cudaGetLastError(); \ printf("cuda error: %i\n",error); \ printf(" %s\n",cudaGetErrorString(error)); \ printf(" line: %i\n",(int)__LINE__); \ printf(" file: %s\n",__FILE__); \ exit(-1); \ } \ } while (0) #define HIP_CHECK() \ do { \ } while (0) #define CUDA_PRINT_FUNCTION( fun ) \ do { \ hipFuncAttributes attr; \ hipFuncGetAttributes(&attr,fun); \ printf("%s:\n",#fun); \ printf(" version = %i\n",attr.binaryVersion); \ printf(" ptx = %i\n",attr.ptxVersion); \ printf(" constSize = %i\n",attr.constSizeBytes); \ printf(" localSize = %i\n",attr.localSizeBytes); \ printf(" sharedSize = %i\n",attr.sharedSizeBytes); \ printf(" maxThreads = %i\n",attr.maxThreadsPerBlock); \ printf(" numRegs = %i\n",attr.numRegs); \ } while (0) // Atomic add operation for double #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ >= 600 #define atomicAdd2 atomicAdd #else __device__ double atomicAdd2(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } #endif // Get the globally unique thread id __device__ int getGlobalIdx3D() { int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z; int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x; return threadId; } // Get the index __device__ inline int getIndex( int n, const double *x, double dx, double y ) { if ( y < x[0] - 0.5 * dx || y > x[n - 1] + 0.5 * dx ) return -1; return findfirstsingle( x, n, y - 0.5 * dx ); } // Kernel that executes on the CUDA device __global__ __launch_bounds__(128,8) // Set bounds to limit the number of registers void RayTraceImageHIPKernel( int N, int nx, int ny, int na, int nb, int nv, const double *x, const double *y, const double *a, const double *b, double dx, double dy, double dz, double da, double db, const double *dv, const RayTrace::ray_gain_struct* gain, const RayTrace::ray_seed_struct* seed, int method, int N_rays, const ray_struct *rays, double scale, double *image, double *I_ang ) { int idx = getGlobalIdx3D(); if ( idx<N_rays ) { const ray_struct ray = rays[idx]; double Iv[K_MAX]; ray_struct ray2; int error = RayTrace_calc_ray( ray, N, dz, gain, seed, nv, method, Iv, ray2 ); if ( error!=0 ) { //failed_rays.push_back(ray); //set_bit(-error,failure_code); } else { if ( method == 1 ) { // We are propagating backward, use ray for the cell updates ray2 = ray; } else { // We are propagating forward, use ray2 for the cell updates // Note: The sign of the angle is reversed with respect to the euv_beam ray2.a = -ray2.a; ray2.b = -ray2.b; if ( ray2.y<0.0 && y[0]>=0.0 ) { // We need to change the sign of y ray2.y = -ray2.y; } } // Get the indicies to the cells in image and I_ang int i1 = getIndex( nx, x, dx, ray2.x ); int i2 = getIndex( ny, y, dy, ray2.y ); int i3 = getIndex( na, a, da, ray2.a ); int i4 = getIndex( nb, b, db, ray2.b ); // Copy I_out into image if (i1>=0 && i2>=0){ double *Iv2 = &image[nv*(i1+i2*nx)]; for (int iv=0; iv<nv; iv++) atomicAdd2(&Iv2[iv],Iv[iv]*scale); } // Copy I_out into I_ang if (i3>=0 && i4>=0) { double tmp = 0.0; for (int iv=0; iv<nv; iv++) tmp += 2.0*dv[iv]*Iv[iv]; atomicAdd2(&I_ang[i3+i4*na],tmp); } } } } // Compute the block size to use inline dim3 calcBlockSize( size_t N_blocks ) { dim3 block_size; if ( N_blocks < 65535 ) { block_size.x = N_blocks; } else { block_size.y = N_blocks/32768; block_size.x = N_blocks/block_size.y + (N_blocks%block_size.y == 0 ? 0:1); } return block_size; } // Create the image and call the cuda kernel void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct& beam, const RayTrace::ray_gain_struct* gain_in, const RayTrace::ray_seed_struct* seed_in, int method, const std::vector<ray_struct>& rays, double scale, double *image, double *I_ang, unsigned int& failure_code, std::vector<ray_struct>& failed_rays ) { failure_code = 0; // Need to track failures on GPU // Get device properties static int maxThreadsPerBlock = 128; /*if ( maxThreadsPerBlock == 0 ) { hipFuncAttributes attr; hipFuncGetAttributes(&attr,RayTraceImageHIPKernel); maxThreadsPerBlock = attr.maxThreadsPerBlock; //CUDA_PRINT_FUNCTION(RayTraceImageCudaKernel); RAY_ASSERT(maxThreadsPerBlock>0); }*/ // place the ray gain and seed structures on the device const int nx = beam.nx; const int ny = beam.ny; const int na = beam.na; const int nb = beam.nb; const int nv = beam.nv; const double dx = beam.dx; const double dy = beam.dy; const double dz = beam.dz; const double da = beam.da; const double db = beam.db; const RayTrace::ray_gain_struct* gain = RayTrace::ray_gain_struct::copy_device( N, gain_in ); const RayTrace::ray_seed_struct* seed = NULL; if ( seed_in!=NULL ) seed = seed_in->copy_device(); // Allocate device memory size_t N_rays = rays.size(); double *x2, *y2, *a2, *b2, *dv2, *image2, *I_ang2; ray_struct *rays2; hipMalloc(&x2,nx*sizeof(double)); hipMalloc(&y2,ny*sizeof(double)); hipMalloc(&a2,na*sizeof(double)); hipMalloc(&b2,nb*sizeof(double)); hipMalloc(&dv2,nv*sizeof(double)); hipMalloc(&image2,nx*ny*nv*sizeof(double)); hipMalloc(&I_ang2,na*nb*sizeof(double)); hipMalloc(&rays2,N_rays*sizeof(ray_struct)); hipMemcpy(x2,beam.x,nx*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(y2,beam.y,ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(a2,beam.a,na*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(b2,beam.b,nb*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(dv2,beam.dv,nv*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),hipMemcpyHostToDevice); hipMemset(image2,0,nx*ny*nv*sizeof(double)); hipMemset(I_ang2,0,na*nb*sizeof(double)); HIP_CHECK(); // Do calculation on device: size_t threads = maxThreadsPerBlock; size_t N_blocks = N_rays/threads + (N_rays%threads == 0 ? 0:1); dim3 block_size = calcBlockSize(N_blocks); block_size.x = N_rays/threads + (N_rays%threads == 0 ? 0:1); hipLaunchKernelGGL( RayTraceImageHIPKernel, dim3(block_size), dim3(threads), 0, 0, N,nx,ny,na,nb,nv,x2,y2,a2,b2, dx,dy,dz,da,db,dv2,gain,seed,method,N_rays,rays2,scale,image2,I_ang2); HIP_CHECK(); // Retrieve result from device and store it in host array hipMemcpy(image,image2,nx*ny*nv*sizeof(double),hipMemcpyDeviceToHost); hipMemcpy(I_ang,I_ang2,na*nb*sizeof(double),hipMemcpyDeviceToHost); HIP_CHECK(); // Cleanup hipFree(x2); hipFree(y2); hipFree(a2); hipFree(b2); hipFree(dv2); hipFree(rays2); hipFree(image2); hipFree(I_ang2); HIP_CHECK(); RayTrace::ray_gain_struct::free_device( N, gain_in, gain ); RayTrace::ray_seed_struct::free_device( seed_in, seed ); } // Copy ray_gain_struct to GPU const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_hip( size_t N, const RayTrace::ray_gain_struct* arr ) { RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N]; for (size_t i=0; i<N; i++) { host_ptr[i].Nx = arr[i].Nx; host_ptr[i].Ny = arr[i].Ny; host_ptr[i].Nv = arr[i].Nv; host_ptr[i].E0 = nullptr; hipMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double)); hipMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double)); hipMalloc(&host_ptr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double)); hipMalloc(&host_ptr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMalloc(&host_ptr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMalloc(&host_ptr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float)); hipMalloc(&host_ptr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); if ( arr[i].E0 != nullptr ) hipMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); } RayTrace::ray_gain_struct* dev_ptr=NULL; hipMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct)); hipMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyHostToDevice); for (size_t i=0; i<N; i++) { host_ptr[i].x = NULL; host_ptr[i].y = NULL; host_ptr[i].n = NULL; host_ptr[i].g0 = NULL; host_ptr[i].E0 = NULL; host_ptr[i].gv = NULL; host_ptr[i].gv0 = NULL; } delete [] host_ptr; HIP_CHECK(); return dev_ptr; } // Free ray_gain_struct from GPU void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct* dev_ptr ) { RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N]; hipMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyDeviceToHost); for (size_t i=0; i<N; i++) { hipFree(host_ptr[i].x); hipFree(host_ptr[i].y); hipFree(host_ptr[i].n); hipFree(host_ptr[i].g0); hipFree(host_ptr[i].E0); hipFree(host_ptr[i].gv); hipFree(host_ptr[i].gv0); host_ptr[i].x = NULL; host_ptr[i].y = NULL; host_ptr[i].n = NULL; host_ptr[i].g0 = NULL; host_ptr[i].E0 = NULL; host_ptr[i].gv = NULL; host_ptr[i].gv0 = NULL; } hipFree((void*)dev_ptr); delete [] host_ptr; HIP_CHECK(); } // Copy ray_seed_struct to GPU const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_hip( const RayTrace::ray_seed_struct& seed ) { RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct(); host_ptr->f0 = seed.f0; for (size_t i=0; i<5; i++) { host_ptr->dim[i] = seed.dim[i]; hipMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double)); hipMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double)); hipMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice); } RayTrace::ray_seed_struct* dev_ptr=NULL; hipMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct)); hipMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyHostToDevice); for (size_t i=0; i<5; i++) { host_ptr->x[i] = NULL; host_ptr->f[i] = NULL; } delete host_ptr; HIP_CHECK(); return dev_ptr; } // Free ray_seed_struct from GPU void ray_seed_struct_free_device_hip( const RayTrace::ray_seed_struct* dev_ptr ) { RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct; hipMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyDeviceToHost); for (size_t i=0; i<5; i++) { hipFree(host_ptr->x[i]); hipFree(host_ptr->f[i]); host_ptr->x[i] = NULL; host_ptr->f[i] = NULL; } hipFree((void*)dev_ptr); delete host_ptr; HIP_CHECK(); }