Loading src/CreateImageHelpers.cpp +11 −14 Original line number Diff line number Diff line Loading @@ -13,6 +13,9 @@ #ifdef USE_CUDA #include <cuda_runtime_api.h> #endif #ifdef USE_HIP #include "hip/hip_runtime.h" #endif // Detect the OS // clang-format off Loading Loading @@ -249,18 +252,6 @@ std::string hostname() gethostname( hostname, sizeof( hostname ) ); return std::string( hostname ); } int getDeviceID() { #ifdef USE_CUDA int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); return prop.pciDeviceID; #else return -1; #endif } void printHardware() { // Get number of threads Loading @@ -269,6 +260,8 @@ void printHardware() int N_gpu = 0; #ifdef USE_CUDA cudaGetDeviceCount( &N_gpu ); #elif defined( USE_HIP ) hipGetDeviceCount( &N_gpu ); #endif // Get system memory #if defined( USE_LINUX ) Loading Loading @@ -313,6 +306,10 @@ static inline std::vector<std::string> allMethods() methods.push_back( "Cuda" ); methods.push_back( "Cuda-MultiGPU" ); #endif #ifdef USE_HIP methods.push_back( "HIP" ); methods.push_back( "HIP-MultiGPU" ); #endif #ifdef USE_OPENACC methods.push_back( "OpenAcc" ); #endif Loading Loading @@ -344,8 +341,8 @@ Options::Options( int argc, char *argv[] ) : iterations( 1 ), scale( 1.0 ) " CreateImage <args> file.dat\n" "Optional arguments:\n" " -methods=METHODS Comma seperated list of methods to test\n" " cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, HIP, OpenAcc," " Kokkos-Serial, Kokkos-Thread, Kokkos-OpenMP, Kokkos-Cuda\n" " cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, HIP, HIP-MultiGPU,\n" " OpenAcc, Kokkos-Serial, Kokkos-Thread, Kokkos-OpenMP, Kokkos-Cuda\n" " all - run all availible tests (default)\n" " parallel - run all availible parallel tests\n" " -iterations=N Number of iterations to run.\n" Loading src/RayTraceImage.cpp +10 −0 Original line number Diff line number Diff line Loading @@ -373,6 +373,16 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth scale, image, I_ang, failure_code, failed_rays ); #else RAY_ERROR( "HIP is not availible" ); #endif } else if ( compute_method == "hip-multigpu" ) { #if defined( ENABLE_HIP ) int N_gpu; hipGetDeviceCount( &N_gpu ); RayTraceImageThreadLoop( N_gpu, RayTraceImageHIPLoop, setGPU, N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, scale, image, I_ang, failure_code, failed_rays ); #else RAY_ERROR( "Cuda-MultiGPU is not availible" ); #endif } else if ( compute_method == "cpu" ) { RayTraceImageCPULoop( N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, Loading src/RayTraceImageHIP.cu +2 −49 Original line number Diff line number Diff line Loading @@ -4,46 +4,10 @@ #define HCC_COMPILE #include "RayTrace/common/RayTraceImageHelper.h" #include "cuda.h" #include "hip/hip_runtime.h" // Check for errors #define CUDA_CHECK() \ do { \ if ( cudaPeekAtLastError() != cudaSuccess ) { \ cudaError_t error = cudaGetLastError(); \ printf( "cuda error: %i\n", error ); \ printf( " %s\n", cudaGetErrorString( error ) ); \ printf( " line: %i\n", (int) __LINE__ ); \ printf( " file: %s\n", __FILE__ ); \ exit( -1 ); \ } \ } while ( 0 ) #define HIP_CHECK() \ do { \ } while ( 0 ) #define CUDA_PRINT_FUNCTION( fun ) \ do { \ hipFuncAttributes attr; \ hipFuncGetAttributes( &attr, fun ); \ printf( "%s:\n", #fun ); \ printf( " version = %i\n", attr.binaryVersion ); \ printf( " ptx = %i\n", attr.ptxVersion ); \ printf( " constSize = %i\n", attr.constSizeBytes ); \ printf( " localSize = %i\n", attr.localSizeBytes ); \ printf( " sharedSize = %i\n", attr.sharedSizeBytes ); \ printf( " maxThreads = %i\n", attr.maxThreadsPerBlock ); \ printf( " numRegs = %i\n", attr.numRegs ); \ } while ( 0 ) // Atomic add operation for double #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ >= 600 #define atomicAdd2 atomicAdd #else __device__ double atomicAdd2( double* address, double val ) { unsigned long long int* address_as_ull = (unsigned long long int*) address; Loading @@ -55,7 +19,6 @@ __device__ double atomicAdd2( double* address, double val ) } while ( assumed != old ); return __longlong_as_double( old ); } #endif // Get the globally unique thread id Loading @@ -79,7 +42,7 @@ __device__ inline int getIndex( int n, const double x_range[2], double dx, doubl } // Kernel that executes on the CUDA device // Kernel that executes on the device __global__ __launch_bounds__( 128, 8 ) // Set bounds to limit the number of registers void RayTraceImageHIPKernel( int N, int nx, int ny, int na, int nb, int nv, double x0, double x1, double y0, double y1, double a0, double a1, double b0, double b1, double dx, Loading Loading @@ -151,7 +114,7 @@ inline dim3 calcBlockSize( size_t N_blocks ) } // Create the image and call the cuda kernel // Create the image and call the kernel void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct& beam, const RayTrace::ray_gain_struct* gain_in, const RayTrace::ray_seed_struct* seed_in, int method, const std::vector<ray_struct>& rays, double scale, double* image, double* I_ang, Loading @@ -160,16 +123,6 @@ void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct& beam, failure_code = 0; // Need to track failures on GPU // Get device properties static int maxThreadsPerBlock = 128; /*if ( maxThreadsPerBlock == 0 ) { hipFuncAttributes attr; hipFuncGetAttributes(&attr,RayTraceImageHIPKernel); maxThreadsPerBlock = attr.maxThreadsPerBlock; if ( maxThreadsPerBlock == 0 ) { //CUDA_PRINT_FUNCTION(RayTraceImageCudaKernel); RAY_WARNING("maxThreadsPerBlock=0 - Setting to default 128"); maxThreadsPerBlock = 128; } }*/ // place the ray gain and seed structures on the device const int nx = beam.nx; const int ny = beam.ny; Loading src/common/RayTraceImageHelper.h +1 −1 Original line number Diff line number Diff line Loading @@ -26,7 +26,7 @@ #define HOST_DEVICE #endif #if !defined( USE_OPENACC ) && !defined( USE_KOKKOS ) && !defined( USE_CUDA ) #if !defined( USE_OPENACC ) && !defined( USE_KOKKOS ) && !defined( USE_CUDA ) && !defined( USE_HIP ) #define RAY_DEBUG #endif Loading Loading
src/CreateImageHelpers.cpp +11 −14 Original line number Diff line number Diff line Loading @@ -13,6 +13,9 @@ #ifdef USE_CUDA #include <cuda_runtime_api.h> #endif #ifdef USE_HIP #include "hip/hip_runtime.h" #endif // Detect the OS // clang-format off Loading Loading @@ -249,18 +252,6 @@ std::string hostname() gethostname( hostname, sizeof( hostname ) ); return std::string( hostname ); } int getDeviceID() { #ifdef USE_CUDA int device; struct cudaDeviceProp prop; cudaGetDevice( &device ); cudaGetDeviceProperties( &prop, device ); return prop.pciDeviceID; #else return -1; #endif } void printHardware() { // Get number of threads Loading @@ -269,6 +260,8 @@ void printHardware() int N_gpu = 0; #ifdef USE_CUDA cudaGetDeviceCount( &N_gpu ); #elif defined( USE_HIP ) hipGetDeviceCount( &N_gpu ); #endif // Get system memory #if defined( USE_LINUX ) Loading Loading @@ -313,6 +306,10 @@ static inline std::vector<std::string> allMethods() methods.push_back( "Cuda" ); methods.push_back( "Cuda-MultiGPU" ); #endif #ifdef USE_HIP methods.push_back( "HIP" ); methods.push_back( "HIP-MultiGPU" ); #endif #ifdef USE_OPENACC methods.push_back( "OpenAcc" ); #endif Loading Loading @@ -344,8 +341,8 @@ Options::Options( int argc, char *argv[] ) : iterations( 1 ), scale( 1.0 ) " CreateImage <args> file.dat\n" "Optional arguments:\n" " -methods=METHODS Comma seperated list of methods to test\n" " cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, HIP, OpenAcc," " Kokkos-Serial, Kokkos-Thread, Kokkos-OpenMP, Kokkos-Cuda\n" " cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, HIP, HIP-MultiGPU,\n" " OpenAcc, Kokkos-Serial, Kokkos-Thread, Kokkos-OpenMP, Kokkos-Cuda\n" " all - run all availible tests (default)\n" " parallel - run all availible parallel tests\n" " -iterations=N Number of iterations to run.\n" Loading
src/RayTraceImage.cpp +10 −0 Original line number Diff line number Diff line Loading @@ -373,6 +373,16 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth scale, image, I_ang, failure_code, failed_rays ); #else RAY_ERROR( "HIP is not availible" ); #endif } else if ( compute_method == "hip-multigpu" ) { #if defined( ENABLE_HIP ) int N_gpu; hipGetDeviceCount( &N_gpu ); RayTraceImageThreadLoop( N_gpu, RayTraceImageHIPLoop, setGPU, N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, scale, image, I_ang, failure_code, failed_rays ); #else RAY_ERROR( "Cuda-MultiGPU is not availible" ); #endif } else if ( compute_method == "cpu" ) { RayTraceImageCPULoop( N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, Loading
src/RayTraceImageHIP.cu +2 −49 Original line number Diff line number Diff line Loading @@ -4,46 +4,10 @@ #define HCC_COMPILE #include "RayTrace/common/RayTraceImageHelper.h" #include "cuda.h" #include "hip/hip_runtime.h" // Check for errors #define CUDA_CHECK() \ do { \ if ( cudaPeekAtLastError() != cudaSuccess ) { \ cudaError_t error = cudaGetLastError(); \ printf( "cuda error: %i\n", error ); \ printf( " %s\n", cudaGetErrorString( error ) ); \ printf( " line: %i\n", (int) __LINE__ ); \ printf( " file: %s\n", __FILE__ ); \ exit( -1 ); \ } \ } while ( 0 ) #define HIP_CHECK() \ do { \ } while ( 0 ) #define CUDA_PRINT_FUNCTION( fun ) \ do { \ hipFuncAttributes attr; \ hipFuncGetAttributes( &attr, fun ); \ printf( "%s:\n", #fun ); \ printf( " version = %i\n", attr.binaryVersion ); \ printf( " ptx = %i\n", attr.ptxVersion ); \ printf( " constSize = %i\n", attr.constSizeBytes ); \ printf( " localSize = %i\n", attr.localSizeBytes ); \ printf( " sharedSize = %i\n", attr.sharedSizeBytes ); \ printf( " maxThreads = %i\n", attr.maxThreadsPerBlock ); \ printf( " numRegs = %i\n", attr.numRegs ); \ } while ( 0 ) // Atomic add operation for double #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ >= 600 #define atomicAdd2 atomicAdd #else __device__ double atomicAdd2( double* address, double val ) { unsigned long long int* address_as_ull = (unsigned long long int*) address; Loading @@ -55,7 +19,6 @@ __device__ double atomicAdd2( double* address, double val ) } while ( assumed != old ); return __longlong_as_double( old ); } #endif // Get the globally unique thread id Loading @@ -79,7 +42,7 @@ __device__ inline int getIndex( int n, const double x_range[2], double dx, doubl } // Kernel that executes on the CUDA device // Kernel that executes on the device __global__ __launch_bounds__( 128, 8 ) // Set bounds to limit the number of registers void RayTraceImageHIPKernel( int N, int nx, int ny, int na, int nb, int nv, double x0, double x1, double y0, double y1, double a0, double a1, double b0, double b1, double dx, Loading Loading @@ -151,7 +114,7 @@ inline dim3 calcBlockSize( size_t N_blocks ) } // Create the image and call the cuda kernel // Create the image and call the kernel void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct& beam, const RayTrace::ray_gain_struct* gain_in, const RayTrace::ray_seed_struct* seed_in, int method, const std::vector<ray_struct>& rays, double scale, double* image, double* I_ang, Loading @@ -160,16 +123,6 @@ void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct& beam, failure_code = 0; // Need to track failures on GPU // Get device properties static int maxThreadsPerBlock = 128; /*if ( maxThreadsPerBlock == 0 ) { hipFuncAttributes attr; hipFuncGetAttributes(&attr,RayTraceImageHIPKernel); maxThreadsPerBlock = attr.maxThreadsPerBlock; if ( maxThreadsPerBlock == 0 ) { //CUDA_PRINT_FUNCTION(RayTraceImageCudaKernel); RAY_WARNING("maxThreadsPerBlock=0 - Setting to default 128"); maxThreadsPerBlock = 128; } }*/ // place the ray gain and seed structures on the device const int nx = beam.nx; const int ny = beam.ny; Loading
src/common/RayTraceImageHelper.h +1 −1 Original line number Diff line number Diff line Loading @@ -26,7 +26,7 @@ #define HOST_DEVICE #endif #if !defined( USE_OPENACC ) && !defined( USE_KOKKOS ) && !defined( USE_CUDA ) #if !defined( USE_OPENACC ) && !defined( USE_KOKKOS ) && !defined( USE_CUDA ) && !defined( USE_HIP ) #define RAY_DEBUG #endif Loading