Commit 33270b99 authored by Berrill, Mark's avatar Berrill, Mark
Browse files

Adding hip option

parent 0aa84331
Loading
Loading
Loading
Loading
+4 −2
Original line number Diff line number Diff line
@@ -9,11 +9,13 @@ cmake \
   -D CMAKE_CXX_COMPILER=mpic++         \
   -D CMAKE_CXX_STANDARD=11             \
   -D USE_OPENACC=0                     \
   -D USE_OPENMP=1                      \
   -D USE_OPENMP=0                      \
   -D USE_KOKKOS=0                      \
      -D KOKKOS_DIRECTORY=${KOKKOS_DIR} \
      -D KOKKOS_WRAPPER=${KOKKOS_DIR}/nvcc_wrapper \
   -D USE_CUDA=1                        \
   -D USE_CUDA=0                        \
      -D CUDA_FLAGS="-arch sm_30"       \
   -D USE_HIP=1                         \
      -D HIP_NVCC_OPTIONS="-arch sm_30"   \
   ../src
+34 −3
Original line number Diff line number Diff line
@@ -137,6 +137,27 @@ IF ( USE_CUDA )
    ENABLE_LANGUAGE( CUDA )
ENDIF()

# Enable HIP
CHECK_ENABLE_FLAG( USE_HIP 0 )
IF ( USE_HIP )
    IF ( NOT DEFINED HIP_PATH )
        IF ( NOT DEFINED ENV{HIP_PATH} )
            SET( HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed" )
        ELSE()
            SET( HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed" )
        ENDIF()
    ENDIF()
    SET( CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH} )
    FIND_PACKAGE( HIP REQUIRED )
    FIND_PACKAGE( CUDA QUIET )
    MESSAGE( "HIP Found")
    MESSAGE( "   HIP version:      ${HIP_VERSION_STRING}")
    MESSAGE( "   HIP platform:     ${HIP_PLATFORM}")
    MESSAGE( "   HIP Include Path: ${HIP_INCLUDE_DIRS}")
    MESSAGE( "   HIP Libraries:    ${HIP_LIBRARIES}")
    ADD_DEFINITIONS( -DUSE_HIP )
ENDIF()


# Include some paths
INCLUDE_DIRECTORIES( "${RAYTRACE_INSTALL_DIR}/include" )
@@ -173,16 +194,26 @@ ENDIF()
IF ( USE_KOKKOS )
    #SET( SOURCES ${SOURCES} RayTraceImageKokkos.cpp )
ENDIF()
IF (USE_HIP )
    SET( SOURCES ${SOURCES} RayTraceImageHIP.cpp )
ENDIF()
IF ( USE_CUDA )
    SET( SOURCES ${SOURCES} RayTraceImageCuda.cu )
ENDIF()
ADD_LIBRARY( RayTrace ${SOURCES} ${CUBINS} )
ADD_DEPENDENCIES( RayTrace copy-include )
ADD_LIBRARY( RayTrace SHARED ${SOURCES} ${CUBINS} )
IF ( USE_HIP )
    SET( HIP_SEPERABLE_COMPILATION ON )
    SET_SOURCE_FILES_PROPERTIES( RayTraceImageHIP.hip PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1 )
    HIP_ADD_LIBRARY( RayTraceHIP RayTraceImageHIP.hip SHARED HIPCC_OPTIONS ${HIP_HIPCC_OPTIONS} HCC_OPTIONS ${HIP_HCC_OPTIONS} NVCC_OPTIONS ${HIP_NVCC_OPTIONS} ${HIP_NVCC_FLAGS} )
    TARGET_LINK_LIBRARIES( RayTraceHIP /opt/rocm-3.3.0/lib/libhip_hcc.so )
    TARGET_LINK_LIBRARIES( RayTrace RayTraceHIP )
    ADD_DEPENDENCIES( RayTraceHIP copy-include )
ENDIF()


# Add the applications
ADD_EXECUTABLE( CreateImage CreateImage.cpp )
TARGET_LINK_LIBRARIES( CreateImage RayTrace ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} )
TARGET_LINK_LIBRARIES( CreateImage RayTrace RayTraceHIP  ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} )
INSTALL( TARGETS CreateImage DESTINATION ${RAYTRACE_INSTALL_DIR}/bin )

+5 −1
Original line number Diff line number Diff line
@@ -6,6 +6,10 @@
#include <thread>


#if defined( USE_MPI )
#include "mpi.h"
#endif

#ifdef USE_CUDA
#include <cuda_runtime_api.h>
#endif
@@ -368,7 +372,7 @@ Options::Options( int argc, char *argv[] ) : iterations( 1 ), scale( 1.0 )
        "  CreateImage <args> file.dat\n"
        "Optional arguments:\n"
        "  -methods=METHODS  Comma seperated list of methods to test\n"
        "                    cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, OpenAcc, Kokkos-Serial, "
        "                    cpu, threads, OpenMP, Cuda, Cuda-MultiGPU, HIP, OpenAcc, Kokkos-Serial, "
        "Kokkos-Thread, Kokkos-OpenMP, Kokkos-Cuda\n"
        "                    all - run all availible tests (default)\n"
        "                    parallel - run all availible parallel tests\n"
+19 −0
Original line number Diff line number Diff line
@@ -28,6 +28,10 @@
#define ENABLE_CUDA
#undef USE_CUDA
#endif
#ifdef USE_HIP
#define ENABLE_HIP
#undef USE_HIP
#endif
#ifdef USE_OPENMP
#define ENABLE_OPENMP
#undef USE_OPENMP
@@ -66,6 +70,12 @@ extern void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct &euv_b
    const std::vector<ray_struct> &rays, double scale, double *image, double *I_ang,
    unsigned int &failure_code, std::vector<ray_struct> &failed_rays );
#endif
#if defined( ENABLE_HIP )
extern void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct &euv_beam,
    const RayTrace::ray_gain_struct *gain, const RayTrace::ray_seed_struct *seed, int method,
    const std::vector<ray_struct> &rays, double scale, double *image, double *I_ang,
    unsigned int &failure_code, std::vector<ray_struct> &failed_rays );
#endif


/**********************************************************************
@@ -300,6 +310,8 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth
        compute_method = "openacc";
#elif defined( ENABLE_CUDA )
        compute_method = "cuda";
#elif defined( ENABLE_HIP )
        compute_method = "hip";
#elif defined( ENABLE_OPENMP )
        compute_method = "openmp";
#elif defined( ENABLE_KOKKOS )
@@ -365,6 +377,13 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth
            failure_code, failed_rays );
#else
        RAY_ERROR( "Cuda-MultiGPU is not availible" );
#endif
    } else if ( compute_method == "hip" ) {
#if defined( ENABLE_HIP )
        RayTraceImageHIPLoop( N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays,
            scale, image, I_ang, failure_code, failed_rays );
#else
        RAY_ERROR( "HIP is not availible" );
#endif
    } else if ( compute_method == "cpu" ) {
        RayTraceImageCPULoop( N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays,
+344 −0
Original line number Diff line number Diff line
#include "RayTrace/RayTraceStructures.h"
#include "RayTrace/utilities/RayUtilities.h"

#define HCC_COMPILE
#include "RayTrace/common/RayTraceImageHelper.h"

#include "hip/hip_runtime.h"
#include "cuda.h"


// Check for errors
#define CUDA_CHECK()                                    \
    do {                                                \
        if ( cudaPeekAtLastError() != cudaSuccess ) {   \
            cudaError_t error = cudaGetLastError();     \
            printf("cuda error: %i\n",error);           \
            printf("   %s\n",cudaGetErrorString(error)); \
            printf("   line: %i\n",(int)__LINE__);      \
            printf("   file: %s\n",__FILE__);           \
            exit(-1);                                   \
        }                                               \
    } while (0)
#define HIP_CHECK()                                    \
    do {                                                \
    } while (0)


#define CUDA_PRINT_FUNCTION( fun )                      \
    do {                                                \
        hipFuncAttributes attr;                        \
        hipFuncGetAttributes(&attr,fun);               \
        printf("%s:\n",#fun);                           \
        printf("  version = %i\n",attr.binaryVersion);  \
        printf("  ptx = %i\n",attr.ptxVersion);         \
        printf("  constSize = %i\n",attr.constSizeBytes); \
        printf("  localSize = %i\n",attr.localSizeBytes); \
        printf("  sharedSize = %i\n",attr.sharedSizeBytes); \
        printf("  maxThreads = %i\n",attr.maxThreadsPerBlock); \
        printf("  numRegs = %i\n",attr.numRegs);        \
    } while (0)


// Atomic add operation for double
#if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ >= 600
    #define atomicAdd2 atomicAdd
#else
__device__ double atomicAdd2(double* address, double val)
{
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, 
                        __double_as_longlong(val + 
                        __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}
#endif



// Get the globally unique thread id
__device__ int getGlobalIdx3D()
{
	int blockId = blockIdx.x 
			 + blockIdx.y * gridDim.x 
			 + gridDim.x * gridDim.y * blockIdx.z; 
	int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
			  + (threadIdx.z * (blockDim.x * blockDim.y))
			  + (threadIdx.y * blockDim.x)
			  + threadIdx.x;
	return threadId;
}


// Get the index
__device__ inline int getIndex( int n, const double *x, double dx, double y )
{
    if ( y < x[0] - 0.5 * dx || y > x[n - 1] + 0.5 * dx )
        return -1;
    return findfirstsingle( x, n, y - 0.5 * dx );
}


// Kernel that executes on the CUDA device
__global__
__launch_bounds__(128,8)    // Set bounds to limit the number of registers
void RayTraceImageHIPKernel( int N, int nx, int ny, int na, int nb, int nv, 
    const double *x, const double *y, const double *a, const double *b,
    double dx, double dy, double dz, double da, double db, const double *dv,
    const RayTrace::ray_gain_struct* gain, const RayTrace::ray_seed_struct* seed,
    int method, int N_rays, const ray_struct *rays, double scale,
    double *image, double *I_ang ) 
{
    int idx = getGlobalIdx3D();
    if ( idx<N_rays ) {
        const ray_struct ray = rays[idx];
        double Iv[K_MAX];
        ray_struct ray2;
        int error = RayTrace_calc_ray( ray, N, dz, gain, seed, nv, method, Iv, ray2 );
        if ( error!=0 ) {
            //failed_rays.push_back(ray);
            //set_bit(-error,failure_code);
        } else {
            if ( method == 1 ) {
                // We are propagating backward, use ray for the cell updates
                ray2 = ray;
            } else {
                // We are propagating forward, use ray2 for the cell updates
                // Note: The sign of the angle is reversed with respect to the euv_beam
                ray2.a = -ray2.a;
                ray2.b = -ray2.b;
                if ( ray2.y<0.0 && y[0]>=0.0 ) {
                    // We need to change the sign of y
                    ray2.y = -ray2.y;
                }
            }
            // Get the indicies to the cells in image and I_ang
            int i1 = getIndex( nx, x, dx, ray2.x );
            int i2 = getIndex( ny, y, dy, ray2.y );
            int i3 = getIndex( na, a, da, ray2.a );
            int i4 = getIndex( nb, b, db, ray2.b );
            // Copy I_out into image 
            if (i1>=0 && i2>=0){ 
                double *Iv2 = &image[nv*(i1+i2*nx)];
                for (int iv=0; iv<nv; iv++)
                    atomicAdd2(&Iv2[iv],Iv[iv]*scale);
            }
            // Copy I_out into I_ang 
            if (i3>=0 && i4>=0) {    
                double tmp = 0.0;
                for (int iv=0; iv<nv; iv++)
                    tmp += 2.0*dv[iv]*Iv[iv];
                atomicAdd2(&I_ang[i3+i4*na],tmp);
            }
        }
    }
}


// Compute the block size to use
inline dim3 calcBlockSize( size_t N_blocks )
{
    dim3 block_size;
    if ( N_blocks < 65535 ) {
        block_size.x = N_blocks;
    } else {
        block_size.y = N_blocks/32768;
        block_size.x = N_blocks/block_size.y + (N_blocks%block_size.y == 0 ? 0:1);
    }
    return block_size;
}


// Create the image and call the cuda kernel
void RayTraceImageHIPLoop( int N, const RayTrace::EUV_beam_struct& beam,
    const RayTrace::ray_gain_struct* gain_in, const RayTrace::ray_seed_struct* seed_in,
    int method, const std::vector<ray_struct>& rays, double scale,
    double *image, double *I_ang, 
    unsigned int& failure_code, std::vector<ray_struct>& failed_rays ) 
{
    failure_code = 0;   // Need to track failures on GPU
    // Get device properties
    static int maxThreadsPerBlock = 128;
    /*if ( maxThreadsPerBlock == 0 ) {
        hipFuncAttributes attr;
        hipFuncGetAttributes(&attr,RayTraceImageHIPKernel);
        maxThreadsPerBlock = attr.maxThreadsPerBlock;
        //CUDA_PRINT_FUNCTION(RayTraceImageCudaKernel);
        RAY_ASSERT(maxThreadsPerBlock>0);
    }*/
    // place the ray gain and seed structures on the device
    const int nx = beam.nx;
    const int ny = beam.ny;
    const int na = beam.na;
    const int nb = beam.nb;
    const int nv = beam.nv;
    const double dx = beam.dx;
    const double dy = beam.dy;
    const double dz = beam.dz;
    const double da = beam.da;
    const double db = beam.db;
    const RayTrace::ray_gain_struct* gain = RayTrace::ray_gain_struct::copy_device( N, gain_in );
    const RayTrace::ray_seed_struct* seed = NULL;
    if ( seed_in!=NULL )
        seed = seed_in->copy_device();
    // Allocate device memory
    size_t N_rays = rays.size();
    double *x2, *y2, *a2, *b2, *dv2, *image2, *I_ang2;
    ray_struct *rays2;
    hipMalloc(&x2,nx*sizeof(double));
    hipMalloc(&y2,ny*sizeof(double));
    hipMalloc(&a2,na*sizeof(double));
    hipMalloc(&b2,nb*sizeof(double));
    hipMalloc(&dv2,nv*sizeof(double));
    hipMalloc(&image2,nx*ny*nv*sizeof(double));
    hipMalloc(&I_ang2,na*nb*sizeof(double));
    hipMalloc(&rays2,N_rays*sizeof(ray_struct));
    hipMemcpy(x2,beam.x,nx*sizeof(double),hipMemcpyHostToDevice);
    hipMemcpy(y2,beam.y,ny*sizeof(double),hipMemcpyHostToDevice);
    hipMemcpy(a2,beam.a,na*sizeof(double),hipMemcpyHostToDevice);
    hipMemcpy(b2,beam.b,nb*sizeof(double),hipMemcpyHostToDevice);
    hipMemcpy(dv2,beam.dv,nv*sizeof(double),hipMemcpyHostToDevice);
    hipMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),hipMemcpyHostToDevice);
    hipMemset(image2,0,nx*ny*nv*sizeof(double));
    hipMemset(I_ang2,0,na*nb*sizeof(double));
    HIP_CHECK();
    // Do calculation on device:
    size_t threads = maxThreadsPerBlock;
    size_t N_blocks = N_rays/threads + (N_rays%threads == 0 ? 0:1);
    dim3 block_size = calcBlockSize(N_blocks);
    block_size.x = N_rays/threads + (N_rays%threads == 0 ? 0:1);
    hipLaunchKernelGGL( RayTraceImageHIPKernel, dim3(block_size), dim3(threads), 0, 0,
        N,nx,ny,na,nb,nv,x2,y2,a2,b2,
        dx,dy,dz,da,db,dv2,gain,seed,method,N_rays,rays2,scale,image2,I_ang2);
    HIP_CHECK();
    // Retrieve result from device and store it in host array
    hipMemcpy(image,image2,nx*ny*nv*sizeof(double),hipMemcpyDeviceToHost);
    hipMemcpy(I_ang,I_ang2,na*nb*sizeof(double),hipMemcpyDeviceToHost);
    HIP_CHECK();
    // Cleanup
    hipFree(x2);
    hipFree(y2);
    hipFree(a2);
    hipFree(b2);
    hipFree(dv2);
    hipFree(rays2);
    hipFree(image2);
    hipFree(I_ang2);
    HIP_CHECK();
    RayTrace::ray_gain_struct::free_device( N, gain_in, gain );
    RayTrace::ray_seed_struct::free_device( seed_in, seed );
}


// Copy ray_gain_struct to GPU
const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_hip( size_t N, const RayTrace::ray_gain_struct* arr )
{
    RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N];
    for (size_t i=0; i<N; i++) {
        host_ptr[i].Nx = arr[i].Nx;
        host_ptr[i].Ny = arr[i].Ny;
        host_ptr[i].Nv = arr[i].Nv;
        host_ptr[i].E0 = nullptr;
        hipMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double));
        hipMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double));
        hipMalloc(&host_ptr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double));
        hipMalloc(&host_ptr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float));
        hipMalloc(&host_ptr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float));
        hipMalloc(&host_ptr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float));
        hipMalloc(&host_ptr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float));
        hipMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice);
        if ( arr[i].E0 != nullptr )
            hipMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice);
    }
    RayTrace::ray_gain_struct* dev_ptr=NULL;
    hipMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct));
    hipMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyHostToDevice);
    for (size_t i=0; i<N; i++) {
        host_ptr[i].x = NULL;
        host_ptr[i].y = NULL;
        host_ptr[i].n = NULL;
        host_ptr[i].g0 = NULL;
        host_ptr[i].E0 = NULL;
        host_ptr[i].gv = NULL;
        host_ptr[i].gv0 = NULL;
    }
    delete [] host_ptr;
    HIP_CHECK();
    return dev_ptr;
}
// Free ray_gain_struct from GPU
void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct* dev_ptr )
{
    RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N];
    hipMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyDeviceToHost);
    for (size_t i=0; i<N; i++) {
        hipFree(host_ptr[i].x);
        hipFree(host_ptr[i].y);
        hipFree(host_ptr[i].n);
        hipFree(host_ptr[i].g0);
        hipFree(host_ptr[i].E0);
        hipFree(host_ptr[i].gv);
        hipFree(host_ptr[i].gv0);
        host_ptr[i].x = NULL;
        host_ptr[i].y = NULL;
        host_ptr[i].n = NULL;
        host_ptr[i].g0 = NULL;
        host_ptr[i].E0 = NULL;
        host_ptr[i].gv = NULL;
        host_ptr[i].gv0 = NULL;
    }
    hipFree((void*)dev_ptr);
    delete [] host_ptr;
    HIP_CHECK();
}


// Copy ray_seed_struct to GPU
const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_hip( const RayTrace::ray_seed_struct& seed )
{
    RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct();
    host_ptr->f0 = seed.f0;
    for (size_t i=0; i<5; i++) {
        host_ptr->dim[i] = seed.dim[i];
        hipMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double));
        hipMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double));
        hipMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice);
    }
    RayTrace::ray_seed_struct* dev_ptr=NULL;
    hipMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct));
    hipMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyHostToDevice);
    for (size_t i=0; i<5; i++) {
        host_ptr->x[i] = NULL;
        host_ptr->f[i] = NULL;
    }
    delete host_ptr;
    HIP_CHECK();
    return dev_ptr;
}
// Free ray_seed_struct from GPU
void ray_seed_struct_free_device_hip( const RayTrace::ray_seed_struct* dev_ptr )
{
    RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct;
    hipMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyDeviceToHost);
    for (size_t i=0; i<5; i++) {
        hipFree(host_ptr->x[i]);
        hipFree(host_ptr->f[i]);
        host_ptr->x[i] = NULL;
        host_ptr->f[i] = NULL;
    }
    hipFree((void*)dev_ptr);
    delete host_ptr;
    HIP_CHECK();
}

Loading