Commit 6726c4de authored by Berrill, Mark's avatar Berrill, Mark
Browse files

Creating hip branch

parent 87d98088
Loading
Loading
Loading
Loading
+5 −4
Original line number Diff line number Diff line
@@ -4,13 +4,14 @@ export KOKKOS_DIR=/packages/TPLs/install/opt/kokkos

rm -rf CMake*

cmake                                   \
/ccs/home/mbt/bin/cmake                 \
   -D CMAKE_BUILD_TYPE=Release          \
   -D CMAKE_CXX_COMPILER=g++            \
   -D CMAKE_CXX_COMPILER=hipcc            \
   -D CXX_STD=11                        \
      -D LDFLAGS="-pthread"  \
   -D USE_OPENACC=0                     \
   -D USE_OPENMP=1                      \
   -D USE_KOKKOS=1                      \
   -D USE_OPENMP=0                      \
   -D USE_KOKKOS=0                      \
      -D KOKKOS_DIRECTORY=${KOKKOS_DIR} \
      -D KOKKOS_WRAPPER=${KOKKOS_DIR}/nvcc_wrapper \
   -D USE_CUDA=1                        \
+34 −33
Original line number Diff line number Diff line
@@ -69,8 +69,8 @@ IF ( USE_OPENMP )
        SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mp")
        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mp")
    ELSEIF ( USING_CLANG )
        SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp -pthread")
        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
        SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp -pthread -fPIC")
        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread -fPIC")
    ELSEIF ( USING_XL )
        SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -qsmp=omp")
        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qsmp=omp")
@@ -124,32 +124,32 @@ ENDIF()

# Enable cuda
IF ( USE_CUDA )
    # Include FindCUDA
    INCLUDE( FindCUDA )
    IF ( NOT CUDA_FOUND )
        MESSAGE ( FATAL_ERROR "CUDA not found" )
    ENDIF()
    IF ( CUDA_FLAGS )
        SET( CUDA_NVCC_FLAGS "${CUDA_FLAGS} ${CXX_STD_FLAG}" )
    ENDIF()
    IF(NOT CUDA_NVCC_FLAGS)
        # Set minimum requirements
        SET( CUDA_NVCC_FLAGS "-arch=sm_30 ${CXX_STD_FLAG}" )
    ENDIF()
    IF ( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" )
        SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo" )
    ENDIF()
    # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CXX_STD_FLAG}" )
    IF( NOT CMAKE_BUILD_TYPE )
        MESSAGE(FATAL_ERROR "CMAKE_BUILD_TYPE is not set")
    ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" )
        SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -g -O0" )
    ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Release" )
        SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3" )
    ENDIF()
    SET( CUDA_PROPAGATE_HOST_FLAGS OFF )
    SET( CUDA_FIND_QUIETLY )
    STRING( REPLACE " " ";" CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} )
#    # Include FindCUDA
#    INCLUDE( FindCUDA )
#    IF ( NOT CUDA_FOUND )
#        MESSAGE ( FATAL_ERROR "CUDA not found" )
#    ENDIF()
#    IF ( CUDA_FLAGS )
#        SET( CUDA_NVCC_FLAGS "${CUDA_FLAGS} ${CXX_STD_FLAG}" )
#    ENDIF()
#    IF(NOT CUDA_NVCC_FLAGS)
#        # Set minimum requirements
#        SET( CUDA_NVCC_FLAGS "-arch=sm_30 ${CXX_STD_FLAG}" )
#    ENDIF()
#    IF ( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" )
#        SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo" )
#    ENDIF()
#    # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CXX_STD_FLAG}" )
#    IF( NOT CMAKE_BUILD_TYPE )
#        MESSAGE(FATAL_ERROR "CMAKE_BUILD_TYPE is not set")
#    ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" )
#        SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -g -O0" )
#    ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Release" )
#        SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3" )
#    ENDIF()
#    SET( CUDA_PROPAGATE_HOST_FLAGS OFF )
#    SET( CUDA_FIND_QUIETLY )
#    STRING( REPLACE " " ";" CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} )
    INCLUDE_DIRECTORIES ( ${CUDA_INCLUDE_DIRS} )
    ADD_DEFINITIONS( "-DUSE_CUDA" ) 
    MESSAGE( "Using CUDA ${CUDA_VERSION}" ) 
@@ -176,7 +176,7 @@ ADD_DISTCLEAN( libRayTrace.* null_timer CreateImage* )
# Create the library
INCLUDE_DIRECTORIES( ${RAYTRACE_SOURCE_DIR} )
ADD_DEFINITIONS( -DDISABLE_WRITE_FAILED_RAYS )
SET( SOURCES RayTrace RayTraceImage.cpp RayTraceStructures.cpp utilities/RayUtilities.cpp AtomicModel/interp.cpp RayTraceImageCPU.cpp CreateImageHelpers.cpp )
SET( SOURCES RayTrace RayTraceImage.cpp RayTraceStructures.cpp utilities/RayUtilities.cpp AtomicModel/interp.cpp RayTraceImageCPU.cpp CreateImageHelpers.cpp RayTraceImageHip.cpp )
IF ( USE_OPENACC )
    SET( SOURCES ${SOURCES} RayTraceImageOpenACC.cpp )
ENDIF()
@@ -184,14 +184,15 @@ IF ( USE_KOKKOS )
    #SET( SOURCES ${SOURCES} RayTraceImageKokkos.cpp )
ENDIF()
IF ( USE_CUDA )
    SET( CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE )
    SET( CMAKE_BUILD_WITH_INSTALL_RPATH TRUE )
    CUDA_COMPILE( CUBINS RayTraceImageCuda.cu )
    #SET( CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE )
    #SET( CMAKE_BUILD_WITH_INSTALL_RPATH TRUE )
    #CUDA_COMPILE( CUBINS RayTraceImageCuda.cu )
ENDIF()
ADD_LIBRARY( ${SOURCES} ${CUBINS} )
ADD_LIBRARY( RayTrace STATIC ${SOURCES} ${CUBINS} )


# Add the applications

ADD_EXECUTABLE( CreateImage CreateImage.cpp )
TARGET_LINK_LIBRARIES( CreateImage RayTrace ${CUDA_LIBRARIES} ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} )
INSTALL( TARGETS CreateImage DESTINATION ${RAYTRACE_INSTALL_DIR}/bin )
+2 −2
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@


#ifdef USE_CUDA
#include <cuda_runtime_api.h>
#include <hip/hip_runtime_api.h>
#endif

// Detect the OS
@@ -250,7 +250,7 @@ void printHardware()
    // Get number of gpus
    int N_gpu = 0;
#ifdef USE_CUDA
    cudaGetDeviceCount( &N_gpu );
    hipGetDeviceCount( &N_gpu );
#endif
    // Get system memory
#if defined( USE_LINUX )
+3 −3
Original line number Diff line number Diff line
@@ -37,7 +37,7 @@
#undef USE_OPENMP
#endif
#ifdef ENABLE_CUDA
#include <cuda_runtime_api.h>
#include <hip/hip_runtime_api.h>
#endif
#include "common/RayTraceDefinitions.h"
#include "common/RayTraceImageHelper.h"
@@ -79,7 +79,7 @@ void setGPU( int id )
{
    NULL_USE( id );
#if defined( ENABLE_CUDA )
    cudaSetDevice( id );
    hipSetDevice( id );
#endif
}
void setDeviceAndRun( int id, int N_threads,
@@ -369,7 +369,7 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth
    } else if ( compute_method == "cuda-multigpu" ) {
#if defined( ENABLE_CUDA )
        int N_gpu;
        cudaGetDeviceCount( &N_gpu );
        hipGetDeviceCount( &N_gpu );
        RayTraceImageThreadLoop( N_gpu, RayTraceImageCudaLoop, setGPU, N,
            std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, scale, image, I_ang,
            failure_code, failed_rays );
+78 −77
Original line number Diff line number Diff line
#include "hip/hip_runtime.h"
#include "RayTraceStructures.h"
#include "common/RayTraceImageHelper.h"
#include "utilities/RayUtilities.h"
@@ -7,10 +8,10 @@
// Check for errors
#define CUDA_CHECK()                                    \
    do {                                                \
        if ( cudaPeekAtLastError() != cudaSuccess ) {   \
            cudaError_t error = cudaGetLastError();     \
        if ( hipPeekAtLastError() != hipSuccess ) {   \
            hipError_t error = hipGetLastError();     \
            printf("cuda error: %i\n",error);           \
            printf("   %s\n",cudaGetErrorString(error)); \
            printf("   %s\n",hipGetErrorString(error)); \
            printf("   line: %i\n",(int)__LINE__);      \
            printf("   file: %s\n",__FILE__);           \
            exit(-1);                                   \
@@ -18,7 +19,7 @@
    } while (0)


#define CUDA_PRINT_FUNCTION( fun )                      \
#define CUDA_PRINT_FUNCTION( fun )     /*                 \
    do {                                                \
        cudaFuncAttributes attr;                        \
        cudaFuncGetAttributes(&attr,fun);               \
@@ -30,7 +31,7 @@
        printf("  sharedSize = %i\n",attr.sharedSizeBytes); \
        printf("  maxThreads = %i\n",attr.maxThreadsPerBlock); \
        printf("  numRegs = %i\n",attr.numRegs);        \
    } while (0)
    } while (0)*/


// Atomic add operation for double
@@ -56,13 +57,13 @@ __device__ double atomicAdd2(double* address, double val)
// Get the globally unique thread id
__device__ int getGlobalIdx3D()
{
	int blockId = blockIdx.x 
			 + blockIdx.y * gridDim.x 
			 + gridDim.x * gridDim.y * blockIdx.z; 
	int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
			  + (threadIdx.z * (blockDim.x * blockDim.y))
			  + (threadIdx.y * blockDim.x)
			  + threadIdx.x;
	int blockId = hipBlockIdx_x 
			 + hipBlockIdx_y * hipGridDim_x 
			 + hipGridDim_x * hipGridDim_y * hipBlockIdx_z; 
	int threadId = blockId * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z)
			  + (hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y))
			  + (hipThreadIdx_y * hipBlockDim_x)
			  + hipThreadIdx_x;
	return threadId;
}

@@ -155,14 +156,14 @@ void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct& beam,
{
    failure_code = 0;   // Need to track failures on GPU
    // Get device properties
    static int maxThreadsPerBlock = 0;
    if ( maxThreadsPerBlock == 0 ) {
    static int maxThreadsPerBlock = 8000;
    /*if ( maxThreadsPerBlock == 0 ) {
        cudaFuncAttributes attr;
        cudaFuncGetAttributes(&attr,RayTraceImageCudaKernel);
        maxThreadsPerBlock = attr.maxThreadsPerBlock;
        RAY_ASSERT(maxThreadsPerBlock>0);
        //CUDA_PRINT_FUNCTION(RayTraceImageCudaKernel);
    }    
    }  */  
    // place the ray gain and seed structures on the device
    const int nx = beam.nx;
    const int ny = beam.ny;
@@ -182,44 +183,44 @@ void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct& beam,
    size_t N_rays = rays.size();
    double *x2, *y2, *a2, *b2, *dv2, *image2, *I_ang2;
    ray_struct *rays2;
    cudaMalloc(&x2,nx*sizeof(double));
    cudaMalloc(&y2,ny*sizeof(double));
    cudaMalloc(&a2,na*sizeof(double));
    cudaMalloc(&b2,nb*sizeof(double));
    cudaMalloc(&dv2,nv*sizeof(double));
    cudaMalloc(&image2,nx*ny*nv*sizeof(double));
    cudaMalloc(&I_ang2,na*nb*sizeof(double));
    cudaMalloc(&rays2,N_rays*sizeof(ray_struct));
    cudaMemcpy(x2,beam.x,nx*sizeof(double),cudaMemcpyHostToDevice);
    cudaMemcpy(y2,beam.y,ny*sizeof(double),cudaMemcpyHostToDevice);
    cudaMemcpy(a2,beam.a,na*sizeof(double),cudaMemcpyHostToDevice);
    cudaMemcpy(b2,beam.b,nb*sizeof(double),cudaMemcpyHostToDevice);
    cudaMemcpy(dv2,beam.dv,nv*sizeof(double),cudaMemcpyHostToDevice);
    cudaMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),cudaMemcpyHostToDevice);
    cudaMemset(image2,0,nx*ny*nv*sizeof(double));
    cudaMemset(I_ang2,0,na*nb*sizeof(double));
    hipMalloc(&x2,nx*sizeof(double));
    hipMalloc(&y2,ny*sizeof(double));
    hipMalloc(&a2,na*sizeof(double));
    hipMalloc(&b2,nb*sizeof(double));
    hipMalloc(&dv2,nv*sizeof(double));
    hipMalloc(&image2,nx*ny*nv*sizeof(double));
    hipMalloc(&I_ang2,na*nb*sizeof(double));
    hipMalloc(&rays2,N_rays*sizeof(ray_struct));
    hipMemcpy(x2,beam.x,nx*sizeof(double),hipMemcpyHostToDevice);
    hipMemcpy(y2,beam.y,ny*sizeof(double),hipMemcpyHostToDevice);
    hipMemcpy(a2,beam.a,na*sizeof(double),hipMemcpyHostToDevice);
    hipMemcpy(b2,beam.b,nb*sizeof(double),hipMemcpyHostToDevice);
    hipMemcpy(dv2,beam.dv,nv*sizeof(double),hipMemcpyHostToDevice);
    hipMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),hipMemcpyHostToDevice);
    hipMemset(image2,0,nx*ny*nv*sizeof(double));
    hipMemset(I_ang2,0,na*nb*sizeof(double));
    CUDA_CHECK();
    // Do calculation on device:
    size_t threads = maxThreadsPerBlock;
    size_t N_blocks = N_rays/threads + (N_rays%threads == 0 ? 0:1);
    dim3 block_size = calcBlockSize(N_blocks);
    block_size.x = N_rays/threads + (N_rays%threads == 0 ? 0:1);
    RayTraceImageCudaKernel <<< block_size,threads >>> (N,nx,ny,na,nb,nv,x2,y2,a2,b2,
    hipLaunchKernelGGL((RayTraceImageCudaKernel), dim3(block_size), dim3(threads ), 0, 0, N,nx,ny,na,nb,nv,x2,y2,a2,b2,
        dx,dy,dz,da,db,dv2,gain,seed,method,N_rays,rays2,scale,image2,I_ang2);
    CUDA_CHECK();
    // Retrieve result from device and store it in host array
    cudaMemcpy(image,image2,nx*ny*nv*sizeof(double),cudaMemcpyDeviceToHost);
    cudaMemcpy(I_ang,I_ang2,na*nb*sizeof(double),cudaMemcpyDeviceToHost);
    hipMemcpy(image,image2,nx*ny*nv*sizeof(double),hipMemcpyDeviceToHost);
    hipMemcpy(I_ang,I_ang2,na*nb*sizeof(double),hipMemcpyDeviceToHost);
    CUDA_CHECK();
    // Cleanup
    cudaFree(x2);
    cudaFree(y2);
    cudaFree(a2);
    cudaFree(b2);
    cudaFree(dv2);
    cudaFree(rays2);
    cudaFree(image2);
    cudaFree(I_ang2);
    hipFree(x2);
    hipFree(y2);
    hipFree(a2);
    hipFree(b2);
    hipFree(dv2);
    hipFree(rays2);
    hipFree(image2);
    hipFree(I_ang2);
    CUDA_CHECK();
    RayTrace::ray_gain_struct::free_device( N, gain_in, gain );
    RayTrace::ray_seed_struct::free_device( seed_in, seed );
@@ -235,25 +236,25 @@ const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_cuda( size_t N, con
        host_ptr[i].Ny = arr[i].Ny;
        host_ptr[i].Nv = arr[i].Nv;
        host_ptr[i].E0 = nullptr;
        cudaMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double));
        cudaMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double));
        cudaMalloc(&host_ptr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double));
        cudaMalloc(&host_ptr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float));
        cudaMalloc(&host_ptr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float));
        cudaMalloc(&host_ptr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float));
        cudaMalloc(&host_ptr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float));
        cudaMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),cudaMemcpyHostToDevice);
        cudaMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),cudaMemcpyHostToDevice);
        cudaMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double),cudaMemcpyHostToDevice);
        cudaMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float),cudaMemcpyHostToDevice);
        hipMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double));
        hipMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double));
        hipMalloc(&host_ptr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double));
        hipMalloc(&host_ptr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float));
        hipMalloc(&host_ptr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float));
        hipMalloc(&host_ptr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float));
        hipMalloc(&host_ptr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float));
        hipMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice);
        if ( arr[i].E0 != nullptr )
            cudaMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float),cudaMemcpyHostToDevice);
        cudaMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float),cudaMemcpyHostToDevice);
        cudaMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float),cudaMemcpyHostToDevice);
            hipMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice);
    }
    RayTrace::ray_gain_struct* dev_ptr=NULL;
    cudaMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct));
    cudaMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),cudaMemcpyHostToDevice);
    hipMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct));
    hipMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyHostToDevice);
    for (size_t i=0; i<N; i++) {
        host_ptr[i].x = NULL;
        host_ptr[i].y = NULL;
@@ -271,15 +272,15 @@ const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_cuda( size_t N, con
void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct* dev_ptr )
{
    RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N];
    cudaMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),cudaMemcpyDeviceToHost);
    hipMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyDeviceToHost);
    for (size_t i=0; i<N; i++) {
        cudaFree(host_ptr[i].x);
        cudaFree(host_ptr[i].y);
        cudaFree(host_ptr[i].n);
        cudaFree(host_ptr[i].g0);
        cudaFree(host_ptr[i].E0);
        cudaFree(host_ptr[i].gv);
        cudaFree(host_ptr[i].gv0);
        hipFree(host_ptr[i].x);
        hipFree(host_ptr[i].y);
        hipFree(host_ptr[i].n);
        hipFree(host_ptr[i].g0);
        hipFree(host_ptr[i].E0);
        hipFree(host_ptr[i].gv);
        hipFree(host_ptr[i].gv0);
        host_ptr[i].x = NULL;
        host_ptr[i].y = NULL;
        host_ptr[i].n = NULL;
@@ -288,7 +289,7 @@ void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct
        host_ptr[i].gv = NULL;
        host_ptr[i].gv0 = NULL;
    }
    cudaFree((void*)dev_ptr);
    hipFree((void*)dev_ptr);
    delete [] host_ptr;
    CUDA_CHECK();
}
@@ -301,14 +302,14 @@ const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_cuda( const RayTrac
    host_ptr->f0 = seed.f0;
    for (size_t i=0; i<5; i++) {
        host_ptr->dim[i] = seed.dim[i];
        cudaMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double));
        cudaMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double));
        cudaMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),cudaMemcpyHostToDevice);
        cudaMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),cudaMemcpyHostToDevice);
        hipMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double));
        hipMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double));
        hipMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice);
        hipMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice);
    }
    RayTrace::ray_seed_struct* dev_ptr=NULL;
    cudaMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct));
    cudaMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),cudaMemcpyHostToDevice);
    hipMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct));
    hipMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyHostToDevice);
    for (size_t i=0; i<5; i++) {
        host_ptr->x[i] = NULL;
        host_ptr->f[i] = NULL;
@@ -321,14 +322,14 @@ const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_cuda( const RayTrac
void ray_seed_struct_free_device_cuda( const RayTrace::ray_seed_struct* dev_ptr )
{
    RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct;
    cudaMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),cudaMemcpyDeviceToHost);
    hipMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyDeviceToHost);
    for (size_t i=0; i<5; i++) {
        cudaFree(host_ptr->x[i]);
        cudaFree(host_ptr->f[i]);
        hipFree(host_ptr->x[i]);
        hipFree(host_ptr->f[i]);
        host_ptr->x[i] = NULL;
        host_ptr->f[i] = NULL;
    }
    cudaFree((void*)dev_ptr);
    hipFree((void*)dev_ptr);
    delete host_ptr;
    CUDA_CHECK();
}
Loading