Creating hip branch (6726c4de) · Commits · Berrill, Mark A / RayTrace-miniapp

build/configure

+5 −4

Original line number	Diff line number	Diff line
		@@ -4,13 +4,14 @@ export KOKKOS_DIR=/packages/TPLs/install/opt/kokkos

		rm -rf CMake*

		cmake \
		/ccs/home/mbt/bin/cmake \
		-D CMAKE_BUILD_TYPE=Release \
		-D CMAKE_CXX_COMPILER=g++ \
		-D CMAKE_CXX_COMPILER=hipcc \
		-D CXX_STD=11 \
		-D LDFLAGS="-pthread" \
		-D USE_OPENACC=0 \
		-D USE_OPENMP=1 \
		-D USE_KOKKOS=1 \
		-D USE_OPENMP=0 \
		-D USE_KOKKOS=0 \
		-D KOKKOS_DIRECTORY=${KOKKOS_DIR} \
		-D KOKKOS_WRAPPER=${KOKKOS_DIR}/nvcc_wrapper \
		-D USE_CUDA=1 \

src/CMakeLists.txt

+34 −33

Original line number	Diff line number	Diff line
		@@ -69,8 +69,8 @@ IF ( USE_OPENMP )
		SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mp")
		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mp")
		ELSEIF ( USING_CLANG )
		SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp -pthread")
		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
		SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp -pthread -fPIC")
		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread -fPIC")
		ELSEIF ( USING_XL )
		SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -qsmp=omp")
		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qsmp=omp")
		@@ -124,32 +124,32 @@ ENDIF()

		# Enable cuda
		IF ( USE_CUDA )
		# Include FindCUDA
		INCLUDE( FindCUDA )
		IF ( NOT CUDA_FOUND )
		MESSAGE ( FATAL_ERROR "CUDA not found" )
		ENDIF()
		IF ( CUDA_FLAGS )
		SET( CUDA_NVCC_FLAGS "${CUDA_FLAGS} ${CXX_STD_FLAG}" )
		ENDIF()
		IF(NOT CUDA_NVCC_FLAGS)
		# Set minimum requirements
		SET( CUDA_NVCC_FLAGS "-arch=sm_30 ${CXX_STD_FLAG}" )
		ENDIF()
		IF ( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" )
		SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo" )
		ENDIF()
		# SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CXX_STD_FLAG}" )
		IF( NOT CMAKE_BUILD_TYPE )
		MESSAGE(FATAL_ERROR "CMAKE_BUILD_TYPE is not set")
		ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" )
		SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -g -O0" )
		ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Release" )
		SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3" )
		ENDIF()
		SET( CUDA_PROPAGATE_HOST_FLAGS OFF )
		SET( CUDA_FIND_QUIETLY )
		STRING( REPLACE " " ";" CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} )
		# # Include FindCUDA
		# INCLUDE( FindCUDA )
		# IF ( NOT CUDA_FOUND )
		# MESSAGE ( FATAL_ERROR "CUDA not found" )
		# ENDIF()
		# IF ( CUDA_FLAGS )
		# SET( CUDA_NVCC_FLAGS "${CUDA_FLAGS} ${CXX_STD_FLAG}" )
		# ENDIF()
		# IF(NOT CUDA_NVCC_FLAGS)
		# # Set minimum requirements
		# SET( CUDA_NVCC_FLAGS "-arch=sm_30 ${CXX_STD_FLAG}" )
		# ENDIF()
		# IF ( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" )
		# SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo" )
		# ENDIF()
		# # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CXX_STD_FLAG}" )
		# IF( NOT CMAKE_BUILD_TYPE )
		# MESSAGE(FATAL_ERROR "CMAKE_BUILD_TYPE is not set")
		# ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" )
		# SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -g -O0" )
		# ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Release" )
		# SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3" )
		# ENDIF()
		# SET( CUDA_PROPAGATE_HOST_FLAGS OFF )
		# SET( CUDA_FIND_QUIETLY )
		# STRING( REPLACE " " ";" CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} )
		INCLUDE_DIRECTORIES ( ${CUDA_INCLUDE_DIRS} )
		ADD_DEFINITIONS( "-DUSE_CUDA" )
		MESSAGE( "Using CUDA ${CUDA_VERSION}" )
		@@ -176,7 +176,7 @@ ADD_DISTCLEAN( libRayTrace.* null_timer CreateImage* )
		# Create the library
		INCLUDE_DIRECTORIES( ${RAYTRACE_SOURCE_DIR} )
		ADD_DEFINITIONS( -DDISABLE_WRITE_FAILED_RAYS )
		SET( SOURCES RayTrace RayTraceImage.cpp RayTraceStructures.cpp utilities/RayUtilities.cpp AtomicModel/interp.cpp RayTraceImageCPU.cpp CreateImageHelpers.cpp )
		SET( SOURCES RayTrace RayTraceImage.cpp RayTraceStructures.cpp utilities/RayUtilities.cpp AtomicModel/interp.cpp RayTraceImageCPU.cpp CreateImageHelpers.cpp RayTraceImageHip.cpp )
		IF ( USE_OPENACC )
		SET( SOURCES ${SOURCES} RayTraceImageOpenACC.cpp )
		ENDIF()
		@@ -184,14 +184,15 @@ IF ( USE_KOKKOS )
		#SET( SOURCES ${SOURCES} RayTraceImageKokkos.cpp )
		ENDIF()
		IF ( USE_CUDA )
		SET( CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE )
		SET( CMAKE_BUILD_WITH_INSTALL_RPATH TRUE )
		CUDA_COMPILE( CUBINS RayTraceImageCuda.cu )
		#SET( CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE )
		#SET( CMAKE_BUILD_WITH_INSTALL_RPATH TRUE )
		#CUDA_COMPILE( CUBINS RayTraceImageCuda.cu )
		ENDIF()
		ADD_LIBRARY( ${SOURCES} ${CUBINS} )
		ADD_LIBRARY( RayTrace STATIC ${SOURCES} ${CUBINS} )


		# Add the applications

		ADD_EXECUTABLE( CreateImage CreateImage.cpp )
		TARGET_LINK_LIBRARIES( CreateImage RayTrace ${CUDA_LIBRARIES} ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} )
		INSTALL( TARGETS CreateImage DESTINATION ${RAYTRACE_INSTALL_DIR}/bin )

src/CreateImageHelpers.cpp

+2 −2

Original line number	Diff line number	Diff line
		@@ -6,7 +6,7 @@


		#ifdef USE_CUDA
		#include <cuda_runtime_api.h>
		#include <hip/hip_runtime_api.h>
		#endif

		// Detect the OS
		@@ -250,7 +250,7 @@ void printHardware()
		// Get number of gpus
		int N_gpu = 0;
		#ifdef USE_CUDA
		cudaGetDeviceCount( &N_gpu );
		hipGetDeviceCount( &N_gpu );
		#endif
		// Get system memory
		#if defined( USE_LINUX )

src/RayTraceImage.cpp

+3 −3

Original line number	Diff line number	Diff line
		@@ -37,7 +37,7 @@
		#undef USE_OPENMP
		#endif
		#ifdef ENABLE_CUDA
		#include <cuda_runtime_api.h>
		#include <hip/hip_runtime_api.h>
		#endif
		#include "common/RayTraceDefinitions.h"
		#include "common/RayTraceImageHelper.h"
		@@ -79,7 +79,7 @@ void setGPU( int id )
		{
		NULL_USE( id );
		#if defined( ENABLE_CUDA )
		cudaSetDevice( id );
		hipSetDevice( id );
		#endif
		}
		void setDeviceAndRun( int id, int N_threads,
		@@ -369,7 +369,7 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth
		} else if ( compute_method == "cuda-multigpu" ) {
		#if defined( ENABLE_CUDA )
		int N_gpu;
		cudaGetDeviceCount( &N_gpu );
		hipGetDeviceCount( &N_gpu );
		RayTraceImageThreadLoop( N_gpu, RayTraceImageCudaLoop, setGPU, N,
		std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, scale, image, I_ang,
		failure_code, failed_rays );

src/RayTraceImageCuda.cu→src/RayTraceImageHip.cpp

+78 −77

Original line number	Diff line number	Diff line
		#include "hip/hip_runtime.h"
		#include "RayTraceStructures.h"
		#include "common/RayTraceImageHelper.h"
		#include "utilities/RayUtilities.h"
		@@ -7,10 +8,10 @@
		// Check for errors
		#define CUDA_CHECK() \
		do { \
		if ( cudaPeekAtLastError() != cudaSuccess ) { \
		cudaError_t error = cudaGetLastError(); \
		if ( hipPeekAtLastError() != hipSuccess ) { \
		hipError_t error = hipGetLastError(); \
		printf("cuda error: %i\n",error); \
		printf(" %s\n",cudaGetErrorString(error)); \
		printf(" %s\n",hipGetErrorString(error)); \
		printf(" line: %i\n",(int)__LINE__); \
		printf(" file: %s\n",__FILE__); \
		exit(-1); \
		@@ -18,7 +19,7 @@
		} while (0)


		#define CUDA_PRINT_FUNCTION( fun ) \
		#define CUDA_PRINT_FUNCTION( fun ) /* \
		do { \
		cudaFuncAttributes attr; \
		cudaFuncGetAttributes(&attr,fun); \
		@@ -30,7 +31,7 @@
		printf(" sharedSize = %i\n",attr.sharedSizeBytes); \
		printf(" maxThreads = %i\n",attr.maxThreadsPerBlock); \
		printf(" numRegs = %i\n",attr.numRegs); \
		} while (0)
		} while (0)*/


		// Atomic add operation for double
		@@ -56,13 +57,13 @@ __device__ double atomicAdd2(double* address, double val)
		// Get the globally unique thread id
		__device__ int getGlobalIdx3D()
		{
		int blockId = blockIdx.x
		+ blockIdx.y * gridDim.x
		+ gridDim.x * gridDim.y * blockIdx.z;
		int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
		+ (threadIdx.z * (blockDim.x * blockDim.y))
		+ (threadIdx.y * blockDim.x)
		+ threadIdx.x;
		int blockId = hipBlockIdx_x
		+ hipBlockIdx_y * hipGridDim_x
		+ hipGridDim_x * hipGridDim_y * hipBlockIdx_z;
		int threadId = blockId * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z)
		+ (hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y))
		+ (hipThreadIdx_y * hipBlockDim_x)
		+ hipThreadIdx_x;
		return threadId;
		}

		@@ -155,14 +156,14 @@ void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct& beam,
		{
		failure_code = 0; // Need to track failures on GPU
		// Get device properties
		static int maxThreadsPerBlock = 0;
		if ( maxThreadsPerBlock == 0 ) {
		static int maxThreadsPerBlock = 8000;
		/*if ( maxThreadsPerBlock == 0 ) {
		cudaFuncAttributes attr;
		cudaFuncGetAttributes(&attr,RayTraceImageCudaKernel);
		maxThreadsPerBlock = attr.maxThreadsPerBlock;
		RAY_ASSERT(maxThreadsPerBlock>0);
		//CUDA_PRINT_FUNCTION(RayTraceImageCudaKernel);
		}
		} */
		// place the ray gain and seed structures on the device
		const int nx = beam.nx;
		const int ny = beam.ny;
		@@ -182,44 +183,44 @@ void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct& beam,
		size_t N_rays = rays.size();
		double x2, y2, a2, b2, dv2, image2, *I_ang2;
		ray_struct *rays2;
		cudaMalloc(&x2,nx*sizeof(double));
		cudaMalloc(&y2,ny*sizeof(double));
		cudaMalloc(&a2,na*sizeof(double));
		cudaMalloc(&b2,nb*sizeof(double));
		cudaMalloc(&dv2,nv*sizeof(double));
		cudaMalloc(&image2,nxnynv*sizeof(double));
		cudaMalloc(&I_ang2,nanbsizeof(double));
		cudaMalloc(&rays2,N_rays*sizeof(ray_struct));
		cudaMemcpy(x2,beam.x,nx*sizeof(double),cudaMemcpyHostToDevice);
		cudaMemcpy(y2,beam.y,ny*sizeof(double),cudaMemcpyHostToDevice);
		cudaMemcpy(a2,beam.a,na*sizeof(double),cudaMemcpyHostToDevice);
		cudaMemcpy(b2,beam.b,nb*sizeof(double),cudaMemcpyHostToDevice);
		cudaMemcpy(dv2,beam.dv,nv*sizeof(double),cudaMemcpyHostToDevice);
		cudaMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),cudaMemcpyHostToDevice);
		cudaMemset(image2,0,nxnynv*sizeof(double));
		cudaMemset(I_ang2,0,nanbsizeof(double));
		hipMalloc(&x2,nx*sizeof(double));
		hipMalloc(&y2,ny*sizeof(double));
		hipMalloc(&a2,na*sizeof(double));
		hipMalloc(&b2,nb*sizeof(double));
		hipMalloc(&dv2,nv*sizeof(double));
		hipMalloc(&image2,nxnynv*sizeof(double));
		hipMalloc(&I_ang2,nanbsizeof(double));
		hipMalloc(&rays2,N_rays*sizeof(ray_struct));
		hipMemcpy(x2,beam.x,nx*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(y2,beam.y,ny*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(a2,beam.a,na*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(b2,beam.b,nb*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(dv2,beam.dv,nv*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),hipMemcpyHostToDevice);
		hipMemset(image2,0,nxnynv*sizeof(double));
		hipMemset(I_ang2,0,nanbsizeof(double));
		CUDA_CHECK();
		// Do calculation on device:
		size_t threads = maxThreadsPerBlock;
		size_t N_blocks = N_rays/threads + (N_rays%threads == 0 ? 0:1);
		dim3 block_size = calcBlockSize(N_blocks);
		block_size.x = N_rays/threads + (N_rays%threads == 0 ? 0:1);
		RayTraceImageCudaKernel <<< block_size,threads >>> (N,nx,ny,na,nb,nv,x2,y2,a2,b2,
		hipLaunchKernelGGL((RayTraceImageCudaKernel), dim3(block_size), dim3(threads ), 0, 0, N,nx,ny,na,nb,nv,x2,y2,a2,b2,
		dx,dy,dz,da,db,dv2,gain,seed,method,N_rays,rays2,scale,image2,I_ang2);
		CUDA_CHECK();
		// Retrieve result from device and store it in host array
		cudaMemcpy(image,image2,nxnynv*sizeof(double),cudaMemcpyDeviceToHost);
		cudaMemcpy(I_ang,I_ang2,nanbsizeof(double),cudaMemcpyDeviceToHost);
		hipMemcpy(image,image2,nxnynv*sizeof(double),hipMemcpyDeviceToHost);
		hipMemcpy(I_ang,I_ang2,nanbsizeof(double),hipMemcpyDeviceToHost);
		CUDA_CHECK();
		// Cleanup
		cudaFree(x2);
		cudaFree(y2);
		cudaFree(a2);
		cudaFree(b2);
		cudaFree(dv2);
		cudaFree(rays2);
		cudaFree(image2);
		cudaFree(I_ang2);
		hipFree(x2);
		hipFree(y2);
		hipFree(a2);
		hipFree(b2);
		hipFree(dv2);
		hipFree(rays2);
		hipFree(image2);
		hipFree(I_ang2);
		CUDA_CHECK();
		RayTrace::ray_gain_struct::free_device( N, gain_in, gain );
		RayTrace::ray_seed_struct::free_device( seed_in, seed );
		@@ -235,25 +236,25 @@ const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_cuda( size_t N, con
		host_ptr[i].Ny = arr[i].Ny;
		host_ptr[i].Nv = arr[i].Nv;
		host_ptr[i].E0 = nullptr;
		cudaMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double));
		cudaMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double));
		cudaMalloc(&host_ptr[i].n,arr[i].Nxarr[i].Nysizeof(double));
		cudaMalloc(&host_ptr[i].g0,arr[i].Nxarr[i].Nysizeof(float));
		cudaMalloc(&host_ptr[i].E0,arr[i].Nxarr[i].Nysizeof(float));
		cudaMalloc(&host_ptr[i].gv,arr[i].Nxarr[i].Nyarr[i].Nv*sizeof(float));
		cudaMalloc(&host_ptr[i].gv0,arr[i].Nxarr[i].Nysizeof(float));
		cudaMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),cudaMemcpyHostToDevice);
		cudaMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),cudaMemcpyHostToDevice);
		cudaMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nxarr[i].Nysizeof(double),cudaMemcpyHostToDevice);
		cudaMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nxarr[i].Nysizeof(float),cudaMemcpyHostToDevice);
		hipMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double));
		hipMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double));
		hipMalloc(&host_ptr[i].n,arr[i].Nxarr[i].Nysizeof(double));
		hipMalloc(&host_ptr[i].g0,arr[i].Nxarr[i].Nysizeof(float));
		hipMalloc(&host_ptr[i].E0,arr[i].Nxarr[i].Nysizeof(float));
		hipMalloc(&host_ptr[i].gv,arr[i].Nxarr[i].Nyarr[i].Nv*sizeof(float));
		hipMalloc(&host_ptr[i].gv0,arr[i].Nxarr[i].Nysizeof(float));
		hipMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nxarr[i].Nysizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nxarr[i].Nysizeof(float),hipMemcpyHostToDevice);
		if ( arr[i].E0 != nullptr )
		cudaMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nxarr[i].Nysizeof(float),cudaMemcpyHostToDevice);
		cudaMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nxarr[i].Nyarr[i].Nv*sizeof(float),cudaMemcpyHostToDevice);
		cudaMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nxarr[i].Nysizeof(float),cudaMemcpyHostToDevice);
		hipMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nxarr[i].Nysizeof(float),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nxarr[i].Nyarr[i].Nv*sizeof(float),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nxarr[i].Nysizeof(float),hipMemcpyHostToDevice);
		}
		RayTrace::ray_gain_struct* dev_ptr=NULL;
		cudaMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct));
		cudaMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),cudaMemcpyHostToDevice);
		hipMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct));
		hipMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyHostToDevice);
		for (size_t i=0; i<N; i++) {
		host_ptr[i].x = NULL;
		host_ptr[i].y = NULL;
		@@ -271,15 +272,15 @@ const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_cuda( size_t N, con
		void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct* dev_ptr )
		{
		RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N];
		cudaMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),cudaMemcpyDeviceToHost);
		hipMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyDeviceToHost);
		for (size_t i=0; i<N; i++) {
		cudaFree(host_ptr[i].x);
		cudaFree(host_ptr[i].y);
		cudaFree(host_ptr[i].n);
		cudaFree(host_ptr[i].g0);
		cudaFree(host_ptr[i].E0);
		cudaFree(host_ptr[i].gv);
		cudaFree(host_ptr[i].gv0);
		hipFree(host_ptr[i].x);
		hipFree(host_ptr[i].y);
		hipFree(host_ptr[i].n);
		hipFree(host_ptr[i].g0);
		hipFree(host_ptr[i].E0);
		hipFree(host_ptr[i].gv);
		hipFree(host_ptr[i].gv0);
		host_ptr[i].x = NULL;
		host_ptr[i].y = NULL;
		host_ptr[i].n = NULL;
		@@ -288,7 +289,7 @@ void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct
		host_ptr[i].gv = NULL;
		host_ptr[i].gv0 = NULL;
		}
		cudaFree((void*)dev_ptr);
		hipFree((void*)dev_ptr);
		delete [] host_ptr;
		CUDA_CHECK();
		}
		@@ -301,14 +302,14 @@ const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_cuda( const RayTrac
		host_ptr->f0 = seed.f0;
		for (size_t i=0; i<5; i++) {
		host_ptr->dim[i] = seed.dim[i];
		cudaMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double));
		cudaMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double));
		cudaMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),cudaMemcpyHostToDevice);
		cudaMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),cudaMemcpyHostToDevice);
		hipMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double));
		hipMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double));
		hipMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice);
		hipMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice);
		}
		RayTrace::ray_seed_struct* dev_ptr=NULL;
		cudaMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct));
		cudaMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),cudaMemcpyHostToDevice);
		hipMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct));
		hipMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyHostToDevice);
		for (size_t i=0; i<5; i++) {
		host_ptr->x[i] = NULL;
		host_ptr->f[i] = NULL;
		@@ -321,14 +322,14 @@ const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_cuda( const RayTrac
		void ray_seed_struct_free_device_cuda( const RayTrace::ray_seed_struct* dev_ptr )
		{
		RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct;
		cudaMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),cudaMemcpyDeviceToHost);
		hipMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyDeviceToHost);
		for (size_t i=0; i<5; i++) {
		cudaFree(host_ptr->x[i]);
		cudaFree(host_ptr->f[i]);
		hipFree(host_ptr->x[i]);
		hipFree(host_ptr->f[i]);
		host_ptr->x[i] = NULL;
		host_ptr->f[i] = NULL;
		}
		cudaFree((void*)dev_ptr);
		hipFree((void*)dev_ptr);
		delete host_ptr;
		CUDA_CHECK();
		}