Loading build/configure +5 −4 Original line number Diff line number Diff line Loading @@ -4,13 +4,14 @@ export KOKKOS_DIR=/packages/TPLs/install/opt/kokkos rm -rf CMake* cmake \ /ccs/home/mbt/bin/cmake \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_CXX_COMPILER=g++ \ -D CMAKE_CXX_COMPILER=hipcc \ -D CXX_STD=11 \ -D LDFLAGS="-pthread" \ -D USE_OPENACC=0 \ -D USE_OPENMP=1 \ -D USE_KOKKOS=1 \ -D USE_OPENMP=0 \ -D USE_KOKKOS=0 \ -D KOKKOS_DIRECTORY=${KOKKOS_DIR} \ -D KOKKOS_WRAPPER=${KOKKOS_DIR}/nvcc_wrapper \ -D USE_CUDA=1 \ Loading src/CMakeLists.txt +34 −33 Original line number Diff line number Diff line Loading @@ -69,8 +69,8 @@ IF ( USE_OPENMP ) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mp") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mp") ELSEIF ( USING_CLANG ) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp -pthread") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread") SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp -pthread -fPIC") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread -fPIC") ELSEIF ( USING_XL ) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -qsmp=omp") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qsmp=omp") Loading Loading @@ -124,32 +124,32 @@ ENDIF() # Enable cuda IF ( USE_CUDA ) # Include FindCUDA INCLUDE( FindCUDA ) IF ( NOT CUDA_FOUND ) MESSAGE ( FATAL_ERROR "CUDA not found" ) ENDIF() IF ( CUDA_FLAGS ) SET( CUDA_NVCC_FLAGS "${CUDA_FLAGS} ${CXX_STD_FLAG}" ) ENDIF() IF(NOT CUDA_NVCC_FLAGS) # Set minimum requirements SET( CUDA_NVCC_FLAGS "-arch=sm_30 ${CXX_STD_FLAG}" ) ENDIF() IF ( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" ) SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo" ) ENDIF() # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CXX_STD_FLAG}" ) IF( NOT CMAKE_BUILD_TYPE ) MESSAGE(FATAL_ERROR "CMAKE_BUILD_TYPE is not set") ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" ) SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -g -O0" ) ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Release" ) SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3" ) ENDIF() SET( CUDA_PROPAGATE_HOST_FLAGS OFF ) SET( CUDA_FIND_QUIETLY ) STRING( REPLACE " " ";" CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ) # # Include FindCUDA # INCLUDE( FindCUDA ) # IF ( NOT CUDA_FOUND ) # MESSAGE ( FATAL_ERROR "CUDA not found" ) # ENDIF() # IF ( CUDA_FLAGS ) # SET( CUDA_NVCC_FLAGS "${CUDA_FLAGS} ${CXX_STD_FLAG}" ) # ENDIF() # IF(NOT CUDA_NVCC_FLAGS) # # Set minimum requirements # SET( CUDA_NVCC_FLAGS "-arch=sm_30 ${CXX_STD_FLAG}" ) # ENDIF() # IF ( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" ) # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo" ) # ENDIF() # # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CXX_STD_FLAG}" ) # IF( NOT CMAKE_BUILD_TYPE ) # MESSAGE(FATAL_ERROR "CMAKE_BUILD_TYPE is not set") # ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" ) # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -g -O0" ) # ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Release" ) # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3" ) # ENDIF() # SET( CUDA_PROPAGATE_HOST_FLAGS OFF ) # SET( CUDA_FIND_QUIETLY ) # STRING( REPLACE " " ";" CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ) INCLUDE_DIRECTORIES ( ${CUDA_INCLUDE_DIRS} ) ADD_DEFINITIONS( "-DUSE_CUDA" ) MESSAGE( "Using CUDA ${CUDA_VERSION}" ) Loading @@ -176,7 +176,7 @@ ADD_DISTCLEAN( libRayTrace.* null_timer CreateImage* ) # Create the library INCLUDE_DIRECTORIES( ${RAYTRACE_SOURCE_DIR} ) ADD_DEFINITIONS( -DDISABLE_WRITE_FAILED_RAYS ) SET( SOURCES RayTrace RayTraceImage.cpp RayTraceStructures.cpp utilities/RayUtilities.cpp AtomicModel/interp.cpp RayTraceImageCPU.cpp CreateImageHelpers.cpp ) SET( SOURCES RayTrace RayTraceImage.cpp RayTraceStructures.cpp utilities/RayUtilities.cpp AtomicModel/interp.cpp RayTraceImageCPU.cpp CreateImageHelpers.cpp RayTraceImageHip.cpp ) IF ( USE_OPENACC ) SET( SOURCES ${SOURCES} RayTraceImageOpenACC.cpp ) ENDIF() Loading @@ -184,14 +184,15 @@ IF ( USE_KOKKOS ) #SET( SOURCES ${SOURCES} RayTraceImageKokkos.cpp ) ENDIF() IF ( USE_CUDA ) SET( CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE ) SET( CMAKE_BUILD_WITH_INSTALL_RPATH TRUE ) CUDA_COMPILE( CUBINS RayTraceImageCuda.cu ) #SET( CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE ) #SET( CMAKE_BUILD_WITH_INSTALL_RPATH TRUE ) #CUDA_COMPILE( CUBINS RayTraceImageCuda.cu ) ENDIF() ADD_LIBRARY( ${SOURCES} ${CUBINS} ) ADD_LIBRARY( RayTrace STATIC ${SOURCES} ${CUBINS} ) # Add the applications ADD_EXECUTABLE( CreateImage CreateImage.cpp ) TARGET_LINK_LIBRARIES( CreateImage RayTrace ${CUDA_LIBRARIES} ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} ) INSTALL( TARGETS CreateImage DESTINATION ${RAYTRACE_INSTALL_DIR}/bin ) Loading src/CreateImageHelpers.cpp +2 −2 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ #ifdef USE_CUDA #include <cuda_runtime_api.h> #include <hip/hip_runtime_api.h> #endif // Detect the OS Loading Loading @@ -250,7 +250,7 @@ void printHardware() // Get number of gpus int N_gpu = 0; #ifdef USE_CUDA cudaGetDeviceCount( &N_gpu ); hipGetDeviceCount( &N_gpu ); #endif // Get system memory #if defined( USE_LINUX ) Loading src/RayTraceImage.cpp +3 −3 Original line number Diff line number Diff line Loading @@ -37,7 +37,7 @@ #undef USE_OPENMP #endif #ifdef ENABLE_CUDA #include <cuda_runtime_api.h> #include <hip/hip_runtime_api.h> #endif #include "common/RayTraceDefinitions.h" #include "common/RayTraceImageHelper.h" Loading Loading @@ -79,7 +79,7 @@ void setGPU( int id ) { NULL_USE( id ); #if defined( ENABLE_CUDA ) cudaSetDevice( id ); hipSetDevice( id ); #endif } void setDeviceAndRun( int id, int N_threads, Loading Loading @@ -369,7 +369,7 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth } else if ( compute_method == "cuda-multigpu" ) { #if defined( ENABLE_CUDA ) int N_gpu; cudaGetDeviceCount( &N_gpu ); hipGetDeviceCount( &N_gpu ); RayTraceImageThreadLoop( N_gpu, RayTraceImageCudaLoop, setGPU, N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, scale, image, I_ang, failure_code, failed_rays ); Loading src/RayTraceImageCuda.cu→src/RayTraceImageHip.cpp +78 −77 Original line number Diff line number Diff line #include "hip/hip_runtime.h" #include "RayTraceStructures.h" #include "common/RayTraceImageHelper.h" #include "utilities/RayUtilities.h" Loading @@ -7,10 +8,10 @@ // Check for errors #define CUDA_CHECK() \ do { \ if ( cudaPeekAtLastError() != cudaSuccess ) { \ cudaError_t error = cudaGetLastError(); \ if ( hipPeekAtLastError() != hipSuccess ) { \ hipError_t error = hipGetLastError(); \ printf("cuda error: %i\n",error); \ printf(" %s\n",cudaGetErrorString(error)); \ printf(" %s\n",hipGetErrorString(error)); \ printf(" line: %i\n",(int)__LINE__); \ printf(" file: %s\n",__FILE__); \ exit(-1); \ Loading @@ -18,7 +19,7 @@ } while (0) #define CUDA_PRINT_FUNCTION( fun ) \ #define CUDA_PRINT_FUNCTION( fun ) /* \ do { \ cudaFuncAttributes attr; \ cudaFuncGetAttributes(&attr,fun); \ Loading @@ -30,7 +31,7 @@ printf(" sharedSize = %i\n",attr.sharedSizeBytes); \ printf(" maxThreads = %i\n",attr.maxThreadsPerBlock); \ printf(" numRegs = %i\n",attr.numRegs); \ } while (0) } while (0)*/ // Atomic add operation for double Loading @@ -56,13 +57,13 @@ __device__ double atomicAdd2(double* address, double val) // Get the globally unique thread id __device__ int getGlobalIdx3D() { int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z; int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x; int blockId = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x + hipGridDim_x * hipGridDim_y * hipBlockIdx_z; int threadId = blockId * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z) + (hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y)) + (hipThreadIdx_y * hipBlockDim_x) + hipThreadIdx_x; return threadId; } Loading Loading @@ -155,14 +156,14 @@ void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct& beam, { failure_code = 0; // Need to track failures on GPU // Get device properties static int maxThreadsPerBlock = 0; if ( maxThreadsPerBlock == 0 ) { static int maxThreadsPerBlock = 8000; /*if ( maxThreadsPerBlock == 0 ) { cudaFuncAttributes attr; cudaFuncGetAttributes(&attr,RayTraceImageCudaKernel); maxThreadsPerBlock = attr.maxThreadsPerBlock; RAY_ASSERT(maxThreadsPerBlock>0); //CUDA_PRINT_FUNCTION(RayTraceImageCudaKernel); } } */ // place the ray gain and seed structures on the device const int nx = beam.nx; const int ny = beam.ny; Loading @@ -182,44 +183,44 @@ void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct& beam, size_t N_rays = rays.size(); double *x2, *y2, *a2, *b2, *dv2, *image2, *I_ang2; ray_struct *rays2; cudaMalloc(&x2,nx*sizeof(double)); cudaMalloc(&y2,ny*sizeof(double)); cudaMalloc(&a2,na*sizeof(double)); cudaMalloc(&b2,nb*sizeof(double)); cudaMalloc(&dv2,nv*sizeof(double)); cudaMalloc(&image2,nx*ny*nv*sizeof(double)); cudaMalloc(&I_ang2,na*nb*sizeof(double)); cudaMalloc(&rays2,N_rays*sizeof(ray_struct)); cudaMemcpy(x2,beam.x,nx*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(y2,beam.y,ny*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(a2,beam.a,na*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(b2,beam.b,nb*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(dv2,beam.dv,nv*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),cudaMemcpyHostToDevice); cudaMemset(image2,0,nx*ny*nv*sizeof(double)); cudaMemset(I_ang2,0,na*nb*sizeof(double)); hipMalloc(&x2,nx*sizeof(double)); hipMalloc(&y2,ny*sizeof(double)); hipMalloc(&a2,na*sizeof(double)); hipMalloc(&b2,nb*sizeof(double)); hipMalloc(&dv2,nv*sizeof(double)); hipMalloc(&image2,nx*ny*nv*sizeof(double)); hipMalloc(&I_ang2,na*nb*sizeof(double)); hipMalloc(&rays2,N_rays*sizeof(ray_struct)); hipMemcpy(x2,beam.x,nx*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(y2,beam.y,ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(a2,beam.a,na*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(b2,beam.b,nb*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(dv2,beam.dv,nv*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),hipMemcpyHostToDevice); hipMemset(image2,0,nx*ny*nv*sizeof(double)); hipMemset(I_ang2,0,na*nb*sizeof(double)); CUDA_CHECK(); // Do calculation on device: size_t threads = maxThreadsPerBlock; size_t N_blocks = N_rays/threads + (N_rays%threads == 0 ? 0:1); dim3 block_size = calcBlockSize(N_blocks); block_size.x = N_rays/threads + (N_rays%threads == 0 ? 0:1); RayTraceImageCudaKernel <<< block_size,threads >>> (N,nx,ny,na,nb,nv,x2,y2,a2,b2, hipLaunchKernelGGL((RayTraceImageCudaKernel), dim3(block_size), dim3(threads ), 0, 0, N,nx,ny,na,nb,nv,x2,y2,a2,b2, dx,dy,dz,da,db,dv2,gain,seed,method,N_rays,rays2,scale,image2,I_ang2); CUDA_CHECK(); // Retrieve result from device and store it in host array cudaMemcpy(image,image2,nx*ny*nv*sizeof(double),cudaMemcpyDeviceToHost); cudaMemcpy(I_ang,I_ang2,na*nb*sizeof(double),cudaMemcpyDeviceToHost); hipMemcpy(image,image2,nx*ny*nv*sizeof(double),hipMemcpyDeviceToHost); hipMemcpy(I_ang,I_ang2,na*nb*sizeof(double),hipMemcpyDeviceToHost); CUDA_CHECK(); // Cleanup cudaFree(x2); cudaFree(y2); cudaFree(a2); cudaFree(b2); cudaFree(dv2); cudaFree(rays2); cudaFree(image2); cudaFree(I_ang2); hipFree(x2); hipFree(y2); hipFree(a2); hipFree(b2); hipFree(dv2); hipFree(rays2); hipFree(image2); hipFree(I_ang2); CUDA_CHECK(); RayTrace::ray_gain_struct::free_device( N, gain_in, gain ); RayTrace::ray_seed_struct::free_device( seed_in, seed ); Loading @@ -235,25 +236,25 @@ const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_cuda( size_t N, con host_ptr[i].Ny = arr[i].Ny; host_ptr[i].Nv = arr[i].Nv; host_ptr[i].E0 = nullptr; cudaMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double)); cudaMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double)); cudaMalloc(&host_ptr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double)); cudaMalloc(&host_ptr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float)); cudaMalloc(&host_ptr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float)); cudaMalloc(&host_ptr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float)); cudaMalloc(&host_ptr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float)); cudaMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float),cudaMemcpyHostToDevice); hipMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double)); hipMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double)); hipMalloc(&host_ptr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double)); hipMalloc(&host_ptr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMalloc(&host_ptr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMalloc(&host_ptr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float)); hipMalloc(&host_ptr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); if ( arr[i].E0 != nullptr ) cudaMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float),cudaMemcpyHostToDevice); hipMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); } RayTrace::ray_gain_struct* dev_ptr=NULL; cudaMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct)); cudaMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),cudaMemcpyHostToDevice); hipMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct)); hipMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyHostToDevice); for (size_t i=0; i<N; i++) { host_ptr[i].x = NULL; host_ptr[i].y = NULL; Loading @@ -271,15 +272,15 @@ const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_cuda( size_t N, con void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct* dev_ptr ) { RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N]; cudaMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),cudaMemcpyDeviceToHost); hipMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyDeviceToHost); for (size_t i=0; i<N; i++) { cudaFree(host_ptr[i].x); cudaFree(host_ptr[i].y); cudaFree(host_ptr[i].n); cudaFree(host_ptr[i].g0); cudaFree(host_ptr[i].E0); cudaFree(host_ptr[i].gv); cudaFree(host_ptr[i].gv0); hipFree(host_ptr[i].x); hipFree(host_ptr[i].y); hipFree(host_ptr[i].n); hipFree(host_ptr[i].g0); hipFree(host_ptr[i].E0); hipFree(host_ptr[i].gv); hipFree(host_ptr[i].gv0); host_ptr[i].x = NULL; host_ptr[i].y = NULL; host_ptr[i].n = NULL; Loading @@ -288,7 +289,7 @@ void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct host_ptr[i].gv = NULL; host_ptr[i].gv0 = NULL; } cudaFree((void*)dev_ptr); hipFree((void*)dev_ptr); delete [] host_ptr; CUDA_CHECK(); } Loading @@ -301,14 +302,14 @@ const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_cuda( const RayTrac host_ptr->f0 = seed.f0; for (size_t i=0; i<5; i++) { host_ptr->dim[i] = seed.dim[i]; cudaMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double)); cudaMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double)); cudaMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),cudaMemcpyHostToDevice); hipMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double)); hipMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double)); hipMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice); } RayTrace::ray_seed_struct* dev_ptr=NULL; cudaMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct)); cudaMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),cudaMemcpyHostToDevice); hipMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct)); hipMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyHostToDevice); for (size_t i=0; i<5; i++) { host_ptr->x[i] = NULL; host_ptr->f[i] = NULL; Loading @@ -321,14 +322,14 @@ const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_cuda( const RayTrac void ray_seed_struct_free_device_cuda( const RayTrace::ray_seed_struct* dev_ptr ) { RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct; cudaMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),cudaMemcpyDeviceToHost); hipMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyDeviceToHost); for (size_t i=0; i<5; i++) { cudaFree(host_ptr->x[i]); cudaFree(host_ptr->f[i]); hipFree(host_ptr->x[i]); hipFree(host_ptr->f[i]); host_ptr->x[i] = NULL; host_ptr->f[i] = NULL; } cudaFree((void*)dev_ptr); hipFree((void*)dev_ptr); delete host_ptr; CUDA_CHECK(); } Loading Loading
build/configure +5 −4 Original line number Diff line number Diff line Loading @@ -4,13 +4,14 @@ export KOKKOS_DIR=/packages/TPLs/install/opt/kokkos rm -rf CMake* cmake \ /ccs/home/mbt/bin/cmake \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_CXX_COMPILER=g++ \ -D CMAKE_CXX_COMPILER=hipcc \ -D CXX_STD=11 \ -D LDFLAGS="-pthread" \ -D USE_OPENACC=0 \ -D USE_OPENMP=1 \ -D USE_KOKKOS=1 \ -D USE_OPENMP=0 \ -D USE_KOKKOS=0 \ -D KOKKOS_DIRECTORY=${KOKKOS_DIR} \ -D KOKKOS_WRAPPER=${KOKKOS_DIR}/nvcc_wrapper \ -D USE_CUDA=1 \ Loading
src/CMakeLists.txt +34 −33 Original line number Diff line number Diff line Loading @@ -69,8 +69,8 @@ IF ( USE_OPENMP ) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mp") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mp") ELSEIF ( USING_CLANG ) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp -pthread") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread") SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp -pthread -fPIC") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread -fPIC") ELSEIF ( USING_XL ) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -qsmp=omp") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qsmp=omp") Loading Loading @@ -124,32 +124,32 @@ ENDIF() # Enable cuda IF ( USE_CUDA ) # Include FindCUDA INCLUDE( FindCUDA ) IF ( NOT CUDA_FOUND ) MESSAGE ( FATAL_ERROR "CUDA not found" ) ENDIF() IF ( CUDA_FLAGS ) SET( CUDA_NVCC_FLAGS "${CUDA_FLAGS} ${CXX_STD_FLAG}" ) ENDIF() IF(NOT CUDA_NVCC_FLAGS) # Set minimum requirements SET( CUDA_NVCC_FLAGS "-arch=sm_30 ${CXX_STD_FLAG}" ) ENDIF() IF ( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" ) SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo" ) ENDIF() # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CXX_STD_FLAG}" ) IF( NOT CMAKE_BUILD_TYPE ) MESSAGE(FATAL_ERROR "CMAKE_BUILD_TYPE is not set") ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" ) SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -g -O0" ) ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Release" ) SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3" ) ENDIF() SET( CUDA_PROPAGATE_HOST_FLAGS OFF ) SET( CUDA_FIND_QUIETLY ) STRING( REPLACE " " ";" CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ) # # Include FindCUDA # INCLUDE( FindCUDA ) # IF ( NOT CUDA_FOUND ) # MESSAGE ( FATAL_ERROR "CUDA not found" ) # ENDIF() # IF ( CUDA_FLAGS ) # SET( CUDA_NVCC_FLAGS "${CUDA_FLAGS} ${CXX_STD_FLAG}" ) # ENDIF() # IF(NOT CUDA_NVCC_FLAGS) # # Set minimum requirements # SET( CUDA_NVCC_FLAGS "-arch=sm_30 ${CXX_STD_FLAG}" ) # ENDIF() # IF ( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" ) # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo" ) # ENDIF() # # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CXX_STD_FLAG}" ) # IF( NOT CMAKE_BUILD_TYPE ) # MESSAGE(FATAL_ERROR "CMAKE_BUILD_TYPE is not set") # ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Debug" ) # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -g -O0" ) # ELSEIF( ${CMAKE_BUILD_TYPE} STREQUAL "Release" ) # SET( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3" ) # ENDIF() # SET( CUDA_PROPAGATE_HOST_FLAGS OFF ) # SET( CUDA_FIND_QUIETLY ) # STRING( REPLACE " " ";" CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ) INCLUDE_DIRECTORIES ( ${CUDA_INCLUDE_DIRS} ) ADD_DEFINITIONS( "-DUSE_CUDA" ) MESSAGE( "Using CUDA ${CUDA_VERSION}" ) Loading @@ -176,7 +176,7 @@ ADD_DISTCLEAN( libRayTrace.* null_timer CreateImage* ) # Create the library INCLUDE_DIRECTORIES( ${RAYTRACE_SOURCE_DIR} ) ADD_DEFINITIONS( -DDISABLE_WRITE_FAILED_RAYS ) SET( SOURCES RayTrace RayTraceImage.cpp RayTraceStructures.cpp utilities/RayUtilities.cpp AtomicModel/interp.cpp RayTraceImageCPU.cpp CreateImageHelpers.cpp ) SET( SOURCES RayTrace RayTraceImage.cpp RayTraceStructures.cpp utilities/RayUtilities.cpp AtomicModel/interp.cpp RayTraceImageCPU.cpp CreateImageHelpers.cpp RayTraceImageHip.cpp ) IF ( USE_OPENACC ) SET( SOURCES ${SOURCES} RayTraceImageOpenACC.cpp ) ENDIF() Loading @@ -184,14 +184,15 @@ IF ( USE_KOKKOS ) #SET( SOURCES ${SOURCES} RayTraceImageKokkos.cpp ) ENDIF() IF ( USE_CUDA ) SET( CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE ) SET( CMAKE_BUILD_WITH_INSTALL_RPATH TRUE ) CUDA_COMPILE( CUBINS RayTraceImageCuda.cu ) #SET( CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE ) #SET( CMAKE_BUILD_WITH_INSTALL_RPATH TRUE ) #CUDA_COMPILE( CUBINS RayTraceImageCuda.cu ) ENDIF() ADD_LIBRARY( ${SOURCES} ${CUBINS} ) ADD_LIBRARY( RayTrace STATIC ${SOURCES} ${CUBINS} ) # Add the applications ADD_EXECUTABLE( CreateImage CreateImage.cpp ) TARGET_LINK_LIBRARIES( CreateImage RayTrace ${CUDA_LIBRARIES} ${KOKKOS_LIB} ${TIMER_LIBS} ${LDFLAGS} ${LDLIBS} ) INSTALL( TARGETS CreateImage DESTINATION ${RAYTRACE_INSTALL_DIR}/bin ) Loading
src/CreateImageHelpers.cpp +2 −2 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ #ifdef USE_CUDA #include <cuda_runtime_api.h> #include <hip/hip_runtime_api.h> #endif // Detect the OS Loading Loading @@ -250,7 +250,7 @@ void printHardware() // Get number of gpus int N_gpu = 0; #ifdef USE_CUDA cudaGetDeviceCount( &N_gpu ); hipGetDeviceCount( &N_gpu ); #endif // Get system memory #if defined( USE_LINUX ) Loading
src/RayTraceImage.cpp +3 −3 Original line number Diff line number Diff line Loading @@ -37,7 +37,7 @@ #undef USE_OPENMP #endif #ifdef ENABLE_CUDA #include <cuda_runtime_api.h> #include <hip/hip_runtime_api.h> #endif #include "common/RayTraceDefinitions.h" #include "common/RayTraceImageHelper.h" Loading Loading @@ -79,7 +79,7 @@ void setGPU( int id ) { NULL_USE( id ); #if defined( ENABLE_CUDA ) cudaSetDevice( id ); hipSetDevice( id ); #endif } void setDeviceAndRun( int id, int N_threads, Loading Loading @@ -369,7 +369,7 @@ void RayTrace::create_image( create_image_struct *info, std::string compute_meth } else if ( compute_method == "cuda-multigpu" ) { #if defined( ENABLE_CUDA ) int N_gpu; cudaGetDeviceCount( &N_gpu ); hipGetDeviceCount( &N_gpu ); RayTraceImageThreadLoop( N_gpu, RayTraceImageCudaLoop, setGPU, N, std::ref( *info->euv_beam ), info->gain, info->seed, method, rays, scale, image, I_ang, failure_code, failed_rays ); Loading
src/RayTraceImageCuda.cu→src/RayTraceImageHip.cpp +78 −77 Original line number Diff line number Diff line #include "hip/hip_runtime.h" #include "RayTraceStructures.h" #include "common/RayTraceImageHelper.h" #include "utilities/RayUtilities.h" Loading @@ -7,10 +8,10 @@ // Check for errors #define CUDA_CHECK() \ do { \ if ( cudaPeekAtLastError() != cudaSuccess ) { \ cudaError_t error = cudaGetLastError(); \ if ( hipPeekAtLastError() != hipSuccess ) { \ hipError_t error = hipGetLastError(); \ printf("cuda error: %i\n",error); \ printf(" %s\n",cudaGetErrorString(error)); \ printf(" %s\n",hipGetErrorString(error)); \ printf(" line: %i\n",(int)__LINE__); \ printf(" file: %s\n",__FILE__); \ exit(-1); \ Loading @@ -18,7 +19,7 @@ } while (0) #define CUDA_PRINT_FUNCTION( fun ) \ #define CUDA_PRINT_FUNCTION( fun ) /* \ do { \ cudaFuncAttributes attr; \ cudaFuncGetAttributes(&attr,fun); \ Loading @@ -30,7 +31,7 @@ printf(" sharedSize = %i\n",attr.sharedSizeBytes); \ printf(" maxThreads = %i\n",attr.maxThreadsPerBlock); \ printf(" numRegs = %i\n",attr.numRegs); \ } while (0) } while (0)*/ // Atomic add operation for double Loading @@ -56,13 +57,13 @@ __device__ double atomicAdd2(double* address, double val) // Get the globally unique thread id __device__ int getGlobalIdx3D() { int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z; int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x; int blockId = hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x + hipGridDim_x * hipGridDim_y * hipBlockIdx_z; int threadId = blockId * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z) + (hipThreadIdx_z * (hipBlockDim_x * hipBlockDim_y)) + (hipThreadIdx_y * hipBlockDim_x) + hipThreadIdx_x; return threadId; } Loading Loading @@ -155,14 +156,14 @@ void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct& beam, { failure_code = 0; // Need to track failures on GPU // Get device properties static int maxThreadsPerBlock = 0; if ( maxThreadsPerBlock == 0 ) { static int maxThreadsPerBlock = 8000; /*if ( maxThreadsPerBlock == 0 ) { cudaFuncAttributes attr; cudaFuncGetAttributes(&attr,RayTraceImageCudaKernel); maxThreadsPerBlock = attr.maxThreadsPerBlock; RAY_ASSERT(maxThreadsPerBlock>0); //CUDA_PRINT_FUNCTION(RayTraceImageCudaKernel); } } */ // place the ray gain and seed structures on the device const int nx = beam.nx; const int ny = beam.ny; Loading @@ -182,44 +183,44 @@ void RayTraceImageCudaLoop( int N, const RayTrace::EUV_beam_struct& beam, size_t N_rays = rays.size(); double *x2, *y2, *a2, *b2, *dv2, *image2, *I_ang2; ray_struct *rays2; cudaMalloc(&x2,nx*sizeof(double)); cudaMalloc(&y2,ny*sizeof(double)); cudaMalloc(&a2,na*sizeof(double)); cudaMalloc(&b2,nb*sizeof(double)); cudaMalloc(&dv2,nv*sizeof(double)); cudaMalloc(&image2,nx*ny*nv*sizeof(double)); cudaMalloc(&I_ang2,na*nb*sizeof(double)); cudaMalloc(&rays2,N_rays*sizeof(ray_struct)); cudaMemcpy(x2,beam.x,nx*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(y2,beam.y,ny*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(a2,beam.a,na*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(b2,beam.b,nb*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(dv2,beam.dv,nv*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),cudaMemcpyHostToDevice); cudaMemset(image2,0,nx*ny*nv*sizeof(double)); cudaMemset(I_ang2,0,na*nb*sizeof(double)); hipMalloc(&x2,nx*sizeof(double)); hipMalloc(&y2,ny*sizeof(double)); hipMalloc(&a2,na*sizeof(double)); hipMalloc(&b2,nb*sizeof(double)); hipMalloc(&dv2,nv*sizeof(double)); hipMalloc(&image2,nx*ny*nv*sizeof(double)); hipMalloc(&I_ang2,na*nb*sizeof(double)); hipMalloc(&rays2,N_rays*sizeof(ray_struct)); hipMemcpy(x2,beam.x,nx*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(y2,beam.y,ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(a2,beam.a,na*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(b2,beam.b,nb*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(dv2,beam.dv,nv*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(rays2,&rays[0],N_rays*sizeof(ray_struct),hipMemcpyHostToDevice); hipMemset(image2,0,nx*ny*nv*sizeof(double)); hipMemset(I_ang2,0,na*nb*sizeof(double)); CUDA_CHECK(); // Do calculation on device: size_t threads = maxThreadsPerBlock; size_t N_blocks = N_rays/threads + (N_rays%threads == 0 ? 0:1); dim3 block_size = calcBlockSize(N_blocks); block_size.x = N_rays/threads + (N_rays%threads == 0 ? 0:1); RayTraceImageCudaKernel <<< block_size,threads >>> (N,nx,ny,na,nb,nv,x2,y2,a2,b2, hipLaunchKernelGGL((RayTraceImageCudaKernel), dim3(block_size), dim3(threads ), 0, 0, N,nx,ny,na,nb,nv,x2,y2,a2,b2, dx,dy,dz,da,db,dv2,gain,seed,method,N_rays,rays2,scale,image2,I_ang2); CUDA_CHECK(); // Retrieve result from device and store it in host array cudaMemcpy(image,image2,nx*ny*nv*sizeof(double),cudaMemcpyDeviceToHost); cudaMemcpy(I_ang,I_ang2,na*nb*sizeof(double),cudaMemcpyDeviceToHost); hipMemcpy(image,image2,nx*ny*nv*sizeof(double),hipMemcpyDeviceToHost); hipMemcpy(I_ang,I_ang2,na*nb*sizeof(double),hipMemcpyDeviceToHost); CUDA_CHECK(); // Cleanup cudaFree(x2); cudaFree(y2); cudaFree(a2); cudaFree(b2); cudaFree(dv2); cudaFree(rays2); cudaFree(image2); cudaFree(I_ang2); hipFree(x2); hipFree(y2); hipFree(a2); hipFree(b2); hipFree(dv2); hipFree(rays2); hipFree(image2); hipFree(I_ang2); CUDA_CHECK(); RayTrace::ray_gain_struct::free_device( N, gain_in, gain ); RayTrace::ray_seed_struct::free_device( seed_in, seed ); Loading @@ -235,25 +236,25 @@ const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_cuda( size_t N, con host_ptr[i].Ny = arr[i].Ny; host_ptr[i].Nv = arr[i].Nv; host_ptr[i].E0 = nullptr; cudaMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double)); cudaMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double)); cudaMalloc(&host_ptr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double)); cudaMalloc(&host_ptr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float)); cudaMalloc(&host_ptr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float)); cudaMalloc(&host_ptr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float)); cudaMalloc(&host_ptr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float)); cudaMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float),cudaMemcpyHostToDevice); hipMalloc(&host_ptr[i].x,arr[i].Nx*sizeof(double)); hipMalloc(&host_ptr[i].y,arr[i].Ny*sizeof(double)); hipMalloc(&host_ptr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double)); hipMalloc(&host_ptr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMalloc(&host_ptr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMalloc(&host_ptr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float)); hipMalloc(&host_ptr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float)); hipMemcpy(host_ptr[i].x,arr[i].x,arr[i].Nx*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].y,arr[i].y,arr[i].Ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].n,arr[i].n,arr[i].Nx*arr[i].Ny*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].g0,arr[i].g0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); if ( arr[i].E0 != nullptr ) cudaMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float),cudaMemcpyHostToDevice); hipMemcpy(host_ptr[i].E0,arr[i].E0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].gv,arr[i].gv,arr[i].Nx*arr[i].Ny*arr[i].Nv*sizeof(float),hipMemcpyHostToDevice); hipMemcpy(host_ptr[i].gv0,arr[i].gv0,arr[i].Nx*arr[i].Ny*sizeof(float),hipMemcpyHostToDevice); } RayTrace::ray_gain_struct* dev_ptr=NULL; cudaMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct)); cudaMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),cudaMemcpyHostToDevice); hipMalloc(&dev_ptr,N*sizeof(RayTrace::ray_gain_struct)); hipMemcpy(dev_ptr,host_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyHostToDevice); for (size_t i=0; i<N; i++) { host_ptr[i].x = NULL; host_ptr[i].y = NULL; Loading @@ -271,15 +272,15 @@ const RayTrace::ray_gain_struct* ray_gain_struct_copy_device_cuda( size_t N, con void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct* dev_ptr ) { RayTrace::ray_gain_struct* host_ptr = new RayTrace::ray_gain_struct[N]; cudaMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),cudaMemcpyDeviceToHost); hipMemcpy(host_ptr,dev_ptr,N*sizeof(RayTrace::ray_gain_struct),hipMemcpyDeviceToHost); for (size_t i=0; i<N; i++) { cudaFree(host_ptr[i].x); cudaFree(host_ptr[i].y); cudaFree(host_ptr[i].n); cudaFree(host_ptr[i].g0); cudaFree(host_ptr[i].E0); cudaFree(host_ptr[i].gv); cudaFree(host_ptr[i].gv0); hipFree(host_ptr[i].x); hipFree(host_ptr[i].y); hipFree(host_ptr[i].n); hipFree(host_ptr[i].g0); hipFree(host_ptr[i].E0); hipFree(host_ptr[i].gv); hipFree(host_ptr[i].gv0); host_ptr[i].x = NULL; host_ptr[i].y = NULL; host_ptr[i].n = NULL; Loading @@ -288,7 +289,7 @@ void ray_gain_struct_free_device_cuda( size_t N, const RayTrace::ray_gain_struct host_ptr[i].gv = NULL; host_ptr[i].gv0 = NULL; } cudaFree((void*)dev_ptr); hipFree((void*)dev_ptr); delete [] host_ptr; CUDA_CHECK(); } Loading @@ -301,14 +302,14 @@ const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_cuda( const RayTrac host_ptr->f0 = seed.f0; for (size_t i=0; i<5; i++) { host_ptr->dim[i] = seed.dim[i]; cudaMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double)); cudaMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double)); cudaMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),cudaMemcpyHostToDevice); cudaMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),cudaMemcpyHostToDevice); hipMalloc(&host_ptr->x[i],seed.dim[i]*sizeof(double)); hipMalloc(&host_ptr->f[i],seed.dim[i]*sizeof(double)); hipMemcpy(host_ptr->x[i],seed.x[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice); hipMemcpy(host_ptr->f[i],seed.f[i],seed.dim[i]*sizeof(double),hipMemcpyHostToDevice); } RayTrace::ray_seed_struct* dev_ptr=NULL; cudaMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct)); cudaMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),cudaMemcpyHostToDevice); hipMalloc(&dev_ptr,sizeof(RayTrace::ray_seed_struct)); hipMemcpy(dev_ptr,host_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyHostToDevice); for (size_t i=0; i<5; i++) { host_ptr->x[i] = NULL; host_ptr->f[i] = NULL; Loading @@ -321,14 +322,14 @@ const RayTrace::ray_seed_struct* ray_seed_struct_copy_device_cuda( const RayTrac void ray_seed_struct_free_device_cuda( const RayTrace::ray_seed_struct* dev_ptr ) { RayTrace::ray_seed_struct* host_ptr = new RayTrace::ray_seed_struct; cudaMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),cudaMemcpyDeviceToHost); hipMemcpy(host_ptr,dev_ptr,sizeof(RayTrace::ray_seed_struct),hipMemcpyDeviceToHost); for (size_t i=0; i<5; i++) { cudaFree(host_ptr->x[i]); cudaFree(host_ptr->f[i]); hipFree(host_ptr->x[i]); hipFree(host_ptr->f[i]); host_ptr->x[i] = NULL; host_ptr->f[i] = NULL; } cudaFree((void*)dev_ptr); hipFree((void*)dev_ptr); delete host_ptr; CUDA_CHECK(); } Loading