Use generic "gpu" in place of "cuda" calls (455a5a3e) · Commits · HYDRO / TRITON

src/gputils.h

0 → 100644

+49 −0

Original line number	Diff line number	Diff line
		/** @file gputils.h
		* @brief Header containing map of generic "gpu" to hip or cuda calls/constants/types
		*
		* @author Michael Kelleher
		* @author Mario Morales Hernandez
		* @author Md Bulbul Sharif
		* @author Tigstu T. Dullo
		* @author Sudershan Gangrade
		* @author Alfred Kalyanapu
		* @author Sheikh Ghafoor
		* @author Shih-Chieh Kao
		* @author Katherine J. Evans
		* @bug No known bugs.
		*/

		#ifdef AMDGPU
		#include "hip/hip_runtime.h"
		#define gpuStream_t hipStream_t
		#define gpuError_t hipError_t
		#define gpuGetDevice hipGetDevice
		#define gpuGetDeviceCount hipGetDeviceCount
		#define gpuGetErrorString hipGetErrorString
		#define gpuSuccess hipSuccess
		#define gpuSetDevice hipSetDevice
		#define gpuStreamCreate hipStreamCreate
		#define gpuMalloc hipMalloc
		#define gpuMemcpyAsync hipMemcpyAsync
		#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
		#define gpuStreamSynchronize hipStreamSynchronize
		#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
		#define gpuFree hipFree
		#define gpuStreamDestroy hipStreamDestroy
		#else
		#define gpuStream_t cudaStream_t
		#define gpuError_t cudaError_t
		#define gpuGetDevice cudaGetDevice
		#define gpuGetDeviceCount cudaGetDeviceCount
		#define gpuGetErrorString cudaGetErrorString
		#define gpuSuccess cudaSuccess
		#define gpuSetDevice cudaSetDevice
		#define gpuStreamCreate cudaStreamCreate
		#define gpuMalloc cudaMalloc
		#define gpuMemcpyAsync cudaMemcpyAsync
		#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
		#define gpuStreamSynchronize cudaStreamSynchronize
		#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
		#define gpuFree cudaFree
		#define gpuStreamDestroy cudaStreamDestroy
		#endif

src/triton.h

+94 −93

Original line number	Diff line number	Diff line
		@@ -22,6 +22,7 @@
		#include "kernels.h"
		#include "output.h"
		#include "mpi_utils.h"
		#include "gputils.h"

		namespace Triton
		{
		@@ -155,7 +156,7 @@ namespace Triton
		Output::output<T> out; /*Object to manage output files. /

		#ifdef ACTIVE_GPU
		cudaStream_t streams; /*< Cuda stream /
		gpuStream_t streams; /*< Cuda stream /
		std::vector<T> device_vec; /< Device vector that contains all floating point array to use in simulation. /
		std::vector<int> device_vec_int; /< Device vector that contains all integer array to use in simulation. /
		#endif
		@@ -1704,32 +1705,32 @@ namespace Triton
		#ifdef ACTIVE_GPU

		int deviceId = 0;
		cudaError_t err = cudaGetDevice(&deviceId);
		if (err != cudaSuccess)
		gpuError_t err = gpuGetDevice(&deviceId);
		if (err != gpuSuccess)
		{
		std::cerr << cudaGetErrorString(err) << std::endl;
		std::cerr << gpuGetErrorString(err) << std::endl;
		exit(EXIT_FAILURE);
		}

		int deviceCount = 0;
		err = cudaGetDeviceCount(&deviceCount);
		if (err != cudaSuccess)
		err = gpuGetDeviceCount(&deviceCount);
		if (err != gpuSuccess)
		{
		std::cerr << cudaGetErrorString(err) << std::endl;
		std::cerr << gpuGetErrorString(err) << std::endl;
		exit(EXIT_FAILURE);
		}

		if(deviceId != rank % deviceCount)
		{
		err = cudaSetDevice(rank % deviceCount);
		if (err != cudaSuccess)
		err = gpuSetDevice(rank % deviceCount);
		if (err != gpuSuccess)
		{
		std::cerr << cudaGetErrorString(err) << std::endl;
		std::cerr << gpuGetErrorString(err) << std::endl;
		exit(EXIT_FAILURE);
		}
		}

		cudaStreamCreate(&streams);
		gpuStreamCreate(&streams);

		device_vec = std::vector<T*>();
		device_vec_int = std::vector<int*>();
		@@ -1754,36 +1755,36 @@ namespace Triton
		int device_src_pos_arr, device_runoff_id_arr, device_relative_bc_index, device_bc_type,
		device_bc_start_index, device_bc_nrows_vars;

		cudaMalloc((void**)&device_h, nbytes);
		cudaMalloc((void**)&device_qx, nbytes);
		cudaMalloc((void**)&device_qy, nbytes);
		cudaMalloc((void**)&device_n, nbytes);
		cudaMalloc((void**)&device_dem, nbytes);
		cudaMalloc((void**)&device_max_value_h, nbytes);

		cudaMalloc((void**)&device_rhsh0, nbytes);
		cudaMalloc((void**)&device_rhsh1, nbytes);
		cudaMalloc((void**)&device_rhsqx0, nbytes);
		cudaMalloc((void**)&device_rhsqx1, nbytes);
		cudaMalloc((void**)&device_rhsqy0, nbytes);
		cudaMalloc((void**)&device_rhsqy1, nbytes);

		cudaMalloc((void**)&device_sqrth_arr, nbytes);
		cudaMalloc((void**)&device_halo_arr, nbytes_halo);
		cudaMalloc((void**)&device_dt_values_arr, nbytes_dt);
		cudaMalloc((void**)&device_hyg_time_arr, nbytes_hyg_time);
		cudaMalloc((void**)&device_hyg_val_arr, nbytes_hyg_val);
		cudaMalloc((void**)&device_runoff_intensity_arr, nbytes_runoff_intensity);
		cudaMalloc((void**)&device_bc_var1_arr, nbytes_bc_vars);
		cudaMalloc((void**)&device_bc_var2_arr, nbytes_bc_vars);


		cudaMalloc((void**)&device_src_pos_arr, nbytes_src_pos);
		cudaMalloc((void**)&device_runoff_id_arr, nbytes_runoff_id);
		cudaMalloc((void**)&device_relative_bc_index, nbytes_bc_cell_size);
		cudaMalloc((void**)&device_bc_type, nbytes_bc_cell_size);
		cudaMalloc((void**)&device_bc_start_index, nbytes_bc_cell_size);
		cudaMalloc((void**)&device_bc_nrows_vars, nbytes_bc_cell_size);
		gpuMalloc((void**)&device_h, nbytes);
		gpuMalloc((void**)&device_qx, nbytes);
		gpuMalloc((void**)&device_qy, nbytes);
		gpuMalloc((void**)&device_n, nbytes);
		gpuMalloc((void**)&device_dem, nbytes);
		gpuMalloc((void**)&device_max_value_h, nbytes);

		gpuMalloc((void**)&device_rhsh0, nbytes);
		gpuMalloc((void**)&device_rhsh1, nbytes);
		gpuMalloc((void**)&device_rhsqx0, nbytes);
		gpuMalloc((void**)&device_rhsqx1, nbytes);
		gpuMalloc((void**)&device_rhsqy0, nbytes);
		gpuMalloc((void**)&device_rhsqy1, nbytes);

		gpuMalloc((void**)&device_sqrth_arr, nbytes);
		gpuMalloc((void**)&device_halo_arr, nbytes_halo);
		gpuMalloc((void**)&device_dt_values_arr, nbytes_dt);
		gpuMalloc((void**)&device_hyg_time_arr, nbytes_hyg_time);
		gpuMalloc((void**)&device_hyg_val_arr, nbytes_hyg_val);
		gpuMalloc((void**)&device_runoff_intensity_arr, nbytes_runoff_intensity);
		gpuMalloc((void**)&device_bc_var1_arr, nbytes_bc_vars);
		gpuMalloc((void**)&device_bc_var2_arr, nbytes_bc_vars);


		gpuMalloc((void**)&device_src_pos_arr, nbytes_src_pos);
		gpuMalloc((void**)&device_runoff_id_arr, nbytes_runoff_id);
		gpuMalloc((void**)&device_relative_bc_index, nbytes_bc_cell_size);
		gpuMalloc((void**)&device_bc_type, nbytes_bc_cell_size);
		gpuMalloc((void**)&device_bc_start_index, nbytes_bc_cell_size);
		gpuMalloc((void**)&device_bc_nrows_vars, nbytes_bc_cell_size);

		device_vec.push_back(device_h);
		device_vec.push_back(device_qx);
		@@ -1816,37 +1817,37 @@ namespace Triton
		device_vec_int.push_back(device_bc_nrows_vars);


		cudaMemcpyAsync(device_vec[H], host_vec[H], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[QX], host_vec[QX], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[QY], host_vec[QY], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[NMAN], host_vec[NMAN], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[DEM], host_vec[DEM], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[MAXH], host_vec[MAXH], nbytes, cudaMemcpyHostToDevice, streams);

		cudaMemcpyAsync(device_vec[RHSH0], host_vec[RHSH0], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RHSH1], host_vec[RHSH1], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RHSQX0], host_vec[RHSQX0], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RHSQX1], host_vec[RHSQX1], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RHSQY0], host_vec[RHSQY0], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RHSQY1], host_vec[RHSQY1], nbytes, cudaMemcpyHostToDevice, streams);

		cudaMemcpyAsync(device_vec[SQRTH], host_vec[SQRTH], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[DT], host_vec[DT], nbytes_dt, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[HYGT], host_vec[HYGT], nbytes_hyg_time, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[HYGV], host_vec[HYGV], nbytes_hyg_val, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RUNIN], host_vec[RUNIN], nbytes_runoff_intensity, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[EXTBCV1], host_vec[EXTBCV1], nbytes_bc_vars, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[EXTBCV2], host_vec[EXTBCV2], nbytes_bc_vars, cudaMemcpyHostToDevice, streams);


		cudaMemcpyAsync(device_vec_int[SRCP], host_vec_int[SRCP], nbytes_src_pos, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec_int[RUNID], host_vec_int[RUNID], nbytes_runoff_id, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec_int[BCRELATIVEINDEX], host_vec_int[BCRELATIVEINDEX], nbytes_bc_cell_size, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec_int[BCTYPE], host_vec_int[BCTYPE], nbytes_bc_cell_size, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec_int[BCINDEXSTART], host_vec_int[BCINDEXSTART], nbytes_bc_cell_size, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec_int[BCNROWSVARS], host_vec_int[BCNROWSVARS], nbytes_bc_cell_size, cudaMemcpyHostToDevice, streams);
		cudaStreamSynchronize(streams);
		gpuMemcpyAsync(device_vec[H], host_vec[H], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[QX], host_vec[QX], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[QY], host_vec[QY], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[NMAN], host_vec[NMAN], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[DEM], host_vec[DEM], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[MAXH], host_vec[MAXH], nbytes, gpuMemcpyHostToDevice, streams);

		gpuMemcpyAsync(device_vec[RHSH0], host_vec[RHSH0], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RHSH1], host_vec[RHSH1], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RHSQX0], host_vec[RHSQX0], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RHSQX1], host_vec[RHSQX1], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RHSQY0], host_vec[RHSQY0], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RHSQY1], host_vec[RHSQY1], nbytes, gpuMemcpyHostToDevice, streams);

		gpuMemcpyAsync(device_vec[SQRTH], host_vec[SQRTH], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[DT], host_vec[DT], nbytes_dt, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[HYGT], host_vec[HYGT], nbytes_hyg_time, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[HYGV], host_vec[HYGV], nbytes_hyg_val, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RUNIN], host_vec[RUNIN], nbytes_runoff_intensity, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[EXTBCV1], host_vec[EXTBCV1], nbytes_bc_vars, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[EXTBCV2], host_vec[EXTBCV2], nbytes_bc_vars, gpuMemcpyHostToDevice, streams);


		gpuMemcpyAsync(device_vec_int[SRCP], host_vec_int[SRCP], nbytes_src_pos, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec_int[RUNID], host_vec_int[RUNID], nbytes_runoff_id, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec_int[BCRELATIVEINDEX], host_vec_int[BCRELATIVEINDEX], nbytes_bc_cell_size, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec_int[BCTYPE], host_vec_int[BCTYPE], nbytes_bc_cell_size, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec_int[BCINDEXSTART], host_vec_int[BCINDEXSTART], nbytes_bc_cell_size, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec_int[BCNROWSVARS], host_vec_int[BCNROWSVARS], nbytes_bc_cell_size, gpuMemcpyHostToDevice, streams);
		gpuStreamSynchronize(streams);
		#endif
		}

		@@ -1894,13 +1895,13 @@ namespace Triton
		#ifdef ACTIVE_GPU
		while (!device_vec.empty())
		{
		cudaFree(device_vec.back());
		gpuFree(device_vec.back());
		device_vec.pop_back();
		}

		while (!device_vec_int.empty())
		{
		cudaFree(device_vec_int.back());
		gpuFree(device_vec_int.back());
		device_vec_int.pop_back();
		}
		#endif
		@@ -1962,14 +1963,14 @@ namespace Triton

		#ifdef ACTIVE_GPU
		st.start(COMPUTE_TIME);
		cudaMemcpyAsync(host_vec[H], device_vec[H], nbytes, cudaMemcpyDeviceToHost, streams);
		cudaMemcpyAsync(host_vec[QX], device_vec[QX], nbytes, cudaMemcpyDeviceToHost, streams);
		cudaMemcpyAsync(host_vec[QY], device_vec[QY], nbytes, cudaMemcpyDeviceToHost, streams);
		gpuMemcpyAsync(host_vec[H], device_vec[H], nbytes, gpuMemcpyDeviceToHost, streams);
		gpuMemcpyAsync(host_vec[QX], device_vec[QX], nbytes, gpuMemcpyDeviceToHost, streams);
		gpuMemcpyAsync(host_vec[QY], device_vec[QY], nbytes, gpuMemcpyDeviceToHost, streams);
		if (arglist.max_value_print_option.size() > 0)
		{
		cudaMemcpyAsync(host_vec[MAXH], device_vec[MAXH], nbytes, cudaMemcpyDeviceToHost, streams);
		gpuMemcpyAsync(host_vec[MAXH], device_vec[MAXH], nbytes, gpuMemcpyDeviceToHost, streams);
		}
		cudaStreamSynchronize(streams);
		gpuStreamSynchronize(streams);
		st.stop(COMPUTE_TIME);
		#endif

		@@ -2052,8 +2053,8 @@ namespace Triton
		cur_dt_arr_sz = temp_dt_arr_sz;
		}

		cudaMemcpyAsync(&local_dt, device_vec[DT], sizeof(T), cudaMemcpyDeviceToHost, streams);
		cudaStreamSynchronize(streams);
		gpuMemcpyAsync(&local_dt, device_vec[DT], sizeof(T), gpuMemcpyDeviceToHost, streams);
		gpuStreamSynchronize(streams);
		#else
		Kernels::compute_dt_and_sqrt(rows*cols, cell_size, host_vec[QX], host_vec[QY], host_vec[H], host_vec[SQRTH], host_vec[DT], arglist.courant, arglist.hextra);
		Kernels::find_min_dt(rows*cols, host_vec[DT]);
		@@ -2212,7 +2213,7 @@ namespace Triton

		if (arglist.gpu_direct_flag)
		{
		cudaStreamSynchronize(streams);
		gpuStreamSynchronize(streams);
		st.stop(COMPUTE_TIME);

		st.start(BALANCING_MPI_TIME);
		@@ -2225,8 +2226,8 @@ namespace Triton
		}
		else
		{
		cudaMemcpyAsync(host_vec[HALO], device_vec[HALO], nbytes_halo, cudaMemcpyDeviceToHost, streams);
		cudaStreamSynchronize(streams);
		gpuMemcpyAsync(host_vec[HALO], device_vec[HALO], nbytes_halo, gpuMemcpyDeviceToHost, streams);
		gpuStreamSynchronize(streams);
		st.stop(COMPUTE_TIME);

		st.start(BALANCING_MPI_TIME);
		@@ -2236,7 +2237,7 @@ namespace Triton
		st.stop(BALANCING_MPI_TIME);

		st.start(COMPUTE_TIME);
		cudaMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, cudaMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, gpuMemcpyHostToDevice, streams);
		}

		Kernels::halo_copy_to_gpu << <(2 * colsGHOST_CELL_PADDING + THREAD_BLOCK - 1) / THREAD_BLOCK, THREAD_BLOCK, 0, streams >> > (2 cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);
		@@ -2404,24 +2405,24 @@ namespace Triton
		#ifdef ACTIVE_GPU

		//not neccessary since we are inside output so we already copied this data to the CPU
		/*cudaMemcpyAsync(host_vec[H], device_vec[H], nbytes, cudaMemcpyDeviceToHost, streams);
		cudaMemcpyAsync(host_vec[QX], device_vec[QX], nbytes, cudaMemcpyDeviceToHost, streams);
		cudaMemcpyAsync(host_vec[QY], device_vec[QY], nbytes, cudaMemcpyDeviceToHost, streams);
		/*gpuMemcpyAsync(host_vec[H], device_vec[H], nbytes, gpuMemcpyDeviceToHost, streams);
		gpuMemcpyAsync(host_vec[QX], device_vec[QX], nbytes, gpuMemcpyDeviceToHost, streams);
		gpuMemcpyAsync(host_vec[QY], device_vec[QY], nbytes, gpuMemcpyDeviceToHost, streams);
		if (arglist.max_value_print_option.size() > 0)
		{
		cudaMemcpyAsync(host_vec[MAXH], device_vec[MAXH], nbytes, cudaMemcpyDeviceToHost, streams);
		gpuMemcpyAsync(host_vec[MAXH], device_vec[MAXH], nbytes, gpuMemcpyDeviceToHost, streams);
		}
		cudaStreamSynchronize(streams);*/
		gpuStreamSynchronize(streams);*/

		cudaStreamDestroy(streams);
		gpuStreamDestroy(streams);
		while (!device_vec.empty())
		{
		cudaFree(device_vec.back());
		gpuFree(device_vec.back());
		device_vec.pop_back();
		}
		while (!device_vec_int.empty())
		{
		cudaFree(device_vec_int.back());
		gpuFree(device_vec_int.back());
		device_vec_int.pop_back();
		}