Verified Commit 455a5a3e authored by Kelleher, Michael's avatar Kelleher, Michael
Browse files

Use generic "gpu" in place of "cuda" calls

Add gputils.h file which translates between cuda... or hip... -> gpu
parent ed973c09
Loading
Loading
Loading
Loading

src/gputils.h

0 → 100644
+49 −0
Original line number Diff line number Diff line
/** @file gputils.h
 *  @brief Header containing map of generic "gpu" to hip or cuda calls/constants/types
 *
 *  @author Michael Kelleher
 *  @author Mario Morales Hernandez
 *  @author Md Bulbul Sharif
 *  @author Tigstu T. Dullo
 *  @author Sudershan Gangrade
 *  @author Alfred Kalyanapu
 *  @author Sheikh Ghafoor
 *  @author Shih-Chieh Kao
 *  @author Katherine J. Evans
 *  @bug No known bugs.
 */

#ifdef AMDGPU
#include "hip/hip_runtime.h"
#define gpuStream_t hipStream_t
#define gpuError_t hipError_t
#define gpuGetDevice hipGetDevice
#define gpuGetDeviceCount hipGetDeviceCount
#define gpuGetErrorString hipGetErrorString
#define gpuSuccess hipSuccess
#define gpuSetDevice hipSetDevice
#define gpuStreamCreate hipStreamCreate
#define gpuMalloc hipMalloc
#define gpuMemcpyAsync hipMemcpyAsync
#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
#define gpuStreamSynchronize hipStreamSynchronize
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
#define gpuFree hipFree
#define gpuStreamDestroy hipStreamDestroy
#else
#define gpuStream_t cudaStream_t
#define gpuError_t cudaError_t
#define gpuGetDevice cudaGetDevice
#define gpuGetDeviceCount cudaGetDeviceCount
#define gpuGetErrorString cudaGetErrorString
#define gpuSuccess cudaSuccess
#define gpuSetDevice cudaSetDevice
#define gpuStreamCreate cudaStreamCreate
#define gpuMalloc cudaMalloc
#define gpuMemcpyAsync cudaMemcpyAsync
#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
#define gpuStreamSynchronize cudaStreamSynchronize
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
#define gpuFree cudaFree
#define gpuStreamDestroy cudaStreamDestroy
#endif
+94 −93
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@
#include "kernels.h"
#include "output.h"
#include "mpi_utils.h"
#include "gputils.h"

namespace Triton
{
@@ -155,7 +156,7 @@ namespace Triton
		Output::output<T> out; /**Object to manage output files. */ 

#ifdef ACTIVE_GPU
		cudaStream_t streams;	/**< Cuda stream */
		gpuStream_t streams;	/**< Cuda stream */
		std::vector<T*> device_vec;	/**< Device vector that contains all floating point array to use in simulation. */
		std::vector<int*> device_vec_int;	/**< Device vector that contains all integer array to use in simulation. */
#endif
@@ -1704,32 +1705,32 @@ namespace Triton
#ifdef ACTIVE_GPU

		int deviceId = 0;
		cudaError_t err = cudaGetDevice(&deviceId);
		if (err != cudaSuccess) 
		gpuError_t err = gpuGetDevice(&deviceId);
		if (err != gpuSuccess) 
		{
			std::cerr << cudaGetErrorString(err) << std::endl;
			std::cerr << gpuGetErrorString(err) << std::endl;
			exit(EXIT_FAILURE);
		}
		
		int deviceCount = 0;
		err = cudaGetDeviceCount(&deviceCount);
		if (err != cudaSuccess) 
		err = gpuGetDeviceCount(&deviceCount);
		if (err != gpuSuccess) 
		{
			std::cerr << cudaGetErrorString(err) << std::endl;
			std::cerr << gpuGetErrorString(err) << std::endl;
			exit(EXIT_FAILURE);
		}
		
		if(deviceId != rank % deviceCount)
		{
			err = cudaSetDevice(rank % deviceCount);
			if (err != cudaSuccess) 
			err = gpuSetDevice(rank % deviceCount);
			if (err != gpuSuccess) 
			{
				std::cerr << cudaGetErrorString(err) << std::endl;
				std::cerr << gpuGetErrorString(err) << std::endl;
				exit(EXIT_FAILURE);
			}
		}

		cudaStreamCreate(&streams);
		gpuStreamCreate(&streams);

		device_vec = std::vector<T*>();
		device_vec_int = std::vector<int*>();
@@ -1754,36 +1755,36 @@ namespace Triton
		int *device_src_pos_arr, *device_runoff_id_arr, *device_relative_bc_index, *device_bc_type,
		*device_bc_start_index, *device_bc_nrows_vars;
		
		cudaMalloc((void**)&device_h, nbytes);
		cudaMalloc((void**)&device_qx, nbytes);
		cudaMalloc((void**)&device_qy, nbytes);
		cudaMalloc((void**)&device_n, nbytes);
		cudaMalloc((void**)&device_dem, nbytes);
		cudaMalloc((void**)&device_max_value_h, nbytes);

		cudaMalloc((void**)&device_rhsh0, nbytes);
		cudaMalloc((void**)&device_rhsh1, nbytes);
		cudaMalloc((void**)&device_rhsqx0, nbytes);
		cudaMalloc((void**)&device_rhsqx1, nbytes);
		cudaMalloc((void**)&device_rhsqy0, nbytes);
		cudaMalloc((void**)&device_rhsqy1, nbytes);

		cudaMalloc((void**)&device_sqrth_arr, nbytes);
		cudaMalloc((void**)&device_halo_arr, nbytes_halo);
		cudaMalloc((void**)&device_dt_values_arr, nbytes_dt);
		cudaMalloc((void**)&device_hyg_time_arr, nbytes_hyg_time);
		cudaMalloc((void**)&device_hyg_val_arr, nbytes_hyg_val);
		cudaMalloc((void**)&device_runoff_intensity_arr, nbytes_runoff_intensity);
		cudaMalloc((void**)&device_bc_var1_arr, nbytes_bc_vars);
		cudaMalloc((void**)&device_bc_var2_arr, nbytes_bc_vars);


		cudaMalloc((void**)&device_src_pos_arr, nbytes_src_pos);
		cudaMalloc((void**)&device_runoff_id_arr, nbytes_runoff_id);
		cudaMalloc((void**)&device_relative_bc_index, nbytes_bc_cell_size);
		cudaMalloc((void**)&device_bc_type, nbytes_bc_cell_size);
		cudaMalloc((void**)&device_bc_start_index, nbytes_bc_cell_size);
		cudaMalloc((void**)&device_bc_nrows_vars, nbytes_bc_cell_size);
		gpuMalloc((void**)&device_h, nbytes);
		gpuMalloc((void**)&device_qx, nbytes);
		gpuMalloc((void**)&device_qy, nbytes);
		gpuMalloc((void**)&device_n, nbytes);
		gpuMalloc((void**)&device_dem, nbytes);
		gpuMalloc((void**)&device_max_value_h, nbytes);

		gpuMalloc((void**)&device_rhsh0, nbytes);
		gpuMalloc((void**)&device_rhsh1, nbytes);
		gpuMalloc((void**)&device_rhsqx0, nbytes);
		gpuMalloc((void**)&device_rhsqx1, nbytes);
		gpuMalloc((void**)&device_rhsqy0, nbytes);
		gpuMalloc((void**)&device_rhsqy1, nbytes);

		gpuMalloc((void**)&device_sqrth_arr, nbytes);
		gpuMalloc((void**)&device_halo_arr, nbytes_halo);
		gpuMalloc((void**)&device_dt_values_arr, nbytes_dt);
		gpuMalloc((void**)&device_hyg_time_arr, nbytes_hyg_time);
		gpuMalloc((void**)&device_hyg_val_arr, nbytes_hyg_val);
		gpuMalloc((void**)&device_runoff_intensity_arr, nbytes_runoff_intensity);
		gpuMalloc((void**)&device_bc_var1_arr, nbytes_bc_vars);
		gpuMalloc((void**)&device_bc_var2_arr, nbytes_bc_vars);


		gpuMalloc((void**)&device_src_pos_arr, nbytes_src_pos);
		gpuMalloc((void**)&device_runoff_id_arr, nbytes_runoff_id);
		gpuMalloc((void**)&device_relative_bc_index, nbytes_bc_cell_size);
		gpuMalloc((void**)&device_bc_type, nbytes_bc_cell_size);
		gpuMalloc((void**)&device_bc_start_index, nbytes_bc_cell_size);
		gpuMalloc((void**)&device_bc_nrows_vars, nbytes_bc_cell_size);

		device_vec.push_back(device_h);
		device_vec.push_back(device_qx);
@@ -1816,37 +1817,37 @@ namespace Triton
		device_vec_int.push_back(device_bc_nrows_vars);


		cudaMemcpyAsync(device_vec[H], host_vec[H], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[QX], host_vec[QX], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[QY], host_vec[QY], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[NMAN], host_vec[NMAN], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[DEM], host_vec[DEM], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[MAXH], host_vec[MAXH], nbytes, cudaMemcpyHostToDevice, streams);

		cudaMemcpyAsync(device_vec[RHSH0], host_vec[RHSH0], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RHSH1], host_vec[RHSH1], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RHSQX0], host_vec[RHSQX0], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RHSQX1], host_vec[RHSQX1], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RHSQY0], host_vec[RHSQY0], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RHSQY1], host_vec[RHSQY1], nbytes, cudaMemcpyHostToDevice, streams);

		cudaMemcpyAsync(device_vec[SQRTH], host_vec[SQRTH], nbytes, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[DT], host_vec[DT], nbytes_dt, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[HYGT], host_vec[HYGT], nbytes_hyg_time, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[HYGV], host_vec[HYGV], nbytes_hyg_val, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[RUNIN], host_vec[RUNIN], nbytes_runoff_intensity, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[EXTBCV1], host_vec[EXTBCV1], nbytes_bc_vars, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec[EXTBCV2], host_vec[EXTBCV2], nbytes_bc_vars, cudaMemcpyHostToDevice, streams);


		cudaMemcpyAsync(device_vec_int[SRCP], host_vec_int[SRCP], nbytes_src_pos, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec_int[RUNID], host_vec_int[RUNID], nbytes_runoff_id, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec_int[BCRELATIVEINDEX], host_vec_int[BCRELATIVEINDEX], nbytes_bc_cell_size, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec_int[BCTYPE], host_vec_int[BCTYPE], nbytes_bc_cell_size, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec_int[BCINDEXSTART], host_vec_int[BCINDEXSTART], nbytes_bc_cell_size, cudaMemcpyHostToDevice, streams);
		cudaMemcpyAsync(device_vec_int[BCNROWSVARS], host_vec_int[BCNROWSVARS], nbytes_bc_cell_size, cudaMemcpyHostToDevice, streams);
		cudaStreamSynchronize(streams);
		gpuMemcpyAsync(device_vec[H], host_vec[H], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[QX], host_vec[QX], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[QY], host_vec[QY], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[NMAN], host_vec[NMAN], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[DEM], host_vec[DEM], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[MAXH], host_vec[MAXH], nbytes, gpuMemcpyHostToDevice, streams);

		gpuMemcpyAsync(device_vec[RHSH0], host_vec[RHSH0], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RHSH1], host_vec[RHSH1], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RHSQX0], host_vec[RHSQX0], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RHSQX1], host_vec[RHSQX1], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RHSQY0], host_vec[RHSQY0], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RHSQY1], host_vec[RHSQY1], nbytes, gpuMemcpyHostToDevice, streams);

		gpuMemcpyAsync(device_vec[SQRTH], host_vec[SQRTH], nbytes, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[DT], host_vec[DT], nbytes_dt, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[HYGT], host_vec[HYGT], nbytes_hyg_time, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[HYGV], host_vec[HYGV], nbytes_hyg_val, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[RUNIN], host_vec[RUNIN], nbytes_runoff_intensity, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[EXTBCV1], host_vec[EXTBCV1], nbytes_bc_vars, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec[EXTBCV2], host_vec[EXTBCV2], nbytes_bc_vars, gpuMemcpyHostToDevice, streams);


		gpuMemcpyAsync(device_vec_int[SRCP], host_vec_int[SRCP], nbytes_src_pos, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec_int[RUNID], host_vec_int[RUNID], nbytes_runoff_id, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec_int[BCRELATIVEINDEX], host_vec_int[BCRELATIVEINDEX], nbytes_bc_cell_size, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec_int[BCTYPE], host_vec_int[BCTYPE], nbytes_bc_cell_size, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec_int[BCINDEXSTART], host_vec_int[BCINDEXSTART], nbytes_bc_cell_size, gpuMemcpyHostToDevice, streams);
		gpuMemcpyAsync(device_vec_int[BCNROWSVARS], host_vec_int[BCNROWSVARS], nbytes_bc_cell_size, gpuMemcpyHostToDevice, streams);
		gpuStreamSynchronize(streams);
#endif
	}

@@ -1894,13 +1895,13 @@ namespace Triton
#ifdef ACTIVE_GPU
		while (!device_vec.empty())
		{
			cudaFree(device_vec.back());
			gpuFree(device_vec.back());
			device_vec.pop_back();
		}

		while (!device_vec_int.empty())
		{
			cudaFree(device_vec_int.back());
			gpuFree(device_vec_int.back());
			device_vec_int.pop_back();
		}
#endif
@@ -1962,14 +1963,14 @@ namespace Triton

#ifdef ACTIVE_GPU
				st.start(COMPUTE_TIME);
				cudaMemcpyAsync(host_vec[H], device_vec[H], nbytes, cudaMemcpyDeviceToHost, streams);
				cudaMemcpyAsync(host_vec[QX], device_vec[QX], nbytes, cudaMemcpyDeviceToHost, streams);
				cudaMemcpyAsync(host_vec[QY], device_vec[QY], nbytes, cudaMemcpyDeviceToHost, streams);
				gpuMemcpyAsync(host_vec[H], device_vec[H], nbytes, gpuMemcpyDeviceToHost, streams);
				gpuMemcpyAsync(host_vec[QX], device_vec[QX], nbytes, gpuMemcpyDeviceToHost, streams);
				gpuMemcpyAsync(host_vec[QY], device_vec[QY], nbytes, gpuMemcpyDeviceToHost, streams);
				if (arglist.max_value_print_option.size() > 0)
				{
					cudaMemcpyAsync(host_vec[MAXH], device_vec[MAXH], nbytes, cudaMemcpyDeviceToHost, streams);
					gpuMemcpyAsync(host_vec[MAXH], device_vec[MAXH], nbytes, gpuMemcpyDeviceToHost, streams);
				}
				cudaStreamSynchronize(streams);
				gpuStreamSynchronize(streams);
				st.stop(COMPUTE_TIME);
#endif

@@ -2052,8 +2053,8 @@ namespace Triton
			cur_dt_arr_sz = temp_dt_arr_sz;
		}

		cudaMemcpyAsync(&local_dt, device_vec[DT], sizeof(T), cudaMemcpyDeviceToHost, streams);
		cudaStreamSynchronize(streams);
		gpuMemcpyAsync(&local_dt, device_vec[DT], sizeof(T), gpuMemcpyDeviceToHost, streams);
		gpuStreamSynchronize(streams);
#else
		Kernels::compute_dt_and_sqrt(rows*cols, cell_size, host_vec[QX], host_vec[QY], host_vec[H], host_vec[SQRTH], host_vec[DT], arglist.courant, arglist.hextra);
		Kernels::find_min_dt(rows*cols, host_vec[DT]);
@@ -2212,7 +2213,7 @@ namespace Triton

			if (arglist.gpu_direct_flag)
			{
				cudaStreamSynchronize(streams);
				gpuStreamSynchronize(streams);
				st.stop(COMPUTE_TIME);

				st.start(BALANCING_MPI_TIME);
@@ -2225,8 +2226,8 @@ namespace Triton
			}
			else
			{
				cudaMemcpyAsync(host_vec[HALO], device_vec[HALO], nbytes_halo, cudaMemcpyDeviceToHost, streams);
				cudaStreamSynchronize(streams);
				gpuMemcpyAsync(host_vec[HALO], device_vec[HALO], nbytes_halo, gpuMemcpyDeviceToHost, streams);
				gpuStreamSynchronize(streams);
				st.stop(COMPUTE_TIME);

				st.start(BALANCING_MPI_TIME);
@@ -2236,7 +2237,7 @@ namespace Triton
				st.stop(BALANCING_MPI_TIME);

				st.start(COMPUTE_TIME);
				cudaMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, cudaMemcpyHostToDevice, streams);
				gpuMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, gpuMemcpyHostToDevice, streams);
			}

			Kernels::halo_copy_to_gpu << <(2 * cols*GHOST_CELL_PADDING + THREAD_BLOCK - 1) / THREAD_BLOCK, THREAD_BLOCK, 0, streams >> > (2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);
@@ -2404,24 +2405,24 @@ namespace Triton
#ifdef ACTIVE_GPU

		//not neccessary since we are inside output so we already copied this data to the CPU
		/*cudaMemcpyAsync(host_vec[H], device_vec[H], nbytes, cudaMemcpyDeviceToHost, streams);
		cudaMemcpyAsync(host_vec[QX], device_vec[QX], nbytes, cudaMemcpyDeviceToHost, streams);
		cudaMemcpyAsync(host_vec[QY], device_vec[QY], nbytes, cudaMemcpyDeviceToHost, streams);
		/*gpuMemcpyAsync(host_vec[H], device_vec[H], nbytes, gpuMemcpyDeviceToHost, streams);
		gpuMemcpyAsync(host_vec[QX], device_vec[QX], nbytes, gpuMemcpyDeviceToHost, streams);
		gpuMemcpyAsync(host_vec[QY], device_vec[QY], nbytes, gpuMemcpyDeviceToHost, streams);
		if (arglist.max_value_print_option.size() > 0)
		{
			cudaMemcpyAsync(host_vec[MAXH], device_vec[MAXH], nbytes, cudaMemcpyDeviceToHost, streams);
			gpuMemcpyAsync(host_vec[MAXH], device_vec[MAXH], nbytes, gpuMemcpyDeviceToHost, streams);
		}
		cudaStreamSynchronize(streams);*/
		gpuStreamSynchronize(streams);*/

		cudaStreamDestroy(streams);
		gpuStreamDestroy(streams);
		while (!device_vec.empty())
		{
			cudaFree(device_vec.back());
			gpuFree(device_vec.back());
			device_vec.pop_back();
		}
		while (!device_vec_int.empty())
		{
			cudaFree(device_vec_int.back());
			gpuFree(device_vec_int.back());
			device_vec_int.pop_back();
		}