Overlap tiling implemented. Nor working for n>1 (637d953e) · Commits · HYDRO / TRITON

CMakeLists.txt

+5 −0

Original line number	Diff line number	Diff line
		@@ -15,6 +15,11 @@ set(CMAKE_CXX_STANDARD 17) # minimum C++ standard to support Kokkos
		set(CMAKE_CXX_STANDARD_REQUIRED ON)
		set(CMAKE_CXX_EXTENSIONS OFF)

		# GHOST_CELL_PADDING option
		set(GHOST_CELL_PADDING 1 CACHE STRING "Number of ghost cell layers for domain decomposition")
		message(STATUS "GHOST_CELL_PADDING = ${GHOST_CELL_PADDING}")
		add_compile_definitions(GHOST_CELL_PADDING=${GHOST_CELL_PADDING})

		message(STATUS "CMAKE_CXX_COMPILER = ${CMAKE_CXX_COMPILER}")

		# check if MPI is available

src/config_utils.h

+3 −1

Original line number	Diff line number	Diff line
		@@ -30,7 +30,8 @@ namespace ConfigUtils
		time_increment_fixed, /*< Flag to indicate time step size characteristics. True = Constant time step size, False = Variable time step size. /
		time_series_flag, /*< Flag to allow time series output. True = Output time series, False = Don't output time series. /
		gpu_direct_flag, /*< Flag to allow GPU-Direct use. True = Use GPU-Direct, False = Don't use GPU-Direct. /
		open_boundaries; /*<Flag to impose open or closed boundaries. By default, open_boundaries=0/
		open_boundaries, /*<Flag to impose open or closed boundaries. By default, open_boundaries=0/
		overlap_tiling; /*< Flag to enable overlap tiling/communication-avoiding. True = Enable overlap tiling, False = Standard mode. /

		int
		checkpoint_id, /*< Use for hot start. If 0 then that means a clean start. Greater than 0 means start from that specific checkpoint. /
		@@ -450,6 +451,7 @@ namespace ConfigUtils
		arglist.hextra = atof((args("hextra", argmap)).c_str());
		arglist.gpu_direct_flag = atoi((args("gpu_direct_flag", argmap)).c_str());
		arglist.open_boundaries = atoi((args("open_boundaries", argmap)).c_str());
		arglist.overlap_tiling = atoi((argsd("overlap_tiling", argmap, "0")).c_str());

		arglist.num_sources = atoi((args("num_sources", argmap)).c_str());
		arglist.num_runoffs = atoi((args("num_runoffs", argmap)).c_str());

src/constants.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -61,7 +61,9 @@ typedef double value_t; /**< Data type to represent floating-point number. It
		#define TIME_SERIES_DIR "series" /*< Deafult folder name containing time series outputs. /
		#define DEFAULT_CFG "case4.cfg" /*< Deafult configuration (cfg) file name. /

		#ifndef GHOST_CELL_PADDING
		#define GHOST_CELL_PADDING 1 /*< Number of extra row and column to use besides each domain border. /
		#endif
		#define USE_MATRIX 0 /*< Use the whole matrix when performing MPI halo exchange. /
		#define USE_HALO 1 /*< Use only halo rows bundle when performing MPI halo exchange. /
		#define SRC_LOCATION 0 /*< Define to use flow locations. /

src/kernels.h

+116 −99

Original line number	Diff line number	Diff line
		@@ -43,6 +43,8 @@ namespace Kernels
		* @param rhsqy0 RHS array to store partial discharge in y direction contributions (east-north)
		* @param rhsqy1 RHS array to store partial discharge in y direction contributions (west-south)
		* @param hextra Minimum depth (tolerance below water is at rest)
		* @param ilo Lower row bound (inclusive). Default: GHOST_CELL_PADDING
		* @param ihi Upper row bound (exclusive). Default: nrows - GHOST_CELL_PADDING
		*/
		template<typename T>
		void flux_x(int size, int nrows, int ncols, T dx, T dt,
		@@ -57,17 +59,21 @@ namespace Kernels
		T * KOKKOS_RESTRICT rhsqx1,
		T * KOKKOS_RESTRICT rhsqy0,
		T * KOKKOS_RESTRICT rhsqy1,
		T hextra)
		T hextra,
		int ilo = GHOST_CELL_PADDING,
		int ihi = -1)
		{
		// Default ihi to nrows - GHOST_CELL_PADDING if not provided
		if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

		/**************************************
		/****
		* RHS sketch
		* _____ _____
		* ____ ____
		* \| \| \|
		* \| 0\|1 \|
		* \|_____\|_____\|
		* \|____\|____\|
		*
		* ************************************/
		* ****/

		triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

		@@ -76,8 +82,8 @@ namespace Kernels
		int rx = (ix * ncols); //row offset

		bool
		is_top = (ix <= GHOST_CELL_PADDING-1),
		is_btm = (ix >= nrows - GHOST_CELL_PADDING),
		is_top = (ix < ilo),
		is_btm = (ix >= ihi),
		is_lt = (iy <= GHOST_CELL_PADDING-2),
		is_rt = (iy >= ncols - GHOST_CELL_PADDING);

		@@ -320,6 +326,8 @@ namespace Kernels
		* @param rhsqy0 RHS array to store partial discharge in y direction contributions (east-north)
		* @param rhsqy1 RHS array to store partial discharge in y direction contributions (west-south)
		* @param hextra Minimum depth (tolerance below water is at rest)
		* @param ilo Lower row bound (inclusive). Default: GHOST_CELL_PADDING
		* @param ihi Upper row bound (exclusive). Default: nrows - GHOST_CELL_PADDING
		*/
		template<typename T>
		void flux_y(int size, int nrows, int ncols, T dx, T dt,
		@@ -334,20 +342,24 @@ namespace Kernels
		T * KOKKOS_RESTRICT rhsqx1,
		T * KOKKOS_RESTRICT rhsqy0,
		T * KOKKOS_RESTRICT rhsqy1,
		T hextra)
		T hextra,
		int ilo = GHOST_CELL_PADDING,
		int ihi = -1)
		{
		// Default ihi to nrows - GHOST_CELL_PADDING if not provided
		if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

		/**************************************
		/****
		* RHS sketch
		* _____
		* ____
		* \| \|
		* \| \|
		* \|__1__\|
		* \| 0 \|
		* \| \|
		* \|_____\|
		* \|____\|
		*
		* ************************************/
		* ****/

		triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

		@@ -356,8 +368,8 @@ namespace Kernels
		int rx = (ix * ncols); //row offset

		bool
		is_top = (ix <= GHOST_CELL_PADDING-1),
		is_btm = (ix >= nrows - GHOST_CELL_PADDING+1),
		is_top = (ix < ilo),
		is_btm = (ix >= ihi),
		is_lt = (iy <= GHOST_CELL_PADDING-1),
		is_rt = (iy >= ncols - GHOST_CELL_PADDING);

		@@ -593,6 +605,8 @@ namespace Kernels
		* @param rhsqy0 RHS array to store partial discharge in y direction contributions (east-north)
		* @param rhsqy1 RHS array to store partial discharge in y direction contributions (west-south)
		* @param hextra Minimum depth (tolerance below water is at rest)
		* @param ilo Lower row bound (inclusive) for overlap tiling. Default: GHOST_CELL_PADDING
		* @param ihi Upper row bound (exclusive) for overlap tiling. Default: nrows - GHOST_CELL_PADDING
		*/
		template<typename T>
		void update_cells(int size, int nrows, int ncols, T dt,
		@@ -607,16 +621,22 @@ namespace Kernels
		T * KOKKOS_RESTRICT rhsqx1,
		T * KOKKOS_RESTRICT rhsqy0,
		T * KOKKOS_RESTRICT rhsqy1,
		T hextra)
		T hextra,
		int ilo = GHOST_CELL_PADDING,
		int ihi = -1)
		{
		// Default ihi to nrows - GHOST_CELL_PADDING if not provided
		if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;
		if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

		triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

		int ix = (id / ncols); //row id
		int iy = (id % ncols); //col id

		bool
		is_top = (ix <= GHOST_CELL_PADDING-1),
		is_btm = (ix >= nrows - GHOST_CELL_PADDING),
		is_top = (ix < ilo),
		is_btm = (ix >= ihi),
		is_lt = (iy <= GHOST_CELL_PADDING-1),
		is_rt = (iy >= ncols - GHOST_CELL_PADDING);

		@@ -676,6 +696,7 @@ namespace Kernels
		* @param size Array size
		* @param nrows Number of rows in that domain/subdomain
		* @param ncols Number of columns in that domain/subdomain
		* @param dt Time step size
		* @param h_arr Water depth array
		* @param qx_arr Discharge in x direction array
		* @param qy_arr Discharge in y direction array
		@@ -683,6 +704,8 @@ namespace Kernels
		* @param max_h_arr Max water depth array
		* @param hextra Minimum depth (tolerance below water is at rest)
		* @param mpi_tasks Number of MPI tasks (domain decomposition)
		* @param ilo Lower row bound (inclusive) for overlap tiling. Default: GHOST_CELL_PADDING
		* @param ihi Upper row bound (exclusive) for overlap tiling. Default: nrows - GHOST_CELL_PADDING
		*/
		template<typename T>
		void wet_dry(int size, int nrows, int ncols, T dt,
		@@ -691,8 +714,13 @@ namespace Kernels
		T * KOKKOS_RESTRICT qy_arr ,
		T const * KOKKOS_RESTRICT dem ,
		T * KOKKOS_RESTRICT max_h_arr,
		T hextra, int mpi_tasks)
		T hextra, int mpi_tasks,
		int ilo = GHOST_CELL_PADDING,
		int ihi = -1)
		{
		// Default ihi to nrows - GHOST_CELL_PADDING if not provided
		if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

		triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

		int ix = (id / ncols); //row id
		@@ -700,8 +728,8 @@ namespace Kernels
		int rx = (ix * ncols); //row offset

		bool
		is_top = (ix <= GHOST_CELL_PADDING-1),
		is_btm = (ix >= nrows - GHOST_CELL_PADDING),
		is_top = (ix < ilo),
		is_btm = (ix >= ihi),
		is_lt = (iy <= GHOST_CELL_PADDING-1),
		is_rt = (iy >= ncols - GHOST_CELL_PADDING);

		@@ -720,14 +748,14 @@ namespace Kernels
		{
		qx_arr[id] = 0.0;
		}
		if(ix==GHOST_CELL_PADDING && mpi_tasks>1){ //first real row
		if(ix==ilo && mpi_tasks>1){ //first real row
		if ((hij + zij < dem[rx + ncols + iy]) && (h_arr[rx + ncols + iy] < EPS12))
		{
		qy_arr[id] = 0.0;
		}

		}else{
		if(ix==nrows - GHOST_CELL_PADDING - 1 && mpi_tasks>1){ //last real row
		if(ix==ihi-1 && mpi_tasks>1){ //last real row
		if ((hij + zij < dem[rx - ncols + iy]) && (h_arr[rx - ncols + iy] < EPS12)) {
		qy_arr[id] = 0.0;
		}
		@@ -816,31 +844,21 @@ namespace Kernels
		{
		triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

		// After MPI exchange, the halo array contains:
		// - Rows 0 to 3N-1: received top halo (from prev rank's bottom interior edge)
		// - Rows 3N to 9N-1: old data (not updated by exchange)
		// - Rows 9N to 12N-1: received bottom halo (from next rank's top interior edge)
		//
		// We should ONLY copy received data to ghost cells in the main arrays.
		// Do NOT overwrite interior edge cells with old data!

		int index = id + 2ncolsGHOST_CELL_PADDING;
		if(id < ncols*GHOST_CELL_PADDING)
		{
		// First half: copy received top halo to top ghost cells only
		int index = id;
		index = id;
		}

		h_arr[id] = halo[index + 0GHOST_CELL_PADDINGncols];
		qx_arr[id] = halo[index + 1GHOST_CELL_PADDINGncols];
		qy_arr[id] = halo[index + 2GHOST_CELL_PADDINGncols];

		// Copy received bottom halo to bottom ghost cells
		// Bottom ghost cells are at rows (nrows-N) to (nrows-1)
		int temp = id + (nrows - GHOST_CELL_PADDING)*ncols;
		h_arr[temp] = halo[index + 9GHOST_CELL_PADDINGncols];
		qx_arr[temp] = halo[index + 10GHOST_CELL_PADDINGncols];
		qy_arr[temp] = halo[index + 11GHOST_CELL_PADDINGncols];
		}
		// Second half (id >= N*ncols) was incorrectly copying old interior edge data
		// to interior edge positions. This is now skipped.
		int temp = id + (nrows - 2GHOST_CELL_PADDING)ncols;

		h_arr[temp] = halo[index + 6GHOST_CELL_PADDINGncols];
		qx_arr[temp] = halo[index + 7GHOST_CELL_PADDINGncols];
		qy_arr[temp] = halo[index + 8GHOST_CELL_PADDINGncols];

		});
		}
		@@ -1115,10 +1133,10 @@ namespace Kernels
		int iy = (ii % ncols); //col id

		bool
		is_top = (ix == GHOST_CELL_PADDING),
		is_btm = (ix == nrows - GHOST_CELL_PADDING - 1),
		is_lt = (iy == GHOST_CELL_PADDING),
		is_rt = (iy == ncols - GHOST_CELL_PADDING - 1);
		is_top = (ix == 1),
		is_btm = (ix == nrows - 2),
		is_lt = (iy == 1),
		is_rt = (iy == ncols - 2);

		//if (!(rank == 0 && is_top) \|\| !is_lt \|\| !is_rt \|\| !(rank == total_process - 1 && is_btm))
		if (!((rank == 0 && is_top) \|\| is_lt \|\| is_rt \|\| (rank == total_process - 1 && is_btm)))
		@@ -1388,4 +1406,3 @@ namespace Kernels
		}

		#endif
		No newline at end of file

src/triton.h

+233 −212

Original line number	Diff line number	Diff line
		@@ -191,8 +191,15 @@ namespace Triton

		/** @brief This function is used to compute next state. All main computation is done inside this function.
		*
		* @param ilo Lower row bound (inclusive) for overlap tiling
		* @param ihi Upper row bound (exclusive) for overlap tiling
		*/
		void compute_new_state();
		void compute_new_state(int ilo, int ihi);

		/** @brief This function performs halo exchange after tile completes.
		*
		*/
		void perform_halo_exchange();

		/** @brief This function is used to get observation data to host.
		*
		@@ -1979,10 +1986,8 @@ namespace Triton

		MPI_Barrier(ENSIFY_COMM_WORLD);


		st.start(SIMULATION_TIME);


		T xll = dem.get_xll_corner();
		T yll = dem.get_yll_corner();
		T cellsize = dem.get_cell_size();
		@@ -1997,7 +2002,6 @@ namespace Triton
		out.write_observation_data(host_obs_h, host_obs_qx, host_obs_qy, simtime, arglist.print_option);
		}


		global_dt = arglist.time_step;
		average_dt = 0.0;
		local_dt = arglist.time_step;
		@@ -2011,44 +2015,63 @@ namespace Triton
		print_id=simtime/arglist.print_interval;
		}

		// Validation for overlap tiling
		if (arglist.overlap_tiling && GHOST_CELL_PADDING < 1) {
		std::cerr << ERROR "overlap_tiling requires GHOST_CELL_PADDING >= 1" << std::endl;
		exit(EXIT_FAILURE);
		}

		if (arglist.overlap_tiling && GHOST_CELL_PADDING == 1 && rank == 0) {
		std::cerr << WARN "overlap_tiling with GHOST_CELL_PADDING=1: m=1 (no communication reduction)" << std::endl;
		}

		// Domain bounds (constant for all substeps)
		const int ilo = GHOST_CELL_PADDING;
		const int ihi = rows - GHOST_CELL_PADDING;

		// Substep counter for halo exchange scheduling
		int substep_count = 0;

		while (simtime < arglist.sim_duration)
		{
		it_count++;
		it_count_average++;

		// Compute dt EVERY substep for b4b correctness
		if (!arglist.time_increment_fixed)
		{
		compute_local_dt();
		compute_global_dt(print_id);
		}
		compute_new_state();

		it_count++;
		it_count_average++;

		// Call unified compute_new_state with CONSTANT bounds
		compute_new_state(ilo, ihi);

		simtime += global_dt;
		average_dt += global_dt;
		substep_count++;

		if(it_count%arglist.it_print == 0){
		if(rank==0){
		std::cerr << INFO " Time: " << simtime << "\tdt: " << average_dt/it_count_average << "\tit: " << it_count << std::endl;
		// Status print
		if (it_count % arglist.it_print == 0 && rank == 0) {
		std::cerr << INFO " Time: " << simtime << "\tdt: " << average_dt/it_count_average
		<< "\tit: " << it_count << std::endl;
		it_count_average = 0;
		average_dt = 0.0;
		}
		}


		// Observation output
		if (arglist.time_series_flag && simtime - last_print_obs_time >= arglist.print_observation) {
		// variable to print observation data files
		last_print_obs_time += arglist.print_observation;
		//transfer only the observation data points to the host
		st.start(COMPUTE_TIME);
		get_observation_data_to_host();
		st.stop(COMPUTE_TIME);

		st.start(IO_TIME);
		out.write_observation_data(host_obs_h, host_obs_qx, host_obs_qy, simtime, arglist.print_option);
		//out.write_observation_data(host_vec[OBSH], host_vec[OBSQX], host_vec[OBSQY], simtime, arglist.print_option);
		st.stop(IO_TIME);
		}

		// File output
		if (simtime >= arglist.print_interval * (print_id + 1))
		{
		print_id++;
		@@ -2088,13 +2111,18 @@ namespace Triton
		new_domain_decomposition();
		st.stop(RESIZE_TIME);
		}

		}

		// Halo exchange: every step for baseline, every GHOST_CELL_PADDING steps for overlap tiling
		bool do_halo = !arglist.overlap_tiling \|\| (substep_count >= GHOST_CELL_PADDING);
		if (do_halo && size > 1) {
		perform_halo_exchange();
		substep_count = 0; // Reset counter after halo exchange
		}
		}
		//write the last state in the observation data files

		// Final observation write
		if (arglist.time_series_flag) {
		//transfer only the observation data points to the host
		st.start(COMPUTE_TIME);
		get_observation_data_to_host();
		st.stop(COMPUTE_TIME);
		@@ -2104,7 +2132,6 @@ namespace Triton
		st.stop(IO_TIME);
		}


		st.stop(SIMULATION_TIME);
		st.stop(TOTAL_TIME);

		@@ -2114,8 +2141,6 @@ namespace Triton
		std::cerr << INFO " Time: " << simtime << "\tdt: " << average_dt/it_count_average << "\tit: " << it_count << std::endl;
		std::cerr << OK "Simulation ends" << std::endl;
		}


		}


		@@ -2187,21 +2212,25 @@ namespace Triton


		template<typename T>
		void triton<T>::compute_new_state()
		void triton<T>::compute_new_state(int ilo, int ihi)
		{
		st.start(COMPUTE_TIME);

		Kernels::flux_x(rows*cols, rows, cols, cell_size, global_dt,
		device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[SQRTH],
		device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1], arglist.hextra);
		device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1],
		arglist.hextra, ilo, ihi);

		// Use same bounds for flux_y as flux_x
		Kernels::flux_y(rows*cols, rows, cols, cell_size, global_dt,
		device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[SQRTH],
		device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1], arglist.hextra);
		device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1],
		arglist.hextra, ilo, ihi);

		Kernels::update_cells(rows*cols, rows, cols, global_dt,
		device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[NMAN],
		device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1], arglist.hextra);
		device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1],
		arglist.hextra, ilo, ihi);

		if (arglist.num_runoffs > 0)
		{
		@@ -2244,46 +2273,41 @@ namespace Triton
		cell_size, global_dt, simtime, idx_low, idx_high, device_vec[H], device_vec_int[SRCP]);
		}


		if(arglist.open_boundaries){

		Kernels::copy_info_to_exterior_boundaries_west_east(2rowsGHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY]);
		Kernels::copy_info_to_exterior_boundaries_north_south(cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY],rank, size);

		}

		if (num_of_extbc > 0 && num_extbc_cells > 0)
		{
		Kernels::compute_extbc_values(num_extbc_cells, rows, cols, global_dt, device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[NMAN], device_vec_int[BCRELATIVEINDEX], device_vec_int[BCTYPE], device_vec_int[BCINDEXSTART], device_vec_int[BCNROWSVARS], device_vec[EXTBCV1], device_vec[EXTBCV2], simtime, rank, size);

		}

		Kernels::wet_dry(rows*cols, rows, cols, global_dt, device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[MAXH], arglist.hextra,size);
		Kernels::wet_dry(rows*cols, rows, cols, global_dt, device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[MAXH], arglist.hextra, size, ilo, ihi);

		st.stop(COMPUTE_TIME);
		}


		if (size > 1)
		template<typename T>
		void triton<T>::perform_halo_exchange()
		{
		Kernels::halo_copy_from_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);
		if (size <= 1) return;

		if (arglist.gpu_direct_flag)
		{
		Kernels::halo_copy_from_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols,
		device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);

		if (arglist.gpu_direct_flag) {
		gpuStreamSynchronize(streams);
		st.stop(COMPUTE_TIME);

		st.start(BALANCING_MPI_TIME);
		st.start(MPI_TIME);
		MpiUtils::exchange(device_vec[HALO], 12*GHOST_CELL_PADDING, cols, rank, size, USE_HALO);
		st.stop(MPI_TIME);
		st.stop(BALANCING_MPI_TIME);

		st.start(COMPUTE_TIME);
		}
		else
		{
		} else {
		gpuMemcpyAsync(host_vec[HALO], device_vec[HALO], nbytes_halo, gpuMemcpyDeviceToHost, streams);
		gpuStreamSynchronize(streams);
		st.stop(COMPUTE_TIME);

		st.start(BALANCING_MPI_TIME);
		st.start(MPI_TIME);
		@@ -2291,20 +2315,17 @@ namespace Triton
		st.stop(MPI_TIME);
		st.stop(BALANCING_MPI_TIME);

		st.start(COMPUTE_TIME);
		gpuMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, gpuMemcpyHostToDevice, streams);
		}

		Kernels::halo_copy_to_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);

		Kernels::wet_dry_qy_halo(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra);
		Kernels::halo_copy_to_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols,
		device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);

		Kernels::wet_dry_qy_halo(2 * cols*GHOST_CELL_PADDING, rows, cols,
		device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra);
		}


		st.stop(COMPUTE_TIME);
		}

		template<typename T>
		void triton<T>::get_observation_data_to_host()
		{