b4b for np=2 (for a tiny problem) (522f55aa) · Commits · HYDRO / TRITON

src/kernels.h

+34 −22

Original line number	Diff line number	Diff line
		@@ -63,8 +63,6 @@ namespace Kernels
		int ilo = GHOST_CELL_PADDING,
		int ihi = -1)
		{
		// Default ihi to nrows - GHOST_CELL_PADDING if not provided
		if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

		/****
		* RHS sketch
		@@ -82,7 +80,7 @@ namespace Kernels
		int rx = (ix * ncols); //row offset

		bool
		is_top = (ix < ilo),
		is_top = (ix <= ilo),
		is_btm = (ix >= ihi),
		is_lt = (iy <= GHOST_CELL_PADDING-2),
		is_rt = (iy >= ncols - GHOST_CELL_PADDING);
		@@ -346,9 +344,6 @@ namespace Kernels
		int ilo = GHOST_CELL_PADDING,
		int ihi = -1)
		{
		if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;
		// flux_y needs one extra row at bottom (computes N edge = south of row above)
		int ihi_flux_y = ihi + 1;

		/****
		* RHS sketch
		@@ -369,8 +364,8 @@ namespace Kernels
		int rx = (ix * ncols); //row offset

		bool
		is_top = (ix < ilo),
		is_btm = (ix >= ihi_flux_y),
		is_top = (ix <= ilo),
		is_btm = (ix >= ihi+1),
		is_lt = (iy <= GHOST_CELL_PADDING-1),
		is_rt = (iy >= ncols - GHOST_CELL_PADDING);

		@@ -626,15 +621,13 @@ namespace Kernels
		int ilo = GHOST_CELL_PADDING,
		int ihi = -1)
		{
		if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

		triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

		int ix = (id / ncols); //row id
		int iy = (id % ncols); //col id

		bool
		is_top = (ix < ilo),
		is_top = (ix <= ilo),
		is_btm = (ix >= ihi),
		is_lt = (iy <= GHOST_CELL_PADDING-1),
		is_rt = (iy >= ncols - GHOST_CELL_PADDING);
		@@ -717,9 +710,6 @@ namespace Kernels
		int ilo = GHOST_CELL_PADDING,
		int ihi = -1)
		{
		// Default ihi to nrows - GHOST_CELL_PADDING if not provided
		if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

		triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

		int ix = (id / ncols); //row id
		@@ -727,7 +717,7 @@ namespace Kernels
		int rx = (ix * ncols); //row offset

		bool
		is_top = (ix < ilo),
		is_top = (ix <= ilo),
		is_btm = (ix >= ihi),
		is_lt = (iy <= GHOST_CELL_PADDING-1),
		is_rt = (iy >= ncols - GHOST_CELL_PADDING);
		@@ -754,7 +744,7 @@ namespace Kernels
		}

		}else{
		if(ix==ihi-1 && mpi_tasks>1){ //last real row
		if(ix==ihi && mpi_tasks>1){ //last real row
		if ((hij + zij < dem[rx - ncols + iy]) && (h_arr[rx - ncols + iy] < EPS12)) {
		qy_arr[id] = 0.0;
		}
		@@ -1031,16 +1021,38 @@ namespace Kernels
		* @param hextra Minimum depth (tolerance below water is at rest)
		*/
		template<typename T>
		void compute_dt_and_sqrt(int size, T dx,
		void compute_dt_and_sqrt(int nrows, int ncols, T dx,
		T const * KOKKOS_RESTRICT input_qx ,
		T const * KOKKOS_RESTRICT input_qy ,
		T const * KOKKOS_RESTRICT input_h ,
		T * KOKKOS_RESTRICT input_sqrth,
		T * KOKKOS_RESTRICT output ,
		T cn, T hextra)
		T cn, T hextra,
		int ilo = GHOST_CELL_PADDING,
		int ihi = -1)
		{

		int size = nrows * ncols;

		triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {
		int ix = (id / ncols); //row id
		int iy = (id % ncols); //col id

		bool

		is_top = (ix <= ilo),
		is_btm = (ix >= ihi),
		is_lt = (iy <= GHOST_CELL_PADDING-1),
		is_rt = (iy >= ncols - GHOST_CELL_PADDING);


		// Set all cells to MAX_VALUE first (excluded cells won't be computed)
		output[id] = MAX_VALUE;

		if (is_top \|\| is_lt \|\| is_rt \|\| is_btm) //exclude halo cells
		{
		return;
		}
		T hij = input_h[id];
		T sqrthij = FMAX(sqrt(hij),0.0);
		input_sqrth[id] = sqrthij;
		@@ -1132,10 +1144,10 @@ namespace Kernels
		int iy = (ii % ncols); //col id

		bool
		is_top = (ix == 1),
		is_btm = (ix == nrows - 2),
		is_lt = (iy == 1),
		is_rt = (iy == ncols - 2);
		is_top = (ix == GHOST_CELL_PADDING),
		is_btm = (ix == nrows - GHOST_CELL_PADDING - 1),
		is_lt = (iy == GHOST_CELL_PADDING),
		is_rt = (iy == ncols - GHOST_CELL_PADDING - 1);

		//if (!(rank == 0 && is_top) \|\| !is_lt \|\| !is_rt \|\| !(rank == total_process - 1 && is_btm))
		if (!((rank == 0 && is_top) \|\| is_lt \|\| is_rt \|\| (rank == total_process - 1 && is_btm)))

src/triton.h

+33 −34

Original line number	Diff line number	Diff line
		@@ -179,7 +179,7 @@ namespace Triton
		/** @brief This function is used to calculate minimum time step size for each sub domain.
		*
		*/
		void compute_local_dt();
		void compute_local_dt(int ilo = GHOST_CELL_PADDING, int ihi = -1);


		/** @brief This function is used to calculate minimum time step size between all sub domain.
		@@ -192,7 +192,9 @@ namespace Triton
		/** @brief This function is used to compute next state. All main computation is done inside this function.
		*
		* @param ilo Lower row bound (inclusive) for overlap tiling
		* @param ihi Upper row bound (exclusive) for overlap tiling
		* @param ihi Upper row bound (exclusive) for state update
		* @param ilo_flux Lower row bound (inclusive) for flux computation (default: same as ilo)
		* @param ihi_flux Upper row bound (exclusive) for flux computation (default: same as ihi)
		*/
		void compute_new_state(int ilo, int ihi);

		@@ -2034,39 +2036,49 @@ void triton<T>::simulate()

		while (simtime < arglist.sim_duration)
		{
		// Compute dt EVERY substep for b4b correctness
		if (!arglist.time_increment_fixed)
		{
		compute_local_dt();
		compute_global_dt(print_id);
		}

		it_count++;
		it_count_average++;
		substep_count++;
		total_iterations++;

		// Determine bounds based on substep
		// Each substep "consumes" one layer of valid halo data from boundaries inward
		// Substep 1: full domain, Substep 2: shrink by 1 row, etc.
		// Only shrink on MPI-facing boundaries (where we have neighbors)
		int ilo = GHOST_CELL_PADDING;
		// Determine bounds for computation
		// For overlap tiling:
		// - At MPI boundaries: extend to G-1 (1 row into ghost)
		// - At physical boundaries: use G (interior only)
		// - Shrink in later substeps to avoid stale ghost data
		// Note: Physical boundary bounds differ between G=1 and G=2 because
		// the ghost cell structure is different. This is expected.
		int ilo = GHOST_CELL_PADDING-1;
		int ihi = rows - GHOST_CELL_PADDING;

		if (arglist.overlap_tiling && substep_count > 1 && size > 1) {
		int shrink = substep_count - 1;
		// Shrink ilo only if we have a lower neighbor (rank > 0)
		if (arglist.overlap_tiling && size > 1) {
		// Extend bounds by 1 row at MPI boundaries only
		if (rank > 0) {
		ilo = GHOST_CELL_PADDING + shrink;
		ilo = substep_count-2;
		}
		// Shrink ihi only if we have an upper neighbor (rank < size - 1)
		if (rank < size - 1) {
		ihi = rows - GHOST_CELL_PADDING - shrink;
		ihi = rows - substep_count + 1;
		}
		}

		// Compute dt EVERY substep for b4b correctness
		if (!arglist.time_increment_fixed)
		{
		compute_local_dt(ilo,ihi);
		compute_global_dt(print_id);
		}

		compute_new_state(ilo, ihi);

		// Halo exchange: every step for baseline, every GHOST_CELL_PADDING steps for overlap tiling
		bool do_halo_exchange = !arglist.overlap_tiling \|\| (substep_count >= size);
		if (do_halo_exchange && size > 1) {
		perform_halo_exchange();
		substep_count = 0;
		total_halo_exchanges++;
		}

		simtime += global_dt;
		average_dt += global_dt;

		@@ -2132,19 +2144,6 @@ void triton<T>::simulate()
		}
		}

		// Halo exchange: every step for baseline, every GHOST_CELL_PADDING steps for overlap tiling
		bool do_halo_exchange = !arglist.overlap_tiling \|\| (substep_count >= GHOST_CELL_PADDING);
		if (do_halo_exchange && size > 1) {
		perform_halo_exchange();
		substep_count = 0;
		total_halo_exchanges++;
		} else if (size > 1) {
		// wet_dry_qy must be called every iteration when size > 1
		st.start(COMPUTE_TIME);
		Kernels::wet_dry_qy(2 * cols*GHOST_CELL_PADDING, rows, cols,
		device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra);
		st.stop(COMPUTE_TIME);
		}
		}

		// Final observation write
		@@ -2209,10 +2208,10 @@ void triton<T>::simulate()


		template<typename T>
		void triton<T>::compute_local_dt()
		void triton<T>::compute_local_dt(int ilo, int ihi)
		{
		st.start(COMPUTE_TIME);
		Kernels::compute_dt_and_sqrt(rows*cols, cell_size, device_vec[QX], device_vec[QY], device_vec[H], device_vec[SQRTH], device_vec[DT], arglist.courant, arglist.hextra);
		Kernels::compute_dt_and_sqrt(rows, cols, cell_size, device_vec[QX], device_vec[QY], device_vec[H], device_vec[SQRTH], device_vec[DT], arglist.courant, arglist.hextra, ilo, ihi);
		local_dt = Kernels::find_min_dt(rows*cols, device_vec[DT]);
		gpuStreamSynchronize(streams);
		st.stop(COMPUTE_TIME);