Loading src/kernels.h +34 −22 Original line number Diff line number Diff line Loading @@ -63,8 +63,6 @@ namespace Kernels int ilo = GHOST_CELL_PADDING, int ihi = -1) { // Default ihi to nrows - GHOST_CELL_PADDING if not provided if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING; /**** * RHS sketch Loading @@ -82,7 +80,7 @@ namespace Kernels int rx = (ix * ncols); //row offset bool is_top = (ix < ilo), is_top = (ix <= ilo), is_btm = (ix >= ihi), is_lt = (iy <= GHOST_CELL_PADDING-2), is_rt = (iy >= ncols - GHOST_CELL_PADDING); Loading Loading @@ -346,9 +344,6 @@ namespace Kernels int ilo = GHOST_CELL_PADDING, int ihi = -1) { if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING; // flux_y needs one extra row at bottom (computes N edge = south of row above) int ihi_flux_y = ihi + 1; /**** * RHS sketch Loading @@ -369,8 +364,8 @@ namespace Kernels int rx = (ix * ncols); //row offset bool is_top = (ix < ilo), is_btm = (ix >= ihi_flux_y), is_top = (ix <= ilo), is_btm = (ix >= ihi+1), is_lt = (iy <= GHOST_CELL_PADDING-1), is_rt = (iy >= ncols - GHOST_CELL_PADDING); Loading Loading @@ -626,15 +621,13 @@ namespace Kernels int ilo = GHOST_CELL_PADDING, int ihi = -1) { if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING; triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) { int ix = (id / ncols); //row id int iy = (id % ncols); //col id bool is_top = (ix < ilo), is_top = (ix <= ilo), is_btm = (ix >= ihi), is_lt = (iy <= GHOST_CELL_PADDING-1), is_rt = (iy >= ncols - GHOST_CELL_PADDING); Loading Loading @@ -717,9 +710,6 @@ namespace Kernels int ilo = GHOST_CELL_PADDING, int ihi = -1) { // Default ihi to nrows - GHOST_CELL_PADDING if not provided if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING; triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) { int ix = (id / ncols); //row id Loading @@ -727,7 +717,7 @@ namespace Kernels int rx = (ix * ncols); //row offset bool is_top = (ix < ilo), is_top = (ix <= ilo), is_btm = (ix >= ihi), is_lt = (iy <= GHOST_CELL_PADDING-1), is_rt = (iy >= ncols - GHOST_CELL_PADDING); Loading @@ -754,7 +744,7 @@ namespace Kernels } }else{ if(ix==ihi-1 && mpi_tasks>1){ //last real row if(ix==ihi && mpi_tasks>1){ //last real row if ((hij + zij < dem[rx - ncols + iy]) && (h_arr[rx - ncols + iy] < EPS12)) { qy_arr[id] = 0.0; } Loading Loading @@ -1031,16 +1021,38 @@ namespace Kernels * @param hextra Minimum depth (tolerance below water is at rest) */ template<typename T> void compute_dt_and_sqrt(int size, T dx, void compute_dt_and_sqrt(int nrows, int ncols, T dx, T const * KOKKOS_RESTRICT input_qx , T const * KOKKOS_RESTRICT input_qy , T const * KOKKOS_RESTRICT input_h , T * KOKKOS_RESTRICT input_sqrth, T * KOKKOS_RESTRICT output , T cn, T hextra) T cn, T hextra, int ilo = GHOST_CELL_PADDING, int ihi = -1) { int size = nrows * ncols; triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) { int ix = (id / ncols); //row id int iy = (id % ncols); //col id bool is_top = (ix <= ilo), is_btm = (ix >= ihi), is_lt = (iy <= GHOST_CELL_PADDING-1), is_rt = (iy >= ncols - GHOST_CELL_PADDING); // Set all cells to MAX_VALUE first (excluded cells won't be computed) output[id] = MAX_VALUE; if (is_top || is_lt || is_rt || is_btm) //exclude halo cells { return; } T hij = input_h[id]; T sqrthij = FMAX(sqrt(hij),0.0); input_sqrth[id] = sqrthij; Loading Loading @@ -1132,10 +1144,10 @@ namespace Kernels int iy = (ii % ncols); //col id bool is_top = (ix == 1), is_btm = (ix == nrows - 2), is_lt = (iy == 1), is_rt = (iy == ncols - 2); is_top = (ix == GHOST_CELL_PADDING), is_btm = (ix == nrows - GHOST_CELL_PADDING - 1), is_lt = (iy == GHOST_CELL_PADDING), is_rt = (iy == ncols - GHOST_CELL_PADDING - 1); //if (!(rank == 0 && is_top) || !is_lt || !is_rt || !(rank == total_process - 1 && is_btm)) if (!((rank == 0 && is_top) || is_lt || is_rt || (rank == total_process - 1 && is_btm))) Loading src/triton.h +33 −34 Original line number Diff line number Diff line Loading @@ -179,7 +179,7 @@ namespace Triton /** @brief This function is used to calculate minimum time step size for each sub domain. * */ void compute_local_dt(); void compute_local_dt(int ilo = GHOST_CELL_PADDING, int ihi = -1); /** @brief This function is used to calculate minimum time step size between all sub domain. Loading @@ -192,7 +192,9 @@ namespace Triton /** @brief This function is used to compute next state. All main computation is done inside this function. * * @param ilo Lower row bound (inclusive) for overlap tiling * @param ihi Upper row bound (exclusive) for overlap tiling * @param ihi Upper row bound (exclusive) for state update * @param ilo_flux Lower row bound (inclusive) for flux computation (default: same as ilo) * @param ihi_flux Upper row bound (exclusive) for flux computation (default: same as ihi) */ void compute_new_state(int ilo, int ihi); Loading Loading @@ -2034,39 +2036,49 @@ void triton<T>::simulate() while (simtime < arglist.sim_duration) { // Compute dt EVERY substep for b4b correctness if (!arglist.time_increment_fixed) { compute_local_dt(); compute_global_dt(print_id); } it_count++; it_count_average++; substep_count++; total_iterations++; // Determine bounds based on substep // Each substep "consumes" one layer of valid halo data from boundaries inward // Substep 1: full domain, Substep 2: shrink by 1 row, etc. // Only shrink on MPI-facing boundaries (where we have neighbors) int ilo = GHOST_CELL_PADDING; // Determine bounds for computation // For overlap tiling: // - At MPI boundaries: extend to G-1 (1 row into ghost) // - At physical boundaries: use G (interior only) // - Shrink in later substeps to avoid stale ghost data // Note: Physical boundary bounds differ between G=1 and G=2 because // the ghost cell structure is different. This is expected. int ilo = GHOST_CELL_PADDING-1; int ihi = rows - GHOST_CELL_PADDING; if (arglist.overlap_tiling && substep_count > 1 && size > 1) { int shrink = substep_count - 1; // Shrink ilo only if we have a lower neighbor (rank > 0) if (arglist.overlap_tiling && size > 1) { // Extend bounds by 1 row at MPI boundaries only if (rank > 0) { ilo = GHOST_CELL_PADDING + shrink; ilo = substep_count-2; } // Shrink ihi only if we have an upper neighbor (rank < size - 1) if (rank < size - 1) { ihi = rows - GHOST_CELL_PADDING - shrink; ihi = rows - substep_count + 1; } } // Compute dt EVERY substep for b4b correctness if (!arglist.time_increment_fixed) { compute_local_dt(ilo,ihi); compute_global_dt(print_id); } compute_new_state(ilo, ihi); // Halo exchange: every step for baseline, every GHOST_CELL_PADDING steps for overlap tiling bool do_halo_exchange = !arglist.overlap_tiling || (substep_count >= size); if (do_halo_exchange && size > 1) { perform_halo_exchange(); substep_count = 0; total_halo_exchanges++; } simtime += global_dt; average_dt += global_dt; Loading Loading @@ -2132,19 +2144,6 @@ void triton<T>::simulate() } } // Halo exchange: every step for baseline, every GHOST_CELL_PADDING steps for overlap tiling bool do_halo_exchange = !arglist.overlap_tiling || (substep_count >= GHOST_CELL_PADDING); if (do_halo_exchange && size > 1) { perform_halo_exchange(); substep_count = 0; total_halo_exchanges++; } else if (size > 1) { // wet_dry_qy must be called every iteration when size > 1 st.start(COMPUTE_TIME); Kernels::wet_dry_qy(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra); st.stop(COMPUTE_TIME); } } // Final observation write Loading Loading @@ -2209,10 +2208,10 @@ void triton<T>::simulate() template<typename T> void triton<T>::compute_local_dt() void triton<T>::compute_local_dt(int ilo, int ihi) { st.start(COMPUTE_TIME); Kernels::compute_dt_and_sqrt(rows*cols, cell_size, device_vec[QX], device_vec[QY], device_vec[H], device_vec[SQRTH], device_vec[DT], arglist.courant, arglist.hextra); Kernels::compute_dt_and_sqrt(rows, cols, cell_size, device_vec[QX], device_vec[QY], device_vec[H], device_vec[SQRTH], device_vec[DT], arglist.courant, arglist.hextra, ilo, ihi); local_dt = Kernels::find_min_dt(rows*cols, device_vec[DT]); gpuStreamSynchronize(streams); st.stop(COMPUTE_TIME); Loading Loading
src/kernels.h +34 −22 Original line number Diff line number Diff line Loading @@ -63,8 +63,6 @@ namespace Kernels int ilo = GHOST_CELL_PADDING, int ihi = -1) { // Default ihi to nrows - GHOST_CELL_PADDING if not provided if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING; /**** * RHS sketch Loading @@ -82,7 +80,7 @@ namespace Kernels int rx = (ix * ncols); //row offset bool is_top = (ix < ilo), is_top = (ix <= ilo), is_btm = (ix >= ihi), is_lt = (iy <= GHOST_CELL_PADDING-2), is_rt = (iy >= ncols - GHOST_CELL_PADDING); Loading Loading @@ -346,9 +344,6 @@ namespace Kernels int ilo = GHOST_CELL_PADDING, int ihi = -1) { if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING; // flux_y needs one extra row at bottom (computes N edge = south of row above) int ihi_flux_y = ihi + 1; /**** * RHS sketch Loading @@ -369,8 +364,8 @@ namespace Kernels int rx = (ix * ncols); //row offset bool is_top = (ix < ilo), is_btm = (ix >= ihi_flux_y), is_top = (ix <= ilo), is_btm = (ix >= ihi+1), is_lt = (iy <= GHOST_CELL_PADDING-1), is_rt = (iy >= ncols - GHOST_CELL_PADDING); Loading Loading @@ -626,15 +621,13 @@ namespace Kernels int ilo = GHOST_CELL_PADDING, int ihi = -1) { if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING; triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) { int ix = (id / ncols); //row id int iy = (id % ncols); //col id bool is_top = (ix < ilo), is_top = (ix <= ilo), is_btm = (ix >= ihi), is_lt = (iy <= GHOST_CELL_PADDING-1), is_rt = (iy >= ncols - GHOST_CELL_PADDING); Loading Loading @@ -717,9 +710,6 @@ namespace Kernels int ilo = GHOST_CELL_PADDING, int ihi = -1) { // Default ihi to nrows - GHOST_CELL_PADDING if not provided if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING; triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) { int ix = (id / ncols); //row id Loading @@ -727,7 +717,7 @@ namespace Kernels int rx = (ix * ncols); //row offset bool is_top = (ix < ilo), is_top = (ix <= ilo), is_btm = (ix >= ihi), is_lt = (iy <= GHOST_CELL_PADDING-1), is_rt = (iy >= ncols - GHOST_CELL_PADDING); Loading @@ -754,7 +744,7 @@ namespace Kernels } }else{ if(ix==ihi-1 && mpi_tasks>1){ //last real row if(ix==ihi && mpi_tasks>1){ //last real row if ((hij + zij < dem[rx - ncols + iy]) && (h_arr[rx - ncols + iy] < EPS12)) { qy_arr[id] = 0.0; } Loading Loading @@ -1031,16 +1021,38 @@ namespace Kernels * @param hextra Minimum depth (tolerance below water is at rest) */ template<typename T> void compute_dt_and_sqrt(int size, T dx, void compute_dt_and_sqrt(int nrows, int ncols, T dx, T const * KOKKOS_RESTRICT input_qx , T const * KOKKOS_RESTRICT input_qy , T const * KOKKOS_RESTRICT input_h , T * KOKKOS_RESTRICT input_sqrth, T * KOKKOS_RESTRICT output , T cn, T hextra) T cn, T hextra, int ilo = GHOST_CELL_PADDING, int ihi = -1) { int size = nrows * ncols; triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) { int ix = (id / ncols); //row id int iy = (id % ncols); //col id bool is_top = (ix <= ilo), is_btm = (ix >= ihi), is_lt = (iy <= GHOST_CELL_PADDING-1), is_rt = (iy >= ncols - GHOST_CELL_PADDING); // Set all cells to MAX_VALUE first (excluded cells won't be computed) output[id] = MAX_VALUE; if (is_top || is_lt || is_rt || is_btm) //exclude halo cells { return; } T hij = input_h[id]; T sqrthij = FMAX(sqrt(hij),0.0); input_sqrth[id] = sqrthij; Loading Loading @@ -1132,10 +1144,10 @@ namespace Kernels int iy = (ii % ncols); //col id bool is_top = (ix == 1), is_btm = (ix == nrows - 2), is_lt = (iy == 1), is_rt = (iy == ncols - 2); is_top = (ix == GHOST_CELL_PADDING), is_btm = (ix == nrows - GHOST_CELL_PADDING - 1), is_lt = (iy == GHOST_CELL_PADDING), is_rt = (iy == ncols - GHOST_CELL_PADDING - 1); //if (!(rank == 0 && is_top) || !is_lt || !is_rt || !(rank == total_process - 1 && is_btm)) if (!((rank == 0 && is_top) || is_lt || is_rt || (rank == total_process - 1 && is_btm))) Loading
src/triton.h +33 −34 Original line number Diff line number Diff line Loading @@ -179,7 +179,7 @@ namespace Triton /** @brief This function is used to calculate minimum time step size for each sub domain. * */ void compute_local_dt(); void compute_local_dt(int ilo = GHOST_CELL_PADDING, int ihi = -1); /** @brief This function is used to calculate minimum time step size between all sub domain. Loading @@ -192,7 +192,9 @@ namespace Triton /** @brief This function is used to compute next state. All main computation is done inside this function. * * @param ilo Lower row bound (inclusive) for overlap tiling * @param ihi Upper row bound (exclusive) for overlap tiling * @param ihi Upper row bound (exclusive) for state update * @param ilo_flux Lower row bound (inclusive) for flux computation (default: same as ilo) * @param ihi_flux Upper row bound (exclusive) for flux computation (default: same as ihi) */ void compute_new_state(int ilo, int ihi); Loading Loading @@ -2034,39 +2036,49 @@ void triton<T>::simulate() while (simtime < arglist.sim_duration) { // Compute dt EVERY substep for b4b correctness if (!arglist.time_increment_fixed) { compute_local_dt(); compute_global_dt(print_id); } it_count++; it_count_average++; substep_count++; total_iterations++; // Determine bounds based on substep // Each substep "consumes" one layer of valid halo data from boundaries inward // Substep 1: full domain, Substep 2: shrink by 1 row, etc. // Only shrink on MPI-facing boundaries (where we have neighbors) int ilo = GHOST_CELL_PADDING; // Determine bounds for computation // For overlap tiling: // - At MPI boundaries: extend to G-1 (1 row into ghost) // - At physical boundaries: use G (interior only) // - Shrink in later substeps to avoid stale ghost data // Note: Physical boundary bounds differ between G=1 and G=2 because // the ghost cell structure is different. This is expected. int ilo = GHOST_CELL_PADDING-1; int ihi = rows - GHOST_CELL_PADDING; if (arglist.overlap_tiling && substep_count > 1 && size > 1) { int shrink = substep_count - 1; // Shrink ilo only if we have a lower neighbor (rank > 0) if (arglist.overlap_tiling && size > 1) { // Extend bounds by 1 row at MPI boundaries only if (rank > 0) { ilo = GHOST_CELL_PADDING + shrink; ilo = substep_count-2; } // Shrink ihi only if we have an upper neighbor (rank < size - 1) if (rank < size - 1) { ihi = rows - GHOST_CELL_PADDING - shrink; ihi = rows - substep_count + 1; } } // Compute dt EVERY substep for b4b correctness if (!arglist.time_increment_fixed) { compute_local_dt(ilo,ihi); compute_global_dt(print_id); } compute_new_state(ilo, ihi); // Halo exchange: every step for baseline, every GHOST_CELL_PADDING steps for overlap tiling bool do_halo_exchange = !arglist.overlap_tiling || (substep_count >= size); if (do_halo_exchange && size > 1) { perform_halo_exchange(); substep_count = 0; total_halo_exchanges++; } simtime += global_dt; average_dt += global_dt; Loading Loading @@ -2132,19 +2144,6 @@ void triton<T>::simulate() } } // Halo exchange: every step for baseline, every GHOST_CELL_PADDING steps for overlap tiling bool do_halo_exchange = !arglist.overlap_tiling || (substep_count >= GHOST_CELL_PADDING); if (do_halo_exchange && size > 1) { perform_halo_exchange(); substep_count = 0; total_halo_exchanges++; } else if (size > 1) { // wet_dry_qy must be called every iteration when size > 1 st.start(COMPUTE_TIME); Kernels::wet_dry_qy(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra); st.stop(COMPUTE_TIME); } } // Final observation write Loading Loading @@ -2209,10 +2208,10 @@ void triton<T>::simulate() template<typename T> void triton<T>::compute_local_dt() void triton<T>::compute_local_dt(int ilo, int ihi) { st.start(COMPUTE_TIME); Kernels::compute_dt_and_sqrt(rows*cols, cell_size, device_vec[QX], device_vec[QY], device_vec[H], device_vec[SQRTH], device_vec[DT], arglist.courant, arglist.hextra); Kernels::compute_dt_and_sqrt(rows, cols, cell_size, device_vec[QX], device_vec[QY], device_vec[H], device_vec[SQRTH], device_vec[DT], arglist.courant, arglist.hextra, ilo, ihi); local_dt = Kernels::find_min_dt(rows*cols, device_vec[DT]); gpuStreamSynchronize(streams); st.stop(COMPUTE_TIME); Loading