Commit 522f55aa authored by Mario Morales Hernandez's avatar Mario Morales Hernandez
Browse files

b4b for np=2 (for a tiny problem)

Fixed b4b for np=2. np=4 for 4 iterations also work for it doesn't for 8
iterations (2 loops). So there should be a bug in any of the indices.
parent 878cdf9d
Loading
Loading
Loading
Loading
+34 −22
Original line number Diff line number Diff line
@@ -63,8 +63,6 @@ namespace Kernels
              int ilo = GHOST_CELL_PADDING,
              int ihi = -1)
  {
    // Default ihi to nrows - GHOST_CELL_PADDING if not provided
    if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

    /****
     *  RHS sketch
@@ -82,7 +80,7 @@ namespace Kernels
      int rx = (ix * ncols);  //row offset

      bool
      is_top = (ix < ilo),
      is_top = (ix <= ilo),
      is_btm = (ix >= ihi),
      is_lt = (iy <= GHOST_CELL_PADDING-2),
      is_rt = (iy >= ncols - GHOST_CELL_PADDING);
@@ -346,9 +344,6 @@ namespace Kernels
              int ilo = GHOST_CELL_PADDING,
              int ihi = -1)
  {
    if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;
    // flux_y needs one extra row at bottom (computes N edge = south of row above)
    int ihi_flux_y = ihi + 1;

    /****
     *  RHS sketch
@@ -369,8 +364,8 @@ namespace Kernels
      int rx = (ix * ncols);  //row offset

      bool
      is_top = (ix < ilo),
      is_btm = (ix >= ihi_flux_y),
      is_top = (ix <= ilo),
      is_btm = (ix >= ihi+1),
      is_lt = (iy <= GHOST_CELL_PADDING-1),
      is_rt = (iy >= ncols - GHOST_CELL_PADDING);

@@ -626,15 +621,13 @@ namespace Kernels
                    int ilo = GHOST_CELL_PADDING,
                    int ihi = -1)
  {
    if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

    triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

      int ix = (id / ncols);  //row id
      int iy = (id % ncols);  //col id

      bool
      is_top = (ix < ilo),
      is_top = (ix <= ilo),
      is_btm = (ix >= ihi),
      is_lt = (iy <= GHOST_CELL_PADDING-1),
      is_rt = (iy >= ncols - GHOST_CELL_PADDING);
@@ -717,9 +710,6 @@ namespace Kernels
               int ilo = GHOST_CELL_PADDING,
               int ihi = -1)
  {
    // Default ihi to nrows - GHOST_CELL_PADDING if not provided
    if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

    triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

      int ix = (id / ncols);  //row id
@@ -727,7 +717,7 @@ namespace Kernels
      int rx = (ix * ncols);  //row offset

      bool
      is_top = (ix < ilo),
      is_top = (ix <= ilo),
      is_btm = (ix >= ihi),
      is_lt = (iy <= GHOST_CELL_PADDING-1),
      is_rt = (iy >= ncols - GHOST_CELL_PADDING);
@@ -754,7 +744,7 @@ namespace Kernels
            }

        }else{
          if(ix==ihi-1 && mpi_tasks>1){ //last real row
          if(ix==ihi && mpi_tasks>1){ //last real row
            if ((hij + zij < dem[rx - ncols + iy]) && (h_arr[rx - ncols + iy] < EPS12))                    {
              qy_arr[id] = 0.0;
            }
@@ -1031,16 +1021,38 @@ namespace Kernels
*  @param hextra Minimum depth (tolerance below water is at rest)
*/
  template<typename T>
  void compute_dt_and_sqrt(int size, T dx,
  void compute_dt_and_sqrt(int nrows, int ncols, T dx,
                    T const * KOKKOS_RESTRICT input_qx   ,
                    T const * KOKKOS_RESTRICT input_qy   ,
                    T const * KOKKOS_RESTRICT input_h    ,
                    T       * KOKKOS_RESTRICT input_sqrth,
                    T       * KOKKOS_RESTRICT output     ,
                    T cn, T hextra)
                    T cn, T hextra,
                    int ilo = GHOST_CELL_PADDING,
                    int ihi = -1)
  {

    int size = nrows * ncols;

    triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {
      int ix = (id / ncols);  //row id
      int iy = (id % ncols);  //col id

      bool

      is_top = (ix <= ilo),
      is_btm = (ix >= ihi),
      is_lt = (iy <= GHOST_CELL_PADDING-1),
      is_rt = (iy >= ncols - GHOST_CELL_PADDING);


      // Set all cells to MAX_VALUE first (excluded cells won't be computed)
      output[id] = MAX_VALUE;

      if (is_top || is_lt || is_rt || is_btm) //exclude halo cells
      {
        return;
      }
      T hij = input_h[id];
      T sqrthij = FMAX(sqrt(hij),0.0);
      input_sqrth[id] = sqrthij;
@@ -1132,10 +1144,10 @@ namespace Kernels
      int iy = (ii % ncols);  //col id

      bool
      is_top = (ix == 1),
      is_btm = (ix == nrows - 2),
      is_lt = (iy == 1),
      is_rt = (iy == ncols - 2);
      is_top = (ix == GHOST_CELL_PADDING),
      is_btm = (ix == nrows - GHOST_CELL_PADDING - 1),
      is_lt = (iy == GHOST_CELL_PADDING),
      is_rt = (iy == ncols - GHOST_CELL_PADDING - 1);
			
			//if (!(rank == 0 && is_top) || !is_lt || !is_rt || !(rank == total_process - 1 && is_btm))
		if (!((rank == 0 && is_top) || is_lt || is_rt || (rank == total_process - 1 && is_btm)))
+33 −34
Original line number Diff line number Diff line
@@ -179,7 +179,7 @@ namespace Triton
/** @brief This function is used to calculate minimum time step size for each sub domain.
*
*/
    void compute_local_dt();
    void compute_local_dt(int ilo = GHOST_CELL_PADDING, int ihi = -1);
    
    
/** @brief This function is used to calculate minimum time step size between all sub domain.
@@ -192,7 +192,9 @@ namespace Triton
/** @brief This function is used to compute next state. All main computation is done inside this function.
*
*  @param ilo Lower row bound (inclusive) for overlap tiling
*  @param ihi Upper row bound (exclusive) for overlap tiling
*  @param ihi Upper row bound (exclusive) for state update
*  @param ilo_flux Lower row bound (inclusive) for flux computation (default: same as ilo)
*  @param ihi_flux Upper row bound (exclusive) for flux computation (default: same as ihi)
*/
    void compute_new_state(int ilo, int ihi);

@@ -2034,39 +2036,49 @@ void triton<T>::simulate()

  while (simtime < arglist.sim_duration)
  {
    // Compute dt EVERY substep for b4b correctness
    if (!arglist.time_increment_fixed)
    {
      compute_local_dt();
      compute_global_dt(print_id);
    }

    it_count++;
    it_count_average++;
    substep_count++;
    total_iterations++;

    // Determine bounds based on substep
    // Each substep "consumes" one layer of valid halo data from boundaries inward
    // Substep 1: full domain, Substep 2: shrink by 1 row, etc.
    // Only shrink on MPI-facing boundaries (where we have neighbors)
    int ilo = GHOST_CELL_PADDING;
    // Determine bounds for computation
    // For overlap tiling:
    //   - At MPI boundaries: extend to G-1 (1 row into ghost)
    //   - At physical boundaries: use G (interior only)
    //   - Shrink in later substeps to avoid stale ghost data
    // Note: Physical boundary bounds differ between G=1 and G=2 because
    // the ghost cell structure is different. This is expected.
    int ilo = GHOST_CELL_PADDING-1;
    int ihi = rows - GHOST_CELL_PADDING;

    if (arglist.overlap_tiling && substep_count > 1 && size > 1) {
      int shrink = substep_count - 1;
      // Shrink ilo only if we have a lower neighbor (rank > 0)
    if (arglist.overlap_tiling && size > 1) {
      // Extend bounds by 1 row at MPI boundaries only
      if (rank > 0) {
        ilo = GHOST_CELL_PADDING + shrink;
        ilo = substep_count-2; 
      }
      // Shrink ihi only if we have an upper neighbor (rank < size - 1)
      if (rank < size - 1) {
        ihi = rows - GHOST_CELL_PADDING - shrink;
        ihi = rows - substep_count + 1;
      }
    }

    // Compute dt EVERY substep for b4b correctness
    if (!arglist.time_increment_fixed)
    {
      compute_local_dt(ilo,ihi);
      compute_global_dt(print_id);
    }

    compute_new_state(ilo, ihi);

    // Halo exchange: every step for baseline, every GHOST_CELL_PADDING steps for overlap tiling
    bool do_halo_exchange = !arglist.overlap_tiling || (substep_count >= size);
    if (do_halo_exchange && size > 1) {
      perform_halo_exchange();
      substep_count = 0;
      total_halo_exchanges++;
    }

    simtime += global_dt;
    average_dt += global_dt;

@@ -2132,19 +2144,6 @@ void triton<T>::simulate()
      }
    }

    // Halo exchange: every step for baseline, every GHOST_CELL_PADDING steps for overlap tiling
    bool do_halo_exchange = !arglist.overlap_tiling || (substep_count >= GHOST_CELL_PADDING);
    if (do_halo_exchange && size > 1) {
      perform_halo_exchange();
      substep_count = 0;
      total_halo_exchanges++;
    } else if (size > 1) {
      // wet_dry_qy must be called every iteration when size > 1
      st.start(COMPUTE_TIME);
      Kernels::wet_dry_qy(2 * cols*GHOST_CELL_PADDING, rows, cols,
          device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra);
      st.stop(COMPUTE_TIME);
    }
  }

  // Final observation write
@@ -2209,10 +2208,10 @@ void triton<T>::simulate()


  template<typename T>
  void triton<T>::compute_local_dt()
  void triton<T>::compute_local_dt(int ilo, int ihi)
  {
    st.start(COMPUTE_TIME);
    Kernels::compute_dt_and_sqrt(rows*cols, cell_size, device_vec[QX], device_vec[QY], device_vec[H], device_vec[SQRTH], device_vec[DT], arglist.courant, arglist.hextra);
    Kernels::compute_dt_and_sqrt(rows, cols, cell_size, device_vec[QX], device_vec[QY], device_vec[H], device_vec[SQRTH], device_vec[DT], arglist.courant, arglist.hextra, ilo, ihi);
    local_dt = Kernels::find_min_dt(rows*cols, device_vec[DT]);
    gpuStreamSynchronize(streams);
    st.stop(COMPUTE_TIME);