Commit 878cdf9d authored by Mario Morales Hernandez's avatar Mario Morales Hernandez
Browse files

fix a bug, timers and add communication stats

- Fix: shrinking bounds only on MPI-facing boundaries (rank > 0 and rank < size-1)
- Fix: proper COMPUTE_TIME tracking in perform_halo_exchange()
- Added: communication statistics diagnostics
parent 781999e9
Loading
Loading
Loading
Loading
+32 −2
Original line number Diff line number Diff line
@@ -2028,6 +2028,10 @@ void triton<T>::simulate()
  // Substep counter for halo exchange scheduling
  int substep_count = 0;

  // Diagnostics: count iterations and halo exchanges
  int total_iterations = 0;
  int total_halo_exchanges = 0;

  while (simtime < arglist.sim_duration)
  {
    // Compute dt EVERY substep for b4b correctness
@@ -2040,18 +2044,26 @@ void triton<T>::simulate()
    it_count++;
    it_count_average++;
    substep_count++;
    total_iterations++;

    // Determine bounds based on substep
    // Each substep "consumes" one layer of valid halo data from boundaries inward
    // Substep 1: full domain, Substep 2: shrink by 1 row, etc.
    // Only shrink on MPI-facing boundaries (where we have neighbors)
    int ilo = GHOST_CELL_PADDING;
    int ihi = rows - GHOST_CELL_PADDING;

    if (arglist.overlap_tiling && substep_count > 1 && size > 1) {
      int shrink = substep_count - 1;
      // Shrink ilo only if we have a lower neighbor (rank > 0)
      if (rank > 0) {
        ilo = GHOST_CELL_PADDING + shrink;
      }
      // Shrink ihi only if we have an upper neighbor (rank < size - 1)
      if (rank < size - 1) {
        ihi = rows - GHOST_CELL_PADDING - shrink;
      }
    }

    compute_new_state(ilo, ihi);

@@ -2125,6 +2137,7 @@ void triton<T>::simulate()
    if (do_halo_exchange && size > 1) {
      perform_halo_exchange();
      substep_count = 0;
      total_halo_exchanges++;
    } else if (size > 1) {
      // wet_dry_qy must be called every iteration when size > 1
      st.start(COMPUTE_TIME);
@@ -2152,6 +2165,16 @@ void triton<T>::simulate()
  
  if(rank==0){
    std::cerr << INFO " Time: " << simtime << "\tdt: " << average_dt/it_count_average << "\tit: " << it_count << std::endl;
    if (size > 1) {
      double ratio = (double)total_halo_exchanges / total_iterations * 100.0;
      std::cerr << INFO " Comm stats: iterations=" << total_iterations
                << ", halo_exchanges=" << total_halo_exchanges
                << " (" << ratio << "%)" << std::endl;
      if (arglist.overlap_tiling) {
        int expected = (total_iterations + GHOST_CELL_PADDING - 1) / GHOST_CELL_PADDING;
        std::cerr << INFO " Overlap tiling: expected ~" << expected << " exchanges for G=" << GHOST_CELL_PADDING << std::endl;
      }
    }
    std::cerr << OK "Simulation ends" << std::endl;
  }
}
@@ -2306,20 +2329,25 @@ void triton<T>::perform_halo_exchange()
{
  if (size <= 1) return;

  st.start(COMPUTE_TIME);
  Kernels::halo_copy_from_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols,
      device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);

  if (arglist.gpu_direct_flag) {
    gpuStreamSynchronize(streams);
    st.stop(COMPUTE_TIME);

    st.start(BALANCING_MPI_TIME);
    st.start(MPI_TIME);
    MpiUtils::exchange(device_vec[HALO], 12*GHOST_CELL_PADDING, cols, rank, size, USE_HALO);
    st.stop(MPI_TIME);
    st.stop(BALANCING_MPI_TIME);

    st.start(COMPUTE_TIME);
  } else {
    gpuMemcpyAsync(host_vec[HALO], device_vec[HALO], nbytes_halo, gpuMemcpyDeviceToHost, streams);
    gpuStreamSynchronize(streams);
    st.stop(COMPUTE_TIME);

    st.start(BALANCING_MPI_TIME);
    st.start(MPI_TIME);
@@ -2327,6 +2355,7 @@ void triton<T>::perform_halo_exchange()
    st.stop(MPI_TIME);
    st.stop(BALANCING_MPI_TIME);

    st.start(COMPUTE_TIME);
    gpuMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, gpuMemcpyHostToDevice, streams);
  }

@@ -2335,6 +2364,7 @@ void triton<T>::perform_halo_exchange()

  Kernels::wet_dry_qy(2 * cols*GHOST_CELL_PADDING, rows, cols,
      device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra);
  st.stop(COMPUTE_TIME);
}