Loading src/triton.h +32 −2 Original line number Diff line number Diff line Loading @@ -2028,6 +2028,10 @@ void triton<T>::simulate() // Substep counter for halo exchange scheduling int substep_count = 0; // Diagnostics: count iterations and halo exchanges int total_iterations = 0; int total_halo_exchanges = 0; while (simtime < arglist.sim_duration) { // Compute dt EVERY substep for b4b correctness Loading @@ -2040,18 +2044,26 @@ void triton<T>::simulate() it_count++; it_count_average++; substep_count++; total_iterations++; // Determine bounds based on substep // Each substep "consumes" one layer of valid halo data from boundaries inward // Substep 1: full domain, Substep 2: shrink by 1 row, etc. // Only shrink on MPI-facing boundaries (where we have neighbors) int ilo = GHOST_CELL_PADDING; int ihi = rows - GHOST_CELL_PADDING; if (arglist.overlap_tiling && substep_count > 1 && size > 1) { int shrink = substep_count - 1; // Shrink ilo only if we have a lower neighbor (rank > 0) if (rank > 0) { ilo = GHOST_CELL_PADDING + shrink; } // Shrink ihi only if we have an upper neighbor (rank < size - 1) if (rank < size - 1) { ihi = rows - GHOST_CELL_PADDING - shrink; } } compute_new_state(ilo, ihi); Loading Loading @@ -2125,6 +2137,7 @@ void triton<T>::simulate() if (do_halo_exchange && size > 1) { perform_halo_exchange(); substep_count = 0; total_halo_exchanges++; } else if (size > 1) { // wet_dry_qy must be called every iteration when size > 1 st.start(COMPUTE_TIME); Loading Loading @@ -2152,6 +2165,16 @@ void triton<T>::simulate() if(rank==0){ std::cerr << INFO " Time: " << simtime << "\tdt: " << average_dt/it_count_average << "\tit: " << it_count << std::endl; if (size > 1) { double ratio = (double)total_halo_exchanges / total_iterations * 100.0; std::cerr << INFO " Comm stats: iterations=" << total_iterations << ", halo_exchanges=" << total_halo_exchanges << " (" << ratio << "%)" << std::endl; if (arglist.overlap_tiling) { int expected = (total_iterations + GHOST_CELL_PADDING - 1) / GHOST_CELL_PADDING; std::cerr << INFO " Overlap tiling: expected ~" << expected << " exchanges for G=" << GHOST_CELL_PADDING << std::endl; } } std::cerr << OK "Simulation ends" << std::endl; } } Loading Loading @@ -2306,20 +2329,25 @@ void triton<T>::perform_halo_exchange() { if (size <= 1) return; st.start(COMPUTE_TIME); Kernels::halo_copy_from_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]); if (arglist.gpu_direct_flag) { gpuStreamSynchronize(streams); st.stop(COMPUTE_TIME); st.start(BALANCING_MPI_TIME); st.start(MPI_TIME); MpiUtils::exchange(device_vec[HALO], 12*GHOST_CELL_PADDING, cols, rank, size, USE_HALO); st.stop(MPI_TIME); st.stop(BALANCING_MPI_TIME); st.start(COMPUTE_TIME); } else { gpuMemcpyAsync(host_vec[HALO], device_vec[HALO], nbytes_halo, gpuMemcpyDeviceToHost, streams); gpuStreamSynchronize(streams); st.stop(COMPUTE_TIME); st.start(BALANCING_MPI_TIME); st.start(MPI_TIME); Loading @@ -2327,6 +2355,7 @@ void triton<T>::perform_halo_exchange() st.stop(MPI_TIME); st.stop(BALANCING_MPI_TIME); st.start(COMPUTE_TIME); gpuMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, gpuMemcpyHostToDevice, streams); } Loading @@ -2335,6 +2364,7 @@ void triton<T>::perform_halo_exchange() Kernels::wet_dry_qy(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra); st.stop(COMPUTE_TIME); } Loading Loading
src/triton.h +32 −2 Original line number Diff line number Diff line Loading @@ -2028,6 +2028,10 @@ void triton<T>::simulate() // Substep counter for halo exchange scheduling int substep_count = 0; // Diagnostics: count iterations and halo exchanges int total_iterations = 0; int total_halo_exchanges = 0; while (simtime < arglist.sim_duration) { // Compute dt EVERY substep for b4b correctness Loading @@ -2040,18 +2044,26 @@ void triton<T>::simulate() it_count++; it_count_average++; substep_count++; total_iterations++; // Determine bounds based on substep // Each substep "consumes" one layer of valid halo data from boundaries inward // Substep 1: full domain, Substep 2: shrink by 1 row, etc. // Only shrink on MPI-facing boundaries (where we have neighbors) int ilo = GHOST_CELL_PADDING; int ihi = rows - GHOST_CELL_PADDING; if (arglist.overlap_tiling && substep_count > 1 && size > 1) { int shrink = substep_count - 1; // Shrink ilo only if we have a lower neighbor (rank > 0) if (rank > 0) { ilo = GHOST_CELL_PADDING + shrink; } // Shrink ihi only if we have an upper neighbor (rank < size - 1) if (rank < size - 1) { ihi = rows - GHOST_CELL_PADDING - shrink; } } compute_new_state(ilo, ihi); Loading Loading @@ -2125,6 +2137,7 @@ void triton<T>::simulate() if (do_halo_exchange && size > 1) { perform_halo_exchange(); substep_count = 0; total_halo_exchanges++; } else if (size > 1) { // wet_dry_qy must be called every iteration when size > 1 st.start(COMPUTE_TIME); Loading Loading @@ -2152,6 +2165,16 @@ void triton<T>::simulate() if(rank==0){ std::cerr << INFO " Time: " << simtime << "\tdt: " << average_dt/it_count_average << "\tit: " << it_count << std::endl; if (size > 1) { double ratio = (double)total_halo_exchanges / total_iterations * 100.0; std::cerr << INFO " Comm stats: iterations=" << total_iterations << ", halo_exchanges=" << total_halo_exchanges << " (" << ratio << "%)" << std::endl; if (arglist.overlap_tiling) { int expected = (total_iterations + GHOST_CELL_PADDING - 1) / GHOST_CELL_PADDING; std::cerr << INFO " Overlap tiling: expected ~" << expected << " exchanges for G=" << GHOST_CELL_PADDING << std::endl; } } std::cerr << OK "Simulation ends" << std::endl; } } Loading Loading @@ -2306,20 +2329,25 @@ void triton<T>::perform_halo_exchange() { if (size <= 1) return; st.start(COMPUTE_TIME); Kernels::halo_copy_from_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]); if (arglist.gpu_direct_flag) { gpuStreamSynchronize(streams); st.stop(COMPUTE_TIME); st.start(BALANCING_MPI_TIME); st.start(MPI_TIME); MpiUtils::exchange(device_vec[HALO], 12*GHOST_CELL_PADDING, cols, rank, size, USE_HALO); st.stop(MPI_TIME); st.stop(BALANCING_MPI_TIME); st.start(COMPUTE_TIME); } else { gpuMemcpyAsync(host_vec[HALO], device_vec[HALO], nbytes_halo, gpuMemcpyDeviceToHost, streams); gpuStreamSynchronize(streams); st.stop(COMPUTE_TIME); st.start(BALANCING_MPI_TIME); st.start(MPI_TIME); Loading @@ -2327,6 +2355,7 @@ void triton<T>::perform_halo_exchange() st.stop(MPI_TIME); st.stop(BALANCING_MPI_TIME); st.start(COMPUTE_TIME); gpuMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, gpuMemcpyHostToDevice, streams); } Loading @@ -2335,6 +2364,7 @@ void triton<T>::perform_halo_exchange() Kernels::wet_dry_qy(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra); st.stop(COMPUTE_TIME); } Loading