fix a bug, timers and add communication stats (878cdf9d) · Commits · HYDRO / TRITON

src/triton.h

+32 −2

Original line number	Diff line number	Diff line
		@@ -2028,6 +2028,10 @@ void triton<T>::simulate()
		// Substep counter for halo exchange scheduling
		int substep_count = 0;

		// Diagnostics: count iterations and halo exchanges
		int total_iterations = 0;
		int total_halo_exchanges = 0;

		while (simtime < arglist.sim_duration)
		{
		// Compute dt EVERY substep for b4b correctness
		@@ -2040,18 +2044,26 @@ void triton<T>::simulate()
		it_count++;
		it_count_average++;
		substep_count++;
		total_iterations++;

		// Determine bounds based on substep
		// Each substep "consumes" one layer of valid halo data from boundaries inward
		// Substep 1: full domain, Substep 2: shrink by 1 row, etc.
		// Only shrink on MPI-facing boundaries (where we have neighbors)
		int ilo = GHOST_CELL_PADDING;
		int ihi = rows - GHOST_CELL_PADDING;

		if (arglist.overlap_tiling && substep_count > 1 && size > 1) {
		int shrink = substep_count - 1;
		// Shrink ilo only if we have a lower neighbor (rank > 0)
		if (rank > 0) {
		ilo = GHOST_CELL_PADDING + shrink;
		}
		// Shrink ihi only if we have an upper neighbor (rank < size - 1)
		if (rank < size - 1) {
		ihi = rows - GHOST_CELL_PADDING - shrink;
		}
		}

		compute_new_state(ilo, ihi);

		@@ -2125,6 +2137,7 @@ void triton<T>::simulate()
		if (do_halo_exchange && size > 1) {
		perform_halo_exchange();
		substep_count = 0;
		total_halo_exchanges++;
		} else if (size > 1) {
		// wet_dry_qy must be called every iteration when size > 1
		st.start(COMPUTE_TIME);
		@@ -2152,6 +2165,16 @@ void triton<T>::simulate()

		if(rank==0){
		std::cerr << INFO " Time: " << simtime << "\tdt: " << average_dt/it_count_average << "\tit: " << it_count << std::endl;
		if (size > 1) {
		double ratio = (double)total_halo_exchanges / total_iterations * 100.0;
		std::cerr << INFO " Comm stats: iterations=" << total_iterations
		<< ", halo_exchanges=" << total_halo_exchanges
		<< " (" << ratio << "%)" << std::endl;
		if (arglist.overlap_tiling) {
		int expected = (total_iterations + GHOST_CELL_PADDING - 1) / GHOST_CELL_PADDING;
		std::cerr << INFO " Overlap tiling: expected ~" << expected << " exchanges for G=" << GHOST_CELL_PADDING << std::endl;
		}
		}
		std::cerr << OK "Simulation ends" << std::endl;
		}
		}
		@@ -2306,20 +2329,25 @@ void triton<T>::perform_halo_exchange()
		{
		if (size <= 1) return;

		st.start(COMPUTE_TIME);
		Kernels::halo_copy_from_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols,
		device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);

		if (arglist.gpu_direct_flag) {
		gpuStreamSynchronize(streams);
		st.stop(COMPUTE_TIME);

		st.start(BALANCING_MPI_TIME);
		st.start(MPI_TIME);
		MpiUtils::exchange(device_vec[HALO], 12*GHOST_CELL_PADDING, cols, rank, size, USE_HALO);
		st.stop(MPI_TIME);
		st.stop(BALANCING_MPI_TIME);

		st.start(COMPUTE_TIME);
		} else {
		gpuMemcpyAsync(host_vec[HALO], device_vec[HALO], nbytes_halo, gpuMemcpyDeviceToHost, streams);
		gpuStreamSynchronize(streams);
		st.stop(COMPUTE_TIME);

		st.start(BALANCING_MPI_TIME);
		st.start(MPI_TIME);
		@@ -2327,6 +2355,7 @@ void triton<T>::perform_halo_exchange()
		st.stop(MPI_TIME);
		st.stop(BALANCING_MPI_TIME);

		st.start(COMPUTE_TIME);
		gpuMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, gpuMemcpyHostToDevice, streams);
		}

		@@ -2335,6 +2364,7 @@ void triton<T>::perform_halo_exchange()

		Kernels::wet_dry_qy(2 * cols*GHOST_CELL_PADDING, rows, cols,
		device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra);
		st.stop(COMPUTE_TIME);
		}