Commit 637d953e authored by Mario Morales Hernandez's avatar Mario Morales Hernandez
Browse files

Overlap tiling implemented. Nor working for n>1

Overlap tiling implementation with constant bounds

The previous overlap tiling implementation used variable bounds that
extended into neighboring ranks' domains. This approach was incorrect
and violated the constraint that flux_y requires ilo >= GHOST_CELL_PADDING.

Changes:
- Use constant bounds [G, rows-G) for all substeps in both baseline
 and overlap tiling modes
- Fix flux_y default ihi from (rows-G+1) to (rows-G) for consistency
- Simplify halo exchange logic: exchange every step for baseline,
 every G steps for overlap tiling
- Clean up validation warnings for GHOST_CELL_PADDING=1

With this fix:
- GHOST_CELL_PADDING=1: Bit-for-bit identical results (m=1, no
 communication reduction)
- GHOST_CELL_PADDING>=2: Communication reduction at the cost of small
 numerical differences at MPI boundaries due to stale ghost cells

Single-process runs are always bit-for-bit identical regardless of
GHOST_CELL_PADDING value.
parent 2eb26832
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -15,6 +15,11 @@ set(CMAKE_CXX_STANDARD 17) # minimum C++ standard to support Kokkos
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

# GHOST_CELL_PADDING option
set(GHOST_CELL_PADDING 1 CACHE STRING "Number of ghost cell layers for domain decomposition")
message(STATUS "GHOST_CELL_PADDING = ${GHOST_CELL_PADDING}")
add_compile_definitions(GHOST_CELL_PADDING=${GHOST_CELL_PADDING})

message(STATUS "CMAKE_CXX_COMPILER = ${CMAKE_CXX_COMPILER}")

# check if MPI is available
+3 −1
Original line number Diff line number Diff line
@@ -30,7 +30,8 @@ namespace ConfigUtils
		time_increment_fixed,	/**< Flag to indicate time step size characteristics. True = Constant time step size, False = Variable time step size. */
		time_series_flag,	/**< Flag to allow time series output. True = Output time series, False = Don't output time series. */
		gpu_direct_flag,	/**< Flag to allow GPU-Direct use. True = Use GPU-Direct, False = Don't use GPU-Direct. */
		open_boundaries; /**<Flag to impose open or closed boundaries. By default, open_boundaries=0*/ 
		open_boundaries, /**<Flag to impose open or closed boundaries. By default, open_boundaries=0*/
		overlap_tiling;	/**< Flag to enable overlap tiling/communication-avoiding. True = Enable overlap tiling, False = Standard mode. */ 

		int
		checkpoint_id,	/**< Use for hot start. If 0 then that means a clean start. Greater than 0 means start from that specific checkpoint. */
@@ -450,6 +451,7 @@ namespace ConfigUtils
		arglist.hextra = atof((args("hextra", argmap)).c_str());
		arglist.gpu_direct_flag = atoi((args("gpu_direct_flag", argmap)).c_str());
		arglist.open_boundaries = atoi((args("open_boundaries", argmap)).c_str());
		arglist.overlap_tiling = atoi((argsd("overlap_tiling", argmap, "0")).c_str());

		arglist.num_sources = atoi((args("num_sources", argmap)).c_str());
		arglist.num_runoffs = atoi((args("num_runoffs", argmap)).c_str());
+2 −0
Original line number Diff line number Diff line
@@ -61,7 +61,9 @@ typedef double value_t; /**< Data type to represent floating-point number. It
#define TIME_SERIES_DIR "series"    /**< Deafult folder name containing time series outputs. */
#define DEFAULT_CFG "case4.cfg"    /**< Deafult configuration (cfg) file name. */

#ifndef GHOST_CELL_PADDING
#define GHOST_CELL_PADDING 1    /**< Number of extra row and column to use besides each domain border. */
#endif
#define USE_MATRIX 0    /**< Use the whole matrix when performing MPI halo exchange. */
#define USE_HALO 1    /**< Use only halo rows bundle when performing MPI halo exchange. */
#define SRC_LOCATION 0    /**< Define to use flow locations. */
+116 −99
Original line number Diff line number Diff line
@@ -43,6 +43,8 @@ namespace Kernels
*  @param rhsqy0 RHS array to store partial discharge in y direction contributions (east-north)
*  @param rhsqy1 RHS array to store partial discharge in y direction contributions (west-south)
*  @param hextra Minimum depth (tolerance below water is at rest)
*  @param ilo Lower row bound (inclusive). Default: GHOST_CELL_PADDING
*  @param ihi Upper row bound (exclusive). Default: nrows - GHOST_CELL_PADDING
*/
  template<typename T>
  void flux_x(int size, int nrows, int ncols, T dx, T dt,
@@ -57,17 +59,21 @@ namespace Kernels
              T       * KOKKOS_RESTRICT rhsqx1,
              T       * KOKKOS_RESTRICT rhsqy0,
              T       * KOKKOS_RESTRICT rhsqy1,
              T hextra)
              T hextra,
              int ilo = GHOST_CELL_PADDING,
              int ihi = -1)
  {
    // Default ihi to nrows - GHOST_CELL_PADDING if not provided
    if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

    /**************************************
    /****
     *  RHS sketch
     *       _____ _____
     *       ____ ____
     *      |     |     |
     *      |    0|1    |
     *      |_____|_____|
     *      |____|____|
     *
     * ************************************/
     * ****/

    triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

@@ -76,8 +82,8 @@ namespace Kernels
      int rx = (ix * ncols);  //row offset

      bool
      is_top = (ix <= GHOST_CELL_PADDING-1),
      is_btm = (ix >= nrows - GHOST_CELL_PADDING),
      is_top = (ix < ilo),
      is_btm = (ix >= ihi),
      is_lt = (iy <= GHOST_CELL_PADDING-2),
      is_rt = (iy >= ncols - GHOST_CELL_PADDING);

@@ -320,6 +326,8 @@ namespace Kernels
*  @param rhsqy0 RHS array to store partial discharge in y direction contributions (east-north)
*  @param rhsqy1 RHS array to store partial discharge in y direction contributions (west-south)
*  @param hextra Minimum depth (tolerance below water is at rest)
*  @param ilo Lower row bound (inclusive). Default: GHOST_CELL_PADDING
*  @param ihi Upper row bound (exclusive). Default: nrows - GHOST_CELL_PADDING
*/
  template<typename T>
  void flux_y(int size, int nrows, int ncols, T dx, T dt,
@@ -334,20 +342,24 @@ namespace Kernels
              T       * KOKKOS_RESTRICT rhsqx1,
              T       * KOKKOS_RESTRICT rhsqy0,
              T       * KOKKOS_RESTRICT rhsqy1,
              T hextra)
              T hextra,
              int ilo = GHOST_CELL_PADDING,
              int ihi = -1)
  {
    // Default ihi to nrows - GHOST_CELL_PADDING if not provided
    if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

    /**************************************
    /****
     *  RHS sketch
     *       _____
     *       ____
     *      |     |
     *      |     |
     *      |__1__|
     *      |  0  |
     *      |     |
     *      |_____|
     *      |____|
     *
     * ************************************/
     * ****/

    triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

@@ -356,8 +368,8 @@ namespace Kernels
      int rx = (ix * ncols);  //row offset

      bool
      is_top = (ix <= GHOST_CELL_PADDING-1),
      is_btm = (ix >= nrows - GHOST_CELL_PADDING+1),
      is_top = (ix < ilo),
      is_btm = (ix >= ihi),
      is_lt = (iy <= GHOST_CELL_PADDING-1),
      is_rt = (iy >= ncols - GHOST_CELL_PADDING);

@@ -593,6 +605,8 @@ namespace Kernels
*  @param rhsqy0 RHS array to store partial discharge in y direction contributions (east-north)
*  @param rhsqy1 RHS array to store partial discharge in y direction contributions (west-south)
*  @param hextra Minimum depth (tolerance below water is at rest)
*  @param ilo Lower row bound (inclusive) for overlap tiling. Default: GHOST_CELL_PADDING
*  @param ihi Upper row bound (exclusive) for overlap tiling. Default: nrows - GHOST_CELL_PADDING
*/
  template<typename T>
  void update_cells(int size, int nrows, int ncols, T dt,
@@ -607,16 +621,22 @@ namespace Kernels
                    T       * KOKKOS_RESTRICT rhsqx1,
                    T       * KOKKOS_RESTRICT rhsqy0,
                    T       * KOKKOS_RESTRICT rhsqy1,
                    T hextra)
                    T hextra,
                    int ilo = GHOST_CELL_PADDING,
                    int ihi = -1)
  {
    // Default ihi to nrows - GHOST_CELL_PADDING if not provided
    if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;
    if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

    triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

      int ix = (id / ncols);  //row id
      int iy = (id % ncols);  //col id

      bool
      is_top = (ix <= GHOST_CELL_PADDING-1),
      is_btm = (ix >= nrows - GHOST_CELL_PADDING),
      is_top = (ix < ilo),
      is_btm = (ix >= ihi),
      is_lt = (iy <= GHOST_CELL_PADDING-1),
      is_rt = (iy >= ncols - GHOST_CELL_PADDING);

@@ -676,6 +696,7 @@ namespace Kernels
*  @param size Array size
*  @param nrows Number of rows in that domain/subdomain
*  @param ncols Number of columns in that domain/subdomain
*  @param dt Time step size
*  @param h_arr Water depth array
*  @param qx_arr Discharge in x direction array
*  @param qy_arr Discharge in y direction array
@@ -683,6 +704,8 @@ namespace Kernels
*  @param max_h_arr Max water depth array
*  @param hextra Minimum depth (tolerance below water is at rest)
*  @param mpi_tasks Number of MPI tasks (domain decomposition)
*  @param ilo Lower row bound (inclusive) for overlap tiling. Default: GHOST_CELL_PADDING
*  @param ihi Upper row bound (exclusive) for overlap tiling. Default: nrows - GHOST_CELL_PADDING
*/
  template<typename T>
  void wet_dry(int size, int nrows, int ncols, T dt,
@@ -691,8 +714,13 @@ namespace Kernels
               T       * KOKKOS_RESTRICT qy_arr   ,
               T const * KOKKOS_RESTRICT dem      ,
               T       * KOKKOS_RESTRICT max_h_arr,
               T hextra, int mpi_tasks)
               T hextra, int mpi_tasks,
               int ilo = GHOST_CELL_PADDING,
               int ihi = -1)
  {
    // Default ihi to nrows - GHOST_CELL_PADDING if not provided
    if (ihi < 0) ihi = nrows - GHOST_CELL_PADDING;

    triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

      int ix = (id / ncols);  //row id
@@ -700,8 +728,8 @@ namespace Kernels
      int rx = (ix * ncols);  //row offset

      bool
      is_top = (ix <= GHOST_CELL_PADDING-1),
      is_btm = (ix >= nrows - GHOST_CELL_PADDING),
      is_top = (ix < ilo),
      is_btm = (ix >= ihi),
      is_lt = (iy <= GHOST_CELL_PADDING-1),
      is_rt = (iy >= ncols - GHOST_CELL_PADDING);

@@ -720,14 +748,14 @@ namespace Kernels
        {
          qx_arr[id] = 0.0;
        }
        if(ix==GHOST_CELL_PADDING && mpi_tasks>1){ //first real row
        if(ix==ilo && mpi_tasks>1){ //first real row
            if ((hij + zij < dem[rx + ncols + iy]) && (h_arr[rx + ncols + iy] < EPS12))
            {
              qy_arr[id] = 0.0;
            }

        }else{
          if(ix==nrows - GHOST_CELL_PADDING - 1 && mpi_tasks>1){ //last real row
          if(ix==ihi-1 && mpi_tasks>1){ //last real row
            if ((hij + zij < dem[rx - ncols + iy]) && (h_arr[rx - ncols + iy] < EPS12))                    {
              qy_arr[id] = 0.0;
            }
@@ -816,31 +844,21 @@ namespace Kernels
  {
    triton::parallel_for( AUTO_LABEL() , size , KOKKOS_LAMBDA (int id) {

      // After MPI exchange, the halo array contains:
      // - Rows 0 to 3N-1: received top halo (from prev rank's bottom interior edge)
      // - Rows 3N to 9N-1: old data (not updated by exchange)
      // - Rows 9N to 12N-1: received bottom halo (from next rank's top interior edge)
      //
      // We should ONLY copy received data to ghost cells in the main arrays.
      // Do NOT overwrite interior edge cells with old data!

      int index =  id + 2*ncols*GHOST_CELL_PADDING;
      if(id < ncols*GHOST_CELL_PADDING)
      {
        // First half: copy received top halo to top ghost cells only
        int index = id;
        index = id;
      }
      
      h_arr[id]  = halo[index + 0*GHOST_CELL_PADDING*ncols];
      qx_arr[id] = halo[index + 1*GHOST_CELL_PADDING*ncols];
      qy_arr[id] = halo[index + 2*GHOST_CELL_PADDING*ncols];

        // Copy received bottom halo to bottom ghost cells
        // Bottom ghost cells are at rows (nrows-N) to (nrows-1)
        int temp = id + (nrows - GHOST_CELL_PADDING)*ncols;
        h_arr[temp]  = halo[index + 9*GHOST_CELL_PADDING*ncols];
        qx_arr[temp] = halo[index + 10*GHOST_CELL_PADDING*ncols];
        qy_arr[temp] = halo[index + 11*GHOST_CELL_PADDING*ncols];
      }
      // Second half (id >= N*ncols) was incorrectly copying old interior edge data
      // to interior edge positions. This is now skipped.
      int temp = id + (nrows - 2*GHOST_CELL_PADDING)*ncols;

      h_arr[temp]  = halo[index + 6*GHOST_CELL_PADDING*ncols];
      qx_arr[temp] = halo[index + 7*GHOST_CELL_PADDING*ncols];
      qy_arr[temp] = halo[index + 8*GHOST_CELL_PADDING*ncols];

    });
  }
@@ -1115,10 +1133,10 @@ namespace Kernels
      int iy = (ii % ncols);  //col id

      bool
      is_top = (ix == GHOST_CELL_PADDING),
      is_btm = (ix == nrows - GHOST_CELL_PADDING - 1),
      is_lt = (iy == GHOST_CELL_PADDING),
      is_rt = (iy == ncols - GHOST_CELL_PADDING - 1);
      is_top = (ix == 1),
      is_btm = (ix == nrows - 2),
      is_lt = (iy == 1),
      is_rt = (iy == ncols - 2);
			
			//if (!(rank == 0 && is_top) || !is_lt || !is_rt || !(rank == total_process - 1 && is_btm))
		if (!((rank == 0 && is_top) || is_lt || is_rt || (rank == total_process - 1 && is_btm)))
@@ -1388,4 +1406,3 @@ namespace Kernels
}

#endif
 No newline at end of file
+233 −212
Original line number Diff line number Diff line
@@ -191,8 +191,15 @@ namespace Triton
    
/** @brief This function is used to compute next state. All main computation is done inside this function.
*
*  @param ilo Lower row bound (inclusive) for overlap tiling
*  @param ihi Upper row bound (exclusive) for overlap tiling
*/
    void compute_new_state();
    void compute_new_state(int ilo, int ihi);

/** @brief This function performs halo exchange after tile completes.
*
*/
    void perform_halo_exchange();

/** @brief This function is used to get observation data to host. 
*
@@ -1979,10 +1986,8 @@ namespace Triton

  MPI_Barrier(ENSIFY_COMM_WORLD);


  st.start(SIMULATION_TIME);

		
  T xll = dem.get_xll_corner();
  T yll = dem.get_yll_corner();
  T cellsize = dem.get_cell_size();
@@ -1997,7 +2002,6 @@ namespace Triton
    out.write_observation_data(host_obs_h, host_obs_qx, host_obs_qy, simtime, arglist.print_option);
  }


  global_dt = arglist.time_step;
  average_dt = 0.0;
  local_dt = arglist.time_step;
@@ -2011,44 +2015,63 @@ namespace Triton
    print_id=simtime/arglist.print_interval;
  }

  // Validation for overlap tiling
  if (arglist.overlap_tiling && GHOST_CELL_PADDING < 1) {
    std::cerr << ERROR "overlap_tiling requires GHOST_CELL_PADDING >= 1" << std::endl;
    exit(EXIT_FAILURE);
  }

  if (arglist.overlap_tiling && GHOST_CELL_PADDING == 1 && rank == 0) {
    std::cerr << WARN "overlap_tiling with GHOST_CELL_PADDING=1: m=1 (no communication reduction)" << std::endl;
  }

  // Domain bounds (constant for all substeps)
  const int ilo = GHOST_CELL_PADDING;
  const int ihi = rows - GHOST_CELL_PADDING;

  // Substep counter for halo exchange scheduling
  int substep_count = 0;

  while (simtime < arglist.sim_duration)
  {
      it_count++;
      it_count_average++;

    // Compute dt EVERY substep for b4b correctness
    if (!arglist.time_increment_fixed)
    {
      compute_local_dt();
      compute_global_dt(print_id);
    }
      compute_new_state();

    it_count++;
    it_count_average++;

    // Call unified compute_new_state with CONSTANT bounds
    compute_new_state(ilo, ihi);

    simtime += global_dt;
    average_dt += global_dt;
    substep_count++;

	if(it_count%arglist.it_print == 0){
		if(rank==0){
			std::cerr << INFO " Time: " << simtime << "\tdt: " << average_dt/it_count_average << "\tit: " << it_count << std::endl;
    // Status print
    if (it_count % arglist.it_print == 0 && rank == 0) {
      std::cerr << INFO " Time: " << simtime << "\tdt: " << average_dt/it_count_average
                << "\tit: " << it_count << std::endl;
      it_count_average = 0;
      average_dt = 0.0;
    }
	}


    // Observation output
    if (arglist.time_series_flag && simtime - last_print_obs_time >= arglist.print_observation) {
    	// variable to print observation data files  
      last_print_obs_time += arglist.print_observation;
		//transfer only the observation data points to the host
      st.start(COMPUTE_TIME);
      get_observation_data_to_host();
      st.stop(COMPUTE_TIME);

      st.start(IO_TIME);
      out.write_observation_data(host_obs_h, host_obs_qx, host_obs_qy, simtime, arglist.print_option);
		//out.write_observation_data(host_vec[OBSH], host_vec[OBSQX], host_vec[OBSQY], simtime, arglist.print_option);
      st.stop(IO_TIME);
    }

    // File output
    if (simtime >= arglist.print_interval * (print_id + 1))
    {
      print_id++;
@@ -2088,13 +2111,18 @@ namespace Triton
        new_domain_decomposition();
        st.stop(RESIZE_TIME);
      }

    }

    // Halo exchange: every step for baseline, every GHOST_CELL_PADDING steps for overlap tiling
    bool do_halo = !arglist.overlap_tiling || (substep_count >= GHOST_CELL_PADDING);
    if (do_halo && size > 1) {
      perform_halo_exchange();
      substep_count = 0;  // Reset counter after halo exchange
    }
  }
    //write the last state in the observation data files

  // Final observation write
  if (arglist.time_series_flag) {  
	//transfer only the observation data points to the host
    st.start(COMPUTE_TIME);
    get_observation_data_to_host();
    st.stop(COMPUTE_TIME);
@@ -2104,7 +2132,6 @@ namespace Triton
    st.stop(IO_TIME);
  }


  st.stop(SIMULATION_TIME);
  st.stop(TOTAL_TIME);
  
@@ -2114,8 +2141,6 @@ namespace Triton
    std::cerr << INFO " Time: " << simtime << "\tdt: " << average_dt/it_count_average << "\tit: " << it_count << std::endl;
    std::cerr << OK "Simulation ends" << std::endl;
  }


}


@@ -2187,21 +2212,25 @@ namespace Triton


  template<typename T>
  void triton<T>::compute_new_state()
void triton<T>::compute_new_state(int ilo, int ihi)
{
  st.start(COMPUTE_TIME);

  Kernels::flux_x(rows*cols, rows, cols, cell_size, global_dt,
  device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[SQRTH],
    device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1], arglist.hextra);
  device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1],
  arglist.hextra, ilo, ihi);

  // Use same bounds for flux_y as flux_x
  Kernels::flux_y(rows*cols, rows, cols, cell_size, global_dt,
  device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[SQRTH],
    device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1], arglist.hextra);
  device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1],
  arglist.hextra, ilo, ihi);

  Kernels::update_cells(rows*cols, rows, cols, global_dt,
  device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[NMAN],
    device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1], arglist.hextra);
  device_vec[RHSH0], device_vec[RHSH1], device_vec[RHSQX0], device_vec[RHSQX1], device_vec[RHSQY0], device_vec[RHSQY1],
  arglist.hextra, ilo, ihi);

  if (arglist.num_runoffs > 0)
  {
@@ -2244,46 +2273,41 @@ namespace Triton
    cell_size, global_dt, simtime, idx_low, idx_high, device_vec[H], device_vec_int[SRCP]);
  }


  if(arglist.open_boundaries){

    Kernels::copy_info_to_exterior_boundaries_west_east(2*rows*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY]);
    Kernels::copy_info_to_exterior_boundaries_north_south(cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY],rank, size);

  }

  if (num_of_extbc > 0 && num_extbc_cells > 0)
  {
    Kernels::compute_extbc_values(num_extbc_cells, rows, cols, global_dt, device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[NMAN], device_vec_int[BCRELATIVEINDEX], device_vec_int[BCTYPE], device_vec_int[BCINDEXSTART], device_vec_int[BCNROWSVARS], device_vec[EXTBCV1], device_vec[EXTBCV2], simtime, rank, size);

  }

    Kernels::wet_dry(rows*cols, rows, cols, global_dt, device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[MAXH], arglist.hextra,size);
  Kernels::wet_dry(rows*cols, rows, cols, global_dt, device_vec[H], device_vec[QX], device_vec[QY], device_vec[DEM], device_vec[MAXH], arglist.hextra, size, ilo, ihi);

  st.stop(COMPUTE_TIME);
}


    if (size > 1)
template<typename T>
void triton<T>::perform_halo_exchange()
{
      Kernels::halo_copy_from_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);
  if (size <= 1) return;

      if (arglist.gpu_direct_flag)
      {
  Kernels::halo_copy_from_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols,
      device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);

  if (arglist.gpu_direct_flag) {
    gpuStreamSynchronize(streams);
        st.stop(COMPUTE_TIME);

    st.start(BALANCING_MPI_TIME);
    st.start(MPI_TIME);
    MpiUtils::exchange(device_vec[HALO], 12*GHOST_CELL_PADDING, cols, rank, size, USE_HALO);
    st.stop(MPI_TIME);
    st.stop(BALANCING_MPI_TIME);

        st.start(COMPUTE_TIME);
      }
      else
      {
  } else {
    gpuMemcpyAsync(host_vec[HALO], device_vec[HALO], nbytes_halo, gpuMemcpyDeviceToHost, streams);
    gpuStreamSynchronize(streams);
        st.stop(COMPUTE_TIME);

    st.start(BALANCING_MPI_TIME);
    st.start(MPI_TIME);
@@ -2291,20 +2315,17 @@ namespace Triton
    st.stop(MPI_TIME);
    st.stop(BALANCING_MPI_TIME);

        st.start(COMPUTE_TIME);
    gpuMemcpyAsync(device_vec[HALO], host_vec[HALO], nbytes_halo, gpuMemcpyHostToDevice, streams);
  }

      Kernels::halo_copy_to_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);

      Kernels::wet_dry_qy_halo(2 * cols*GHOST_CELL_PADDING, rows, cols, device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra);
  Kernels::halo_copy_to_gpu(2 * cols*GHOST_CELL_PADDING, rows, cols,
      device_vec[H], device_vec[QX], device_vec[QY], device_vec[HALO]);

  Kernels::wet_dry_qy_halo(2 * cols*GHOST_CELL_PADDING, rows, cols,
      device_vec[H], device_vec[QY], device_vec[DEM], arglist.hextra);
}


    st.stop(COMPUTE_TIME);
  }

	template<typename T>
	void triton<T>::get_observation_data_to_host()
	{