Unverified Commit 628f0d33 authored by Eisenhauer, Greg's avatar Eisenhauer, Greg Committed by GitHub
Browse files

Merge pull request #5011 from eisenhauer/mercury-dataplane

Add Mercury data plane for SST
parents a616d288 7b46f489
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -488,6 +488,9 @@ if (ADIOS2_HAVE_SST)
  if (ADIOS2_SST_HAVE_UCX)
     set (HPCDataPlaneList "${HPCDataPlaneList} UCX")
  endif()
  if (ADIOS2_SST_HAVE_MERCURY)
     set (HPCDataPlaneList "${HPCDataPlaneList} mercury")
  endif()
  if (ADIOS2_HAVE_MPI)
     set (HPCDataPlaneList "${HPCDataPlaneList} MPI")
  endif()
+13 −0
Original line number Diff line number Diff line
@@ -579,6 +579,19 @@ if(ADIOS2_USE_SST AND NOT WIN32)
    set(ADIOS2_SST_HAVE_UCX TRUE)
    set(ADIOS2_HAVE_UCX TRUE)
  endif()

  # Mercury
  if(ADIOS2_USE_Mercury STREQUAL AUTO)
    find_package(MERCURY 2.0.0 QUIET)
    find_package(MARGO QUIET)
  elseif(ADIOS2_USE_Mercury)
    find_package(MERCURY 2.0.0 REQUIRED)
    find_package(MARGO REQUIRED)
  endif()
  if(MERCURY_FOUND AND MARGO_FOUND)
    set(ADIOS2_SST_HAVE_MERCURY TRUE)
    set(ADIOS2_HAVE_MERCURY TRUE)
  endif()
endif()

# DAOS

cmake/FindMARGO.cmake

0 → 100644
+66 −0
Original line number Diff line number Diff line
# SPDX-FileCopyrightText: 2026 Oak Ridge National Laboratory and Contributors
#
# SPDX-License-Identifier: Apache-2.0

######################################################
# - Try to find Margo library
# Once done this will define
#  MARGO_FOUND - System has Margo
#  MARGO_INCLUDE_DIRS - The Margo include directories
#  MARGO_LIBRARIES - The libraries needed to use Margo
#  margo::margo - Imported target for Margo
#
######################################################

find_package(PkgConfig)
if(PKG_CONFIG_FOUND)
  set(_MARGO_CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH})
  if(MARGO_ROOT)
    list(INSERT CMAKE_PREFIX_PATH 0 "${MARGO_ROOT}")
  elseif(NOT ENV{MARGO_ROOT} STREQUAL "")
    list(INSERT CMAKE_PREFIX_PATH 0 "$ENV{MARGO_ROOT}")
  endif()
  set(PKG_CONFIG_USE_MARGO_CMAKE_PREFIX_PATH ON)

  pkg_check_modules(PC_MARGO margo)

  set(CMAKE_PREFIX_PATH ${_MARGO_CMAKE_PREFIX_PATH})
  unset(_MARGO_CMAKE_PREFIX_PATH)

  if(PC_MARGO_FOUND)
    if(BUILD_SHARED_LIBS)
      set(_PC_TYPE)
    else()
      set(_PC_TYPE _STATIC)
    endif()
    set(MARGO_INCLUDE_DIRS ${PC_MARGO${_PC_TYPE}_INCLUDE_DIRS})
    set(MARGO_LIBRARIES ${PC_MARGO${_PC_TYPE}_LINK_LIBRARIES})
    set(MARGO_DEFINITIONS ${PC_MARGO${_PC_TYPE}_CFLAGS_OTHER})
  endif()
endif()

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(MARGO DEFAULT_MSG MARGO_LIBRARIES)

if(MARGO_FOUND)
  if(NOT TARGET margo::margo)
    add_library(margo::margo INTERFACE IMPORTED)
    if(MARGO_INCLUDE_DIRS)
      set_target_properties(margo::margo PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${MARGO_INCLUDE_DIRS}"
      )
    endif()
    if(MARGO_DEFINITIONS)
      set_target_properties(margo::margo PROPERTIES
        INTERFACE_COMPILE_OPTIONS "${MARGO_DEFINITIONS}"
      )
    endif()
    if(MARGO_LIBRARIES)
      set_target_properties(margo::margo PROPERTIES
        INTERFACE_LINK_LIBRARIES "${MARGO_LIBRARIES}"
      )
    endif()
  endif()
endif()

mark_as_advanced(MARGO_INCLUDE_DIRS MARGO_LIBRARIES)
+74 −0
Original line number Diff line number Diff line
# SPDX-FileCopyrightText: 2026 Oak Ridge National Laboratory and Contributors
#
# SPDX-License-Identifier: Apache-2.0

######################################################
# - Try to find Mercury RPC library
# Once done this will define
#  MERCURY_FOUND - System has Mercury
#  MERCURY_INCLUDE_DIRS - The Mercury include directories
#  MERCURY_LIBRARIES - The libraries needed to use Mercury
#  mercury::mercury - Imported target for Mercury
#
######################################################

# Try to find Mercury using its CMake config first
find_package(mercury ${MERCURY_FIND_VERSION} QUIET CONFIG
  HINTS
    ${MERCURY_ROOT}
    $ENV{MERCURY_ROOT}
  PATH_SUFFIXES
    share/cmake/mercury
    lib/cmake/mercury
)

if(mercury_FOUND)
  set(MERCURY_FOUND TRUE)

  # Get include dirs and libraries from the imported target
  if(TARGET mercury)
    get_target_property(MERCURY_INCLUDE_DIRS mercury INTERFACE_INCLUDE_DIRECTORIES)
    get_target_property(MERCURY_LIBRARIES mercury INTERFACE_LINK_LIBRARIES)

    # Create alias with mercury:: namespace if it doesn't exist
    if(NOT TARGET mercury::mercury)
      add_library(mercury::mercury ALIAS mercury)
    endif()
  endif()
else()
  # Fallback to manual search
  find_path(MERCURY_INCLUDE_DIR
    NAMES mercury.h
    HINTS
      ${MERCURY_ROOT}
      $ENV{MERCURY_ROOT}
    PATH_SUFFIXES include
  )

  find_library(MERCURY_LIBRARY
    NAMES mercury
    HINTS
      ${MERCURY_ROOT}
      $ENV{MERCURY_ROOT}
    PATH_SUFFIXES lib lib64
  )

  set(MERCURY_INCLUDE_DIRS ${MERCURY_INCLUDE_DIR})
  set(MERCURY_LIBRARIES ${MERCURY_LIBRARY})

  include(FindPackageHandleStandardArgs)
  find_package_handle_standard_args(MERCURY
    REQUIRED_VARS MERCURY_LIBRARY MERCURY_INCLUDE_DIR
    VERSION_VAR MERCURY_VERSION
  )

  if(MERCURY_FOUND AND NOT TARGET mercury::mercury)
    add_library(mercury::mercury INTERFACE IMPORTED)
    set_target_properties(mercury::mercury PROPERTIES
      INTERFACE_INCLUDE_DIRECTORIES "${MERCURY_INCLUDE_DIRS}"
      INTERFACE_LINK_LIBRARIES "${MERCURY_LIBRARIES}"
    )
  endif()
endif()

mark_as_advanced(MERCURY_INCLUDE_DIR MERCURY_LIBRARY)
+35 −14
Original line number Diff line number Diff line
@@ -157,8 +157,8 @@ the underlying network communication mechanism to use for exchanging
data in SST.  Generally this is chosen by SST based upon what is
available on the current platform.  However, specifying this engine
parameter allows overriding SST's choice.  Current allowed values are
**"UCX"**, **"MPI"**, **"RDMA"**, and **"WAN"**.  (**ib** and **fabric** are accepted as
equivalent to **RDMA** and **evpath** is equivalent to **WAN**.)
**"UCX"**, **"MPI"**, **"RDMA"**, **"Mercury"**, and **"WAN"**.  (**ib** and **fabric**
are accepted as equivalent to **RDMA** and **evpath** is equivalent to **WAN**.)
Generally both the reader and writer should be using the same network
transport, and the network transport chosen may be dictated by the
situation.  For example, the RDMA transport generally operates only
@@ -176,7 +176,19 @@ must be a data transport known to EVPath, such as **"sockets"**,
be using the same EVPath-level data transport.  This value is
interpreted by both SST Writer and Reader engines.

8. ``ControlTransport``: Default **tcp**.  This string value specifies
8. ``MercuryProtocol``: Default **"tcp"**.  If the SST **DataTransport**
parameter is **"Mercury"**, this string value specifies the underlying
Mercury NA (Network Abstraction) plugin and protocol string to use, in
the form that Mercury accepts.  Typical values are **"cxi://"**
(Slingshot/HPE-Cray interconnects such as Frontier and Perlmutter),
**"ofi+tcp"** (libfabric over TCP), **"ofi+verbs"** (InfiniBand via
libfabric), **"bmi+tcp"**, and **"na+sm"** (shared memory). If this
parameter is not set, SST falls back to the ``SST_MERCURY_PROTOCOL``
environment variable, and then to **"tcp"**.  Generally both the reader
and writer should use the same protocol.  This value is interpreted by
both SST Writer and Reader engines.

9. ``ControlTransport``: Default **tcp**.  This string value specifies
the underlying network communication mechanism to use for performing
control operations in SST.  SST can be configured to standard TCP
sockets, which are very reliable and efficient, but which are limited
@@ -188,7 +200,7 @@ equivalent to **scalable**. Generally both the reader and writer
should be using the same control transport.  This value is interpreted
by both SST Writer and Reader engines.

9. ``NetworkInterface``: Default **NULL**.  In situations in which
10. ``NetworkInterface``: Default **NULL**.  In situations in which
there are multiple possible network interfaces available to SST, this
string value specifies which should be used to generate SST's contact
information for writers.  Generally this should *NOT* be specified
@@ -201,14 +213,14 @@ will result in SST generating contact information that uses the
network address associated with the loopback interface (127.0.0.1).
This value is interpreted by only by the SST Writer engine.

10. ``ControlInterface``: Default **NULL**.  This value is similar to the
11. ``ControlInterface``: Default **NULL**.  This value is similar to the
NetworkInterface parameter, but only applies to the SST layer which does
messaging for control (open, close, flow and timestep management, but not
actual data transfer).  Generally the NetworkInterface parameter can be used
to control this, but that also aplies to the Data Plane.  Use
ControlInterface in the event of conflicting specifications.

11. ``DataInterface``: Default **NULL**.  This value is similar to the
12. ``DataInterface``: Default **NULL**.  This value is similar to the
NetworkInterface parameter, but only applies to the SST layer which does
messaging for data transfer, not control (open, close, flow and timestep
management).  Generally the NetworkInterface parameter can be used to
@@ -216,7 +228,7 @@ control this, but that also aplies to the Control Plane. Use DataInterface
in the event of conflicting specifications.  In the case of the RDMA data
plane, this parameter controls the libfabric interface choice.

12. ``FirstTimestepPrecious``: Default **FALSE**.
13. ``FirstTimestepPrecious``: Default **FALSE**.
FirstTimestepPrecious is a boolean parameter that affects the queueing
of the first timestep presented to the SST Writer engine. If
FirstTimestepPrecious is **TRUE**, then the first timestep is
@@ -232,7 +244,7 @@ other reader-side operations (like requesting the LatestAvailable
timestep in Engine parameters) might still cause the timestep to be skipped.
This value is interpreted by only by the SST Writer engine.

13. ``AlwaysProvideLatestTimestep``: Default **FALSE**.
14. ``AlwaysProvideLatestTimestep``: Default **FALSE**.
AlwaysProvideLatestTimestep is a boolean parameter that affects what
of the available timesteps will be provided to the reader engine.  If
AlwaysProvideLatestTimestep is **TRUE**, then if there are multiple
@@ -240,14 +252,14 @@ timesteps available to the reader, older timesteps will be skipped and
the reader will see only the newest available upon BeginStep.
This value is interpreted by only by the SST Reader engine.

14. ``OpenTimeoutSecs``: Default **60**.  OpenTimeoutSecs is an integer
15. ``OpenTimeoutSecs``: Default **60**.  OpenTimeoutSecs is an integer
parameter that specifies the number of seconds SST is to wait for a peer
connection on Open().  Currently this is only implemented on the Reader side
of SST, and is a timeout for locating the contact information file created
by Writer-side Open, not for completing the entire Open() handshake.
Currently value is interpreted by only by the SST Reader engine.

15. ``SpeculativePreloadMode``: Default **AUTO**.  In some
16. ``SpeculativePreloadMode``: Default **AUTO**.  In some
circumstances, SST eagerly sends all data from writers to every
readers without first waiting for read requests.  Generally this
improves performance if every reader needs all the data, but can be
@@ -260,14 +272,14 @@ is less than or equal to the value of the ``SpecAutoNodeThreshold``
engine parameter (Default value 1), eager sending is initiated.
Currently value is interpreted by only by the SST Reader engine.

16.  ``SpecAutoNodeThreshold``:  Default **1**.  If the size of the
17.  ``SpecAutoNodeThreshold``:  Default **1**.  If the size of the
reader cohort is less than or equal to this value *and* the
``SpeculativePreloadMode`` parameter is **AUTO**, SST will initiate
eager data sending of all data from each writer to all readers.
Currently value is interpreted by only by the SST Reader engine.


17. ``StepDistributionMode``: Default **"AllToAll"**.  This value
18. ``StepDistributionMode``: Default **"AllToAll"**.  This value
controls how steps are distributed, particularly when there are
multiple readers.  By default, the value is **"AllToAll"**, which
means that all timesteps are to be delivered to all readers (subject
@@ -288,8 +300,9 @@ BeginStep timeouts) and writer-side rules (like queue limit behavior) apply.
  QueueLimit                    integer               **0** (no queue limits)
  QueueFullPolicy               string                **Block**, Discard
  ReserveQueueLimit             integer               **0** (no queue limits)
  DataTransport                 string                **default varies by platform**, UCX, MPI, RDMA, WAN
  DataTransport                 string                **default varies by platform**, UCX, MPI, RDMA, Mercury, WAN
  WANDataTransport              string                **sockets**, enet, ib
  MercuryProtocol               string                **tcp**, cxi://, ofi+tcp, ofi+verbs, bmi+tcp, na+sm
  ControlTransport              string                **TCP**, Scalable
  MarshalMethod                 string                **BP5**, BP, FFS
  NetworkInterface              string                **NULL**
@@ -325,3 +338,11 @@ Additionally to the above controls, finetuning for technical parameters of the d
    * ``SST_UCX_PROGRESS_THREAD``: Some setups of UCX may require progress threads. Unlike in the RDMA data plane, progress threads are not opened automatically, but must be requested by defining this environment variable as either ``1``, ``on`` or ``yes``. Any other value as well as no definition of the environment variable at all will not create progress threads.

    * Tip: For restricting UCX to communication with shared memory only, set ``UCX_TLS=shm``. Progress threads must be used on the writer and reader side. It might be necessary to additionally set ``UCX_POSIX_USE_PROC_LINK=n`` on some systems.

* By the **Mercury data plane**:

    * ``SST_MERCURY_PROTOCOL``: Fallback for the ``MercuryProtocol`` engine parameter. If ``MercuryProtocol`` is not set on the engine, this environment variable is consulted before falling back to the default of ``tcp``. Useful when the same binary needs to run on different interconnects without rebuilding or re-parameterizing.

    * On Slingshot/CXI systems (e.g. Frontier, Perlmutter), the Mercury data plane automatically supplies an OFI auth key to Mercury's NA layer when ``SLINGSHOT_VNIS`` is defined in the environment (set by the batch system). No user action is required; specifying ``MercuryProtocol=cxi://`` (or the env-var fallback) is sufficient. Without the auth key, RDMA bulk transfers fail with ``EXDEV``.

    * Platform guidance: use ``cxi://`` on HPE-Cray Slingshot systems, ``ofi+verbs`` or ``ofi+tcp`` on InfiniBand / Ethernet clusters, and ``na+sm`` for single-node shared-memory testing. Generally the same protocol string should be used on both writer and reader.
 No newline at end of file
Loading