Commit 0c5883bb authored by qbisi's avatar qbisi
Browse files

mpiCheckPhaseHook: add parameters to bypass errors in sandbox

parent f8f18110
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -2,4 +2,8 @@

makeSetupHook {
  name = "mpi-checkPhase-hook";

  substitutions = {
    topology = ./topology.xml;
  };
} ./mpi-check-hook.sh
+11 −0
Original line number Diff line number Diff line
@@ -44,6 +44,17 @@ setupMpiCheck() {
      # Disable CPU pinning
      export OMPI_MCA_hwloc_base_binding_policy=none
      export PRTE_MCA_hwloc_default_binding_policy=none

      # OpenMPI get confused by the sandbox environment and spew errors like this (both to stdout and stderr):
      #     [hwloc/linux] failed to find sysfs cpu topology directory, aborting linux discovery.
      #     [1729458724.473282] [localhost:78   :0]       tcp_iface.c:893  UCX  ERROR scandir(/sys/class/net) failed: No such file or directory
      # These messages contaminate test output, which makes the difftest to fail.
      # The solution is to use a preset cpu topology file and disable ucx model.

      # Disable sysfs cpu topology directory discovery.
      export PRTE_MCA_hwloc_use_topo_file="@topology@"
      # Use the network model ob1 instead of ucx.
      export OMPI_MCA_pml=ob1
      ;;
    MPICH)
      # Fix to make mpich run in a sandbox
+10 −0
Original line number Diff line number Diff line
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE topology SYSTEM "hwloc2.dtd">
<topology version="2.0">
  <object type="Machine" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" allowed_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" allowed_nodeset="0x00000001" gp_index="1">
    <object type="Core" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="2">
      <object type="NUMANode" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="4"/>
      <object type="PU" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="3"/>
    </object>
  </object>
</topology>