Unverified Commit c4875a44 authored by Markus Kowalewski's avatar Markus Kowalewski Committed by GitHub
Browse files

mpiCheckPhaseHook: add parameters to bypass errors in sandbox (#350112)

parents 968e5b2a 3e08945b
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -195,7 +195,7 @@ stdenv.mkDerivation rec {
  doCheck = false;

  doInstallCheck = true;
  nativeCheckInputs = [ mpiCheckPhaseHook ];
  nativeInstallCheckInputs = [ mpiCheckPhaseHook ];
  installCheckPhase = ''
    runHook preInstallCheck

@@ -211,7 +211,7 @@ stdenv.mkDerivation rec {
  meta = with lib; {
    description = "Open Source High-Performance Computational Chemistry";
    mainProgram = "nwchem";
    platforms = [ "x86_64-linux" ];
    platforms = [ "x86_64-linux" "aarch64-linux" ];
    maintainers = with maintainers; [ sheepforce markuskowa ];
    homepage = "https://nwchemgit.github.io";
    license = licenses.ecl20;
+4 −0
Original line number Diff line number Diff line
@@ -2,4 +2,8 @@

makeSetupHook {
  name = "mpi-checkPhase-hook";

  substitutions = {
    topology = ./topology.xml;
  };
} ./mpi-check-hook.sh
+11 −0
Original line number Diff line number Diff line
@@ -44,6 +44,17 @@ setupMpiCheck() {
      # Disable CPU pinning
      export OMPI_MCA_hwloc_base_binding_policy=none
      export PRTE_MCA_hwloc_default_binding_policy=none

      # OpenMPI get confused by the sandbox environment and spew errors like this (both to stdout and stderr):
      #     [hwloc/linux] failed to find sysfs cpu topology directory, aborting linux discovery.
      #     [1729458724.473282] [localhost:78   :0]       tcp_iface.c:893  UCX  ERROR scandir(/sys/class/net) failed: No such file or directory
      # These messages contaminate test output, which makes the difftest to fail.
      # The solution is to use a preset cpu topology file and disable ucx model.

      # Disable sysfs cpu topology directory discovery.
      export PRTE_MCA_hwloc_use_topo_file="@topology@"
      # Use the network model ob1 instead of ucx.
      export OMPI_MCA_pml=ob1
      ;;
    MPICH)
      # Fix to make mpich run in a sandbox
+10 −0
Original line number Diff line number Diff line
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE topology SYSTEM "hwloc2.dtd">
<topology version="2.0">
  <object type="Machine" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" allowed_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" allowed_nodeset="0x00000001" gp_index="1">
    <object type="Core" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="2">
      <object type="NUMANode" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="4"/>
      <object type="PU" os_index="0" cpuset="0x00000001" complete_cpuset="0x00000001" nodeset="0x00000001" complete_nodeset="0x00000001" gp_index="3"/>
    </object>
  </object>
</topology>
+0 −112
Original line number Diff line number Diff line
diff --git a/src/snes/tutorials/makefile b/src/snes/tutorials/makefile
index fa15faad39e..7670e80931e 100644
--- a/src/snes/tutorials/makefile
+++ b/src/snes/tutorials/makefile
@@ -13,6 +13,7 @@ ex55: ex55.o ex55k.o
 #  these tests are used by the makefile in PETSC_DIR for basic tests of the install and should not be removed
 testex5f: ex5f.PETSc
 	-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex5f -snes_rtol 1e-4 > ex5f_1.tmp 2>&1; \
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex5f_1.tmp; \
         if (${DIFF} output/ex5f_1.testout ex5f_1.tmp > /dev/null 2>&1) then \
           echo "Fortran example src/snes/tutorials/ex5f run successfully with 1 MPI process"; \
         else \
@@ -25,6 +26,7 @@ testex5f: ex5f.PETSc
         ${MAKE} PETSC_ARCH=${PETSC_ARCH} PETSC_DIR=${PETSC_DIR} ex5f.rm;
 testex19: ex19.PETSc
 	-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -pc_type mg -ksp_type fgmres  > ex19_1.tmp 2>&1; \
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
         if (${DIFF} output/ex19_1.testout ex19_1.tmp > /dev/null 2>&1) then \
           echo "C/C++ example src/snes/tutorials/ex19 run successfully with 1 MPI process"; \
         else \
@@ -36,6 +38,7 @@ testex19: ex19.PETSc
         ${RM} -f ex19_1.tmp;
 testex19_mpi:
 	-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -pc_type mg -ksp_type fgmres  > ex19_1.tmp 2>&1; \
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
         if (${DIFF} output/ex19_1.testout ex19_1.tmp > /dev/null 2>&1) then \
           echo "C/C++ example src/snes/tutorials/ex19 run successfully with 2 MPI processes"; \
         else \
@@ -48,6 +51,7 @@ testex19_mpi:
 #use unpreconditioned norm because HYPRE device installations use different AMG parameters
 runex19_hypre:
 	-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
           if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \
             echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE"; \
           else  \
@@ -57,6 +61,7 @@ runex19_hypre:
           ${RM} -f ex19_1.tmp
 runex19_hypre_cuda:
 	-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -dm_vec_type cuda -dm_mat_type aijcusparse -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
 	   if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \
            echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE/CUDA"; \
            else  \
@@ -66,6 +71,7 @@ runex19_hypre_cuda:
 	   ${RM} -f ex19_1.tmp
 runex19_hypre_hip:
 	-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -dm_vec_type hip -da_refine 3 -snes_monitor_short -ksp_norm_type unpreconditioned -pc_type hypre > ex19_1.tmp 2>&1; \
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
 	   if (${DIFF} output/ex19_hypre.out ex19_1.tmp) then \
            echo "C/C++ example src/snes/tutorials/ex19 run successfully with HYPRE/HIP"; \
            else \
@@ -75,6 +81,7 @@ runex19_hypre_hip:
 	   ${RM} -f ex19_1.tmp
 runex19_cuda:
 	-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -snes_monitor -dm_mat_type seqaijcusparse -dm_vec_type seqcuda -pc_type gamg -ksp_monitor -mg_levels_ksp_max_it 1 > ex19_1.tmp 2>&1; \
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
 	   if (${DIFF} output/ex19_cuda_1.out ex19_1.tmp) then \
            echo "C/C++ example src/snes/tutorials/ex19 run successfully with CUDA"; \
            else  \
@@ -84,6 +91,7 @@ runex19_cuda:
 	   ${RM} -f ex19_1.tmp
 runex19_ml:
 	-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -pc_type ml > ex19_1.tmp 2>&1; \
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
 	   if (${DIFF} output/ex19_ml.out ex19_1.tmp) then  \
            echo "C/C++ example src/snes/tutorials/ex19 run successfully with ML"; \
            else \
@@ -93,6 +101,7 @@ runex19_ml:
            ${RM} -f ex19_1.tmp
 runex19_fieldsplit_mumps:
 	-@${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex19 -pc_type fieldsplit -pc_fieldsplit_block_size 4 -pc_fieldsplit_type SCHUR -pc_fieldsplit_0_fields 0,1,2 -pc_fieldsplit_1_fields 3 -fieldsplit_0_pc_type lu -fieldsplit_1_pc_type lu -snes_monitor_short -ksp_monitor_short  -fieldsplit_0_pc_factor_mat_solver_type mumps -fieldsplit_1_pc_factor_mat_solver_type mumps > ex19_6.tmp 2>&1; \
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_6.tmp; \
 	   if (${DIFF} output/ex19_fieldsplit_5.out ex19_6.tmp) then  \
            echo "C/C++ example src/snes/tutorials/ex19 run successfully with MUMPS"; \
            else  \
@@ -102,6 +111,7 @@ runex19_fieldsplit_mumps:
            ${RM} -f ex19_6.tmp
 runex19_superlu_dist:
 	-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_grid_x 20 -da_grid_y 20 -pc_type lu -pc_factor_mat_solver_type superlu_dist > ex19.tmp 2>&1; \
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19.tmp; \
 	   if (${DIFF} output/ex19_superlu.out ex19.tmp) then \
            echo "C/C++ example src/snes/tutorials/ex19 run successfully with SuperLU_DIST"; \
            else  \
@@ -111,6 +121,7 @@ runex19_superlu_dist:
 	   ${RM} -f ex19.tmp
 runex19_suitesparse:
 	-@${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex19 -da_refine 3 -snes_monitor_short -pc_type lu -pc_factor_mat_solver_type umfpack > ex19_1.tmp 2>&1; \
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex19_1.tmp; \
 	   if (${DIFF} output/ex19_suitesparse.out ex19_1.tmp) then \
            echo "C/C++ example src/snes/tutorials/ex19 run successfully with SuiteSparse"; \
            else \
@@ -120,6 +131,7 @@ runex19_suitesparse:
 	   ${RM} -f ex19_1.tmp
 runex3k_kokkos: ex3k.PETSc
 	-@OMP_PROC_BIND=false ${MPIEXEC} -n 2 ${MPIEXEC_TAIL} ./ex3k -view_initial -dm_vec_type kokkos -dm_mat_type aijkokkos -use_gpu_aware_mpi 0 -snes_monitor > ex3k_1.tmp 2>&1 ;\
+        sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex3k_1.tmp; \
 	if (${DIFF} output/ex3k_1.out ex3k_1.tmp) then \
           echo "C/C++ example src/snes/tutorials/ex3k run successfully with Kokkos Kernels"; \
         else \
diff --git a/src/vec/vec/tests/makefile b/src/vec/vec/tests/makefile
index d1f047820ec..aab400535dd 100644
--- a/src/vec/vec/tests/makefile
+++ b/src/vec/vec/tests/makefile
@@ -5,6 +5,7 @@ include ${PETSC_DIR}/lib/petsc/conf/rules
 
 runex47: ex47.PETSc
 	-@H5OUT=`mktemp -t petsc.h5.XXXXXX`; ${MPIEXEC} -n 1 ${MPIEXEC_TAIL} ./ex47 -filename $${H5OUT} > ex47_1.tmp 2>&1; \
+       sed -i '/hwloc\/linux/d ; /ERROR scandir(\/sys\/class\/net) failed/d ; /ERROR opendir(\/sys\/class\/net) failed/d' ex47_1.tmp; \
 	   if (${DIFF} output/ex47_1.out ex47_1.tmp) then \
              echo "C/C++ example src/vec/vec/tests/ex47 run successfully with HDF5"; \
            else \
Loading