Unverified Commit 2b1f3244 authored by Markus Kowalewski's avatar Markus Kowalewski Committed by GitHub
Browse files

nixos/slurm: update test, make more reliable (#432787)

parents f3494061 122e2919
Loading
Loading
Loading
Loading
+22 −17
Original line number Diff line number Diff line
@@ -49,6 +49,16 @@ let
      mkdir -p $out/bin
      ${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest
    '';

  sbatchOutput = "/tmp/shared/sbatch.log";
  sbatchScript = pkgs.writeText "sbatchScript" ''
    #!${pkgs.runtimeShell}
    #SBATCH --nodes 1
    #SBATCH --ntasks 1
    #SBATCH --output ${sbatchOutput}

    echo "sbatch success"
  '';
in
{
  name = "slurm";
@@ -127,43 +137,38 @@ in
    };

  testScript = ''
    start_all()

    # Make sure DBD is up after DB initialzation
    with subtest("can_start_slurmdbd"):
        dbd.succeed("systemctl restart slurmdbd")
        dbd.wait_for_unit("slurmdbd.service")
        dbd.wait_for_open_port(6819)

    # there needs to be an entry for the current
    # cluster in the database before slurmctld is restarted
    with subtest("add_account"):
        control.succeed("sacctmgr -i add cluster default")
        # check for cluster entry
        control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default")

    with subtest("can_start_slurmctld"):
        control.succeed("systemctl restart slurmctld")
    with subtest("cluster_is_initialized"):
        control.wait_for_unit("multi-user.target")
        control.wait_for_unit("slurmctld.service")
        control.wait_until_succeeds("sacctmgr list cluster | awk '{ print $1 }' | grep default")

    start_all()

    with subtest("can_start_slurmd"):
        for node in [node1, node2, node3]:
            node.succeed("systemctl restart slurmd.service")
            node.wait_for_unit("slurmd")

    # Test that the cluster works and can distribute jobs;
    submit.wait_for_unit("multi-user.target")

    with subtest("run_distributed_command"):
        # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
        # The output must contain the 3 different names
        submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq")

        with subtest("check_slurm_dbd"):
        with subtest("check_slurm_dbd_job"):
            # find the srun job from above in the database
            control.succeed("sleep 5")
            control.succeed("sacct | grep hostname")
            control.wait_until_succeeds("sacct | grep hostname")

    with subtest("run_PMIx_mpitest"):
        submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3")

    with subtest("run_sbatch"):
        submit.succeed("sbatch --wait ${sbatchScript}")
        submit.succeed("grep 'sbatch success' ${sbatchOutput}")
  '';
}