Loading nixos/tests/slurm.nix +22 −17 Original line number Diff line number Diff line Loading @@ -49,6 +49,16 @@ let mkdir -p $out/bin ${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest ''; sbatchOutput = "/tmp/shared/sbatch.log"; sbatchScript = pkgs.writeText "sbatchScript" '' #!${pkgs.runtimeShell} #SBATCH --nodes 1 #SBATCH --ntasks 1 #SBATCH --output ${sbatchOutput} echo "sbatch success" ''; in { name = "slurm"; Loading Loading @@ -127,43 +137,38 @@ in }; testScript = '' start_all() # Make sure DBD is up after DB initialzation with subtest("can_start_slurmdbd"): dbd.succeed("systemctl restart slurmdbd") dbd.wait_for_unit("slurmdbd.service") dbd.wait_for_open_port(6819) # there needs to be an entry for the current # cluster in the database before slurmctld is restarted with subtest("add_account"): control.succeed("sacctmgr -i add cluster default") # check for cluster entry control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default") with subtest("can_start_slurmctld"): control.succeed("systemctl restart slurmctld") with subtest("cluster_is_initialized"): control.wait_for_unit("multi-user.target") control.wait_for_unit("slurmctld.service") control.wait_until_succeeds("sacctmgr list cluster | awk '{ print $1 }' | grep default") start_all() with subtest("can_start_slurmd"): for node in [node1, node2, node3]: node.succeed("systemctl restart slurmd.service") node.wait_for_unit("slurmd") # Test that the cluster works and can distribute jobs; submit.wait_for_unit("multi-user.target") with subtest("run_distributed_command"): # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). # The output must contain the 3 different names submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq") with subtest("check_slurm_dbd"): with subtest("check_slurm_dbd_job"): # find the srun job from above in the database control.succeed("sleep 5") control.succeed("sacct | grep hostname") control.wait_until_succeeds("sacct | grep hostname") with subtest("run_PMIx_mpitest"): submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3") with subtest("run_sbatch"): submit.succeed("sbatch --wait ${sbatchScript}") submit.succeed("grep 'sbatch success' ${sbatchOutput}") ''; } Loading
nixos/tests/slurm.nix +22 −17 Original line number Diff line number Diff line Loading @@ -49,6 +49,16 @@ let mkdir -p $out/bin ${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest ''; sbatchOutput = "/tmp/shared/sbatch.log"; sbatchScript = pkgs.writeText "sbatchScript" '' #!${pkgs.runtimeShell} #SBATCH --nodes 1 #SBATCH --ntasks 1 #SBATCH --output ${sbatchOutput} echo "sbatch success" ''; in { name = "slurm"; Loading Loading @@ -127,43 +137,38 @@ in }; testScript = '' start_all() # Make sure DBD is up after DB initialzation with subtest("can_start_slurmdbd"): dbd.succeed("systemctl restart slurmdbd") dbd.wait_for_unit("slurmdbd.service") dbd.wait_for_open_port(6819) # there needs to be an entry for the current # cluster in the database before slurmctld is restarted with subtest("add_account"): control.succeed("sacctmgr -i add cluster default") # check for cluster entry control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default") with subtest("can_start_slurmctld"): control.succeed("systemctl restart slurmctld") with subtest("cluster_is_initialized"): control.wait_for_unit("multi-user.target") control.wait_for_unit("slurmctld.service") control.wait_until_succeeds("sacctmgr list cluster | awk '{ print $1 }' | grep default") start_all() with subtest("can_start_slurmd"): for node in [node1, node2, node3]: node.succeed("systemctl restart slurmd.service") node.wait_for_unit("slurmd") # Test that the cluster works and can distribute jobs; submit.wait_for_unit("multi-user.target") with subtest("run_distributed_command"): # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). # The output must contain the 3 different names submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq") with subtest("check_slurm_dbd"): with subtest("check_slurm_dbd_job"): # find the srun job from above in the database control.succeed("sleep 5") control.succeed("sacct | grep hostname") control.wait_until_succeeds("sacct | grep hostname") with subtest("run_PMIx_mpitest"): submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3") with subtest("run_sbatch"): submit.succeed("sbatch --wait ${sbatchScript}") submit.succeed("grep 'sbatch success' ${sbatchOutput}") ''; }