From 334fb61c65ab19ab4f341d53bbcb850cf029f16b Mon Sep 17 00:00:00 2001 From: Wes Brewer Date: Fri, 5 Sep 2025 10:52:47 -0400 Subject: [PATCH 1/3] Rename run-multi-part subcommand to run-parts. Throw warning when running with single partition. --- README.md | 31 ++++++++++++++----------------- main.py | 4 ++-- raps/run_sim.py | 18 +++++++++++++----- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index eeb5a5d..e49e08b 100644 --- a/README.md +++ b/README.md @@ -62,21 +62,20 @@ For MIT Supercloud python -m raps.dataloaders.mit_supercloud.cli download --start 2021-05-21T13:00 --end 2021-05-21T14:00 # Load data and run simulation - will save data as part-cpu.npz and part-gpu.npz files - raps run-multi-part -x 'mit_supercloud/*' -f $DPATH --system mit_supercloud \ - --start 2021-05-21T13:00 --end 2021-05-21T14:00 + raps run-parts -x mit_supercloud -f $DPATH --system mit_supercloud --start 2021-05-21T13:00 --end 2021-05-21T14:00 # Note: if no start, end dates provided will default to run 24 hours between # 2021-05-21T00:00 to 2021-05-22T00:00 set by defaults in raps/dataloaders/mit_supercloud/utils.py # Re-run simulation using npz files (much faster load) - raps run-multi-part -x mit_supercloud/* -f part-*.npz --system mit_supercloud + raps run-parts -x mit_supercloud -f part-*.npz --system mit_supercloud # Synthetic tests for verification studies: - raps run-multi-part -x 'mit_supercloud/*' -w multitenant + raps run-parts -x mit_supercloud -w multitenant For Lumi - # Synthetic test for lumi multi-part-sim: - raps run-multi-part -x lumi/* + # Synthetic test for Lumi: + raps run-parts -x lumi ## Perform Network Simulation @@ -93,7 +92,6 @@ given instead of the parquet files for more quickly running subsequent simulatio raps run -f jobs_2024-02-20_12-20-39.npz - ## Cooling models We provide several cooling models in the repo https://code.ornl.gov/exadigit/POWER9CSM @@ -111,23 +109,21 @@ use `--cooling` or `-c` argument. e.g., ## Support for multiple system partitions -Multi-partition systems are supported by running the `multi-part-sim.py` script, where a list of configurations can be specified using the `-x` flag as follows: +Multi-partition systems are supported by running `raps multi-parts ...` command, where a list of partitions can be specified using the `-x` flag as follows: - raps run-multi-part -x setonix/part-cpu setonix/part-gpu + raps run-parts -x setonix/part-cpu setonix/part-gpu or simply: - raps run-multi-part -x setonix/* # bash - - raps run-multi-part -x 'setonix/*' # zsh + raps run-parts -x setonix This will simulate synthetic workloads on two partitions as defined in `config/setonix-cpu` and `config/setonix-gpu`. To replay telemetry workloads from another system, e.g., Marconi100's PM100 dataset, first create a .npz snapshot of the telemetry data, e.g., - raps run-multi-part --system marconi100 -f /path/to/marconi100/job_table.parquet + raps run-parts --system marconi100 -f /path/to/marconi100/job_table.parquet -This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename this file to pm100.npz for clarity. Note: can control-C when the simulation starts. Now, this pm100.npz file can be used with `multi-part-sim.py` as follows: +This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename this file to pm100.npz for clarity. Note: can control-C when the simulation starts. Now, this pm100.npz file can be used as follows: - raps run-multi-part -x setonix/* -f pm100.npz --arrival poisson --scale 192 + raps run-parts -x setonix -f pm100.npz --arrival poisson --scale 192 ## Modifications to telemetry replay @@ -135,9 +131,10 @@ There are three ways to modify replaying of telemetry data: 1. `--arrival`. Changing the arrival time distribution - replay cases will default to `--arrival prescribed`, where the jobs will be submitted exactly as they were submitted on the physical machine. This can be changed to `--arrival poisson` to change when the jobs arrive, which is especially useful in cases where there may be gaps in time, e.g., when the system goes down for several days, or the system is is underutilized. python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --arrival poisson -2. `--policy`. Changing the way the jobs are scheduled. The `--policy` flag will be set by default to `replay` in cases where a telemetry file is provided, in which case the jobs will be scheduled according to the start times provided. Changing the `--policy` to `fcfs` or `backfill` will use the internal scheduler. -python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h +2. `--policy`. Changing the way the jobs are scheduled. The `--policy` flag will be set by default to `replay` in cases where a telemetry file is provided, in which case the jobs will be scheduled according to the start times provided. Changing the `--policy` to `fcfs` or `backfill` will use the internal scheduler, e.g.: + + python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h 3. `--scale`. Changing the scale of each job in the telemetry data. The `--scale` flag will specify the maximum number of nodes for each job (generally set this to the max number of nodes of the smallest partition), and randomly select the number of nodes for each job from one to max nodes. This flag is useful when replaying telemetry from a larger system onto a smaller system. diff --git a/main.py b/main.py index 1e52395..7c38960 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ ExaDigiT Resource Allocator & Power Simulator (RAPS) """ import argparse from raps.helpers import check_python_version -from raps.run_sim import run_sim_add_parser, run_multi_part_sim_add_parser, show_add_parser +from raps.run_sim import run_sim_add_parser, run_parts_sim_add_parser, show_add_parser from raps.workload import run_workload_add_parser from raps.telemetry import run_telemetry_add_parser @@ -20,7 +20,7 @@ def main(cli_args: list[str] | None = None): subparsers = parser.add_subparsers(required=True) run_sim_add_parser(subparsers) - run_multi_part_sim_add_parser(subparsers) + run_parts_sim_add_parser(subparsers) show_add_parser(subparsers) run_workload_add_parser(subparsers) run_telemetry_add_parser(subparsers) diff --git a/raps/run_sim.py b/raps/run_sim.py index 402acea..5afd6f1 100644 --- a/raps/run_sim.py +++ b/raps/run_sim.py @@ -7,6 +7,7 @@ import json import pandas as pd import sys import yaml +import warnings from pathlib import Path from raps.ui import LayoutManager from raps.plotting import Plotter @@ -73,7 +74,7 @@ def run_sim(sim_config: SimConfig): if sim_config.verbose or sim_config.debug: print(f"SimConfig: {sim_config.model_dump_json(indent=4)}") if len(sim_config.system_configs) > 1: - print("Use run-multi-part to run multi-partition simulations") + print("Use run-parts to run multi-partition simulations") sys.exit(1) engine, workload_data, time_delta = Engine.from_sim_config(sim_config) @@ -221,8 +222,8 @@ def run_sim(sim_config: SimConfig): print("Output directory is: ", out) # If output is enabled, the user wants this information as last output -def run_multi_part_sim_add_parser(subparsers: SubParsers): - parser = subparsers.add_parser("run-multi-part", description=""" +def run_parts_sim_add_parser(subparsers: SubParsers): + parser = subparsers.add_parser("run-parts", description=""" Simulates multi-partition (heterogeneous) systems. Supports replaying telemetry or generating synthetic workloads across CPU-only, GPU, and mixed partitions. Initializes per-partition power, FLOPS, and scheduling models, then advances simulations in lockstep. @@ -237,11 +238,18 @@ def run_multi_part_sim_add_parser(subparsers: SubParsers): "cli_shortcuts": shortcuts, }) parser.set_defaults( - impl=lambda args: run_multi_part_sim(model_validate(args, read_yaml(args.config_file))) + impl=lambda args: run_parts_sim(model_validate(args, read_yaml(args.config_file))) ) -def run_multi_part_sim(sim_config: SimConfig): +def run_parts_sim(sim_config: SimConfig): + + if len(sim_config.system_configs) == 1: + warnings.warn( + "run_parts_sim is usually for multiple partitions. Did you mean to run with one?", + UserWarning + ) + multi_engine, workload_results, timestep_start, timestep_end, time_delta = \ MultiPartEngine.from_sim_config(sim_config) -- GitLab From c8d4d84f0510443efe3bf5585e2f30c88875a0a4 Mon Sep 17 00:00:00 2001 From: Wes Brewer Date: Fri, 5 Sep 2025 11:02:32 -0400 Subject: [PATCH 2/3] Update tests to use `run-parts` instead of `run-multi-part` --- tests/smoke.py | 2 +- tests/systems/test_multi_part_sim_basic_run.py | 2 +- tests/systems/test_multi_part_sim_network_run.py | 2 +- tests/systems/test_multi_part_sim_withdata_run.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/smoke.py b/tests/smoke.py index 7548de3..946f6db 100644 --- a/tests/smoke.py +++ b/tests/smoke.py @@ -54,7 +54,7 @@ def synthetic_workload_tests(): def hetero_tests(): """Run heterogeneous workload tests.""" print("Starting heterogeneous workload tests...") - run_command(f"python main.py run-multi-part -x setonix/part-cpu setonix/part-gpu -t {DEFAULT_TIME}") + run_command(f"python main.py run-parts -x setonix/part-cpu setonix/part-gpu -t {DEFAULT_TIME}") def main(): diff --git a/tests/systems/test_multi_part_sim_basic_run.py b/tests/systems/test_multi_part_sim_basic_run.py index 3ea2a9c..9351fd6 100644 --- a/tests/systems/test_multi_part_sim_basic_run.py +++ b/tests/systems/test_multi_part_sim_basic_run.py @@ -18,7 +18,7 @@ def test_multi_part_sim_basic_run(system, system_config): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run-multi-part", + "python", "main.py", "run-parts", "--time", "1h", "-x", f"{system}/*", ], capture_output=True, text=True, stdin=subprocess.DEVNULL) diff --git a/tests/systems/test_multi_part_sim_network_run.py b/tests/systems/test_multi_part_sim_network_run.py index aa90cca..c556014 100644 --- a/tests/systems/test_multi_part_sim_network_run.py +++ b/tests/systems/test_multi_part_sim_network_run.py @@ -19,7 +19,7 @@ def test_multi_part_sim_network_run(system, system_config, sim_output): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run-multi-part", + "python", "main.py", "run-parts", "--time", "1h", "-x", f"{system}/*", "--net", diff --git a/tests/systems/test_multi_part_sim_withdata_run.py b/tests/systems/test_multi_part_sim_withdata_run.py index 9694969..f38cf8e 100644 --- a/tests/systems/test_multi_part_sim_withdata_run.py +++ b/tests/systems/test_multi_part_sim_withdata_run.py @@ -20,7 +20,7 @@ def test_multi_part_sim_withdata_run(system, system_config, system_files): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run-multi-part", + "python", "main.py", "run-parts", "--time", "1h", "-x", f"{system}/*", "-f", ','.join(system_files), -- GitLab From 85438eb0767134d0b1a61e0f51f74ab90f1310f7 Mon Sep 17 00:00:00 2001 From: Wes Brewer Date: Fri, 5 Sep 2025 11:08:10 -0400 Subject: [PATCH 3/3] Update README.md for how to run tests for multi-partition systems --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index e49e08b..bccaec1 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,12 @@ RAPS_DATA_DIR=/opt/data pytest -n auto -x By default, tests are parallelized with `pytest-xdist` (`-n auto`) to speed up execution. The `-x` flag stops execution after the first failure. Add `-v` to run in verbose mode. +### Run tests on multi-partition systems + +```bash +pytest -v -k "multi_part_sim" +``` + ### Run only network-related tests ```bash -- GitLab