diff --git a/README.md b/README.md index eeb5a5dc389ed1e403512804c19ca6e0a00bac17..bccaec147779664d9910348fab2ebbb23dd33b2f 100644 --- a/README.md +++ b/README.md @@ -62,21 +62,20 @@ For MIT Supercloud python -m raps.dataloaders.mit_supercloud.cli download --start 2021-05-21T13:00 --end 2021-05-21T14:00 # Load data and run simulation - will save data as part-cpu.npz and part-gpu.npz files - raps run-multi-part -x 'mit_supercloud/*' -f $DPATH --system mit_supercloud \ - --start 2021-05-21T13:00 --end 2021-05-21T14:00 + raps run-parts -x mit_supercloud -f $DPATH --system mit_supercloud --start 2021-05-21T13:00 --end 2021-05-21T14:00 # Note: if no start, end dates provided will default to run 24 hours between # 2021-05-21T00:00 to 2021-05-22T00:00 set by defaults in raps/dataloaders/mit_supercloud/utils.py # Re-run simulation using npz files (much faster load) - raps run-multi-part -x mit_supercloud/* -f part-*.npz --system mit_supercloud + raps run-parts -x mit_supercloud -f part-*.npz --system mit_supercloud # Synthetic tests for verification studies: - raps run-multi-part -x 'mit_supercloud/*' -w multitenant + raps run-parts -x mit_supercloud -w multitenant For Lumi - # Synthetic test for lumi multi-part-sim: - raps run-multi-part -x lumi/* + # Synthetic test for Lumi: + raps run-parts -x lumi ## Perform Network Simulation @@ -93,7 +92,6 @@ given instead of the parquet files for more quickly running subsequent simulatio raps run -f jobs_2024-02-20_12-20-39.npz - ## Cooling models We provide several cooling models in the repo https://code.ornl.gov/exadigit/POWER9CSM @@ -111,23 +109,21 @@ use `--cooling` or `-c` argument. e.g., ## Support for multiple system partitions -Multi-partition systems are supported by running the `multi-part-sim.py` script, where a list of configurations can be specified using the `-x` flag as follows: +Multi-partition systems are supported by running `raps multi-parts ...` command, where a list of partitions can be specified using the `-x` flag as follows: - raps run-multi-part -x setonix/part-cpu setonix/part-gpu + raps run-parts -x setonix/part-cpu setonix/part-gpu or simply: - raps run-multi-part -x setonix/* # bash - - raps run-multi-part -x 'setonix/*' # zsh + raps run-parts -x setonix This will simulate synthetic workloads on two partitions as defined in `config/setonix-cpu` and `config/setonix-gpu`. To replay telemetry workloads from another system, e.g., Marconi100's PM100 dataset, first create a .npz snapshot of the telemetry data, e.g., - raps run-multi-part --system marconi100 -f /path/to/marconi100/job_table.parquet + raps run-parts --system marconi100 -f /path/to/marconi100/job_table.parquet -This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename this file to pm100.npz for clarity. Note: can control-C when the simulation starts. Now, this pm100.npz file can be used with `multi-part-sim.py` as follows: +This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename this file to pm100.npz for clarity. Note: can control-C when the simulation starts. Now, this pm100.npz file can be used as follows: - raps run-multi-part -x setonix/* -f pm100.npz --arrival poisson --scale 192 + raps run-parts -x setonix -f pm100.npz --arrival poisson --scale 192 ## Modifications to telemetry replay @@ -135,9 +131,10 @@ There are three ways to modify replaying of telemetry data: 1. `--arrival`. Changing the arrival time distribution - replay cases will default to `--arrival prescribed`, where the jobs will be submitted exactly as they were submitted on the physical machine. This can be changed to `--arrival poisson` to change when the jobs arrive, which is especially useful in cases where there may be gaps in time, e.g., when the system goes down for several days, or the system is is underutilized. python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --arrival poisson -2. `--policy`. Changing the way the jobs are scheduled. The `--policy` flag will be set by default to `replay` in cases where a telemetry file is provided, in which case the jobs will be scheduled according to the start times provided. Changing the `--policy` to `fcfs` or `backfill` will use the internal scheduler. -python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h +2. `--policy`. Changing the way the jobs are scheduled. The `--policy` flag will be set by default to `replay` in cases where a telemetry file is provided, in which case the jobs will be scheduled according to the start times provided. Changing the `--policy` to `fcfs` or `backfill` will use the internal scheduler, e.g.: + + python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h 3. `--scale`. Changing the scale of each job in the telemetry data. The `--scale` flag will specify the maximum number of nodes for each job (generally set this to the max number of nodes of the smallest partition), and randomly select the number of nodes for each job from one to max nodes. This flag is useful when replaying telemetry from a larger system onto a smaller system. @@ -182,6 +179,12 @@ RAPS_DATA_DIR=/opt/data pytest -n auto -x By default, tests are parallelized with `pytest-xdist` (`-n auto`) to speed up execution. The `-x` flag stops execution after the first failure. Add `-v` to run in verbose mode. +### Run tests on multi-partition systems + +```bash +pytest -v -k "multi_part_sim" +``` + ### Run only network-related tests ```bash diff --git a/main.py b/main.py index 1e52395e1bc12f703700dffce8303883209568d6..7c38960d4ae576c2bbb1abcf1c59b2fb7b4d6c44 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ ExaDigiT Resource Allocator & Power Simulator (RAPS) """ import argparse from raps.helpers import check_python_version -from raps.run_sim import run_sim_add_parser, run_multi_part_sim_add_parser, show_add_parser +from raps.run_sim import run_sim_add_parser, run_parts_sim_add_parser, show_add_parser from raps.workload import run_workload_add_parser from raps.telemetry import run_telemetry_add_parser @@ -20,7 +20,7 @@ def main(cli_args: list[str] | None = None): subparsers = parser.add_subparsers(required=True) run_sim_add_parser(subparsers) - run_multi_part_sim_add_parser(subparsers) + run_parts_sim_add_parser(subparsers) show_add_parser(subparsers) run_workload_add_parser(subparsers) run_telemetry_add_parser(subparsers) diff --git a/raps/run_sim.py b/raps/run_sim.py index 402aceaff3c7877647f31c7bf8ce8783174aa1f6..5afd6f1823bcbe47f7343f7df041b728a7229644 100644 --- a/raps/run_sim.py +++ b/raps/run_sim.py @@ -7,6 +7,7 @@ import json import pandas as pd import sys import yaml +import warnings from pathlib import Path from raps.ui import LayoutManager from raps.plotting import Plotter @@ -73,7 +74,7 @@ def run_sim(sim_config: SimConfig): if sim_config.verbose or sim_config.debug: print(f"SimConfig: {sim_config.model_dump_json(indent=4)}") if len(sim_config.system_configs) > 1: - print("Use run-multi-part to run multi-partition simulations") + print("Use run-parts to run multi-partition simulations") sys.exit(1) engine, workload_data, time_delta = Engine.from_sim_config(sim_config) @@ -221,8 +222,8 @@ def run_sim(sim_config: SimConfig): print("Output directory is: ", out) # If output is enabled, the user wants this information as last output -def run_multi_part_sim_add_parser(subparsers: SubParsers): - parser = subparsers.add_parser("run-multi-part", description=""" +def run_parts_sim_add_parser(subparsers: SubParsers): + parser = subparsers.add_parser("run-parts", description=""" Simulates multi-partition (heterogeneous) systems. Supports replaying telemetry or generating synthetic workloads across CPU-only, GPU, and mixed partitions. Initializes per-partition power, FLOPS, and scheduling models, then advances simulations in lockstep. @@ -237,11 +238,18 @@ def run_multi_part_sim_add_parser(subparsers: SubParsers): "cli_shortcuts": shortcuts, }) parser.set_defaults( - impl=lambda args: run_multi_part_sim(model_validate(args, read_yaml(args.config_file))) + impl=lambda args: run_parts_sim(model_validate(args, read_yaml(args.config_file))) ) -def run_multi_part_sim(sim_config: SimConfig): +def run_parts_sim(sim_config: SimConfig): + + if len(sim_config.system_configs) == 1: + warnings.warn( + "run_parts_sim is usually for multiple partitions. Did you mean to run with one?", + UserWarning + ) + multi_engine, workload_results, timestep_start, timestep_end, time_delta = \ MultiPartEngine.from_sim_config(sim_config) diff --git a/tests/smoke.py b/tests/smoke.py index 7548de3982c2ed2c9464aa72ed5420eaeb3fbadd..946f6db94a7bfd8f0f382472e098ef624c817f48 100644 --- a/tests/smoke.py +++ b/tests/smoke.py @@ -54,7 +54,7 @@ def synthetic_workload_tests(): def hetero_tests(): """Run heterogeneous workload tests.""" print("Starting heterogeneous workload tests...") - run_command(f"python main.py run-multi-part -x setonix/part-cpu setonix/part-gpu -t {DEFAULT_TIME}") + run_command(f"python main.py run-parts -x setonix/part-cpu setonix/part-gpu -t {DEFAULT_TIME}") def main(): diff --git a/tests/systems/test_multi_part_sim_basic_run.py b/tests/systems/test_multi_part_sim_basic_run.py index 3ea2a9caa1b6d5b156524a8c6f9b4c2666869382..9351fd6bc83008ca3a8dc9c977af9a0aadf15972 100644 --- a/tests/systems/test_multi_part_sim_basic_run.py +++ b/tests/systems/test_multi_part_sim_basic_run.py @@ -18,7 +18,7 @@ def test_multi_part_sim_basic_run(system, system_config): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run-multi-part", + "python", "main.py", "run-parts", "--time", "1h", "-x", f"{system}/*", ], capture_output=True, text=True, stdin=subprocess.DEVNULL) diff --git a/tests/systems/test_multi_part_sim_network_run.py b/tests/systems/test_multi_part_sim_network_run.py index aa90cca0180db1cbfb4c5b903fbc3bd141670e21..c5560148abb9d74bde8de7b44ddd6a1ca099cd58 100644 --- a/tests/systems/test_multi_part_sim_network_run.py +++ b/tests/systems/test_multi_part_sim_network_run.py @@ -19,7 +19,7 @@ def test_multi_part_sim_network_run(system, system_config, sim_output): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run-multi-part", + "python", "main.py", "run-parts", "--time", "1h", "-x", f"{system}/*", "--net", diff --git a/tests/systems/test_multi_part_sim_withdata_run.py b/tests/systems/test_multi_part_sim_withdata_run.py index 969496916b0d21b50e3a8c2286c6435315cd96a8..f38cf8ee35fd2d2188e381974eb179d21b146633 100644 --- a/tests/systems/test_multi_part_sim_withdata_run.py +++ b/tests/systems/test_multi_part_sim_withdata_run.py @@ -20,7 +20,7 @@ def test_multi_part_sim_withdata_run(system, system_config, system_files): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run-multi-part", + "python", "main.py", "run-parts", "--time", "1h", "-x", f"{system}/*", "-f", ','.join(system_files),