From 0a3daab7f99a7bedd0dbbdc63f4451c01eb33b72 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 20 Aug 2025 18:37:51 -0400 Subject: [PATCH 01/23] Refactoring to multi-partition configs --- multi-part-sim-mpi.py | 18 ++------- multi-part-sim.py | 15 +++---- raps/config.py | 80 +++++++++++++++++++++++++++++++------ tests/test_system_config.py | 16 +++++++- 4 files changed, 91 insertions(+), 38 deletions(-) diff --git a/multi-part-sim-mpi.py b/multi-part-sim-mpi.py index fed038e..f57b0e6 100644 --- a/multi-part-sim-mpi.py +++ b/multi-part-sim-mpi.py @@ -15,7 +15,7 @@ from raps.power import PowerManager, compute_node_power from raps.flops import FLOPSManager from raps.engine import Engine from raps.ui import LayoutManager -from raps.config import get_system_config, CONFIG_PATH +from raps.config import get_partition_configs from args import args import random import os @@ -29,20 +29,10 @@ def main(): rank = comm.Get_rank() size = comm.Get_size() - # 1) Expand “partitions” (on rank 0) if the user used a glob: - if rank == 0: - partition_names = args.partitions - if '*' in partition_names[0]: - paths = glob.glob(os.path.join(CONFIG_PATH, partition_names[0])) - partition_names = [os.path.join(*p.split(os.sep)[-2:]) for p in paths] - else: - partition_names = None - - # 2) Broadcast the final list of partition_names to everyone - partition_names = comm.bcast(partition_names, root=0) - # 3) Load configs for every partition (all ranks do this) - configs = [get_system_config(p).get_legacy() for p in partition_names] + multi_config = get_partition_configs(args.partitions) + partition_names = multi_config.partition_names + configs = [c.get_legacy() for c in multi_config.partitions] args_dicts = [{**vars(args), 'config': cfg} for cfg in configs] # 4) Each rank decides which partition‐indices it owns (round-robin): diff --git a/multi-part-sim.py b/multi-part-sim.py index 350462e..0905671 100644 --- a/multi-part-sim.py +++ b/multi-part-sim.py @@ -16,7 +16,7 @@ from raps.power import PowerManager, compute_node_power from raps.flops import FLOPSManager from raps.engine import Engine from raps.ui import LayoutManager -from raps.config import get_system_config, CONFIG_PATH +from raps.config import get_partition_configs from raps.args import args import random import os @@ -26,16 +26,11 @@ check_python_version() # Load configurations for each partition -partition_names = args.partitions +multi_config = get_partition_configs(args.partitions) +partition_names = multi_config.partition_names +configs = [c.get_legacy() for c in multi_config.partitions] +args.system = multi_config.system_name -print(args.partitions) -if '*' in args.partitions[0]: - paths = glob.glob(os.path.join(CONFIG_PATH, args.partitions[0].replace("'", ""))) - partition_names = [os.path.join(*p.split(os.sep)[-2:]) for p in paths] - - args.system = partition_names[0].split(os.sep)[0] - -configs = [get_system_config(partition).get_legacy() for partition in partition_names] args_dicts = [ {**vars(args), 'config': config, 'partition': partition_names[i]} for i, config in enumerate(configs) diff --git a/raps/config.py b/raps/config.py index 4f9f709..57ae4e9 100644 --- a/raps/config.py +++ b/raps/config.py @@ -1,8 +1,8 @@ -import os, functools -from typing import Any, Literal +import os, functools, glob, fnmatch, re +from typing import Any, Literal, Annotated as A from pathlib import Path import yaml -from pydantic import BaseModel, computed_field, model_validator +from pydantic import BaseModel, computed_field, model_validator, field_validator ROOT_DIR = Path(__file__).parent.parent CONFIG_PATH = Path(os.environ.get("RAPS_CONFIG", ROOT_DIR / 'config')).resolve() @@ -156,6 +156,8 @@ class SystemNetworkConfig(BaseModel): class SystemConfig(BaseModel): system_name: str + """ Name of the system, defaults to the yaml file name """ + system: SystemSystemConfig power: SystemPowerConfig scheduler: SystemSchedulerConfig @@ -189,6 +191,21 @@ class SystemConfig(BaseModel): config_dict['config'] = self return config_dict +class MultiPartitionSystemConfig(BaseModel): + system_name: str + partitions: list[SystemConfig] + + @field_validator("partitions") + def _validate_partitions(cls, partitions: list[SystemConfig]): + partition_names = [c.system_name for c in partitions] + if len(set(partition_names)) != len(partition_names): + raise ValueError(f"Duplicate system names: {','.join(partition_names)}") + return partitions + + @property + def partition_names(self): + return [c.system_name for c in self.partitions] + @functools.cache def list_systems() -> list[str]: @@ -206,18 +223,55 @@ def get_system_config(system: str) -> SystemConfig: system can either be a path to a custom .yaml file, or the name of one of the pre-configured systems defined in RAPS_CONFIG. """ - config_path = Path(system.removesuffix(".yaml") + ".yaml") - if config_path.exists() or config_path.is_absolute(): - system_name = config_path.resolve() - else: # assume it's a pre-configured system - system_name = system.removesuffix(".yaml") - config_path = CONFIG_PATH / config_path + if system in list_systems(): + config_path = CONFIG_PATH / f"{system}.yaml" + system_name = system + else: + config_path = Path(system).resolve() + system_name = config_path.stem + if not config_path.is_file(): - raise FileNotFoundError( - f'"{system}" not found. Known systems are: {list_systems()}' - ) + raise FileNotFoundError(f'"{system}" not found. Valid systems are: {list_systems()}') config = { - "system_name": system_name, + "system_name": system_name, # You can override system_name in the yaml as well **yaml.safe_load(config_path.read_text()), } return SystemConfig.model_validate(config) + + +def get_partition_configs(partitions: list[str]) -> MultiPartitionSystemConfig: + """ + Resolves multiple partition config files. Can pass globs, or directories to include all yaml + files under the directory. + """ + systems = list_systems() + multi_partition_systems = set(s.split("/")[0] for s in systems if "/" in s) + combined_system_name = [] + + parsed_configs: list[SystemConfig] = [] + for pat in partitions: + if pat in multi_partition_systems: + matched_systems = fnmatch.filter(systems, f"{pat}/*") + combined_system_name.append(pat) + elif fnmatch.filter(systems, pat): + matched_systems = fnmatch.filter(systems, pat) + combined_system_name.extend(s.split("/")[0] for s in matched_systems) + elif Path(pat).is_dir(): + matched_systems = Path(pat).glob("*.yaml") + combined_system_name.append(Path(pat).name) + else: + matched_systems = glob.glob(pat) + combined_system_name.extend(Path(s).stem for s in matched_systems) + + if not matched_systems: + raise FileNotFoundError(f'No config files match "{pat}"') + parsed_configs.extend(get_system_config(s) for s in sorted(matched_systems)) + + if len(parsed_configs) == 1: + combined_system_name = parsed_configs[0].system_name + else: + combined_system_name = "+".join(dict.fromkeys(combined_system_name)) # dedup, keep order + return MultiPartitionSystemConfig( + system_name = combined_system_name, + partitions = parsed_configs, + ) diff --git a/tests/test_system_config.py b/tests/test_system_config.py index 74280df..9074954 100644 --- a/tests/test_system_config.py +++ b/tests/test_system_config.py @@ -1,5 +1,5 @@ import pytest -from raps.config import list_systems, get_system_config +from raps.config import list_systems, get_system_config, CONFIG_PATH, get_partition_configs @pytest.mark.parametrize("system_name", list_systems()) def test_configs(system_name): @@ -8,3 +8,17 @@ def test_configs(system_name): assert config.system_name == system_name assert config.get_legacy()['system_name'] == system_name assert config.get_legacy()['config'] == config + + +@pytest.mark.parametrize("input,expected_name,expected_configs", [ + (["lumi"], "lumi", ["lumi/lumi-c", "lumi/lumi-g"]), + (["lumi/*"], "lumi", ["lumi/lumi-c", "lumi/lumi-g"]), + (["frontier", "summit"], "frontier+summit", ["frontier", "summit"]), + # test passing arbitrary paths + ([str(CONFIG_PATH / "lumi")], "lumi", ["lumi-c", "lumi-g"]), + ([str(CONFIG_PATH / "lumi/lumi-*")], "lumi-c+lumi-g", ["lumi-c", "lumi-g"]), +]) +def test_get_partition_configs(input, expected_name, expected_configs): + result = get_partition_configs(input) + assert result.system_name == expected_name + assert result.partition_names == expected_configs -- GitLab From 628be337afb25d698702c2d6d334168601588baf Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 21 Aug 2025 07:54:44 -0400 Subject: [PATCH 02/23] Update minimum python to 3.12 --- README.md | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f46e23a..eaee18e 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Instructions for setup and usage are given below. An online documentation of Exa ## Setup environment -Note: Requires python3.11 or greater. +Note: Requires python3.12 or greater. pip install -e . diff --git a/pyproject.toml b/pyproject.toml index d46a621..b7fbb99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "raps" version = "0.0.1" -requires-python = ">=3.11" +requires-python = ">=3.12" description = "RAPS" readme = "README.md" # license = {file = "LICENSE.txt"} -- GitLab From 07615bf922422ff0fafec8b04b9a2806f1e11991 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 21 Aug 2025 13:01:54 -0400 Subject: [PATCH 03/23] Move test --- tests/{ => unit}/test_system_config.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{ => unit}/test_system_config.py (100%) diff --git a/tests/test_system_config.py b/tests/unit/test_system_config.py similarity index 100% rename from tests/test_system_config.py rename to tests/unit/test_system_config.py -- GitLab From f86db4ea0730d3644ae3d29d7d4f2441fba6ba09 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 21 Aug 2025 10:55:06 -0400 Subject: [PATCH 04/23] Parse args using pydantic --- README.md | 14 +- multi-part-sim-mpi.py | 6 +- multi-part-sim.py | 8 +- raps/args.py | 576 +++++++++--------- raps/dataloaders/adastraMI250.py | 2 +- raps/dataloaders/frontier.py | 4 +- raps/dataloaders/lassen.py | 8 +- raps/dataloaders/marconi100.py | 2 +- raps/telemetry.py | 6 +- raps/utils.py | 134 ++-- raps/workload.py | 75 --- scripts/marconi100-day51.sh | 8 +- scripts/meta_run.sh | 2 +- tests/systems/test_main_network_run.py | 2 +- .../systems/test_main_network_withdata_run.py | 2 +- tests/systems/test_main_time_delta_run.py | 7 +- .../test_main_time_delta_sub_second_run.py | 11 +- tests/systems/test_main_time_ff_delta_run.py | 2 +- .../systems/test_multi_part_sim_basic_run.py | 1 - .../test_multi_part_sim_network_run.py | 3 +- tests/unit/test_utils.py | 39 ++ 21 files changed, 441 insertions(+), 471 deletions(-) create mode 100644 tests/unit/test_utils.py diff --git a/README.md b/README.md index eaee18e..665a64a 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Note: Requires python3.12 or greater. # Frontier DATEDIR="date=2024-01-18" DPATH=~/data/frontier-sample-2024-01-18 - python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR + python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR ## Open Telemetry dataset @@ -46,7 +46,7 @@ For Adastra MI250 supercomputer, download 'AdastaJobsMI250_15days.parquet' from For Google cluster trace v2 - python main.py --system gcloudv2 -f ~/data/gcloud/v2/google_cluster_data_2011_sample -ff 600 + python main.py --system gcloudv2 -f ~/data/gcloud/v2/google_cluster_data_2011_sample --ff 600 # analyze dataset python -m raps.telemetry --system gcloudv2 -f ~/data/gcloud/v2/google_cluster_data_2011_sample -v @@ -83,7 +83,7 @@ For Lumi Lassen is one of the few datasets that has networking data. See `raps/dataloaders/lassen.py` for how to get the datasets. To run a network simulation, use the following command: - python main.py -f ~/data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit -ff 365d -t 12h --arrival poisson -net + python main.py -f ~/data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit --ff 365d -t 12h --arrival poisson --net ## Snapshot of extracted workload data @@ -140,10 +140,10 @@ This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename There are three ways to modify replaying of telemetry data: 1. `--arrival`. Changing the arrival time distribution - replay cases will default to `--arrival prescribed`, where the jobs will be submitted exactly as they were submitted on the physical machine. This can be changed to `--arrival poisson` to change when the jobs arrive, which is especially useful in cases where there may be gaps in time, e.g., when the system goes down for several days, or the system is is underutilized. -python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR --arrival poisson +python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --arrival poisson 2. `--policy`. Changing the way the jobs are scheduled. The `--policy` flag will be set by default to `replay` in cases where a telemetry file is provided, in which case the jobs will be scheduled according to the start times provided. Changing the `--policy` to `fcfs` or `backfill` will use the internal scheduler. -python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h +python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h 3. `--scale`. Changing the scale of each job in the telemetry data. The `--scale` flag will specify the maximum number of nodes for each job (generally set this to the max number of nodes of the smallest partition), and randomly select the number of nodes for each job from one to max nodes. This flag is useful when replaying telemetry from a larger system onto a smaller system. @@ -151,11 +151,11 @@ python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR --pol ## Job-level power output example for replay of single job - python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR --jid 1234567 -o + python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --jid 1234567 -o ## Compute stats on telemetry data, e.g., average job arrival time - python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR + python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR ## Build and run Docker container diff --git a/multi-part-sim-mpi.py b/multi-part-sim-mpi.py index f57b0e6..4d65b96 100644 --- a/multi-part-sim-mpi.py +++ b/multi-part-sim-mpi.py @@ -8,7 +8,7 @@ stats for heterogeneous systems (e.g., LUMI, Setonix, Adastra). from tqdm import tqdm from mpi4py import MPI -from raps.utils import convert_to_seconds, next_arrival +from raps.utils import next_arrival from raps.workload import Workload from raps.telemetry import Telemetry from raps.power import PowerManager, compute_node_power @@ -112,12 +112,12 @@ def main(): # 9) Compute timestep_start / timestep_end (all ranks agree): if args.fastforward: - fastforward = convert_to_seconds(args.fastforward) + fastforward = args.fastforward else: fastforward = 0 if args.time: - timesteps = convert_to_seconds(args.time) + timesteps = args.time else: timesteps = 88200 # default 24 hours diff --git a/multi-part-sim.py b/multi-part-sim.py index 0905671..148a702 100644 --- a/multi-part-sim.py +++ b/multi-part-sim.py @@ -9,7 +9,7 @@ statistics for systems such as MIT Supercloud, Setonix, Adastra, and LUMI. from tqdm import tqdm from raps.stats import get_engine_stats, get_job_stats, get_scheduler_stats, get_network_stats -from raps.utils import convert_to_seconds, next_arrival +from raps.utils import next_arrival from raps.workload import Workload from raps.telemetry import Telemetry from raps.power import PowerManager, compute_node_power @@ -118,11 +118,11 @@ for i, (config, ad) in enumerate(zip(configs, args_dicts)): # Set simulation timesteps if args.fastforward: - fastfoward = convert_to_seconds(args.fastforward) + fastfoward = args.fastforward else: fastforward = 0 if args.time: - timesteps = convert_to_seconds(args.time) + timesteps = args.time else: timesteps = 88200 # Default to 24 hours @@ -130,7 +130,7 @@ timestep_start = fastforward timestep_end = timestep_start + timesteps if args.time_delta: - time_delta = convert_to_seconds(args.time_delta) + time_delta = args.time_delta else: time_delta = config['TRACE_QUANTA'] diff --git a/raps/args.py b/raps/args.py index 9c4142b..5a4b06a 100644 --- a/raps/args.py +++ b/raps/args.py @@ -2,309 +2,283 @@ import argparse import os import sys import yaml +from datetime import timedelta +from typing import Literal, Annotated as A from raps.schedulers.default import PolicyType, BackfillType -from raps.workload import add_workload_to_parser, check_workload_args -from raps.utils import convert_to_seconds - - -def load_config(path): - if path and os.path.exists(path): - with open(path, "r") as f: - return yaml.safe_load(f) or {} - return {} - - -def _expand_path(p): - if isinstance(p, str): - # expand ~ and $VARS - return os.path.expanduser(os.path.expandvars(p)) - return p - - -def apply_config_to_args(cfg, args): - # Merge supported sections or top-level keys - merged = {} - for k, v in (cfg or {}).items(): - if isinstance(v, dict) and k in { - "shared", "simulate", "telemetry", "scheduler", "output" - }: - merged.update(v) - else: - # Enter the commandline argument, but _underscores as the -dashes - # are replaced when reading from the commandline, but not in the yaml. - merged[k.replace('-', '_')] = v - - # Apply to argparse namespace - for k, v in merged.items(): - setattr(args, k, v) - - # Coerce certain keys to lists if YAML provided strings - list_keys = { - "cluster_var", "output_vars", "input_vars", "partitions", "plot" - } - for key in list_keys: - if hasattr(args, key): - val = getattr(args, key) - if isinstance(val, str): - setattr(args, key, [val]) - - # Expand paths (tilde + env vars) - for key in ("path", "output_dir", "plot_dir", "config_file"): - if hasattr(args, key): - setattr(args, key, _expand_path(getattr(args, key))) - - # Normalize enums if provided as strings in YAML - if getattr(args, "policy", None): - try: - # Accept exact values or case-insensitive - val = str(args.policy) - opts = {p.value.lower(): p.value for p in PolicyType} - if val.lower() in opts: - args.policy = opts[val.lower()] - except Exception: - pass - - if getattr(args, "backfill", None): - try: - val = str(args.backfill) - opts = {b.value.lower(): b.value for b in BackfillType} - if val.lower() in opts: - args.backfill = opts[val.lower()] - except Exception: - pass - - -parser = argparse.ArgumentParser( - description="Resource Allocator & Power Simulator (RAPS)", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, -) -parser.add_argument( - "config_file", nargs="?", default=None, - help="YAML config file; overrides defaults/flags." -) - -# System configurations -parser.add_argument("--system", type=str, default="frontier", - help="System config to use") -parser.add_argument( - "-x", "--partitions", nargs="+", default=None, - help="List of machine configurations, e.g., -x setonix-cpu setonix-gpu" -) -parser.add_argument("-c", "--cooling", action="store_true", - help="Include FMU cooling model") -parser.add_argument("-net", "--simulate-network", default=False, - action="store_true", help="Include Network model") -parser.add_argument("--noui", default=False, action="store_true", - help="Run without UI") - -# Simulation runtime options -parser.add_argument("-ff", "--fastforward", type=str, default=None, - help="Fast-forward by time amount (uses same units as -t)") -parser.add_argument("-t", "--time", type=str, default=None, - help="Length of time to simulate, e.g., 123, 27m, 3h, 7d") -parser.add_argument("--time-delta", type=str, default="1s", - help="Step size, e.g., 15s, 1m, 1h, 1ms (default: 1s)") -parser.add_argument("-d", "--debug", action="store_true", - help="Enable debug mode and disable rich layout") -parser.add_argument("-n", "--numjobs", type=int, default=100, - help="Number of jobs to schedule") -parser.add_argument("-v", "--verbose", action="store_true", - help="Enable verbose output") -parser.add_argument("--start", type=str, default="2021-05-21T13:00", - help="ISO8601 start of simulation") -parser.add_argument("--end", type=str, default="2021-05-21T14:00", - help="ISO8601 end of simulation") -parser.add_argument("--seed", action="store_true", - help="Set RNG seed for deterministic simulation") -parser.add_argument( - "-u", "--uncertainties", action="store_true", - help=("Use float-with-uncertainties (much slower).") +from raps.utils import ( + parse_time_unit, convert_to_time_unit, infer_time_unit, ExpandedPath, + parse_args_pydantic, ) -# UI -ui_layout_choices = ["layout1", "layout2"] -parser.add_argument("--layout", type=str, choices=ui_layout_choices, - default=ui_layout_choices[0], help="UI layout") - -# Output -parser.add_argument('-o', '--output', type=str, nargs="?", - const="", # Used if -o is given without a value - default=None, # Used if -o is not provided at all - help=("Output power, cooling, and loss models for later " - "analysis. Argumment specifies name.") - ) -plot_choices = ["power", "loss", "pue", "temp", "util"] -parser.add_argument("-p", "--plot", nargs="+", choices=plot_choices, - help="Plots to generate") -img_choices = ["png", "svg", "jpg", "pdf", "eps"] -parser.add_argument("--imtype", type=str, choices=img_choices, - default=img_choices[0], help="Plot image type") - -# Telemetry -parser.add_argument( - "-f", "--replay", nargs="+", type=str, - help=("Either: path/to/joblive path/to/jobprofile OR filename.npz " - "(overrides --workload)") -) -parser.add_argument("-e", "--encrypt", action="store_true", - help="Encrypt sensitive data in telemetry") -parser.add_argument("--validate", action="store_true", - help="Use node power instead of CPU/GPU utilizations") -parser.add_argument("--jid", type=str, default="*", - help="Replay job id") -parser.add_argument("--scale", type=int, default=0, - help=("Scale telemetry to a smaller target system, " - "e.g., --scale 192")) -parser.add_argument("--live", action="store_true", - help="Grab data from live system.") - - -# Synthetic workloads -parser = add_workload_to_parser(parser) - -# Scheduling -sched_choices = ["default", "scheduleflow", "nrel", "anl", "flux", - "experimental", "multitenant"] -parser.add_argument("--scheduler", type=str, choices=sched_choices, - default=sched_choices[0], help="Scheduler name") -parser.add_argument("--policy", type=str, default=None, - help=f"Schedule policy: {[p.value for p in PolicyType]}") -parser.add_argument("--backfill", type=str, default=None, - help=f"Backfill policy: {[b.value for b in BackfillType]}") - -# Arrival -arr_choices = ["prescribed", "poisson"] -parser.add_argument("--arrival", default=arr_choices[0], type=str, - choices=arr_choices, - help=("Modify arrival distribution (poisson) or use " - "original submit times (prescribed)")) -parser.add_argument("--job-arrival-time", type=int, - help=("Poisson arrival (seconds). Overrides " - "config/*/scheduler.json")) -parser.add_argument("--job-arrival-rate", type=float, - help="Modify Poisson rate (default 1)") - -# Accounts -parser.add_argument("--accounts", action="store_true", - help="Track accounts") -parser.add_argument("--accounts-json", type=str, - help="Accounts JSON from previous run") - -# Downtime -parser.add_argument("--downtime-first", type=str, default=None, - help="First downtime, e.g., after 123, 27m, 3h, 7d") -parser.add_argument("--downtime-interval", type=str, default=None, - help="Interval between downtimes, e.g., every 123, 27m, 3h, 7d") -parser.add_argument("--downtime-length", type=str, default=None, - help="Downtime length, e.g., 123, 27m, 3h, 7d") - -# Continous Job Generation -parser.add_argument("--continuous-job-generation", action="store_true", - help="Activate continuous job generation.") -parser.add_argument("--maxqueue", type=int, default=50, - help="Specify the max queue length for continuous job generation.") - - -def post_process_args(args): - if args.time_delta: - tdelta_raw, tdelta_down = convert_to_seconds(args.time_delta) - else: - tdelta_raw, tdelta_down = None, 1 - - if args.time: - time_raw, time_down = convert_to_seconds(args.time) - else: - time_raw, time_down = None, 1 - - if args.fastforward: - ff_raw, ff_down = convert_to_seconds(args.fastforward) - else: - ff_raw, ff_down = None, 1 - - if args.downtime_first: - dtf_raw, dtf_down = convert_to_seconds(args.downtime_first) - if args.downtime_interval: - dti_raw, dti_down = convert_to_seconds(args.downtime_interval) - if args.downtime_length: - dtl_raw, dtl_down = convert_to_seconds(args.downtime_length) - - max_down = max(tdelta_down, time_down, ff_down) - args.downscale = max_down - - if args.time_delta: - args.time_delta = int((tdelta_raw / tdelta_down) * max_down) - if args.time: - args.time = int((time_raw / time_down) * max_down) - if args.fastforward: - args.fastforward = int((ff_raw / ff_down) * max_down) - - if args.downtime_first: - args.downtime_first = int((dtf_raw / dtf_down) * max_down) - if args.downtime_interval: - args.downtime_interval = int((dti_raw / dti_down) * max_down) - if args.downtime_length: - args.downtime_length = int((dtl_raw / dtl_down) * max_down) - - return args - - -# ---- Parse + YAML merge ---- -args = parser.parse_args() - -# Config file existence check -if args.config_file and not os.path.isfile(args.config_file): - print(f"Error: '{args.config_file}' not found.", file=sys.stderr) - sys.exit(1) - -cfg = load_config(args.config_file) - -apply_config_to_args(cfg, args) - -# Optional: format fileprefix after config merge (if provided by workload parser) -if hasattr(args, "fileprefix") and isinstance(args.fileprefix, str): - try: - args.fileprefix = args.fileprefix.format(**vars(args)) - except KeyError as e: - print(f"Warning: missing placeholder {e} in fileprefix; skipping.") - -# Expand paths inside list fields (e.g., replay) -if getattr(args, "replay", None): - if isinstance(args.replay, str): - args.replay = [args.replay] - args.replay = [_expand_path(p) for p in args.replay] - -# Prefer replay if both replay and workload got set -if getattr(args, "replay", None) and getattr(args, "workload", None): - print("Info: --replay provided; ignoring --workload.", file=sys.stderr) - print("Info: --replay provided; ignoring --workload.", file=sys.stderr) - args.workload = None - -# Enforce valid policy/backfill values (after normalization in apply_config_to_args) -if getattr(args, "policy", None): - _valid_policies = {p.value for p in PolicyType} - if args.policy not in _valid_policies: - sys.exit(f"Error: Unknown policy '{args.policy}'. " - f"Valid: {sorted(_valid_policies)}") -if getattr(args, "backfill", None): - _valid_backfills = {b.value for b in BackfillType} - if args.backfill not in _valid_backfills: - sys.exit(f"Error: Unknown backfill '{args.backfill}'. " - f"Valid: {sorted(_valid_backfills)}") - -# Multi-partition guard for single-part driver (check merged args incl. CLI) -if os.path.basename(sys.argv[0]) == "main.py": - _parts = args.partitions or [] - if isinstance(_parts, str): - _parts = [_parts] - if len(_parts) > 1: - sys.exit("Error: Use multi-part-sim.py for multi-partition runs.") - -# Validate workload args before time conversions -check_workload_args(args) - -# Convert time-like args and compute downscale -args = post_process_args(args) - -# Expose dict form +from pydantic import model_validator, computed_field +from pydantic_settings import BaseSettings, SettingsConfigDict + +Distribution = Literal['uniform', 'weibull', 'normal'] + +class SimConfig(BaseSettings): + system: str|None = None + """ System config to use """ + partitions: list[str] = [] + """ List of multiple system configurations for a multi-partition run. Can contain wildcards """ + + cooling: bool = False + """ Include the FMU cooling model """ + simulate_network: bool = False + """ Include network model """ + + # Simulation runtime options + fastforward: int|None = None + """ + Fast-forward by time amount (unit specified by `time_unit`, default seconds). + Can pass a string like 15s, 1m, 1h + """ + time: int|None = None + """ + Length of time to simulate (unit specified by `time_unit`, default seconds). + Can pass a string like 123, 27m, 3h, 7d + """ + time_delta: int = 1 + """ + Step size (unit specified by `time_unit`, default seconds). + Can pass a string like 15s, 1m, 1h, 1ms + """ + time_unit: timedelta + """ + Units all time delta ints are measured in (default seconds) + """ + + @computed_field + @property + def downscale(self) -> int: + return int(timedelta(seconds = 1) / self.time_unit) + + start: str = "2021-05-21T13:00" + """ ISO8601 start of simulation """ + end: str = "2021-05-21T14:00" + """ ISO8601 end of simulation """ + + numjobs: int = 100 + """ Number of jobs to schedule """ + + uncertainties: bool = False + """ Use float-with-uncertainties (much slower) """ + + seed: bool = False + """ Set RNG seed for deterministic simulation """ + output: ExpandedPath|None = None # TODO this no longer supports empty + """ Output power, cooling, and loss models for later analysis. Argument specifies name. """ + + debug: bool = False + """ Enable debug mode and disable rich layout """ + noui: bool = False + """ Run without UI """ + verbose: bool = False + """ Enable verbose output """ + layout: Literal["layout1", "layout2"] = "layout1" + """ UI layout """ + plot: list[Literal["power", "loss", "pue", "temp", "util"]]|None = None + """ Plots to generate """ + + imtype: Literal["png", "svg", "jpg", "pdf", "eps"] = "png" + """ Plot image type """ + + replay: list[ExpandedPath]|None = None + """ Either: path/to/joblive path/to/jobprofile OR filename.npz """ + + encrypt: bool = False + """ Encrypt sensitive data in telemetry """ + + power_scope: Literal['node', 'chip'] = "chip" + """ node mode will use node power instead of CPU/GPU utilizations """ + + jid: str = "*" + """ Replay job id """ + + scale: int = 0 + """ Scale telemetry to a smaller target system, --scale 192 """ + + live: bool = False + """ Grab data from live system. """ + + # Workload arguments (TODO split into separate model) + workload: Literal['random', 'benchmark', 'peak', 'idle', 'synthetic', 'multitenant']|None = None + + """ Type of synthetic workload """ + multimodal: list[float] = [1.0] + """ + Percentage to draw from each distribution (list of floats). e.g. '0.2 0.8' percentages apply + in order to the list of the --distribution argument list. + """ + # Jobsize + jobsize_distribution: list[Distribution]|None = None + """ Distribution type """ + jobsize_normal_mean: float|None = None + """ Mean (mu) for Normal distribution """ + jobsize_normal_stddev: float|None = None + """ Standard deviation (sigma) for Normal distribution """ + jobsize_weibull_shape: float|None = None + """ Jobsize shape of weibull """ + jobsize_weibull_scale: float|None = None + """ Jobsize scale of weibull """ + jobsize_is_of_degree: int|None = None + """ Draw jobsizes from distribution of degree N (squared,cubed). """ + jobsize_is_power_of: int|None = None + """ Draw jobsizes from distribution of power of N (2->2^x,3->3^x). """ + + # Walltime + walltime_distribution: list[Distribution]|None = None + """ Distribution type """ + walltime_normal_mean: float|None = None + """ Walltime mean (mu) for Normal distribution """ + walltime_normal_stddev: float|None = None + """ Walltime standard deviation (sigma) for Normal distribution """ + walltime_weibull_shape: float|None = None + """ Walltime shape of weibull """ + walltime_weibull_scale: float|None = None + """ Walltime scale of weibull """ + # Utilizations (TODO should probably make a reusable "Distribution" submodel) + cpuutil_distribution: list[Distribution] = ['uniform'] + """ Distribution type """ + cpuutil_normal_mean: float|None = None + """ Walltime mean (mu) for Normal distribution """ + cpuutil_normal_stddev: float|None = None + """ Walltime standard deviation (sigma) for Normal distribution """ + cpuutil_weibull_shape: float|None = None + """ Walltime shape of weibull """ + cpuutil_weibull_scale: float|None = None + """ Walltime scale of weibull """ + gpuutil_distribution: list[Distribution] = ['uniform'] + """ Distribution type """ + gpuutil_normal_mean: float|None = None + """ Walltime mean (mu) for Normal distribution """ + gpuutil_normal_stddev: float|None = None + """ Walltime standard deviation (sigma) for Normal distribution """ + gpuutil_weibull_shape: float|None = None + """ Walltime shape of weibull """ + gpuutil_weibull_scale: float|None = None + """ Walltime scale of weibull """ + gantt_nodes: bool = False + """ Print Gannt with nodes required as line thickness (default false) """ + + # Synthetic workloads + scheduler: Literal[ + "default", "scheduleflow", "nrel", "anl", "flux", "experimental", "multitenant", + ] = "default" + """ Scheduler name """ + policy: PolicyType|None = None + """ Schedule policy """ + backfill: BackfillType|None = None + """ Backfill policy """ + + # Arrival + arrival: Literal["prescribed", "poisson"] = "prescribed" + """ Modify arrival distribution (poisson) or use original submit times (prescribed) """ + job_arrival_time: int|None = None + """ Poisson arrival (seconds). Overrides system config scheduler.job_arrival_time """ + job_arrival_rate: float|None = None # TODO define default here + """ Modify Poisson rate (default 1) """ + + # Accounts + accounts: bool = False + accounts_json: ExpandedPath|None = None + """ Path to accounts JSON file from previous run """ + + # Downtime + downtime_first: int|None = None + """ + First downtime (unit specified by `time_unit`, default seconds). + Can pass a string like 27m, 3h, 7d + """ + downtime_interval: str|None = None + """ + Interval between downtimes (unit specified by `time_unit`, default seconds). + Can pass a string like 123, 27m, 3h, 7d + """ + downtime_length: str|None = None + """ + Downtime length (unit specified by `time_unit`, default seconds). + Can pass a string like 123, 27m, 3h, 7d + """ + + # Continous Job Generation + continuous_job_generation: bool = False + """ Activate continuous job generation """ + maxqueue: int = 50 + """ Specify the max queue length for continuous job generation """ + + @model_validator(mode = "before") + def _parse_times(cls, data): + time_fields = [ + "time_delta", "time", "fastforward", + "downtime_first", "downtime_interval", "downtime_length", + ] + + if data.get('time_unit') is not None: + time_unit = parse_time_unit(data['time_unit']) + else: + time_unit = min( + [infer_time_unit(data[f]) for f in time_fields if data.get(f)], + default = timedelta(seconds = 1) + ) + + data['time_unit'] = time_unit + for field in time_fields: + if data.get(field) is not None: + data[field] = convert_to_time_unit(data[field], time_unit) + + return data + + @model_validator(mode = "after") + def _validate(self): + if self.system and self.partitions: + raise ValueError("system and partitions are mutually exclusive") + elif not self.system and not self.partitions: + self.system = "frontier" + + if not self.replay and not self.workload: + self.workload = "random" + + if self.jobsize_is_power_of is not None and self.jobsize_is_of_degree is not None: + raise ValueError("jobsize_is_power_of and jobsize_is_of_degree are mutually exclusive") + + return self + + model_config = SettingsConfigDict( + cli_implicit_flags=True, + cli_kebab_case=True, + cli_shortcuts={ + "partitions": "x", + "cooling": "c", + "simulate-network": "net", + "fastforward": "ff", + "time": "t", + "debug": "d", + "numjobs": "n", + "verbose": "v", + "output": "o", + "uncertainties": "u", + "plot": "p", + "replay": "f", + "workload": "w", + }, + ) + +def parse_args(cli_args = None) -> tuple[SimConfig, argparse.Namespace]: + parser = argparse.ArgumentParser( + description="Resource Allocator & Power Simulator (RAPS)", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + # parser.add_argument( + # "config_file", nargs="?", default=None, + # help = "YAML sim config file, can be used to configure an experiment instead of using CLI flags", + # ) + sim_config, raw_args = parse_args_pydantic(parser, SimConfig) + + args_dict = sim_config.model_dump(mode = "json") + # Temporary backwards compatibility addition + args_dict['validate'] = args_dict["power_scope"] == "node" + return sim_config, argparse.Namespace(**args_dict) + +sim_config, args = parse_args() args_dict = vars(args) diff --git a/raps/dataloaders/adastraMI250.py b/raps/dataloaders/adastraMI250.py index f0f576b..90201c8 100644 --- a/raps/dataloaders/adastraMI250.py +++ b/raps/dataloaders/adastraMI250.py @@ -12,7 +12,7 @@ python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 --policy priority --backfill easy # to fast-forward 60 days and replay for 1 day - python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 -ff 60d -t 1d + python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 --ff 60d -t 1d # to analyze dataset python -m raps.telemetry -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 -v diff --git a/raps/dataloaders/frontier.py b/raps/dataloaders/frontier.py index 506dab9..3154992 100644 --- a/raps/dataloaders/frontier.py +++ b/raps/dataloaders/frontier.py @@ -4,10 +4,10 @@ # To simulate DATEDIR="date=2024-01-18" DPATH=/path/to/data - python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR + python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR # To analyze the data - python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR + python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR """ import ast import time diff --git a/raps/dataloaders/lassen.py b/raps/dataloaders/lassen.py index f777f79..c9aae0d 100644 --- a/raps/dataloaders/lassen.py +++ b/raps/dataloaders/lassen.py @@ -23,7 +23,7 @@ Usage Instructions: python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --arrival poisson # to fast-forward 365 days and replay for 1 day. This region day has 2250 jobs with 1650 jobs executed. - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen -ff 365d -t 1d + python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --ff 365d -t 1d # For the network replay this command gives suiteable snapshots: python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit -t 12h --arrival poisson # noqa @@ -38,7 +38,7 @@ from tqdm import tqdm from datetime import timedelta from ..job import job_dict, Job -from ..utils import power_to_utilization, next_arrival_byconfkwargs, convert_to_seconds +from ..utils import power_to_utilization, next_arrival_byconfkwargs, parse_td def load_data(path, **kwargs): @@ -80,7 +80,7 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): time_to_simulate = 31536000 # a year time_to_simulate_timedelta = timedelta(seconds=time_to_simulate) # timedelta else: - time_to_simulate_timedelta = timedelta(seconds=convert_to_seconds(time_to_simulate)) # timedelta + time_to_simulate_timedelta = parse_td(time_to_simulate) # timedelta telemetry_start_timestamp = allocation_df['begin_timestamp'].min() telemetry_start_time = 0 @@ -190,7 +190,7 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): ib_tx_per_node = total_ib_tx / n # average bytes per node ib_rx_per_node = total_ib_rx / n # average bytes per node - # net_tx, net_rx = [],[] # generate_network_sequences generates errors (e.g. -ff 800d -t 1d ) + # net_tx, net_rx = [],[] # generate_network_sequences generates errors (e.g. --ff 800d -t 1d ) # net_tx, net_rx = generate_network_sequences(ib_tx, ib_rx, samples, lambda_poisson=0.3) net_tx, net_rx = throughput_traces(ib_tx_per_node, ib_rx_per_node, samples) diff --git a/raps/dataloaders/marconi100.py b/raps/dataloaders/marconi100.py index 6222c7c..fef8ec0 100644 --- a/raps/dataloaders/marconi100.py +++ b/raps/dataloaders/marconi100.py @@ -16,7 +16,7 @@ python main.py -f /path/to/job_table.parquet --system marconi100 --policy priority --backfill firstfit # to fast-forward 60 days and replay for 1 day - python main.py -f /path/to/job_table.parquet --system marconi100 -ff 60d -t 1d + python main.py -f /path/to/job_table.parquet --system marconi100 --ff 60d -t 1d # to analyze dataset python -m raps.telemetry -f /path/to/job_table.parquet --system marconi100 -v diff --git a/raps/telemetry.py b/raps/telemetry.py index e78a40a..77e5112 100644 --- a/raps/telemetry.py +++ b/raps/telemetry.py @@ -57,7 +57,7 @@ from raps.plotting import ( plot_nodes_gantt, plot_network_histogram ) -from raps.utils import next_arrival_byconfargs, create_casename, convert_to_seconds +from raps.utils import next_arrival_byconfargs, create_casename, convert_to_time_unit # from raps.args import args, args_dict @@ -259,7 +259,7 @@ class Telemetry: print(f"File was generated with:" f"\n--system {args_from_file.system} ") if hasattr(args_from_file, 'fastforward'): - print(f"-ff {args_from_file.fastforward} ") + print(f"--ff {args_from_file.fastforward} ") if hasattr(args_from_file, 'time'): print(f"-t {args_from_file.time}") print(f"All Args:\n{args_from_file}" @@ -308,7 +308,7 @@ class Telemetry: timestep_end=timestep_end, args=args, filename=self.dirname) if args.time: - timestep_end = timestep_start + convert_to_seconds(args.time) + timestep_end = timestep_start + convert_to_time_unit(args.time) elif not timestep_end: timestep_end = int(max(job.wall_time + job.start_time for job in jobs)) + 1 diff --git a/raps/utils.py b/raps/utils.py index bbac74d..a9752aa 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -8,16 +8,21 @@ generating random numbers, summarizing and expanding ranges, determining job sta from datetime import timedelta from enum import Enum - import os import hashlib import math +import re import numpy as np import pandas as pd import random import sys import uuid import json +import argparse + +from typing import Annotated as A, TypeVar +from pydantic import TypeAdapter, AfterValidator +from pydantic_settings import BaseSettings, CliApp, CliSettingsSource from raps.job import Job @@ -454,54 +459,63 @@ def next_arrival(lambda_rate, reset=False, start_time=0): -math.log(1.0 - random.random()) / lambda_rate return next_arrival.next_time - -def convert_to_seconds(time_str): - if isinstance(time_str, (int, float)): - return time_str # this happens.... - # Define the conversion factors - time_factors = { - 'd': 86400, # 1 day = 86400 seconds - 'h': 3600, # 1 hour = 3600 seconds - 'm': 60, # 1 minute = 60 seconds - 's': 1, # 1 second = 1 second - '': 1 # empty string = 1 second - } - downscale_factors = { - 'ms': 1000, - 'cs': 100, - 'ds': 10 - } - - # Check if the input string ends with a unit or is purely numeric - # and extract the numeric part and the time unit - if time_str[-1].isdigit(): - unit = '' - num_str = time_str[:] - else: - if time_str[-2].isdigit(): - unit = time_str[-1] - num_str = time_str[:-1] - else: - unit = time_str[-2:] - num_str = time_str[:-2] - - index = num_str.find(".") # convert int or float string - if index != -1: - num = float(num_str) - raise ValueError(f"Float not supported at this time: {num}{unit}") - - else: - num = int(num_str) - - # Convert to seconds using the conversion factors - if unit in time_factors: - return num * time_factors[unit], 1 - elif unit in downscale_factors: - downscale = downscale_factors[unit] - return num, downscale - else: - raise ValueError(f"Unknown time unit: {unit}") - +TIME_UNITS = { + 'd': timedelta(days=1), + 'h': timedelta(hours=1), + 'm': timedelta(minutes=1), + 's': timedelta(seconds=1), + 'ds': timedelta(milliseconds=100), + 'cs': timedelta(milliseconds=10), + 'ms': timedelta(milliseconds=1), +} + +def parse_time_unit(unit) -> timedelta: + parsed_unit = unit + if isinstance(unit, str): + parsed_unit = TIME_UNITS.get(unit) + if not isinstance(parsed_unit, timedelta): + raise ValueError(f"Invalid time unit {unit}") + if parsed_unit not in TIME_UNITS.values() or parsed_unit > TIME_UNITS['s']: + raise ValueError("Only time units of s, ds, cs, and ms are supported") + return parsed_unit + +def parse_td(td) -> timedelta: + """ Parse into a timedelta. """ + if TypeAdapter(int).validator.isinstance_python(td): + return timedelta(seconds = TypeAdapter(int).validate_python(td)) + if TypeAdapter(timedelta).validator.isinstance_python(td): + return TypeAdapter(timedelta).validate_python(td) + if isinstance(td, str): + re_match = re.fullmatch(r"(\d+)\s*(\w+)", td.strip()) + if re_match and re_match[2] in TIME_UNITS: + num_str, unit_str = re_match.groups() + return int(num_str) * TIME_UNITS[unit_str] + raise ValueError(f"Invalid timedelta: {td}") + +def convert_to_time_unit(td, unit: str|timedelta = 's'): + """ + Converts to integer number of time unit + Throws if the given time is less than the unit + """ + num = parse_td(td) / parse_time_unit(unit) + if (num != 0 and num < 1) or not num.is_integer(): + raise ValueError(f"{td} is not divisible by time unit {unit}") + return int(num) + +def infer_time_unit(td) -> timedelta: + """ Infers the time unit the user meant for the input string """ + parsed_td = parse_td(td) + time_unit = None + if isinstance(td, str): # infer unit from string, e.g. 1s or 200ms + re_match = re.fullmatch(r"(\d+)\s*(\w+)", td.strip()) + if re_match and re_match[2] in TIME_UNITS: + time_unit = TIME_UNITS[re_match[2]] + if not time_unit: + for unit in sorted(TIME_UNITS.values(), reverse=True): + if (parsed_td % unit).total_seconds() == 0: + time_unit = unit + break + return min(TIME_UNITS['s'], time_unit or TIME_UNITS['s']) def encrypt(name): """Encrypts a given name using SHA-256 and returns the hexadecimal digest.""" @@ -604,3 +618,25 @@ class ValueComparableEnum(Enum): def __hash__(self): # required if you override __eq__ return hash(self.value) + + +ExpandedPath = A[str, AfterValidator(lambda v: os.path.expanduser(os.path.expandvars(v)))] +""" Type that that expands ~ and environment variables in a path string """ + +T = TypeVar("T", bound = BaseSettings) +def parse_args_pydantic( + parser: argparse.ArgumentParser, model_cls: type[T], args: list[str]|None = None, +) -> tuple[T, argparse.Namespace]: + """ + Add arguments to the parser from the settings model, and then returns the + raw argparse args object, and the parsed model. + + You can configure Pydantic to implicitly parse from sys.argv but we want + a bit more control over the parser. + """ + if args is None: + args = sys.argv[1:] + cli_settings_source = CliSettingsSource(model_cls, root_parser=parser) + parsed_args = parser.parse_args(args) + model = CliApp.run(model_cls, cli_args=parsed_args, cli_settings_source=cli_settings_source) + return model, parsed_args diff --git a/raps/workload.py b/raps/workload.py index 1e6b562..6ca0c8c 100644 --- a/raps/workload.py +++ b/raps/workload.py @@ -799,81 +799,6 @@ def plot_job_hist(jobs, config=None, dist_split=None, gantt_nodes=False): plt.show() - -def add_workload_to_parser(parser): - - choices = ['random', 'benchmark', 'peak', 'idle', 'synthetic', 'multitenant'] - parser.add_argument('-w', '--workload', type=str, choices=choices, - default=choices[0], help='Type of synthetic workload') - - parser.add_argument("--multimodal", default=[1.0], type=float, nargs="+", - help="Percentage to draw from each distribution " - "(list of floats)e.g. '0.2 0.8' percentages apply" - " in order to the list of the --distribution argument list.") - - # Jobsize: - parser.add_argument("--jobsize-distribution", type=str, nargs="+", - choices=['uniform', 'weibull', 'normal'], default=None, help='Distribution type') - - parser.add_argument("--jobsize-normal-mean", type=float, required=False, help="Mean (mu) for Normal distribution") - parser.add_argument("--jobsize-normal-stddev", type=float, required=False, - help="Standard deviation (sigma) for Normal distribution") - - parser.add_argument("--jobsize-weibull-shape", type=float, required=False, help="Jobsize shape of weibull") - parser.add_argument("--jobsize-weibull-scale", type=float, required=False, help="Jobsize scale of weibull") - - parser.add_argument("--jobsize-is-of-degree", default=None, type=int, required=False, - help="Draw jobsizes from distribution of degree N (squared,cubed).") - parser.add_argument("--jobsize-is-power-of", default=None, type=int, required=False, - help="Draw jobsizes from distribution of power of N (2->2^x,3->3^x).") - - # Walltime: - parser.add_argument("--walltime-distribution", type=str, nargs="+", - choices=['uniform', 'weibull', 'normal'], default=None, help='Distribution type') - - parser.add_argument("--walltime-normal-mean", type=float, required=False, - help="Walltime mean (mu) for Normal distribution") - parser.add_argument("--walltime-normal-stddev", type=float, required=False, - help="Walltime standard deviation (sigma) for Normal distribution") - - parser.add_argument("--walltime-weibull-shape", type=float, required=False, help="Walltime shape of weibull") - parser.add_argument("--walltime-weibull-scale", type=float, required=False, help="Walltime scale of weibull") - - # Utilizations - parser.add_argument("--cpuutil-distribution", type=str, nargs="+", - choices=['uniform', 'weibull', 'normal'], default=['uniform'], help='Distribution type') - - parser.add_argument("--cpuutil-normal-mean", type=float, required=False, - help="Walltime mean (mu) for Normal distribution") - parser.add_argument("--cpuutil-normal-stddev", type=float, required=False, - help="Walltime standard deviation (sigma) for Normal distribution") - - parser.add_argument("--cpuutil-weibull-shape", type=float, required=False, help="Walltime shape of weibull") - parser.add_argument("--cpuutil-weibull-scale", type=float, required=False, help="Walltime scale of weibull") - - parser.add_argument("--gpuutil-distribution", type=str, nargs="+", - choices=['uniform', 'weibull', 'normal'], default=['uniform'], help='Distribution type') - - parser.add_argument("--gpuutil-normal-mean", type=float, required=False, - help="Walltime mean (mu) for Normal distribution") - parser.add_argument("--gpuutil-normal-stddev", type=float, required=False, - help="Walltime standard deviation (sigma) for Normal distribution") - - parser.add_argument("--gpuutil-weibull-shape", type=float, required=False, help="Walltime shape of weibull") - parser.add_argument("--gpuutil-weibull-scale", type=float, required=False, help="Walltime scale of weibull") - - parser.add_argument("--gantt-nodes", default=False, action='store_true', required=False, - help="Print Gannt with nodes required as line thickness (default false)") - - return parser - - -def check_workload_args(args): - if (args.jobsize_is_power_of is not None and args.jobsize_is_of_degree is not None): - print("Choose either --jobsize-is-power-of or --jobsize-is-of-degree! Not both.") - exit(1) - - def run_workload(): from raps.args import args, args_dict from raps.config import get_system_config diff --git a/scripts/marconi100-day51.sh b/scripts/marconi100-day51.sh index ae801a5..01da9a2 100644 --- a/scripts/marconi100-day51.sh +++ b/scripts/marconi100-day51.sh @@ -1,4 +1,4 @@ -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 -ff 4381000 -t 61000 -o --policy replay -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 -ff 4381000 -t 61000 -o --policy fcfs -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 -ff 4381000 -t 61000 -o --policy fcfs --backfill easy -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 -ff 4381000 -t 61000 -o --policy priority --backfill firstfit +python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy replay +python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs +python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs --backfill easy +python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy priority --backfill firstfit diff --git a/scripts/meta_run.sh b/scripts/meta_run.sh index 41f4831..0c60596 100755 --- a/scripts/meta_run.sh +++ b/scripts/meta_run.sh @@ -17,7 +17,7 @@ while [ $current_sec -le $end_sec ]; do DATEDIRS="date=$DATEDIR" # Construct the command with the formatted date - command="python main.py -d -o --plot power loss -f $DPATH/slurm/joblive/$DATEDIRS $DPATH/jobprofile/jobprofile/$DATEDIRS >& $DATEDIRS.out &" + command="python main.py -d -o --plot power loss -f $DPATH/slurm/joblive/$DATEDIRS,$DPATH/jobprofile/jobprofile/$DATEDIRS >& $DATEDIRS.out &" sleep 10 # Execute the command diff --git a/tests/systems/test_main_network_run.py b/tests/systems/test_main_network_run.py index f40cc8f..8b80d5d 100644 --- a/tests/systems/test_main_network_run.py +++ b/tests/systems/test_main_network_run.py @@ -24,7 +24,7 @@ def test_main_network_run(system, system_config, random_id): "python", "main.py", "--time", "1m", "--system", system, - "-net", + "--net", "-o", random_id ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_network_withdata_run.py b/tests/systems/test_main_network_withdata_run.py index c478e21..82c30de 100644 --- a/tests/systems/test_main_network_withdata_run.py +++ b/tests/systems/test_main_network_withdata_run.py @@ -31,7 +31,7 @@ def test_main_run(system, system_config, system_file, random_id): "--time", "1m", "--system", system, "-f", *file_list, - "-net", + "--net", "-o", random_id ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_time_delta_run.py b/tests/systems/test_main_time_delta_run.py index f86caa7..9cb87a2 100644 --- a/tests/systems/test_main_time_delta_run.py +++ b/tests/systems/test_main_time_delta_run.py @@ -3,8 +3,7 @@ import subprocess import gc import pytest from tests.util import PROJECT_ROOT -from raps.utils import convert_seconds_to_hhmmss -from raps.utils import convert_to_seconds +from raps.utils import convert_to_time_unit, convert_seconds_to_hhmmss pytestmark = [ @@ -38,8 +37,8 @@ def test_main_time_delta_run(system, system_config, time_arg, tdelta_arg, random "-o", random_id ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" - time, downscale = convert_to_seconds(time_arg) - assert f"Time Simulated: {convert_seconds_to_hhmmss(time // downscale)}" in result.stdout + time = convert_to_time_unit(time_arg) + assert f"Time Simulated: {convert_seconds_to_hhmmss(time)}" in result.stdout subprocess.run( f"rm {random_id}.npz && rm -fr simulation_results/{random_id}", diff --git a/tests/systems/test_main_time_delta_sub_second_run.py b/tests/systems/test_main_time_delta_sub_second_run.py index 5542549..a73be33 100644 --- a/tests/systems/test_main_time_delta_sub_second_run.py +++ b/tests/systems/test_main_time_delta_sub_second_run.py @@ -3,8 +3,7 @@ import subprocess import gc import pytest from tests.util import PROJECT_ROOT -from raps.utils import convert_seconds_to_hhmmss -from raps.utils import convert_to_seconds +from raps.utils import convert_seconds_to_hhmmss, convert_to_time_unit pytestmark = [ @@ -39,10 +38,10 @@ def test_main_time_delta_run(system, system_config, time_arg, tdelta_arg, random "-o", random_id ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" - time, downscale = convert_to_seconds(time_arg) - td, td_ds = convert_to_seconds(tdelta_arg) - #assert f"Time Simulated: {convert_seconds_to_hhmmss(int((time / td_ds) * downscale))}" in result.stdout - assert f"Time Simulated: {convert_seconds_to_hhmmss(time / downscale)}" in result.stdout + time = convert_to_time_unit(time_arg) + td = convert_to_time_unit(tdelta_arg) + #assert f"Time Simulated: {convert_seconds_to_hhmmss(int(time / td_ds))}" in result.stdout + assert f"Time Simulated: {convert_seconds_to_hhmmss(time)}" in result.stdout subprocess.run( f"rm {random_id}.npz && rm -fr simulation_results/{random_id}", diff --git a/tests/systems/test_main_time_ff_delta_run.py b/tests/systems/test_main_time_ff_delta_run.py index 3e46dda..d3ef963 100644 --- a/tests/systems/test_main_time_ff_delta_run.py +++ b/tests/systems/test_main_time_ff_delta_run.py @@ -30,7 +30,7 @@ def test_main_time_delta_run(system, system_config, time_arg, tdelta_arg, result = subprocess.run([ "python", "main.py", "-t", time_arg, - "-ff", ff_arg, + "--ff", ff_arg, "--time-delta", tdelta_arg, "--system", system, #--"-f", system_file, diff --git a/tests/systems/test_multi_part_sim_basic_run.py b/tests/systems/test_multi_part_sim_basic_run.py index 24a671e..9b9db13 100644 --- a/tests/systems/test_multi_part_sim_basic_run.py +++ b/tests/systems/test_multi_part_sim_basic_run.py @@ -20,7 +20,6 @@ def test_multi_part_sim_run(system, system_config): result = subprocess.run([ "python", "multi-part-sim.py", "--time", "1h", - "--system", system, "-x", f"{system}/*", #"--noui" ], capture_output=True, text=True, stdin=subprocess.DEVNULL) diff --git a/tests/systems/test_multi_part_sim_network_run.py b/tests/systems/test_multi_part_sim_network_run.py index 643b97a..1871725 100644 --- a/tests/systems/test_multi_part_sim_network_run.py +++ b/tests/systems/test_multi_part_sim_network_run.py @@ -23,9 +23,8 @@ def test_multi_part_sim_run(system, system_config, random_id): result = subprocess.run([ "python", "multi-part-sim.py", "--time", "1h", - "--system", system, "-x", f"{system}/*", - "-net", + "--net", #"--noui" ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py new file mode 100644 index 0000000..50e879a --- /dev/null +++ b/tests/unit/test_utils.py @@ -0,0 +1,39 @@ +import pytest +from datetime import timedelta +from raps.utils import parse_td, convert_to_time_unit, infer_time_unit, TIME_UNITS + +@pytest.mark.parametrize("input,expected", [ + ("1", timedelta(seconds = 1)), + ("1m", timedelta(minutes = 1)), + (timedelta(minutes = 1), timedelta(minutes = 1)), + (2, timedelta(seconds = 2)), + ("PT2S", timedelta(seconds = 2)), +]) +def test_parse_td(input, expected): + assert parse_td(input) == expected + +def test_parse_td_error(): + with pytest.raises(ValueError): + parse_td("1x") + +@pytest.mark.parametrize("input,unit,expected", [ + ("1s", 's', 1), + ("1m", 's', 60), + (0, 'ms', 0), + (timedelta(seconds=6), 'ms', 6000), +]) +def test_convert_to_time_unit(input, unit, expected): + assert convert_to_time_unit(input, unit) == expected + + +@pytest.mark.parametrize("input,expected", [ + ("1s", 's'), + ("1000ms", 'ms'), + (0, 's'), + (timedelta(seconds=6), 's'), + (timedelta(days=6), 's'), + (timedelta(milliseconds=6), 'ms'), + (timedelta(milliseconds=60), 'cs'), +]) +def test_infer_time_unit(input, expected): + assert infer_time_unit(input) == TIME_UNITS[expected] -- GitLab From 2c3d6110eea4b4460064cdfb6a575c34efcc6735 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 22 Aug 2025 14:21:57 -0400 Subject: [PATCH 05/23] Support config_file for sim --- raps/args.py | 49 +++++++++++++++++++++++++++++------------------- raps/utils.py | 52 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 65 insertions(+), 36 deletions(-) diff --git a/raps/args.py b/raps/args.py index 5a4b06a..8603d6b 100644 --- a/raps/args.py +++ b/raps/args.py @@ -3,20 +3,21 @@ import os import sys import yaml from datetime import timedelta +from pathlib import Path from typing import Literal, Annotated as A from raps.schedulers.default import PolicyType, BackfillType from raps.utils import ( parse_time_unit, convert_to_time_unit, infer_time_unit, ExpandedPath, - parse_args_pydantic, + pydantic_add_args, ) -from pydantic import model_validator, computed_field -from pydantic_settings import BaseSettings, SettingsConfigDict +from pydantic import BaseModel, model_validator, computed_field +from pydantic_settings import SettingsConfigDict Distribution = Literal['uniform', 'weibull', 'normal'] -class SimConfig(BaseSettings): +class SimConfig(BaseModel): system: str|None = None """ System config to use """ partitions: list[str] = [] @@ -244,10 +245,21 @@ class SimConfig(BaseSettings): return self - model_config = SettingsConfigDict( - cli_implicit_flags=True, - cli_kebab_case=True, - cli_shortcuts={ + +def parse_args(cli_args = None) -> tuple[SimConfig, argparse.Namespace]: + parser = argparse.ArgumentParser( + description="Resource Allocator & Power Simulator (RAPS)", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "config_file", nargs="?", default=None, + help = 'YAML sim config file, can be used to configure an experiment instead of using CLI flags. Pass "-" to read from stdin.', + ) + + model_validate_args = pydantic_add_args(parser, SimConfig, model_config = SettingsConfigDict( + cli_implicit_flags = True, + cli_kebab_case = True, + cli_shortcuts = { "partitions": "x", "cooling": "c", "simulate-network": "net", @@ -262,18 +274,17 @@ class SimConfig(BaseSettings): "replay": "f", "workload": "w", }, - ) + )) -def parse_args(cli_args = None) -> tuple[SimConfig, argparse.Namespace]: - parser = argparse.ArgumentParser( - description="Resource Allocator & Power Simulator (RAPS)", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - # parser.add_argument( - # "config_file", nargs="?", default=None, - # help = "YAML sim config file, can be used to configure an experiment instead of using CLI flags", - # ) - sim_config, raw_args = parse_args_pydantic(parser, SimConfig) + args = parser.parse_args(cli_args) + if args.config_file == "-": + config_file_data = yaml.safe_load(sys.stdin.read()) + elif args.config_file: + config_file_data = yaml.safe_load(Path(args.config_file).read_text()) + else: + config_file_data = {} + + sim_config = model_validate_args(args, config_file_data) args_dict = sim_config.model_dump(mode = "json") # Temporary backwards compatibility addition diff --git a/raps/utils.py b/raps/utils.py index a9752aa..f31c80f 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -20,9 +20,9 @@ import uuid import json import argparse -from typing import Annotated as A, TypeVar -from pydantic import TypeAdapter, AfterValidator -from pydantic_settings import BaseSettings, CliApp, CliSettingsSource +from typing import Annotated as A, TypeVar, Callable +from pydantic import BaseModel, TypeAdapter, AfterValidator +from pydantic_settings import BaseSettings, SettingsConfigDict, CliApp, CliSettingsSource from raps.job import Job @@ -623,20 +623,38 @@ class ValueComparableEnum(Enum): ExpandedPath = A[str, AfterValidator(lambda v: os.path.expanduser(os.path.expandvars(v)))] """ Type that that expands ~ and environment variables in a path string """ -T = TypeVar("T", bound = BaseSettings) -def parse_args_pydantic( - parser: argparse.ArgumentParser, model_cls: type[T], args: list[str]|None = None, -) -> tuple[T, argparse.Namespace]: + +T = TypeVar("T", bound = BaseModel) +def pydantic_add_args( + parser: argparse.ArgumentParser, model_cls: type[T], + model_config: SettingsConfigDict|None = None, +) -> Callable[[argparse.Namespace, dict|None], T]: """ - Add arguments to the parser from the settings model, and then returns the - raw argparse args object, and the parsed model. + Add arguments to the parser from the model. Returns a function that can be used to parse the + model from the argparse args. - You can configure Pydantic to implicitly parse from sys.argv but we want - a bit more control over the parser. + Normally you'd just configure Pydantic to just automatically create a BaseSettings object from + sys.argv and/or env variables. But we want a bit more control over the cli parser, and to use + the SimConfig model as a regular non-settings model in the simulation server. So here we do + some hacks to apply the args manually. """ - if args is None: - args = sys.argv[1:] - cli_settings_source = CliSettingsSource(model_cls, root_parser=parser) - parsed_args = parser.parse_args(args) - model = CliApp.run(model_cls, cli_args=parsed_args, cli_settings_source=cli_settings_source) - return model, parsed_args + model_config_dict = SettingsConfigDict({ + **(model_config or {}), + "cli_parse_args": False, # Don't automatically parse args + }) + class SettingsModel(model_cls, BaseSettings): + @classmethod + def settings_customise_sources(cls, settings_cls, init_settings, *args): + return (init_settings,) # Don't load from env vars or anything else + + model_config = model_config_dict + + cli_settings_source = CliSettingsSource(SettingsModel, root_parser = parser) + + def model_validate_args(args: argparse.Namespace, data: dict|None = None): + return CliApp.run(model_cls, + cli_args = args, + cli_settings_source = cli_settings_source, + **(data or {}), + ) + return model_validate_args -- GitLab From 13928f5f35cf26c1c5a062a9c8a03512c2a7eb45 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 22 Aug 2025 14:28:33 -0400 Subject: [PATCH 06/23] Rename files --- main.py | 4 ++-- multi-part-sim-mpi.py | 4 ++-- multi-part-sim.py | 4 ++-- raps/downtime.py | 2 +- raps/{args.py => sim_config.py} | 0 raps/{config.py => system_config.py} | 0 raps/telemetry.py | 6 +++--- raps/workload.py | 4 ++-- tests/unit/test_system_config.py | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) rename raps/{args.py => sim_config.py} (100%) rename raps/{config.py => system_config.py} (100%) diff --git a/main.py b/main.py index 9655b1b..1425f6b 100644 --- a/main.py +++ b/main.py @@ -16,7 +16,7 @@ import math # from raps.helpers import check_python_version # -from raps.config import get_system_config +from raps.system_config import get_system_config from raps.constants import OUTPUT_PATH, SEED from raps.cooling import ThermoFluidsModel from raps.ui import LayoutManager @@ -45,7 +45,7 @@ from raps.stats import ( print_formatted_report ) -from raps.args import args, args_dict +from raps.sim_config import args, args_dict, sim_config check_python_version() diff --git a/multi-part-sim-mpi.py b/multi-part-sim-mpi.py index 4d65b96..820bd09 100644 --- a/multi-part-sim-mpi.py +++ b/multi-part-sim-mpi.py @@ -15,8 +15,8 @@ from raps.power import PowerManager, compute_node_power from raps.flops import FLOPSManager from raps.engine import Engine from raps.ui import LayoutManager -from raps.config import get_partition_configs -from args import args +from raps.system_config import get_partition_configs +from raps.sim_config import args, sim_config import random import os import glob diff --git a/multi-part-sim.py b/multi-part-sim.py index 148a702..bb61ba5 100644 --- a/multi-part-sim.py +++ b/multi-part-sim.py @@ -16,8 +16,8 @@ from raps.power import PowerManager, compute_node_power from raps.flops import FLOPSManager from raps.engine import Engine from raps.ui import LayoutManager -from raps.config import get_partition_configs -from raps.args import args +from raps.system_config import get_partition_configs +from raps.sim_config import args, sim_config import random import os import glob diff --git a/raps/downtime.py b/raps/downtime.py index 7c5bf1f..97c9139 100644 --- a/raps/downtime.py +++ b/raps/downtime.py @@ -1,7 +1,7 @@ from __future__ import annotations from typing import TYPE_CHECKING from raps.job import JobState -from raps.args import args +from raps.sim_config import args, sim_config import numpy as np diff --git a/raps/args.py b/raps/sim_config.py similarity index 100% rename from raps/args.py rename to raps/sim_config.py diff --git a/raps/config.py b/raps/system_config.py similarity index 100% rename from raps/config.py rename to raps/system_config.py diff --git a/raps/telemetry.py b/raps/telemetry.py index 77e5112..c6815cd 100644 --- a/raps/telemetry.py +++ b/raps/telemetry.py @@ -18,7 +18,7 @@ from types import ModuleType if __name__ == "__main__": - # from raps.args import args,args_dict + # from raps.sim_config import args, args_dict parser = argparse.ArgumentParser(description='Telemetry data validator') parser.add_argument('--jid', type=str, default='*', help='Replay job id') parser.add_argument('-f', '--replay', nargs='+', type=str, @@ -49,7 +49,7 @@ import pandas as pd from tqdm import tqdm # from rich.progress import track -from raps.config import get_system_config +from raps.system_config import get_system_config from raps.job import Job, job_dict import matplotlib.pyplot as plt from raps.plotting import ( @@ -58,7 +58,7 @@ from raps.plotting import ( plot_network_histogram ) from raps.utils import next_arrival_byconfargs, create_casename, convert_to_time_unit -# from raps.args import args, args_dict +# from raps.sim_config import args, args_dict class Telemetry: diff --git a/raps/workload.py b/raps/workload.py index 6ca0c8c..5b9a7fb 100644 --- a/raps/workload.py +++ b/raps/workload.py @@ -800,8 +800,8 @@ def plot_job_hist(jobs, config=None, dist_split=None, gantt_nodes=False): plt.show() def run_workload(): - from raps.args import args, args_dict - from raps.config import get_system_config + from raps.sim_config import args, args_dict, sim_config + from raps.system_config import get_system_config config = get_system_config(args.system).get_legacy() if args.replay: td = Telemetry(**args_dict) diff --git a/tests/unit/test_system_config.py b/tests/unit/test_system_config.py index 9074954..40ac0fe 100644 --- a/tests/unit/test_system_config.py +++ b/tests/unit/test_system_config.py @@ -1,5 +1,5 @@ import pytest -from raps.config import list_systems, get_system_config, CONFIG_PATH, get_partition_configs +from raps.system_config import list_systems, get_system_config, CONFIG_PATH, get_partition_configs @pytest.mark.parametrize("system_name", list_systems()) def test_configs(system_name): -- GitLab From 151887c120c8d84f49c710be0f41cac4aebaa781 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 22 Aug 2025 14:36:38 -0400 Subject: [PATCH 07/23] Add methods to get "legacy" formats --- raps/sim_config.py | 35 +++++++++++++++++++++++++---------- raps/system_config.py | 4 ++-- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/raps/sim_config.py b/raps/sim_config.py index 8603d6b..4e70402 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -245,10 +245,29 @@ class SimConfig(BaseModel): return self - -def parse_args(cli_args = None) -> tuple[SimConfig, argparse.Namespace]: + def get_legacy_args(self): + """ + Return as an argparse.Namespace object for backwards compatability + """ + return argparse.Namespace(**self.get_legacy_args_dict()) + + def get_legacy_args_dict(self): + """ + Return as a dict object. This is for backwards compatibility with the rest of RAPS code so + we can migrate to the new config gradually. The dict also has a "model" key that contains + the SimConfig object itself. + """ + args_dict = self.model_dump(mode = "json") + # validate has been renamed to power_scope + args_dict['validate'] = args_dict["power_scope"] == "node" + args_dict['model'] = self + return args_dict + + +def parse_args(cli_args = None) -> SimConfig: parser = argparse.ArgumentParser( description="Resource Allocator & Power Simulator (RAPS)", + allow_abbrev=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( @@ -284,12 +303,8 @@ def parse_args(cli_args = None) -> tuple[SimConfig, argparse.Namespace]: else: config_file_data = {} - sim_config = model_validate_args(args, config_file_data) - - args_dict = sim_config.model_dump(mode = "json") - # Temporary backwards compatibility addition - args_dict['validate'] = args_dict["power_scope"] == "node" - return sim_config, argparse.Namespace(**args_dict) + return model_validate_args(args, config_file_data) -sim_config, args = parse_args() -args_dict = vars(args) +sim_config = parse_args() +args = sim_config.get_legacy_args() +args_dict = sim_config.get_legacy_args_dict() diff --git a/raps/system_config.py b/raps/system_config.py index 57ae4e9..16d67cb 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -169,7 +169,7 @@ class SystemConfig(BaseModel): """ Return the system config as a flattened, uppercased dict. This is for backwards compatibility with the rest of RAPS code so we can migrate to the new config format - gradually. The dict also as a "config" key that contains the SystemConfig object itself. + gradually. The dict also as a "model" key that contains the SystemConfig object itself. """ renames = { # fields that need to be renamed to something other than just .upper() "system_name": "system_name", @@ -188,7 +188,7 @@ class SystemConfig(BaseModel): config_dict[k] = v # rename keys config_dict = {renames.get(k, k.upper()): v for k, v in config_dict.items()} - config_dict['config'] = self + config_dict['model'] = self return config_dict class MultiPartitionSystemConfig(BaseModel): -- GitLab From da8e36152c7065dec122a827af8d30402d061f22 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 22 Aug 2025 15:28:23 -0400 Subject: [PATCH 08/23] Use Path objects --- raps/sim_config.py | 9 +++++++++ raps/utils.py | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/raps/sim_config.py b/raps/sim_config.py index 4e70402..fa0fcbf 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -260,6 +260,15 @@ class SimConfig(BaseModel): args_dict = self.model_dump(mode = "json") # validate has been renamed to power_scope args_dict['validate'] = args_dict["power_scope"] == "node" + + # Convert Path objects to str + if args_dict['output']: + args_dict['output'] = str(args_dict['output']) + if args_dict['replay']: + args_dict['replay'] = [str(p) for p in args_dict['replay']] + if args_dict['accounts_json']: + args_dict['accounts_json'] = str(args_dict['accounts_json']) + args_dict['model'] = self return args_dict diff --git a/raps/utils.py b/raps/utils.py index f31c80f..61ef801 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -19,7 +19,7 @@ import sys import uuid import json import argparse - +from pathlib import Path from typing import Annotated as A, TypeVar, Callable from pydantic import BaseModel, TypeAdapter, AfterValidator from pydantic_settings import BaseSettings, SettingsConfigDict, CliApp, CliSettingsSource @@ -620,7 +620,7 @@ class ValueComparableEnum(Enum): return hash(self.value) -ExpandedPath = A[str, AfterValidator(lambda v: os.path.expanduser(os.path.expandvars(v)))] +ExpandedPath = A[Path, AfterValidator(lambda v: Path(v).expanduser().resolve())] """ Type that that expands ~ and environment variables in a path string """ -- GitLab From 27bd9a035efcb445c655b9d0c3a6882b7a2db49e Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 22 Aug 2025 15:42:12 -0400 Subject: [PATCH 09/23] Add raps_config --- raps/raps_config.py | 33 +++++++++++++++++++++++++++++++++ raps/system_config.py | 12 +++++------- 2 files changed, 38 insertions(+), 7 deletions(-) create mode 100644 raps/raps_config.py diff --git a/raps/raps_config.py b/raps/raps_config.py new file mode 100644 index 0000000..17b5ce7 --- /dev/null +++ b/raps/raps_config.py @@ -0,0 +1,33 @@ +from pathlib import Path +from raps.utils import ExpandedPath +from pydantic_settings import BaseSettings, SettingsConfigDict, YamlConfigSettingsSource +ROOT_DIR = Path(__file__).parent.parent + +class RapsConfig(BaseSettings): + """ + General settings for raps. Pydantic will automatically populate this model from env vars or a + .env file. + """ + # TODO I think we should move more of general/ui related settings from SimConfig into here. + # We'll be using SimConfig in the simulation server and those settings aren't applicable there, + # so it makes sense to keep SimConfig scoped to the logical operation of the sim. + + system_config_dir: ExpandedPath = ROOT_DIR / 'config' + """ Directory containing system configuration files """ + + model_config = SettingsConfigDict( + yaml_file = "raps_config.yaml", + env_prefix = 'raps_', + env_nested_delimiter = '__', + nested_model_default_partial_update = True, + ) + + # Customize setting sources, we'll use yaml config file instead of the default .env + @classmethod + def settings_customise_sources( + cls, settings_cls, + init_settings, env_settings, dotenv_settings, file_secret_settings, + ): + return (init_settings, env_settings, YamlConfigSettingsSource(settings_cls),) + +raps_config = RapsConfig() diff --git a/raps/system_config.py b/raps/system_config.py index 16d67cb..deeafd9 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -1,11 +1,9 @@ -import os, functools, glob, fnmatch, re +import functools, glob, fnmatch from typing import Any, Literal, Annotated as A from pathlib import Path import yaml from pydantic import BaseModel, computed_field, model_validator, field_validator - -ROOT_DIR = Path(__file__).parent.parent -CONFIG_PATH = Path(os.environ.get("RAPS_CONFIG", ROOT_DIR / 'config')).resolve() +from raps.raps_config import raps_config # Define Pydantic models for the config to handle parsing and validation @@ -211,8 +209,8 @@ class MultiPartitionSystemConfig(BaseModel): def list_systems() -> list[str]: """ Lists all available systems """ return sorted([ - str(p.relative_to(CONFIG_PATH)).removesuffix(".yaml") - for p in CONFIG_PATH.rglob("*.yaml") + str(p.relative_to(raps_config.system_config_dir)).removesuffix(".yaml") + for p in raps_config.system_config_dir.rglob("*.yaml") ]) @@ -224,7 +222,7 @@ def get_system_config(system: str) -> SystemConfig: systems defined in RAPS_CONFIG. """ if system in list_systems(): - config_path = CONFIG_PATH / f"{system}.yaml" + config_path = raps_config.system_config_dir / f"{system}.yaml" system_name = system else: config_path = Path(system).resolve() -- GitLab From 768de19f173f37ea6dc39748d3fa7f19007a2ab2 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 22 Aug 2025 16:08:22 -0400 Subject: [PATCH 10/23] Fix test_system_config --- tests/unit/test_system_config.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_system_config.py b/tests/unit/test_system_config.py index 40ac0fe..7b23c82 100644 --- a/tests/unit/test_system_config.py +++ b/tests/unit/test_system_config.py @@ -1,5 +1,6 @@ import pytest -from raps.system_config import list_systems, get_system_config, CONFIG_PATH, get_partition_configs +from raps.raps_config import raps_config +from raps.system_config import list_systems, get_system_config, get_partition_configs @pytest.mark.parametrize("system_name", list_systems()) def test_configs(system_name): @@ -15,8 +16,8 @@ def test_configs(system_name): (["lumi/*"], "lumi", ["lumi/lumi-c", "lumi/lumi-g"]), (["frontier", "summit"], "frontier+summit", ["frontier", "summit"]), # test passing arbitrary paths - ([str(CONFIG_PATH / "lumi")], "lumi", ["lumi-c", "lumi-g"]), - ([str(CONFIG_PATH / "lumi/lumi-*")], "lumi-c+lumi-g", ["lumi-c", "lumi-g"]), + ([str(raps_config.system_config_dir / "lumi")], "lumi", ["lumi-c", "lumi-g"]), + ([str(raps_config.system_config_dir / "lumi/lumi-*")], "lumi-c+lumi-g", ["lumi-c", "lumi-g"]), ]) def test_get_partition_configs(input, expected_name, expected_configs): result = get_partition_configs(input) -- GitLab From a5de476a389780de20bfd2622d6efe1193fa8e02 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 22 Aug 2025 16:27:31 -0400 Subject: [PATCH 11/23] Remove todo comment --- raps/sim_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/raps/sim_config.py b/raps/sim_config.py index fa0fcbf..0a9d9e6 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -67,7 +67,7 @@ class SimConfig(BaseModel): seed: bool = False """ Set RNG seed for deterministic simulation """ - output: ExpandedPath|None = None # TODO this no longer supports empty + output: ExpandedPath|None = None """ Output power, cooling, and loss models for later analysis. Argument specifies name. """ debug: bool = False -- GitLab From 6fdcdcc46b9a532510594e6dbe5b27db7277c266 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 22 Aug 2025 17:14:06 -0400 Subject: [PATCH 12/23] Fixes to pydantic_add_args --- raps/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/raps/utils.py b/raps/utils.py index 61ef801..915ee92 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -644,7 +644,9 @@ def pydantic_add_args( }) class SettingsModel(model_cls, BaseSettings): @classmethod - def settings_customise_sources(cls, settings_cls, init_settings, *args): + def settings_customise_sources(cls, settings_cls, + init_settings, env_settings, dotenv_settings, file_secret_settings, + ): return (init_settings,) # Don't load from env vars or anything else model_config = model_config_dict @@ -652,9 +654,11 @@ def pydantic_add_args( cli_settings_source = CliSettingsSource(SettingsModel, root_parser = parser) def model_validate_args(args: argparse.Namespace, data: dict|None = None): - return CliApp.run(model_cls, + model = CliApp.run(SettingsModel, cli_args = args, cli_settings_source = cli_settings_source, **(data or {}), ) + # Recreate model so we don't return the SettingsModel subclass + return model_cls.model_validate(model.model_dump()) return model_validate_args -- GitLab From 82581871c2e489bf596f80fa4bf3f4a769751c05 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 22 Aug 2025 17:28:55 -0400 Subject: [PATCH 13/23] Fix tests --- tests/systems/test_main_time_delta_sub_second_run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/systems/test_main_time_delta_sub_second_run.py b/tests/systems/test_main_time_delta_sub_second_run.py index a73be33..7845670 100644 --- a/tests/systems/test_main_time_delta_sub_second_run.py +++ b/tests/systems/test_main_time_delta_sub_second_run.py @@ -3,7 +3,7 @@ import subprocess import gc import pytest from tests.util import PROJECT_ROOT -from raps.utils import convert_seconds_to_hhmmss, convert_to_time_unit +from raps.utils import convert_seconds_to_hhmmss, convert_to_time_unit, infer_time_unit pytestmark = [ @@ -39,7 +39,7 @@ def test_main_time_delta_run(system, system_config, time_arg, tdelta_arg, random ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" time = convert_to_time_unit(time_arg) - td = convert_to_time_unit(tdelta_arg) + td = infer_time_unit(tdelta_arg) #assert f"Time Simulated: {convert_seconds_to_hhmmss(int(time / td_ds))}" in result.stdout assert f"Time Simulated: {convert_seconds_to_hhmmss(time)}" in result.stdout -- GitLab From 82fe091c532427696487b3b649b312aaccd310a5 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 25 Aug 2025 08:30:45 -0400 Subject: [PATCH 14/23] Update docstring --- raps/system_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/raps/system_config.py b/raps/system_config.py index deeafd9..9c1e82f 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -219,7 +219,7 @@ def get_system_config(system: str) -> SystemConfig: """ Returns the system config as a Pydantic object. system can either be a path to a custom .yaml file, or the name of one of the pre-configured - systems defined in RAPS_CONFIG. + systems defined in RAPS_SYSTEM_CONFIG_DIR. """ if system in list_systems(): config_path = raps_config.system_config_dir / f"{system}.yaml" -- GitLab From e8408b04ee42d62af7370aa2c9d923fd3d6be85f Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 25 Aug 2025 08:31:58 -0400 Subject: [PATCH 15/23] Rename model key to avoid key conflicts --- raps/sim_config.py | 6 +++--- raps/system_config.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/raps/sim_config.py b/raps/sim_config.py index 0a9d9e6..201371c 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -254,8 +254,8 @@ class SimConfig(BaseModel): def get_legacy_args_dict(self): """ Return as a dict object. This is for backwards compatibility with the rest of RAPS code so - we can migrate to the new config gradually. The dict also has a "model" key that contains - the SimConfig object itself. + we can migrate to the new config gradually. The dict also has a "sim_config" key that + contains the SimConfig object itself. """ args_dict = self.model_dump(mode = "json") # validate has been renamed to power_scope @@ -269,7 +269,7 @@ class SimConfig(BaseModel): if args_dict['accounts_json']: args_dict['accounts_json'] = str(args_dict['accounts_json']) - args_dict['model'] = self + args_dict['sim_config'] = self return args_dict diff --git a/raps/system_config.py b/raps/system_config.py index 9c1e82f..8330e80 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -167,7 +167,8 @@ class SystemConfig(BaseModel): """ Return the system config as a flattened, uppercased dict. This is for backwards compatibility with the rest of RAPS code so we can migrate to the new config format - gradually. The dict also as a "model" key that contains the SystemConfig object itself. + gradually. The dict also as a "system_config" key that contains the SystemConfig object + itself. """ renames = { # fields that need to be renamed to something other than just .upper() "system_name": "system_name", @@ -186,7 +187,7 @@ class SystemConfig(BaseModel): config_dict[k] = v # rename keys config_dict = {renames.get(k, k.upper()): v for k, v in config_dict.items()} - config_dict['model'] = self + config_dict['system_config'] = self return config_dict class MultiPartitionSystemConfig(BaseModel): -- GitLab From 51ae34e1f5dc5fb4e43a45bba3ddf4e889a632c9 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 25 Aug 2025 08:40:33 -0400 Subject: [PATCH 16/23] Fix more tests --- tests/unit/test_system_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_system_config.py b/tests/unit/test_system_config.py index 7b23c82..b02f154 100644 --- a/tests/unit/test_system_config.py +++ b/tests/unit/test_system_config.py @@ -8,7 +8,7 @@ def test_configs(system_name): config = get_system_config(system_name) assert config.system_name == system_name assert config.get_legacy()['system_name'] == system_name - assert config.get_legacy()['config'] == config + assert config.get_legacy()['system_config'] == config @pytest.mark.parametrize("input,expected_name,expected_configs", [ -- GitLab From 968abbf3f8f6a5570853d88ee77bfa36ede21330 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 25 Aug 2025 08:55:14 -0400 Subject: [PATCH 17/23] Print out yaml when running sim_config directly --- raps/sim_config.py | 5 ++++- raps/utils.py | 27 ++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/raps/sim_config.py b/raps/sim_config.py index 201371c..4a64849 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -9,7 +9,7 @@ from raps.schedulers.default import PolicyType, BackfillType from raps.utils import ( parse_time_unit, convert_to_time_unit, infer_time_unit, ExpandedPath, - pydantic_add_args, + pydantic_add_args, yaml_dump, ) from pydantic import BaseModel, model_validator, computed_field @@ -317,3 +317,6 @@ def parse_args(cli_args = None) -> SimConfig: sim_config = parse_args() args = sim_config.get_legacy_args() args_dict = sim_config.get_legacy_args_dict() + +if __name__ == "__main__": + print(yaml_dump(sim_config.model_dump(mode="json"))) diff --git a/raps/utils.py b/raps/utils.py index 915ee92..937be4a 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -23,7 +23,7 @@ from pathlib import Path from typing import Annotated as A, TypeVar, Callable from pydantic import BaseModel, TypeAdapter, AfterValidator from pydantic_settings import BaseSettings, SettingsConfigDict, CliApp, CliSettingsSource - +import yaml from raps.job import Job @@ -662,3 +662,28 @@ def pydantic_add_args( # Recreate model so we don't return the SettingsModel subclass return model_cls.model_validate(model.model_dump()) return model_validate_args + +def yaml_dump(data): + """ Dumps yaml with pretty formatting """ + class IndentDumper(yaml.Dumper): + def represent_data(self, data): + # Quote all strings with special characters to avoid confusion + if ( + isinstance(data, str) and + (not re.fullmatch(r"[\w-]+", data) or data.isdigit()) and + not "\n" in data + ): + return self.represent_scalar('tag:yaml.org,2002:str', data, style='"') + return super(IndentDumper, self).represent_data(data) + + def increase_indent(self, flow=False, indentless=False): + # Indent lists + return super(IndentDumper, self).increase_indent(flow, False) + + return yaml.dump( + data, + Dumper = IndentDumper, + sort_keys = False, + indent = 2, + allow_unicode = True, + ) -- GitLab From cd64d6cb9f191ff28259a8535abef2aacaac787b Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 25 Aug 2025 09:01:26 -0400 Subject: [PATCH 18/23] Improve parse_time_unit --- raps/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/raps/utils.py b/raps/utils.py index 937be4a..f70f3bf 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -471,7 +471,9 @@ TIME_UNITS = { def parse_time_unit(unit) -> timedelta: parsed_unit = unit - if isinstance(unit, str): + if TypeAdapter(timedelta).validator.isinstance_python(unit): + parsed_unit = TypeAdapter(timedelta).validate_python(unit) + elif isinstance(unit, str): parsed_unit = TIME_UNITS.get(unit) if not isinstance(parsed_unit, timedelta): raise ValueError(f"Invalid time unit {unit}") -- GitLab From b2d3b49ba119a4103fbc6e5cf7f59a849388f5d5 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 25 Aug 2025 09:19:48 -0400 Subject: [PATCH 19/23] Fix time parsing Serializing then re-parsing would change the time values because it would re-multiple the numbers by time_unit. --- raps/sim_config.py | 9 +++++++-- raps/utils.py | 9 +++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/raps/sim_config.py b/raps/sim_config.py index 4a64849..156c2d4 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -9,7 +9,7 @@ from raps.schedulers.default import PolicyType, BackfillType from raps.utils import ( parse_time_unit, convert_to_time_unit, infer_time_unit, ExpandedPath, - pydantic_add_args, yaml_dump, + pydantic_add_args, yaml_dump, parse_td, ) from pydantic import BaseModel, model_validator, computed_field @@ -217,16 +217,21 @@ class SimConfig(BaseModel): if data.get('time_unit') is not None: time_unit = parse_time_unit(data['time_unit']) + input_time_unit = time_unit else: time_unit = min( [infer_time_unit(data[f]) for f in time_fields if data.get(f)], default = timedelta(seconds = 1) ) + # When "inferring" time unit interpret raw numbers as seconds. + # E.g. `-t 10 --time-delta 1ds` should be `-t 10s --time-delta 1ds` + input_time_unit = timedelta(seconds=1) data['time_unit'] = time_unit for field in time_fields: if data.get(field) is not None: - data[field] = convert_to_time_unit(data[field], time_unit) + td = parse_td(data[field], input_time_unit) + data[field] = convert_to_time_unit(td, time_unit) return data diff --git a/raps/utils.py b/raps/utils.py index f70f3bf..98a6e58 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -481,10 +481,11 @@ def parse_time_unit(unit) -> timedelta: raise ValueError("Only time units of s, ds, cs, and ms are supported") return parsed_unit -def parse_td(td) -> timedelta: - """ Parse into a timedelta. """ +def parse_td(td, unit: str|timedelta = 's') -> timedelta: + """ Parse into a timedelta. Pass unit to interpret raw numbers as (default seconds) """ + unit = parse_time_unit(unit) if TypeAdapter(int).validator.isinstance_python(td): - return timedelta(seconds = TypeAdapter(int).validate_python(td)) + return unit * TypeAdapter(int).validate_python(td) if TypeAdapter(timedelta).validator.isinstance_python(td): return TypeAdapter(timedelta).validate_python(td) if isinstance(td, str): @@ -499,7 +500,7 @@ def convert_to_time_unit(td, unit: str|timedelta = 's'): Converts to integer number of time unit Throws if the given time is less than the unit """ - num = parse_td(td) / parse_time_unit(unit) + num = parse_td(td, unit) / parse_time_unit(unit) if (num != 0 and num < 1) or not num.is_integer(): raise ValueError(f"{td} is not divisible by time unit {unit}") return int(num) -- GitLab From fe1b5209823d35ea9b7cbbe80615bb2a553ebbf8 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 25 Aug 2025 10:02:32 -0400 Subject: [PATCH 20/23] Formatting --- .flake8 | 2 +- main.py | 2 +- multi-part-sim-mpi.py | 4 +- multi-part-sim.py | 3 +- raps/dataloaders/frontier.py | 1 - raps/raps_config.py | 10 ++- raps/sim_config.py | 104 ++++++++++++----------- raps/system_config.py | 77 ++++++++++------- raps/utils.py | 52 +++++++----- raps/workload.py | 5 +- tests/systems/test_workload_synthetic.py | 1 - tests/test_main.py | 1 - tests/unit/test_system_config.py | 1 + tests/unit/test_utils.py | 13 +-- 14 files changed, 150 insertions(+), 126 deletions(-) diff --git a/.flake8 b/.flake8 index bd48511..ce4ab0a 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,3 @@ [flake8] -exclude = .git, __pycache__, venv*, simulation_results, third_party +exclude = .git, __pycache__, venv*, simulation_results, third_party, models max-line-length = 120 diff --git a/main.py b/main.py index 1425f6b..c3ba946 100644 --- a/main.py +++ b/main.py @@ -45,7 +45,7 @@ from raps.stats import ( print_formatted_report ) -from raps.sim_config import args, args_dict, sim_config +from raps.sim_config import args, args_dict check_python_version() diff --git a/multi-part-sim-mpi.py b/multi-part-sim-mpi.py index 820bd09..eabb19b 100644 --- a/multi-part-sim-mpi.py +++ b/multi-part-sim-mpi.py @@ -16,10 +16,8 @@ from raps.flops import FLOPSManager from raps.engine import Engine from raps.ui import LayoutManager from raps.system_config import get_partition_configs -from raps.sim_config import args, sim_config +from raps.sim_config import args import random -import os -import glob from raps.helpers import check_python_version check_python_version() diff --git a/multi-part-sim.py b/multi-part-sim.py index bb61ba5..587dffb 100644 --- a/multi-part-sim.py +++ b/multi-part-sim.py @@ -17,10 +17,9 @@ from raps.flops import FLOPSManager from raps.engine import Engine from raps.ui import LayoutManager from raps.system_config import get_partition_configs -from raps.sim_config import args, sim_config +from raps.sim_config import args import random import os -import glob from raps.helpers import check_python_version check_python_version() diff --git a/raps/dataloaders/frontier.py b/raps/dataloaders/frontier.py index 3154992..8491617 100644 --- a/raps/dataloaders/frontier.py +++ b/raps/dataloaders/frontier.py @@ -9,7 +9,6 @@ # To analyze the data python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR """ -import ast import time import numpy as np import pandas as pd diff --git a/raps/raps_config.py b/raps/raps_config.py index 17b5ce7..6eddca8 100644 --- a/raps/raps_config.py +++ b/raps/raps_config.py @@ -3,6 +3,7 @@ from raps.utils import ExpandedPath from pydantic_settings import BaseSettings, SettingsConfigDict, YamlConfigSettingsSource ROOT_DIR = Path(__file__).parent.parent + class RapsConfig(BaseSettings): """ General settings for raps. Pydantic will automatically populate this model from env vars or a @@ -16,10 +17,10 @@ class RapsConfig(BaseSettings): """ Directory containing system configuration files """ model_config = SettingsConfigDict( - yaml_file = "raps_config.yaml", - env_prefix = 'raps_', - env_nested_delimiter = '__', - nested_model_default_partial_update = True, + yaml_file="raps_config.yaml", + env_prefix='raps_', + env_nested_delimiter='__', + nested_model_default_partial_update=True, ) # Customize setting sources, we'll use yaml config file instead of the default .env @@ -30,4 +31,5 @@ class RapsConfig(BaseSettings): ): return (init_settings, env_settings, YamlConfigSettingsSource(settings_cls),) + raps_config = RapsConfig() diff --git a/raps/sim_config.py b/raps/sim_config.py index 156c2d4..127cec3 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -1,10 +1,9 @@ import argparse -import os import sys import yaml from datetime import timedelta from pathlib import Path -from typing import Literal, Annotated as A +from typing import Literal from raps.schedulers.default import PolicyType, BackfillType from raps.utils import ( @@ -17,8 +16,9 @@ from pydantic_settings import SettingsConfigDict Distribution = Literal['uniform', 'weibull', 'normal'] + class SimConfig(BaseModel): - system: str|None = None + system: str | None = None """ System config to use """ partitions: list[str] = [] """ List of multiple system configurations for a multi-partition run. Can contain wildcards """ @@ -29,12 +29,12 @@ class SimConfig(BaseModel): """ Include network model """ # Simulation runtime options - fastforward: int|None = None + fastforward: int | None = None """ Fast-forward by time amount (unit specified by `time_unit`, default seconds). Can pass a string like 15s, 1m, 1h """ - time: int|None = None + time: int | None = None """ Length of time to simulate (unit specified by `time_unit`, default seconds). Can pass a string like 123, 27m, 3h, 7d @@ -52,7 +52,7 @@ class SimConfig(BaseModel): @computed_field @property def downscale(self) -> int: - return int(timedelta(seconds = 1) / self.time_unit) + return int(timedelta(seconds=1) / self.time_unit) start: str = "2021-05-21T13:00" """ ISO8601 start of simulation """ @@ -67,7 +67,7 @@ class SimConfig(BaseModel): seed: bool = False """ Set RNG seed for deterministic simulation """ - output: ExpandedPath|None = None + output: ExpandedPath | None = None """ Output power, cooling, and loss models for later analysis. Argument specifies name. """ debug: bool = False @@ -78,13 +78,13 @@ class SimConfig(BaseModel): """ Enable verbose output """ layout: Literal["layout1", "layout2"] = "layout1" """ UI layout """ - plot: list[Literal["power", "loss", "pue", "temp", "util"]]|None = None + plot: list[Literal["power", "loss", "pue", "temp", "util"]] | None = None """ Plots to generate """ imtype: Literal["png", "svg", "jpg", "pdf", "eps"] = "png" """ Plot image type """ - replay: list[ExpandedPath]|None = None + replay: list[ExpandedPath] | None = None """ Either: path/to/joblive path/to/jobprofile OR filename.npz """ encrypt: bool = False @@ -103,7 +103,7 @@ class SimConfig(BaseModel): """ Grab data from live system. """ # Workload arguments (TODO split into separate model) - workload: Literal['random', 'benchmark', 'peak', 'idle', 'synthetic', 'multitenant']|None = None + workload: Literal['random', 'benchmark', 'peak', 'idle', 'synthetic', 'multitenant'] | None = None """ Type of synthetic workload """ multimodal: list[float] = [1.0] @@ -112,52 +112,52 @@ class SimConfig(BaseModel): in order to the list of the --distribution argument list. """ # Jobsize - jobsize_distribution: list[Distribution]|None = None + jobsize_distribution: list[Distribution] | None = None """ Distribution type """ - jobsize_normal_mean: float|None = None + jobsize_normal_mean: float | None = None """ Mean (mu) for Normal distribution """ - jobsize_normal_stddev: float|None = None + jobsize_normal_stddev: float | None = None """ Standard deviation (sigma) for Normal distribution """ - jobsize_weibull_shape: float|None = None + jobsize_weibull_shape: float | None = None """ Jobsize shape of weibull """ - jobsize_weibull_scale: float|None = None + jobsize_weibull_scale: float | None = None """ Jobsize scale of weibull """ - jobsize_is_of_degree: int|None = None + jobsize_is_of_degree: int | None = None """ Draw jobsizes from distribution of degree N (squared,cubed). """ - jobsize_is_power_of: int|None = None + jobsize_is_power_of: int | None = None """ Draw jobsizes from distribution of power of N (2->2^x,3->3^x). """ # Walltime - walltime_distribution: list[Distribution]|None = None - """ Distribution type """ - walltime_normal_mean: float|None = None + walltime_distribution: list[Distribution] | None = None + """ Distribution type """ + walltime_normal_mean: float | None = None """ Walltime mean (mu) for Normal distribution """ - walltime_normal_stddev: float|None = None + walltime_normal_stddev: float | None = None """ Walltime standard deviation (sigma) for Normal distribution """ - walltime_weibull_shape: float|None = None + walltime_weibull_shape: float | None = None """ Walltime shape of weibull """ - walltime_weibull_scale: float|None = None + walltime_weibull_scale: float | None = None """ Walltime scale of weibull """ # Utilizations (TODO should probably make a reusable "Distribution" submodel) cpuutil_distribution: list[Distribution] = ['uniform'] """ Distribution type """ - cpuutil_normal_mean: float|None = None + cpuutil_normal_mean: float | None = None """ Walltime mean (mu) for Normal distribution """ - cpuutil_normal_stddev: float|None = None + cpuutil_normal_stddev: float | None = None """ Walltime standard deviation (sigma) for Normal distribution """ - cpuutil_weibull_shape: float|None = None + cpuutil_weibull_shape: float | None = None """ Walltime shape of weibull """ - cpuutil_weibull_scale: float|None = None + cpuutil_weibull_scale: float | None = None """ Walltime scale of weibull """ gpuutil_distribution: list[Distribution] = ['uniform'] """ Distribution type """ - gpuutil_normal_mean: float|None = None + gpuutil_normal_mean: float | None = None """ Walltime mean (mu) for Normal distribution """ - gpuutil_normal_stddev: float|None = None + gpuutil_normal_stddev: float | None = None """ Walltime standard deviation (sigma) for Normal distribution """ - gpuutil_weibull_shape: float|None = None + gpuutil_weibull_shape: float | None = None """ Walltime shape of weibull """ - gpuutil_weibull_scale: float|None = None + gpuutil_weibull_scale: float | None = None """ Walltime scale of weibull """ gantt_nodes: bool = False """ Print Gannt with nodes required as line thickness (default false) """ @@ -167,36 +167,36 @@ class SimConfig(BaseModel): "default", "scheduleflow", "nrel", "anl", "flux", "experimental", "multitenant", ] = "default" """ Scheduler name """ - policy: PolicyType|None = None + policy: PolicyType | None = None """ Schedule policy """ - backfill: BackfillType|None = None + backfill: BackfillType | None = None """ Backfill policy """ # Arrival arrival: Literal["prescribed", "poisson"] = "prescribed" """ Modify arrival distribution (poisson) or use original submit times (prescribed) """ - job_arrival_time: int|None = None + job_arrival_time: int | None = None """ Poisson arrival (seconds). Overrides system config scheduler.job_arrival_time """ - job_arrival_rate: float|None = None # TODO define default here + job_arrival_rate: float | None = None # TODO define default here """ Modify Poisson rate (default 1) """ # Accounts accounts: bool = False - accounts_json: ExpandedPath|None = None + accounts_json: ExpandedPath | None = None """ Path to accounts JSON file from previous run """ # Downtime - downtime_first: int|None = None + downtime_first: int | None = None """ First downtime (unit specified by `time_unit`, default seconds). Can pass a string like 27m, 3h, 7d """ - downtime_interval: str|None = None + downtime_interval: str | None = None """ Interval between downtimes (unit specified by `time_unit`, default seconds). Can pass a string like 123, 27m, 3h, 7d """ - downtime_length: str|None = None + downtime_length: str | None = None """ Downtime length (unit specified by `time_unit`, default seconds). Can pass a string like 123, 27m, 3h, 7d @@ -208,7 +208,7 @@ class SimConfig(BaseModel): maxqueue: int = 50 """ Specify the max queue length for continuous job generation """ - @model_validator(mode = "before") + @model_validator(mode="before") def _parse_times(cls, data): time_fields = [ "time_delta", "time", "fastforward", @@ -221,7 +221,7 @@ class SimConfig(BaseModel): else: time_unit = min( [infer_time_unit(data[f]) for f in time_fields if data.get(f)], - default = timedelta(seconds = 1) + default=timedelta(seconds=1) ) # When "inferring" time unit interpret raw numbers as seconds. # E.g. `-t 10 --time-delta 1ds` should be `-t 10s --time-delta 1ds` @@ -235,13 +235,13 @@ class SimConfig(BaseModel): return data - @model_validator(mode = "after") + @model_validator(mode="after") def _validate(self): if self.system and self.partitions: raise ValueError("system and partitions are mutually exclusive") elif not self.system and not self.partitions: self.system = "frontier" - + if not self.replay and not self.workload: self.workload = "random" @@ -262,7 +262,7 @@ class SimConfig(BaseModel): we can migrate to the new config gradually. The dict also has a "sim_config" key that contains the SimConfig object itself. """ - args_dict = self.model_dump(mode = "json") + args_dict = self.model_dump(mode="json") # validate has been renamed to power_scope args_dict['validate'] = args_dict["power_scope"] == "node" @@ -278,7 +278,7 @@ class SimConfig(BaseModel): return args_dict -def parse_args(cli_args = None) -> SimConfig: +def parse_args(cli_args=None) -> SimConfig: parser = argparse.ArgumentParser( description="Resource Allocator & Power Simulator (RAPS)", allow_abbrev=False, @@ -286,13 +286,16 @@ def parse_args(cli_args = None) -> SimConfig: ) parser.add_argument( "config_file", nargs="?", default=None, - help = 'YAML sim config file, can be used to configure an experiment instead of using CLI flags. Pass "-" to read from stdin.', + help=( + 'YAML sim config file, can be used to configure an experiment instead of using CLI ' + + 'flags. Pass "-" to read from stdin.' + ) ) - model_validate_args = pydantic_add_args(parser, SimConfig, model_config = SettingsConfigDict( - cli_implicit_flags = True, - cli_kebab_case = True, - cli_shortcuts = { + model_validate_args = pydantic_add_args(parser, SimConfig, model_config=SettingsConfigDict( + cli_implicit_flags=True, + cli_kebab_case=True, + cli_shortcuts={ "partitions": "x", "cooling": "c", "simulate-network": "net", @@ -319,6 +322,7 @@ def parse_args(cli_args = None) -> SimConfig: return model_validate_args(args, config_file_data) + sim_config = parse_args() args = sim_config.get_legacy_args() args_dict = sim_config.get_legacy_args_dict() diff --git a/raps/system_config.py b/raps/system_config.py index 8330e80..caa7eca 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -1,5 +1,7 @@ -import functools, glob, fnmatch -from typing import Any, Literal, Annotated as A +import functools +import glob +import fnmatch +from typing import Any, Literal from pathlib import Path import yaml from pydantic import BaseModel, computed_field, model_validator, field_validator @@ -7,6 +9,7 @@ from raps.raps_config import raps_config # Define Pydantic models for the config to handle parsing and validation + class SystemSystemConfig(BaseModel): num_cdus: int racks_per_cdu: int @@ -25,8 +28,8 @@ class SystemSystemConfig(BaseModel): gpu_peak_flops: float cpu_fp_ratio: float gpu_fp_ratio: float - threads_per_core: int|None = None - cores_per_cpu: int|None = None + threads_per_core: int | None = None + cores_per_cpu: int | None = None @model_validator(mode='after') def _update_down_nodes(self): @@ -74,15 +77,16 @@ class SystemSystemConfig(BaseModel): def available_nodes(self) -> int: return self.total_nodes - len(self.down_nodes) + class SystemPowerConfig(BaseModel): power_gpu_idle: float power_gpu_max: float power_cpu_idle: float power_cpu_max: float power_mem: float - power_nic: float|None = None - power_nic_idle: float|None = None - power_nic_max: float|None = None + power_nic: float | None = None + power_nic_idle: float | None = None + power_nic_max: float | None = None power_nvme: float power_switch: float power_cdu: float @@ -94,6 +98,7 @@ class SystemPowerConfig(BaseModel): rectifier_efficiency: float power_cost: float + class SystemUqConfig(BaseModel): power_gpu_uncertainty: float power_cpu_uncertainty: float @@ -105,8 +110,10 @@ class SystemUqConfig(BaseModel): power_switch_uncertainty: float rectifier_power_uncertainty: float + JobEndStates = Literal["COMPLETED", "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL"] + class SystemSchedulerConfig(BaseModel): job_arrival_time: int mtbf: int @@ -118,11 +125,12 @@ class SystemSchedulerConfig(BaseModel): job_end_probs: dict[JobEndStates, float] multitenant: bool = False + class SystemCoolingConfig(BaseModel): cooling_efficiency: float wet_bulb_temp: float - zip_code: str|None = None - country_code: str|None = None + zip_code: str | None = None + country_code: str | None = None fmu_path: str fmu_column_mapping: dict[str, str] w_htwps_key: str @@ -130,27 +138,29 @@ class SystemCoolingConfig(BaseModel): w_cts_key: str temperature_keys: list[str] + class SystemNetworkConfig(BaseModel): topology: Literal["fat-tree", "dragonfly", "torus3d"] network_max_bw: float - latency: float|None = None + latency: float | None = None - fattree_k: int|None = None + fattree_k: int | None = None - dragonfly_d: int|None = None - dragonfly_a: int|None = None - dragonfly_p: int|None = None + dragonfly_d: int | None = None + dragonfly_a: int | None = None + dragonfly_p: int | None = None - torus_x: int|None = None - torus_y: int|None = None - torus_z: int|None = None - torus_wrap: bool|None = None - torus_link_bw: float|None = None - torus_routing: str|None = None + torus_x: int | None = None + torus_y: int | None = None + torus_z: int | None = None + torus_wrap: bool | None = None + torus_link_bw: float | None = None + torus_routing: str | None = None + + hosts_per_router: int | None = None + latency_per_hop: float | None = None + node_coords_csv: str | None = None - hosts_per_router: int|None = None - latency_per_hop: float|None = None - node_coords_csv: str|None = None class SystemConfig(BaseModel): system_name: str @@ -159,9 +169,9 @@ class SystemConfig(BaseModel): system: SystemSystemConfig power: SystemPowerConfig scheduler: SystemSchedulerConfig - uq: SystemUqConfig|None = None - cooling: SystemCoolingConfig|None = None - network: SystemNetworkConfig|None = None + uq: SystemUqConfig | None = None + cooling: SystemCoolingConfig | None = None + network: SystemNetworkConfig | None = None def get_legacy(self) -> dict[str, Any]: """ @@ -170,17 +180,17 @@ class SystemConfig(BaseModel): gradually. The dict also as a "system_config" key that contains the SystemConfig object itself. """ - renames = { # fields that need to be renamed to something other than just .upper() + renames = { # fields that need to be renamed to something other than just .upper() "system_name": "system_name", "w_htwps_key": "W_HTWPs_KEY", "w_ctwps_key": "W_CTWPs_KEY", "w_cts_key": "W_CTs_KEY", "multitenant": "multitenant", } - dump = self.model_dump(mode = "json", exclude_none = True) + dump = self.model_dump(mode="json", exclude_none=True) config_dict: dict[str, Any] = {} - for k, v in dump.items(): # flatten + for k, v in dump.items(): # flatten if isinstance(v, dict): config_dict.update(v) else: @@ -190,6 +200,7 @@ class SystemConfig(BaseModel): config_dict['system_config'] = self return config_dict + class MultiPartitionSystemConfig(BaseModel): system_name: str partitions: list[SystemConfig] @@ -232,7 +243,7 @@ def get_system_config(system: str) -> SystemConfig: if not config_path.is_file(): raise FileNotFoundError(f'"{system}" not found. Valid systems are: {list_systems()}') config = { - "system_name": system_name, # You can override system_name in the yaml as well + "system_name": system_name, # You can override system_name in the yaml as well **yaml.safe_load(config_path.read_text()), } return SystemConfig.model_validate(config) @@ -269,8 +280,8 @@ def get_partition_configs(partitions: list[str]) -> MultiPartitionSystemConfig: if len(parsed_configs) == 1: combined_system_name = parsed_configs[0].system_name else: - combined_system_name = "+".join(dict.fromkeys(combined_system_name)) # dedup, keep order + combined_system_name = "+".join(dict.fromkeys(combined_system_name)) # dedup, keep order return MultiPartitionSystemConfig( - system_name = combined_system_name, - partitions = parsed_configs, + system_name=combined_system_name, + partitions=parsed_configs, ) diff --git a/raps/utils.py b/raps/utils.py index 98a6e58..f54cc71 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -459,6 +459,7 @@ def next_arrival(lambda_rate, reset=False, start_time=0): -math.log(1.0 - random.random()) / lambda_rate return next_arrival.next_time + TIME_UNITS = { 'd': timedelta(days=1), 'h': timedelta(hours=1), @@ -469,6 +470,7 @@ TIME_UNITS = { 'ms': timedelta(milliseconds=1), } + def parse_time_unit(unit) -> timedelta: parsed_unit = unit if TypeAdapter(timedelta).validator.isinstance_python(unit): @@ -481,7 +483,8 @@ def parse_time_unit(unit) -> timedelta: raise ValueError("Only time units of s, ds, cs, and ms are supported") return parsed_unit -def parse_td(td, unit: str|timedelta = 's') -> timedelta: + +def parse_td(td, unit: str | timedelta = 's') -> timedelta: """ Parse into a timedelta. Pass unit to interpret raw numbers as (default seconds) """ unit = parse_time_unit(unit) if TypeAdapter(int).validator.isinstance_python(td): @@ -495,7 +498,8 @@ def parse_td(td, unit: str|timedelta = 's') -> timedelta: return int(num_str) * TIME_UNITS[unit_str] raise ValueError(f"Invalid timedelta: {td}") -def convert_to_time_unit(td, unit: str|timedelta = 's'): + +def convert_to_time_unit(td, unit: str | timedelta = 's'): """ Converts to integer number of time unit Throws if the given time is less than the unit @@ -505,11 +509,12 @@ def convert_to_time_unit(td, unit: str|timedelta = 's'): raise ValueError(f"{td} is not divisible by time unit {unit}") return int(num) + def infer_time_unit(td) -> timedelta: """ Infers the time unit the user meant for the input string """ parsed_td = parse_td(td) time_unit = None - if isinstance(td, str): # infer unit from string, e.g. 1s or 200ms + if isinstance(td, str): # infer unit from string, e.g. 1s or 200ms re_match = re.fullmatch(r"(\d+)\s*(\w+)", td.strip()) if re_match and re_match[2] in TIME_UNITS: time_unit = TIME_UNITS[re_match[2]] @@ -520,6 +525,7 @@ def infer_time_unit(td) -> timedelta: break return min(TIME_UNITS['s'], time_unit or TIME_UNITS['s']) + def encrypt(name): """Encrypts a given name using SHA-256 and returns the hexadecimal digest.""" encoded_name = name.encode() @@ -627,11 +633,13 @@ ExpandedPath = A[Path, AfterValidator(lambda v: Path(v).expanduser().resolve())] """ Type that that expands ~ and environment variables in a path string """ -T = TypeVar("T", bound = BaseModel) +T = TypeVar("T", bound=BaseModel) + + def pydantic_add_args( parser: argparse.ArgumentParser, model_cls: type[T], - model_config: SettingsConfigDict|None = None, -) -> Callable[[argparse.Namespace, dict|None], T]: + model_config: SettingsConfigDict | None = None, +) -> Callable[[argparse.Namespace, dict | None], T]: """ Add arguments to the parser from the model. Returns a function that can be used to parse the model from the argparse args. @@ -643,29 +651,31 @@ def pydantic_add_args( """ model_config_dict = SettingsConfigDict({ **(model_config or {}), - "cli_parse_args": False, # Don't automatically parse args + "cli_parse_args": False, # Don't automatically parse args }) + class SettingsModel(model_cls, BaseSettings): @classmethod def settings_customise_sources(cls, settings_cls, - init_settings, env_settings, dotenv_settings, file_secret_settings, - ): - return (init_settings,) # Don't load from env vars or anything else + init_settings, env_settings, dotenv_settings, file_secret_settings, + ): + return (init_settings,) # Don't load from env vars or anything else model_config = model_config_dict - cli_settings_source = CliSettingsSource(SettingsModel, root_parser = parser) + cli_settings_source = CliSettingsSource(SettingsModel, root_parser=parser) - def model_validate_args(args: argparse.Namespace, data: dict|None = None): + def model_validate_args(args: argparse.Namespace, data: dict | None = None): model = CliApp.run(SettingsModel, - cli_args = args, - cli_settings_source = cli_settings_source, - **(data or {}), - ) + cli_args=args, + cli_settings_source=cli_settings_source, + **(data or {}), + ) # Recreate model so we don't return the SettingsModel subclass return model_cls.model_validate(model.model_dump()) return model_validate_args + def yaml_dump(data): """ Dumps yaml with pretty formatting """ class IndentDumper(yaml.Dumper): @@ -674,7 +684,7 @@ def yaml_dump(data): if ( isinstance(data, str) and (not re.fullmatch(r"[\w-]+", data) or data.isdigit()) and - not "\n" in data + "\n" not in data ): return self.represent_scalar('tag:yaml.org,2002:str', data, style='"') return super(IndentDumper, self).represent_data(data) @@ -685,8 +695,8 @@ def yaml_dump(data): return yaml.dump( data, - Dumper = IndentDumper, - sort_keys = False, - indent = 2, - allow_unicode = True, + Dumper=IndentDumper, + sort_keys=False, + indent=2, + allow_unicode=True, ) diff --git a/raps/workload.py b/raps/workload.py index 5b9a7fb..151e2c3 100644 --- a/raps/workload.py +++ b/raps/workload.py @@ -711,7 +711,6 @@ def plot_job_hist(jobs, config=None, dist_split=None, gantt_nodes=False): axs[0][1].yaxis.tick_right() else: axs[0][1].set_yticks([]) - pass axs[0][1].hist(cpu_util, bins=100, orientation='vertical', zorder=1, density=True, color='tab:cyan') axs[0][1].axvline(np.mean(cpu_util), color='r', linewidth=1, zorder=3) axs[0][1].set(xlim=[0, config['CPUS_PER_NODE']]) @@ -799,8 +798,9 @@ def plot_job_hist(jobs, config=None, dist_split=None, gantt_nodes=False): plt.show() + def run_workload(): - from raps.sim_config import args, args_dict, sim_config + from raps.sim_config import args, args_dict from raps.system_config import get_system_config config = get_system_config(args.system).get_legacy() if args.replay: @@ -970,7 +970,6 @@ def continuous_job_generation(*, engine, timestep, jobs): if len(engine.queue) <= engine.continuous_workload.args.maxqueue: new_jobs = engine.continuous_workload.generate_jobs() jobs.extend(new_jobs) - pass if __name__ == "__main__": diff --git a/tests/systems/test_workload_synthetic.py b/tests/systems/test_workload_synthetic.py index 2e4c4a1..dd5f8cf 100644 --- a/tests/systems/test_workload_synthetic.py +++ b/tests/systems/test_workload_synthetic.py @@ -1,7 +1,6 @@ import subprocess import gc import pytest -import shlex pytestmark = [ diff --git a/tests/test_main.py b/tests/test_main.py index 4b98263..76f48a3 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,4 +1,3 @@ -from tests.smoke import main import subprocess import os diff --git a/tests/unit/test_system_config.py b/tests/unit/test_system_config.py index b02f154..e32b99f 100644 --- a/tests/unit/test_system_config.py +++ b/tests/unit/test_system_config.py @@ -2,6 +2,7 @@ import pytest from raps.raps_config import raps_config from raps.system_config import list_systems, get_system_config, get_partition_configs + @pytest.mark.parametrize("system_name", list_systems()) def test_configs(system_name): # Very basic test that all system configs are valid diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 50e879a..edf06fe 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -2,20 +2,23 @@ import pytest from datetime import timedelta from raps.utils import parse_td, convert_to_time_unit, infer_time_unit, TIME_UNITS + @pytest.mark.parametrize("input,expected", [ - ("1", timedelta(seconds = 1)), - ("1m", timedelta(minutes = 1)), - (timedelta(minutes = 1), timedelta(minutes = 1)), - (2, timedelta(seconds = 2)), - ("PT2S", timedelta(seconds = 2)), + ("1", timedelta(seconds=1)), + ("1m", timedelta(minutes=1)), + (timedelta(minutes=1), timedelta(minutes=1)), + (2, timedelta(seconds=2)), + ("PT2S", timedelta(seconds=2)), ]) def test_parse_td(input, expected): assert parse_td(input) == expected + def test_parse_td_error(): with pytest.raises(ValueError): parse_td("1x") + @pytest.mark.parametrize("input,unit,expected", [ ("1s", 's', 1), ("1m", 's', 60), -- GitLab From 91557455086503e0d941c7301f31df10357e7e3b Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 25 Aug 2025 11:53:51 -0400 Subject: [PATCH 21/23] Consistent ordering in get_partition_configs --- raps/system_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/raps/system_config.py b/raps/system_config.py index caa7eca..e458c68 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -267,10 +267,10 @@ def get_partition_configs(partitions: list[str]) -> MultiPartitionSystemConfig: matched_systems = fnmatch.filter(systems, pat) combined_system_name.extend(s.split("/")[0] for s in matched_systems) elif Path(pat).is_dir(): - matched_systems = Path(pat).glob("*.yaml") + matched_systems = sorted(Path(pat).glob("*.yaml")) combined_system_name.append(Path(pat).name) else: - matched_systems = glob.glob(pat) + matched_systems = sorted(glob.glob(pat)) combined_system_name.extend(Path(s).stem for s in matched_systems) if not matched_systems: -- GitLab From 15fcce61e818b1fe33d878c81b9cf532d0c56d6f Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 25 Aug 2025 12:08:41 -0400 Subject: [PATCH 22/23] Fix more tests --- tests/systems/test_main_time_delta_sub_second_run.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/systems/test_main_time_delta_sub_second_run.py b/tests/systems/test_main_time_delta_sub_second_run.py index 7845670..459d295 100644 --- a/tests/systems/test_main_time_delta_sub_second_run.py +++ b/tests/systems/test_main_time_delta_sub_second_run.py @@ -3,7 +3,7 @@ import subprocess import gc import pytest from tests.util import PROJECT_ROOT -from raps.utils import convert_seconds_to_hhmmss, convert_to_time_unit, infer_time_unit +from raps.utils import convert_seconds_to_hhmmss, parse_td pytestmark = [ @@ -38,9 +38,7 @@ def test_main_time_delta_run(system, system_config, time_arg, tdelta_arg, random "-o", random_id ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" - time = convert_to_time_unit(time_arg) - td = infer_time_unit(tdelta_arg) - #assert f"Time Simulated: {convert_seconds_to_hhmmss(int(time / td_ds))}" in result.stdout + time = parse_td(time_arg).seconds assert f"Time Simulated: {convert_seconds_to_hhmmss(time)}" in result.stdout subprocess.run( -- GitLab From 5a361fb95a25c99b698da6c516217cb7c1b5d481 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 25 Aug 2025 13:33:16 -0400 Subject: [PATCH 23/23] Fix test --- tests/systems/test_main_withdata_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/systems/test_main_withdata_run.py b/tests/systems/test_main_withdata_run.py index f2f5f7c..928d149 100644 --- a/tests/systems/test_main_withdata_run.py +++ b/tests/systems/test_main_withdata_run.py @@ -28,7 +28,7 @@ def test_main_withdata_run(system, system_config, system_file, random_id): "python", "main.py", "--time", "1m", "--system", system, - "-f", *file_list, + "-f", ','.join(file_list), "-o", random_id ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" -- GitLab