From e87de95fcd452e07a6bed4ef595b64980eb61e6e Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 4 Sep 2025 16:09:50 -0400 Subject: [PATCH 01/21] Add back random output dir --- raps/engine.py | 2 +- raps/run_sim.py | 8 +++++--- raps/sim_config.py | 15 ++++++++++++++- raps/workload.py | 5 +++-- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/raps/engine.py b/raps/engine.py index 23a2605..90b84b0 100644 --- a/raps/engine.py +++ b/raps/engine.py @@ -161,7 +161,7 @@ class Engine: self.flops_manager = flops_manager self.debug = sim_config.debug self.continuous_workload = continuous_workload - self.output = sim_config.output + self.output = sim_config.get_output() self.replay = sim_config.replay self.downscale = sim_config.downscale # Factor to downscale the 1s timesteps (power of 10) self.simulate_network = sim_config.simulate_network diff --git a/raps/run_sim.py b/raps/run_sim.py index 5afd6f1..6c39a2f 100644 --- a/raps/run_sim.py +++ b/raps/run_sim.py @@ -44,6 +44,7 @@ shortcuts = { "numjobs": "n", "verbose": "v", "output": "o", + "random-output": "O", "uncertainties": "u", "plot": "p", "replay": "f", @@ -79,7 +80,7 @@ def run_sim(sim_config: SimConfig): engine, workload_data, time_delta = Engine.from_sim_config(sim_config) - out = sim_config.output + out = sim_config.get_output() if out: out.mkdir(parents=True) engine.telemetry.save_snapshot( @@ -253,10 +254,11 @@ def run_parts_sim(sim_config: SimConfig): multi_engine, workload_results, timestep_start, timestep_end, time_delta = \ MultiPartEngine.from_sim_config(sim_config) - if sim_config.output: + out = sim_config.get_output() + if out: for part, engine in multi_engine.engines.items(): engine.telemetry.save_snapshot( - dest=str(sim_config.output / part.split('/')[-1]), + dest=str(out / part.split('/')[-1]), result=workload_results[part], args=sim_config, ) diff --git a/raps/sim_config.py b/raps/sim_config.py index 5cdd09c..f18c730 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -1,10 +1,11 @@ import argparse +from pathlib import Path from functools import cached_property from datetime import timedelta from typing import Literal from raps.schedulers.default import PolicyType, BackfillType from raps.utils import ( - parse_time_unit, convert_to_time_unit, infer_time_unit, ExpandedPath, parse_td, + parse_time_unit, convert_to_time_unit, infer_time_unit, ExpandedPath, parse_td, create_casename, ) from raps.system_config import SystemConfig, get_partition_configs from pydantic import BaseModel, model_validator @@ -62,8 +63,17 @@ class SimConfig(BaseModel): seed: int | None = None """ Set RNG seed for deterministic simulation """ + output: ExpandedPath | None = None """ Output power, cooling, and loss models for later analysis. Argument specifies name. """ + random_output: bool = False + """ If True, output to a randomly named directory. Mutually exclusive with output """ + + _random_output: Path | None = None + def get_output(self): + if self.random_output and not self._random_output: + self._random_output = Path(create_casename("out-")).resolve() + return self._random_output or self.output debug: bool = False """ Enable debug mode and disable rich layout """ @@ -291,6 +301,9 @@ class SimConfig(BaseModel): raise ValueError(f"policy {self.backfill} not implemented by {self.scheduler}. " f"Valid selections: {sorted(valid_backfilltypes)}") + if self.random_output and self.output: + raise ValueError("--random-output and --output are mutually exclusive") + return self @property diff --git a/raps/workload.py b/raps/workload.py index 6fb3c3b..57b536d 100644 --- a/raps/workload.py +++ b/raps/workload.py @@ -987,10 +987,11 @@ def run_workload(sim_config: SimConfig): dist_split=sim_config.multimodal, gantt_nodes=sim_config.gantt_nodes) - if sim_config.output: + out = sim_config.get_output() + if out: timestep_start = min([x.submit_time for x in jobs]) timestep_end = math.ceil(max([x.submit_time for x in jobs]) + max([x.expected_run_time for x in jobs])) - filename = create_file_indexed('wl', create=False, ending="npz").split(".npz")[0] + filename = create_file_indexed('wl', path = str(out), create=False, ending="npz").split(".npz")[0] # savez_compressed add npz itself, but create_file_indexed needs to check for .npz to find existing files np.savez_compressed(filename, jobs=jobs, timestep_start=timestep_start, timestep_end=timestep_end, args=args) print(filename + ".npz") # To std-out to show which npz was created. -- GitLab From 5b83b1867a326f65f7ae3576f8a2f8f347bb12ba Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 4 Sep 2025 16:32:38 -0400 Subject: [PATCH 02/21] Set pydantic config so CLI help includes docstrings --- raps/sim_config.py | 7 ++++--- raps/system_config.py | 19 ++++++++++--------- raps/telemetry.py | 6 +++--- raps/utils.py | 9 ++++++++- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/raps/sim_config.py b/raps/sim_config.py index f18c730..300ac94 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -3,18 +3,19 @@ from pathlib import Path from functools import cached_property from datetime import timedelta from typing import Literal +import importlib from raps.schedulers.default import PolicyType, BackfillType from raps.utils import ( parse_time_unit, convert_to_time_unit, infer_time_unit, ExpandedPath, parse_td, create_casename, + RAPSBaseModel, ) from raps.system_config import SystemConfig, get_partition_configs -from pydantic import BaseModel, model_validator -import importlib +from pydantic import model_validator Distribution = Literal['uniform', 'weibull', 'normal'] -class SimConfig(BaseModel): +class SimConfig(RAPSBaseModel): system: str | None = None """ System config to use """ partitions: list[str] = [] diff --git a/raps/system_config.py b/raps/system_config.py index 726c086..adbf470 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -5,13 +5,14 @@ from typing import Any, Literal from pathlib import Path from functools import cached_property import yaml -from pydantic import BaseModel, computed_field, model_validator, field_validator +from pydantic import computed_field, model_validator, field_validator +from raps.utils import RAPSBaseModel from raps.raps_config import raps_config # Define Pydantic models for the config to handle parsing and validation -class SystemSystemConfig(BaseModel): +class SystemSystemConfig(RAPSBaseModel): num_cdus: int racks_per_cdu: int nodes_per_rack: int @@ -79,7 +80,7 @@ class SystemSystemConfig(BaseModel): return self.total_nodes - len(self.down_nodes) -class SystemPowerConfig(BaseModel): +class SystemPowerConfig(RAPSBaseModel): power_gpu_idle: float power_gpu_max: float power_cpu_idle: float @@ -100,7 +101,7 @@ class SystemPowerConfig(BaseModel): power_cost: float -class SystemUqConfig(BaseModel): +class SystemUqConfig(RAPSBaseModel): power_gpu_uncertainty: float power_cpu_uncertainty: float power_mem_uncertainty: float @@ -115,7 +116,7 @@ class SystemUqConfig(BaseModel): JobEndStates = Literal["COMPLETED", "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL"] -class SystemSchedulerConfig(BaseModel): +class SystemSchedulerConfig(RAPSBaseModel): job_arrival_time: int mtbf: int trace_quanta: int @@ -127,7 +128,7 @@ class SystemSchedulerConfig(BaseModel): multitenant: bool = False -class SystemCoolingConfig(BaseModel): +class SystemCoolingConfig(RAPSBaseModel): cooling_efficiency: float wet_bulb_temp: float zip_code: str | None = None @@ -140,7 +141,7 @@ class SystemCoolingConfig(BaseModel): temperature_keys: list[str] -class SystemNetworkConfig(BaseModel): +class SystemNetworkConfig(RAPSBaseModel): topology: Literal["capacity", "fat-tree", "dragonfly", "torus3d"] network_max_bw: float latency: float | None = None @@ -163,7 +164,7 @@ class SystemNetworkConfig(BaseModel): node_coords_csv: str | None = None -class SystemConfig(BaseModel): +class SystemConfig(RAPSBaseModel): system_name: str """ Name of the system, defaults to the yaml file name """ @@ -202,7 +203,7 @@ class SystemConfig(BaseModel): return config_dict -class MultiPartitionSystemConfig(BaseModel): +class MultiPartitionSystemConfig(RAPSBaseModel): system_name: str partitions: list[SystemConfig] diff --git a/raps/telemetry.py b/raps/telemetry.py index 340b9ae..63ee158 100644 --- a/raps/telemetry.py +++ b/raps/telemetry.py @@ -15,7 +15,7 @@ from types import ModuleType import importlib import numpy as np import pandas as pd -from pydantic import BaseModel, model_validator +from pydantic import model_validator # from rich.progress import track from raps.sim_config import SimConfig @@ -28,12 +28,12 @@ from raps.plotting import ( plot_network_histogram ) from raps.utils import ( - next_arrival_byconfargs, pydantic_add_args, SubParsers, ExpandedPath, WorkloadData, + next_arrival_byconfargs, pydantic_add_args, SubParsers, ExpandedPath, WorkloadData, RAPSBaseModel, ) # TODO: should reuse this model in SimConfig -class TelemetryArgs(BaseModel): +class TelemetryArgs(RAPSBaseModel): jid: str = '*' """ Replay job id """ replay: list[ExpandedPath] | None = None diff --git a/raps/utils.py b/raps/utils.py index 323ac8a..6042a45 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -639,6 +639,13 @@ SmartTimedelta = A[timedelta, AfterValidator(parse_td)] T = TypeVar("T", bound=BaseModel) +class RAPSBaseModel(BaseModel): + """ Base Pydantic model with shared config """ + model_config = ConfigDict( + use_attribute_docstrings=True, + ) + + def pydantic_add_args( parser: argparse.ArgumentParser, model_cls: type[T], model_config: SettingsConfigDict | None = None, @@ -711,7 +718,7 @@ def yaml_dump(data): ) -class WorkloadData(BaseModel): +class WorkloadData(RAPSBaseModel): """ Represents a workload, a list of jobs with some metadata. Returned by dataloaders load_data() function, and by Workload.generate_jobs(). -- GitLab From f82ed3a5253a0f4c81f84e7ab4bafb352a673db1 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 10:30:58 -0400 Subject: [PATCH 03/21] Better error reporting --- raps/utils.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/raps/utils.py b/raps/utils.py index 6042a45..dbb5ea3 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -21,7 +21,7 @@ import json import argparse from pathlib import Path from typing import Annotated as A, TypeVar, Callable, TypeAlias -from pydantic import BaseModel, TypeAdapter, AfterValidator, ConfigDict, AwareDatetime +from pydantic import BaseModel, TypeAdapter, AfterValidator, ConfigDict, AwareDatetime, ValidationError from pydantic_settings import BaseSettings, SettingsConfigDict, CliApp, CliSettingsSource import yaml from raps.job import Job @@ -662,6 +662,7 @@ def pydantic_add_args( model_config_dict = SettingsConfigDict({ "cli_implicit_flags": True, "cli_kebab_case": True, + "title": model_cls.__name__, **(model_config or {}), "cli_parse_args": False, # Don't automatically parse args }) @@ -678,13 +679,17 @@ def pydantic_add_args( cli_settings_source = CliSettingsSource(SettingsModel, root_parser=parser) def model_validate_args(args: argparse.Namespace, data: dict | None = None): - model = CliApp.run(SettingsModel, - cli_args=args, - cli_settings_source=cli_settings_source, - **(data or {}), - ) - # Recreate model so we don't return the SettingsModel subclass - return model_cls.model_validate(model.model_dump()) + try: + model = CliApp.run(SettingsModel, + cli_args=args, + cli_settings_source=cli_settings_source, + **(data or {}), + ) + # Recreate model so we don't return the SettingsModel subclass + return model_cls.model_validate(model.model_dump()) + except ValidationError as err: + print(err) + sys.exit(1) return model_validate_args -- GitLab From 623dae8f415a37a259ff33d06b5d536e097d5c44 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 11:10:10 -0400 Subject: [PATCH 04/21] Set system in legacy args --- raps/engine.py | 2 -- raps/sim_config.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/raps/engine.py b/raps/engine.py index 90b84b0..d51905d 100644 --- a/raps/engine.py +++ b/raps/engine.py @@ -232,8 +232,6 @@ class Engine: sim_config_args = sim_config.get_legacy_args() sim_config_dict = sim_config.get_legacy_args_dict() sim_config_dict['config'] = system_config_dict - if partition: - sim_config_dict["system"] = sim_config.system_name if sim_config.seed: random.seed(sim_config.seed) diff --git a/raps/sim_config.py b/raps/sim_config.py index 300ac94..f1cb6ea 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -340,6 +340,7 @@ class SimConfig(RAPSBaseModel): contains the SimConfig object itself. """ args_dict = self.model_dump(mode="json") + args_dict['system'] = self.system_name # validate has been renamed to power_scope args_dict['validate'] = args_dict["power_scope"] == "node" args_dict['downscale'] = self.downscale -- GitLab From 25b406c821b2620fae2daad9bb293b8b2490feb0 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 11:10:52 -0400 Subject: [PATCH 05/21] Add base to SystemConfig --- raps/system_config.py | 24 +++++++++++++++++++++--- raps/utils.py | 10 ++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/raps/system_config.py b/raps/system_config.py index adbf470..c85867b 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -6,7 +6,7 @@ from pathlib import Path from functools import cached_property import yaml from pydantic import computed_field, model_validator, field_validator -from raps.utils import RAPSBaseModel +from raps.utils import RAPSBaseModel, deep_merge from raps.raps_config import raps_config # Define Pydantic models for the config to handle parsing and validation @@ -168,6 +168,12 @@ class SystemConfig(RAPSBaseModel): system_name: str """ Name of the system, defaults to the yaml file name """ + base: str | None = None + """ + Optional, name or path to another SystemConfig to "inherit" from. Lets you make small modifications + to an existing system without having to copy the whole config. + """ + system: SystemSystemConfig power: SystemPowerConfig scheduler: SystemSchedulerConfig @@ -175,6 +181,13 @@ class SystemConfig(RAPSBaseModel): cooling: SystemCoolingConfig | None = None network: SystemNetworkConfig | None = None + @model_validator(mode="before") + def _load_base(cls, data): + if data.get("base"): + base = get_system_config(data['base']) + data = deep_merge(base.model_dump(mode='json'), data) + return data + def get_legacy(self) -> dict[str, Any]: """ Return the system config as a flattened, uppercased dict. This is for backwards @@ -248,10 +261,12 @@ def get_system_config(system: str) -> SystemConfig: "system_name": system_name, # You can override system_name in the yaml as well **yaml.safe_load(config_path.read_text()), } + if str(config.get('base', '')).endswith(".yaml"): + config['base'] = config_path.parent / str(config['base']) # path relative to yaml return SystemConfig.model_validate(config) -def get_partition_configs(partitions: list[str]) -> MultiPartitionSystemConfig: +def get_partition_configs(partitions: list[str|SystemConfig]) -> MultiPartitionSystemConfig: """ Resolves multiple partition config files. Can pass globs, or directories to include all yaml files under the directory. @@ -262,7 +277,10 @@ def get_partition_configs(partitions: list[str]) -> MultiPartitionSystemConfig: parsed_configs: list[SystemConfig] = [] for pat in partitions: - if pat in multi_partition_systems: + if isinstance(pat, SystemConfig): + parsed_configs.append(pat) + combined_system_name.append(pat.system_name) + elif pat in multi_partition_systems: matched_systems = fnmatch.filter(systems, f"{pat}/*") combined_system_name.append(pat) elif fnmatch.filter(systems, pat): diff --git a/raps/utils.py b/raps/utils.py index dbb5ea3..ef2f7ef 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -27,6 +27,16 @@ import yaml from raps.job import Job +def deep_merge(a: dict, b: dict): + a = {**a} + for key in b.keys(): + if key in a and isinstance(a[key], dict) and isinstance(b[key], dict): + a[key] = deep_merge(a[key], b[key]) + else: + a[key] = b[key] + return a + + def sum_values(values): return sum(x[1] for x in values) if values else 0 -- GitLab From 4d075aa43c171af7ca3d0dc8476b029ded854617 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 12:46:50 -0400 Subject: [PATCH 06/21] Add ability to override system settings --- raps/sim_config.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/raps/sim_config.py b/raps/sim_config.py index f1cb6ea..cfdde03 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -16,9 +16,12 @@ Distribution = Literal['uniform', 'weibull', 'normal'] class SimConfig(RAPSBaseModel): - system: str | None = None - """ System config to use """ - partitions: list[str] = [] + system: SystemConfig | str | None = None + """ + System config to use. Either the name of one of the redefined systems, or specify the full + system. You can modify a system with system.base, e.g. `--system.base frontier --system.cooling.fmu-path my.fmu` + """ + partitions: list[SystemConfig|str] = [] """ List of multiple system configurations for a multi-partition run. Can contain wildcards """ cooling: bool = False -- GitLab From be3cfb520ee694b41b26294d56d6e7c6c9957b46 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 13:09:28 -0400 Subject: [PATCH 07/21] Fix bug --- raps/system_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/raps/system_config.py b/raps/system_config.py index c85867b..b118eef 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -183,7 +183,7 @@ class SystemConfig(RAPSBaseModel): @model_validator(mode="before") def _load_base(cls, data): - if data.get("base"): + if isinstance(data, dict) and data.get("base"): base = get_system_config(data['base']) data = deep_merge(base.model_dump(mode='json'), data) return data -- GitLab From 9f601385a7d2315aba4b9d83bcd4758bede7e3d9 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 13:50:02 -0400 Subject: [PATCH 08/21] Don't use computed_field in SystemConfig computed_field is always serialized which causes some complications when saving SimConfigs and SystemConfigs. Instead add them to the legacy dict explicitly --- raps/system_config.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/raps/system_config.py b/raps/system_config.py index b118eef..679b3fd 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -5,7 +5,7 @@ from typing import Any, Literal from pathlib import Path from functools import cached_property import yaml -from pydantic import computed_field, model_validator, field_validator +from pydantic import model_validator, field_validator from raps.utils import RAPSBaseModel, deep_merge from raps.raps_config import raps_config @@ -42,27 +42,22 @@ class SystemSystemConfig(RAPSBaseModel): self.down_nodes = sorted(set(self.down_nodes)) return self - @computed_field @cached_property def num_racks(self) -> int: return self.num_cdus * self.racks_per_cdu - len(self.missing_racks) - @computed_field @cached_property def sc_shape(self) -> list[int]: return [self.num_cdus, self.racks_per_cdu, self.nodes_per_rack] - @computed_field @cached_property def total_nodes(self) -> int: return self.num_cdus * self.racks_per_cdu * self.nodes_per_rack - @computed_field @cached_property def blades_per_chassis(self) -> int: return int(self.nodes_per_rack / self.chassis_per_rack / self.nodes_per_blade) - @computed_field @cached_property def power_df_header(self) -> list[str]: power_df_header = ["CDU"] @@ -74,7 +69,6 @@ class SystemSystemConfig(RAPSBaseModel): power_df_header.append("Loss") return power_df_header - @computed_field @cached_property def available_nodes(self) -> int: return self.total_nodes - len(self.down_nodes) @@ -195,6 +189,8 @@ class SystemConfig(RAPSBaseModel): gradually. The dict also as a "system_config" key that contains the SystemConfig object itself. """ + dump = self.model_dump(mode="json", exclude_none=True) + renames = { # fields that need to be renamed to something other than just .upper() "system_name": "system_name", "w_htwps_key": "W_HTWPs_KEY", @@ -202,7 +198,6 @@ class SystemConfig(RAPSBaseModel): "w_cts_key": "W_CTs_KEY", "multitenant": "multitenant", } - dump = self.model_dump(mode="json", exclude_none=True) config_dict: dict[str, Any] = {} for k, v in dump.items(): # flatten @@ -210,6 +205,13 @@ class SystemConfig(RAPSBaseModel): config_dict.update(v) else: config_dict[k] = v + config_dict["num_racks"] = self.system.num_racks + config_dict["sc_shape"] = self.system.sc_shape + config_dict["total_nodes"] = self.system.total_nodes + config_dict["blades_per_chassis"] = self.system.blades_per_chassis + config_dict["power_df_header"] = self.system.power_df_header + config_dict["available_nodes"] = self.system.available_nodes + # rename keys config_dict = {renames.get(k, k.upper()): v for k, v in config_dict.items()} config_dict['system_config'] = self -- GitLab From 8745bf7be63786ca647ca28346d1650a072b69f2 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 16:42:36 -0400 Subject: [PATCH 09/21] Only output fields not in base --- raps/system_config.py | 14 ++++++++++++++ raps/utils.py | 17 +++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/raps/system_config.py b/raps/system_config.py index 679b3fd..928385f 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -7,6 +7,11 @@ from functools import cached_property import yaml from pydantic import model_validator, field_validator from raps.utils import RAPSBaseModel, deep_merge +from pydantic import ( + model_validator, field_validator, model_serializer, SerializationInfo, + SerializerFunctionWrapHandler, +) +from raps.utils import RAPSBaseModel, deep_merge, deep_subtract_dicts from raps.raps_config import raps_config # Define Pydantic models for the config to handle parsing and validation @@ -182,6 +187,15 @@ class SystemConfig(RAPSBaseModel): data = deep_merge(base.model_dump(mode='json'), data) return data + @model_serializer(mode='wrap') + def model_serializer(self, handler: SerializerFunctionWrapHandler, info: SerializationInfo): + # don't include the base system data in the output + if self.base and (info.exclude_defaults or info.exclude_unset): + base = get_system_config(self.base) + return deep_subtract_dicts(handler(self), handler(base)) + else: + return handler(self) + def get_legacy(self) -> dict[str, Any]: """ Return the system config as a flattened, uppercased dict. This is for backwards diff --git a/raps/utils.py b/raps/utils.py index ef2f7ef..afc76cf 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -37,6 +37,23 @@ def deep_merge(a: dict, b: dict): return a +def deep_subtract_dicts(a: dict, b: dict): + """ + Remove all fields from a that are already in b, such that + deep_merge(deep_subtract_dicts(a, b), b) == a + a should contain a superset of b's keys. + """ + a = {**a} + for key in b.keys(): + if key in a: + if a[key] == b[key]: + a.pop(key) + elif isinstance(a[key], dict) and isinstance(b[key], dict): + a[key] = deep_subtract_dicts(a[key], b[key]) + # otherwise keep key in a as is + return a + + def sum_values(values): return sum(x[1] for x in values) if values else 0 -- GitLab From d79b8a998eb8b8821964fbeccf1eb525c2c41f5e Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 16:46:18 -0400 Subject: [PATCH 10/21] Formatting --- raps/dataloaders/mit_supercloud/loader.py | 3 +-- raps/sim_config.py | 3 ++- raps/system_config.py | 8 +++----- raps/utils.py | 8 ++++---- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/raps/dataloaders/mit_supercloud/loader.py b/raps/dataloaders/mit_supercloud/loader.py index 2c8dbc1..ab68eb7 100644 --- a/raps/dataloaders/mit_supercloud/loader.py +++ b/raps/dataloaders/mit_supercloud/loader.py @@ -119,7 +119,7 @@ from collections import Counter from datetime import datetime, timezone from raps.job import job_dict, Job -from raps.utils import summarize_ranges, next_arrival, WorkloadData +from raps.utils import summarize_ranges, WorkloadData from .utils import proc_cpu_series, proc_gpu_series, to_epoch from .utils import DEFAULT_START, DEFAULT_END @@ -211,7 +211,6 @@ def load_data(local_dataset_path, **kwargs): """ debug = kwargs.get("debug") config = kwargs.get("config") - arrival = kwargs.get("arrival") NL_PATH = os.path.dirname(__file__) skip_counts = Counter() diff --git a/raps/sim_config.py b/raps/sim_config.py index cfdde03..7be5258 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -21,7 +21,7 @@ class SimConfig(RAPSBaseModel): System config to use. Either the name of one of the redefined systems, or specify the full system. You can modify a system with system.base, e.g. `--system.base frontier --system.cooling.fmu-path my.fmu` """ - partitions: list[SystemConfig|str] = [] + partitions: list[SystemConfig | str] = [] """ List of multiple system configurations for a multi-partition run. Can contain wildcards """ cooling: bool = False @@ -74,6 +74,7 @@ class SimConfig(RAPSBaseModel): """ If True, output to a randomly named directory. Mutually exclusive with output """ _random_output: Path | None = None + def get_output(self): if self.random_output and not self._random_output: self._random_output = Path(create_casename("out-")).resolve() diff --git a/raps/system_config.py b/raps/system_config.py index 928385f..663e789 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -5,8 +5,6 @@ from typing import Any, Literal from pathlib import Path from functools import cached_property import yaml -from pydantic import model_validator, field_validator -from raps.utils import RAPSBaseModel, deep_merge from pydantic import ( model_validator, field_validator, model_serializer, SerializationInfo, SerializerFunctionWrapHandler, @@ -167,7 +165,7 @@ class SystemConfig(RAPSBaseModel): system_name: str """ Name of the system, defaults to the yaml file name """ - base: str | None = None + base: str | None = None """ Optional, name or path to another SystemConfig to "inherit" from. Lets you make small modifications to an existing system without having to copy the whole config. @@ -278,11 +276,11 @@ def get_system_config(system: str) -> SystemConfig: **yaml.safe_load(config_path.read_text()), } if str(config.get('base', '')).endswith(".yaml"): - config['base'] = config_path.parent / str(config['base']) # path relative to yaml + config['base'] = config_path.parent / str(config['base']) # path relative to yaml return SystemConfig.model_validate(config) -def get_partition_configs(partitions: list[str|SystemConfig]) -> MultiPartitionSystemConfig: +def get_partition_configs(partitions: list[str | SystemConfig]) -> MultiPartitionSystemConfig: """ Resolves multiple partition config files. Can pass globs, or directories to include all yaml files under the directory. diff --git a/raps/utils.py b/raps/utils.py index afc76cf..ab02a2a 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -708,10 +708,10 @@ def pydantic_add_args( def model_validate_args(args: argparse.Namespace, data: dict | None = None): try: model = CliApp.run(SettingsModel, - cli_args=args, - cli_settings_source=cli_settings_source, - **(data or {}), - ) + cli_args=args, + cli_settings_source=cli_settings_source, + **(data or {}), + ) # Recreate model so we don't return the SettingsModel subclass return model_cls.model_validate(model.model_dump()) except ValidationError as err: -- GitLab From e45d9666d53c6e2b41314e5318e185648438ef90 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 16:51:24 -0400 Subject: [PATCH 11/21] Output files by default --- raps/engine.py | 1 - raps/run_sim.py | 1 - raps/sim_config.py | 26 +++++++++++++++----------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/raps/engine.py b/raps/engine.py index d51905d..3c83c84 100644 --- a/raps/engine.py +++ b/raps/engine.py @@ -161,7 +161,6 @@ class Engine: self.flops_manager = flops_manager self.debug = sim_config.debug self.continuous_workload = continuous_workload - self.output = sim_config.get_output() self.replay = sim_config.replay self.downscale = sim_config.downscale # Factor to downscale the 1s timesteps (power of 10) self.simulate_network = sim_config.simulate_network diff --git a/raps/run_sim.py b/raps/run_sim.py index 6c39a2f..9846eed 100644 --- a/raps/run_sim.py +++ b/raps/run_sim.py @@ -44,7 +44,6 @@ shortcuts = { "numjobs": "n", "verbose": "v", "output": "o", - "random-output": "O", "uncertainties": "u", "plot": "p", "replay": "f", diff --git a/raps/sim_config.py b/raps/sim_config.py index 7be5258..2a31724 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -68,17 +68,24 @@ class SimConfig(RAPSBaseModel): seed: int | None = None """ Set RNG seed for deterministic simulation """ - output: ExpandedPath | None = None - """ Output power, cooling, and loss models for later analysis. Argument specifies name. """ - random_output: bool = False - """ If True, output to a randomly named directory. Mutually exclusive with output """ + output: ExpandedPath | Literal['none'] | None = None + """ + Where to output power, cooling, and loss models for later analysis. + If omitted it will output to raps-output- by default. + Set to "none" to disable file output entirely. + """ _random_output: Path | None = None - def get_output(self): - if self.random_output and not self._random_output: - self._random_output = Path(create_casename("out-")).resolve() - return self._random_output or self.output + def get_output(self) -> Path | None: + if self.output is None: # by default, output to a random directory + if not self._random_output: + self._random_output = Path(create_casename("raps-output-")).resolve() + return self._random_output + elif self.output == "none": # allow explicitly disabling output with "none" + return None + else: + return self.output # return user defined output path debug: bool = False """ Enable debug mode and disable rich layout """ @@ -306,9 +313,6 @@ class SimConfig(RAPSBaseModel): raise ValueError(f"policy {self.backfill} not implemented by {self.scheduler}. " f"Valid selections: {sorted(valid_backfilltypes)}") - if self.random_output and self.output: - raise ValueError("--random-output and --output are mutually exclusive") - return self @property -- GitLab From 7a567ae9813a7372bba72ee908bc47de5d4aa946 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 17:32:44 -0400 Subject: [PATCH 12/21] Output sim config --- raps/run_sim.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/raps/run_sim.py b/raps/run_sim.py index 9846eed..93a2cf8 100644 --- a/raps/run_sim.py +++ b/raps/run_sim.py @@ -87,6 +87,9 @@ def run_sim(sim_config: SimConfig): result=workload_data, args=sim_config, ) + config_yaml = yaml_dump(sim_config.model_dump(mode="json", exclude_defaults=True)) + (out / 'sim_config.yaml').write_text(config_yaml) + jobs = workload_data.jobs timestep_start, timestep_end = workload_data.telemetry_start, workload_data.telemetry_end total_timesteps = timestep_end - timestep_start @@ -243,7 +246,6 @@ def run_parts_sim_add_parser(subparsers: SubParsers): def run_parts_sim(sim_config: SimConfig): - if len(sim_config.system_configs) == 1: warnings.warn( "run_parts_sim is usually for multiple partitions. Did you mean to run with one?", @@ -261,6 +263,9 @@ def run_parts_sim(sim_config: SimConfig): result=workload_results[part], args=sim_config, ) + config_yaml = yaml_dump(sim_config.model_dump(mode="json", exclude_defaults=True)) + (out / 'sim_config.yaml').write_text(config_yaml) + jobs = {p: w.jobs for p, w in workload_results.items()} ui_update_freq = sim_config.system_configs[0].scheduler.ui_update_freq -- GitLab From 5473162c6622cb0083aa2c6aff7c7e76ee2c5b58 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 17:47:45 -0400 Subject: [PATCH 13/21] Pass SystemConfig through in get_system_config --- raps/system_config.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/raps/system_config.py b/raps/system_config.py index 663e789..e621d4b 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -255,13 +255,15 @@ def list_systems() -> list[str]: ]) -@functools.cache -def get_system_config(system: str) -> SystemConfig: +def get_system_config(system: str|SystemConfig) -> SystemConfig: """ Returns the system config as a Pydantic object. system can either be a path to a custom .yaml file, or the name of one of the pre-configured systems defined in RAPS_SYSTEM_CONFIG_DIR. """ + if isinstance(system, SystemConfig): # Just pass system through if its already parsed + return system + if system in list_systems(): config_path = raps_config.system_config_dir / f"{system}.yaml" system_name = system -- GitLab From 4a80ebcba640a88b2da21036a662d08263c69dbf Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 17:56:46 -0400 Subject: [PATCH 14/21] Separate configs for single and multi run --- raps/engine.py | 6 ++-- raps/multi_part_engine.py | 4 +-- raps/run_sim.py | 16 +++++------ raps/sim_config.py | 59 ++++++++++++++++++++++++--------------- raps/workload.py | 6 ++-- 5 files changed, 52 insertions(+), 39 deletions(-) diff --git a/raps/engine.py b/raps/engine.py index 3c83c84..3d27848 100644 --- a/raps/engine.py +++ b/raps/engine.py @@ -38,7 +38,7 @@ from raps.workload import Workload, continuous_job_generation from raps.account import Accounts from raps.downtime import Downtime from raps.weather import Weather -from raps.sim_config import SimConfig +from raps.sim_config import SingleSimConfig from raps.system_config import SystemConfig from bisect import bisect_right @@ -135,7 +135,7 @@ class Engine: # Workload class to generate from for continuous generation continuous_workload: Workload | None = None, accounts=None, - sim_config: SimConfig, + sim_config: SingleSimConfig, system_config: SystemConfig, ): self.config = system_config.get_legacy() @@ -212,7 +212,7 @@ class Engine: self.network_model = None @staticmethod - def from_sim_config(sim_config: SimConfig, partition: str | None = None): + def from_sim_config(sim_config: SingleSimConfig, partition: str | None = None): if partition: system_config_by_name = {s.system_name: s for s in sim_config.system_configs} system_config = system_config_by_name.get(partition) diff --git a/raps/multi_part_engine.py b/raps/multi_part_engine.py index 944ced9..57e3e27 100644 --- a/raps/multi_part_engine.py +++ b/raps/multi_part_engine.py @@ -1,6 +1,6 @@ from collections.abc import Iterable from raps.engine import Engine, TickData -from raps.sim_config import SimConfig +from raps.sim_config import MultiPartSimConfig from raps.utils import WorkloadData @@ -11,7 +11,7 @@ class MultiPartEngine: self.jobs = jobs @staticmethod - def from_sim_config(sim_config: SimConfig): + def from_sim_config(sim_config: MultiPartSimConfig): if sim_config.replay: root_systems = set(s.system_name.split("/")[0] for s in sim_config.system_configs) # TODO should consider how to pass separate replay values for separate systems diff --git a/raps/run_sim.py b/raps/run_sim.py index 93a2cf8..06af0b4 100644 --- a/raps/run_sim.py +++ b/raps/run_sim.py @@ -22,7 +22,7 @@ from raps.stats import ( print_formatted_report ) -from raps.sim_config import SimConfig +from raps.sim_config import SingleSimConfig, MultiPartSimConfig def read_yaml(config_file: str): @@ -62,7 +62,7 @@ def run_sim_add_parser(subparsers: SubParsers): YAML sim config file, can be used to configure an experiment instead of using CLI flags. Pass "-" to read from stdin. """) - model_validate = pydantic_add_args(parser, SimConfig, model_config={ + model_validate = pydantic_add_args(parser, SingleSimConfig, model_config={ "cli_shortcuts": shortcuts, }) parser.set_defaults( @@ -70,9 +70,9 @@ def run_sim_add_parser(subparsers: SubParsers): ) -def run_sim(sim_config: SimConfig): +def run_sim(sim_config: SingleSimConfig): if sim_config.verbose or sim_config.debug: - print(f"SimConfig: {sim_config.model_dump_json(indent=4)}") + print(f"SingleSimConfig: {sim_config.model_dump_json(indent=4)}") if len(sim_config.system_configs) > 1: print("Use run-parts to run multi-partition simulations") sys.exit(1) @@ -237,7 +237,7 @@ def run_parts_sim_add_parser(subparsers: SubParsers): YAML sim config file, can be used to configure an experiment instead of using CLI flags. Pass "-" to read from stdin. """) - model_validate = pydantic_add_args(parser, SimConfig, model_config={ + model_validate = pydantic_add_args(parser, MultiPartSimConfig, model_config={ "cli_shortcuts": shortcuts, }) parser.set_defaults( @@ -245,7 +245,7 @@ def run_parts_sim_add_parser(subparsers: SubParsers): ) -def run_parts_sim(sim_config: SimConfig): +def run_parts_sim(sim_config: MultiPartSimConfig): if len(sim_config.system_configs) == 1: warnings.warn( "run_parts_sim is usually for multiple partitions. Did you mean to run with one?", @@ -323,7 +323,7 @@ def show_add_parser(subparsers: SubParsers): parser.add_argument("--show-defaults", default=False, help=""" If true, include defaults in the output YAML """) - model_validate = pydantic_add_args(parser, SimConfig, model_config={ + model_validate = pydantic_add_args(parser, SingleSimConfig, model_config={ "cli_shortcuts": shortcuts, }) @@ -334,6 +334,6 @@ def show_add_parser(subparsers: SubParsers): parser.set_defaults(impl=impl) -def show(sim_config: SimConfig, show_defaults=False): +def show(sim_config: SingleSimConfig, show_defaults=False): data = sim_config.model_dump(mode="json", exclude_defaults=not show_defaults) print(yaml_dump(data), end="") diff --git a/raps/sim_config.py b/raps/sim_config.py index 2a31724..6aec463 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -1,4 +1,5 @@ import argparse +import abc from pathlib import Path from functools import cached_property from datetime import timedelta @@ -9,21 +10,13 @@ from raps.utils import ( parse_time_unit, convert_to_time_unit, infer_time_unit, ExpandedPath, parse_td, create_casename, RAPSBaseModel, ) -from raps.system_config import SystemConfig, get_partition_configs +from raps.system_config import SystemConfig, get_partition_configs, get_system_config from pydantic import model_validator Distribution = Literal['uniform', 'weibull', 'normal'] -class SimConfig(RAPSBaseModel): - system: SystemConfig | str | None = None - """ - System config to use. Either the name of one of the redefined systems, or specify the full - system. You can modify a system with system.base, e.g. `--system.base frontier --system.cooling.fmu-path my.fmu` - """ - partitions: list[SystemConfig | str] = [] - """ List of multiple system configurations for a multi-partition run. Can contain wildcards """ - +class SimConfig(RAPSBaseModel, abc.ABC): cooling: bool = False """ Include the FMU cooling model """ simulate_network: bool = False @@ -264,12 +257,6 @@ class SimConfig(RAPSBaseModel): @model_validator(mode="after") def _validate_after(self): - # This is called after Pydantic has parsed everything into the model - if self.system and self.partitions: - raise ValueError("system and partitions are mutually exclusive") - elif not self.system and not self.partitions: - self.system = "frontier" - if not self.replay and not self.workload: self.workload = "random" @@ -316,24 +303,22 @@ class SimConfig(RAPSBaseModel): return self @property + @abc.abstractmethod def system_name(self) -> str: """ Name of the system. - Note, this is different than system, as system can be a file or None if partition is set. + Note, this is different than system, as system can be a file, or there can be multiple systems """ - return self._multi_partition_system_config.system_name + pass @property + @abc.abstractmethod def system_configs(self) -> list[SystemConfig]: """ Return the SystemConfigs for the selected systems. Will be a single element array unless multiple `partitions` are selected. """ - return self._multi_partition_system_config.partitions - - @cached_property - def _multi_partition_system_config(self): - return get_partition_configs(self.partitions if self.partitions else [self.system]) + pass def get_legacy_args(self): """ @@ -363,3 +348,31 @@ class SimConfig(RAPSBaseModel): args_dict['sim_config'] = self return args_dict + + +class SingleSimConfig(SimConfig, abc.ABC): + system: SystemConfig | str = "frontier" + + @property + def system_name(self) -> str: + return self.system_configs[0].system_name + + @cached_property + def system_configs(self) -> list[SystemConfig]: + return [get_system_config(self.system)] + + +class MultiPartSimConfig(SimConfig): + partitions: list[SystemConfig | str] + + @property + def system_name(self) -> str: + return self._multi_partition_system_config.system_name + + @property + def system_configs(self) -> list[SystemConfig]: + return self._multi_partition_system_config.partitions + + @cached_property + def _multi_partition_system_config(self): + return get_partition_configs(self.partitions) diff --git a/raps/workload.py b/raps/workload.py index 57b536d..78d8d2a 100644 --- a/raps/workload.py +++ b/raps/workload.py @@ -40,7 +40,7 @@ import matplotlib.pyplot as plt from raps.telemetry import Telemetry from raps.job import job_dict, Job from raps.utils import create_file_indexed, SubParsers, pydantic_add_args -from raps.sim_config import SimConfig +from raps.sim_config import SingleSimConfig JOB_NAMES = ["LAMMPS", "GROMACS", "VASP", "Quantum ESPRESSO", "NAMD", @@ -965,13 +965,13 @@ def run_workload_add_parser(subparsers: SubParsers): YAML sim config file, can be used to configure an experiment instead of using CLI flags. Pass "-" to read from stdin. """) - model_validate = pydantic_add_args(parser, SimConfig, model_config={ + model_validate = pydantic_add_args(parser, SingleSimConfig, model_config={ "cli_shortcuts": shortcuts, }) parser.set_defaults(impl=lambda args: run_workload(model_validate(args, {}))) -def run_workload(sim_config: SimConfig): +def run_workload(sim_config: SingleSimConfig): args = sim_config.get_legacy_args() args_dict = sim_config.get_legacy_args() config = sim_config.system_configs[0].get_legacy() -- GitLab From a9fade70cf5deb32e425e78214abb9bbb4b99b3f Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 18:31:01 -0400 Subject: [PATCH 15/21] Fix tests --- raps/run_sim.py | 3 ++- tests/smoke.py | 2 +- tests/systems/test_engine.py | 4 ++-- tests/systems/test_main_basic_run.py | 2 +- tests/systems/test_main_time_delta_sub_second_run.py | 8 +------- tests/systems/test_multi_part_sim_basic_run.py | 3 ++- tests/systems/test_multi_part_sim_withdata_run.py | 3 ++- tests/test_main.py | 11 ++++++----- 8 files changed, 17 insertions(+), 19 deletions(-) diff --git a/raps/run_sim.py b/raps/run_sim.py index 06af0b4..ce89529 100644 --- a/raps/run_sim.py +++ b/raps/run_sim.py @@ -83,7 +83,7 @@ def run_sim(sim_config: SingleSimConfig): if out: out.mkdir(parents=True) engine.telemetry.save_snapshot( - dest=str(out), + dest=str(out / 'snapshot.npz'), result=workload_data, args=sim_config, ) @@ -257,6 +257,7 @@ def run_parts_sim(sim_config: MultiPartSimConfig): out = sim_config.get_output() if out: + out.mkdir(parents=True) for part, engine in multi_engine.engines.items(): engine.telemetry.save_snapshot( dest=str(out / part.split('/')[-1]), diff --git a/tests/smoke.py b/tests/smoke.py index 946f6db..a2ea598 100644 --- a/tests/smoke.py +++ b/tests/smoke.py @@ -32,7 +32,7 @@ def run_command(command): def build_command(system, file_paths, additional_args=""): """Build the command string for the given system and file paths.""" full_paths = " ".join([os.path.join(DATAPATH, path) for path in file_paths.split()]) - return f"python main.py run --system {system} -f {full_paths} -t {DEFAULT_TIME} {additional_args}".strip() + return f"python main.py run --system {system} -f {full_paths} -t {DEFAULT_TIME} -o none {additional_args}".strip() def execute_system_tests(systems): diff --git a/tests/systems/test_engine.py b/tests/systems/test_engine.py index e483b18..0404e89 100644 --- a/tests/systems/test_engine.py +++ b/tests/systems/test_engine.py @@ -1,6 +1,6 @@ import pytest from raps.engine import Engine -from raps.sim_config import SimConfig +from raps.sim_config import SingleSimConfig from raps.stats import ( get_engine_stats, # get_job_stats, @@ -18,7 +18,7 @@ def test_engine(system, system_config, sim_output): if not system_config.get("main", False): pytest.skip(f"{system} does not support basic main run.") - sim_config = SimConfig.model_validate({ + sim_config = SingleSimConfig.model_validate({ "system": system, "time": "2m", }) diff --git a/tests/systems/test_main_basic_run.py b/tests/systems/test_main_basic_run.py index 0cc9b69..37661f3 100644 --- a/tests/systems/test_main_basic_run.py +++ b/tests/systems/test_main_basic_run.py @@ -19,6 +19,6 @@ def test_main_basic_run(system, system_config, sim_output): "python", "main.py", "run", "--time", "1m", "--system", system, - "-o", sim_output + "-o", sim_output, ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_time_delta_sub_second_run.py b/tests/systems/test_main_time_delta_sub_second_run.py index 55c0e3c..db80105 100644 --- a/tests/systems/test_main_time_delta_sub_second_run.py +++ b/tests/systems/test_main_time_delta_sub_second_run.py @@ -34,17 +34,11 @@ def test_main_time_delta_sub_second_run(system, system_config, time_arg, tdelta_ "--time-delta", tdelta_arg, "--system", system, "--noui", - "-o", sim_output + "-o", sim_output, ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" time = parse_td(time_arg).seconds assert f"Time Simulated: {convert_seconds_to_hhmmss(time)}" in result.stdout - subprocess.run( - f"rm {sim_output}.npz && rm -fr simulation_results/{sim_output}", - shell=True, - check=True - ) - del result gc.collect() diff --git a/tests/systems/test_multi_part_sim_basic_run.py b/tests/systems/test_multi_part_sim_basic_run.py index 9351fd6..0edcc90 100644 --- a/tests/systems/test_multi_part_sim_basic_run.py +++ b/tests/systems/test_multi_part_sim_basic_run.py @@ -11,7 +11,7 @@ pytestmark = [ ] -def test_multi_part_sim_basic_run(system, system_config): +def test_multi_part_sim_basic_run(system, system_config, sim_output): if not system_config.get("multi-part-sim", False): pytest.skip(f"{system} does not support basic multi-part-sim run.") @@ -21,6 +21,7 @@ def test_multi_part_sim_basic_run(system, system_config): "python", "main.py", "run-parts", "--time", "1h", "-x", f"{system}/*", + "-o", sim_output, ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" del result diff --git a/tests/systems/test_multi_part_sim_withdata_run.py b/tests/systems/test_multi_part_sim_withdata_run.py index f38cf8e..ab62f93 100644 --- a/tests/systems/test_multi_part_sim_withdata_run.py +++ b/tests/systems/test_multi_part_sim_withdata_run.py @@ -12,7 +12,7 @@ pytestmark = [ ] -def test_multi_part_sim_withdata_run(system, system_config, system_files): +def test_multi_part_sim_withdata_run(system, system_config, system_files, sim_output): if not system_config.get("multi-part-sim", False): pytest.skip(f"{system} does not support basic multi-part-sim run even without data.") if not system_config.get("withdata", False): @@ -24,5 +24,6 @@ def test_multi_part_sim_withdata_run(system, system_config, system_files): "--time", "1h", "-x", f"{system}/*", "-f", ','.join(system_files), + "-o", sim_output, ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/test_main.py b/tests/test_main.py index 5c08182..0c31dd0 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -10,11 +10,12 @@ PROJECT_ROOT = Path(__file__).resolve().parent.parent # adjust if needed @pytest.mark.order(1) -def test_main_withui(): +def test_main_withui(sim_output): os.chdir(PROJECT_ROOT) result = subprocess.run([ "python", "main.py", "run", "--time", "1h", + "-o", sim_output, ], capture_output=True, text=True ) @@ -22,12 +23,12 @@ def test_main_withui(): @pytest.mark.order(2) -def test_main_noui(): +def test_main_noui(sim_output): os.chdir(PROJECT_ROOT) result = subprocess.run([ "python", "main.py", "run", "--time", "1h", - "--noui" + "--noui", "-o", sim_output, ], capture_output=True, text=True ) @@ -36,10 +37,10 @@ def test_main_noui(): @pytest.mark.long @pytest.mark.order(3) -def test_main_long(): +def test_main_long(sim_output): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run", + "python", "main.py", "run", "-o", sim_output, ], capture_output=True, text=True ) -- GitLab From 44c044eab39db223bec84be8ce38f9f67b57d866 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 20:35:29 -0400 Subject: [PATCH 16/21] Formatting --- raps/system_config.py | 4 ++-- raps/workload.py | 2 +- tests/systems/test_main_network_withdata_run.py | 2 +- tests/systems/test_main_withdata_run.py | 2 +- tests/systems/test_multi_part_sim_withdata_run.py | 3 +-- tests/systems/test_telemetry_withdata_run.py | 2 +- 6 files changed, 7 insertions(+), 8 deletions(-) diff --git a/raps/system_config.py b/raps/system_config.py index e621d4b..198f250 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -255,13 +255,13 @@ def list_systems() -> list[str]: ]) -def get_system_config(system: str|SystemConfig) -> SystemConfig: +def get_system_config(system: str | SystemConfig) -> SystemConfig: """ Returns the system config as a Pydantic object. system can either be a path to a custom .yaml file, or the name of one of the pre-configured systems defined in RAPS_SYSTEM_CONFIG_DIR. """ - if isinstance(system, SystemConfig): # Just pass system through if its already parsed + if isinstance(system, SystemConfig): # Just pass system through if its already parsed return system if system in list_systems(): diff --git a/raps/workload.py b/raps/workload.py index 78d8d2a..2a630b2 100644 --- a/raps/workload.py +++ b/raps/workload.py @@ -991,7 +991,7 @@ def run_workload(sim_config: SingleSimConfig): if out: timestep_start = min([x.submit_time for x in jobs]) timestep_end = math.ceil(max([x.submit_time for x in jobs]) + max([x.expected_run_time for x in jobs])) - filename = create_file_indexed('wl', path = str(out), create=False, ending="npz").split(".npz")[0] + filename = create_file_indexed('wl', path=str(out), create=False, ending="npz").split(".npz")[0] # savez_compressed add npz itself, but create_file_indexed needs to check for .npz to find existing files np.savez_compressed(filename, jobs=jobs, timestep_start=timestep_start, timestep_end=timestep_end, args=args) print(filename + ".npz") # To std-out to show which npz was created. diff --git a/tests/systems/test_main_network_withdata_run.py b/tests/systems/test_main_network_withdata_run.py index 58d14f9..1cbeae8 100644 --- a/tests/systems/test_main_network_withdata_run.py +++ b/tests/systems/test_main_network_withdata_run.py @@ -1,7 +1,7 @@ import os import subprocess import pytest -from tests.util import PROJECT_ROOT, DATA_PATH +from tests.util import PROJECT_ROOT pytestmark = [ diff --git a/tests/systems/test_main_withdata_run.py b/tests/systems/test_main_withdata_run.py index eb996a3..3539db9 100644 --- a/tests/systems/test_main_withdata_run.py +++ b/tests/systems/test_main_withdata_run.py @@ -1,7 +1,7 @@ import os import subprocess import pytest -from tests.util import PROJECT_ROOT, DATA_PATH +from tests.util import PROJECT_ROOT pytestmark = [ diff --git a/tests/systems/test_multi_part_sim_withdata_run.py b/tests/systems/test_multi_part_sim_withdata_run.py index ab62f93..538726c 100644 --- a/tests/systems/test_multi_part_sim_withdata_run.py +++ b/tests/systems/test_multi_part_sim_withdata_run.py @@ -1,8 +1,7 @@ import os import subprocess -import gc import pytest -from tests.util import PROJECT_ROOT, DATA_PATH +from tests.util import PROJECT_ROOT pytestmark = [ diff --git a/tests/systems/test_telemetry_withdata_run.py b/tests/systems/test_telemetry_withdata_run.py index 2729c7c..43a218b 100644 --- a/tests/systems/test_telemetry_withdata_run.py +++ b/tests/systems/test_telemetry_withdata_run.py @@ -1,7 +1,7 @@ import os import subprocess import pytest -from tests.util import PROJECT_ROOT, DATA_PATH +from tests.util import PROJECT_ROOT pytestmark = [ -- GitLab From c11a7088687b15cf359cd2c37664d9247484f087 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 5 Sep 2025 20:41:03 -0400 Subject: [PATCH 17/21] More test fixes --- tests/test_main.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/tests/test_main.py b/tests/test_main.py index 0c31dd0..4b09fa0 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -10,38 +10,32 @@ PROJECT_ROOT = Path(__file__).resolve().parent.parent # adjust if needed @pytest.mark.order(1) -def test_main_withui(sim_output): +def test_main_withui(): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run", - "--time", "1h", - "-o", sim_output, - ], capture_output=True, - text=True - ) + "python", "main.py", "run", + "--time", "1h", + "-o", 'none', + ], capture_output=True, text=True) assert result.returncode == 0 @pytest.mark.order(2) -def test_main_noui(sim_output): +def test_main_noui(): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run", - "--time", "1h", - "--noui", "-o", sim_output, - ], capture_output=True, - text=True - ) + "python", "main.py", "run", + "--time", "1h", + "--noui", "-o", 'none', + ], capture_output=True, text=True) assert result.returncode == 0 @pytest.mark.long @pytest.mark.order(3) -def test_main_long(sim_output): +def test_main_long(): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run", "-o", sim_output, - ], capture_output=True, - text=True - ) + "python", "main.py", "run", "-o", 'none', + ], capture_output=True, text=True) assert result.returncode == 0 -- GitLab From 0ec59e9e29e0c4c78820b3c5199ebb7503b47987 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Sat, 6 Sep 2025 11:56:12 -0400 Subject: [PATCH 18/21] Fix to SystemConfig base path resolution --- raps/system_config.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/raps/system_config.py b/raps/system_config.py index 198f250..bd405be 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -277,8 +277,9 @@ def get_system_config(system: str | SystemConfig) -> SystemConfig: "system_name": system_name, # You can override system_name in the yaml as well **yaml.safe_load(config_path.read_text()), } - if str(config.get('base', '')).endswith(".yaml"): - config['base'] = config_path.parent / str(config['base']) # path relative to yaml + base = str(config.get('base', '')) + if base.endswith(".yaml"): + config['base'] = str(config_path.parent / base) # path relative to yaml return SystemConfig.model_validate(config) -- GitLab From 7415b66910d5a14efc8e387938d74374cf5ad168 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 8 Sep 2025 13:44:24 -0400 Subject: [PATCH 19/21] Fix lassen dataloader --- raps/dataloaders/lassen.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/raps/dataloaders/lassen.py b/raps/dataloaders/lassen.py index fd0e364..bc57a7c 100644 --- a/raps/dataloaders/lassen.py +++ b/raps/dataloaders/lassen.py @@ -245,7 +245,8 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): return WorkloadData( jobs=job_list, telemetry_start=telemetry_start_time, telemetry_end=telemetry_end_time, - start_date=telemetry_start_timestamp, + # TODO: Confirm whether lassen timestamps are UTC or PDT + start_date=telemetry_start_timestamp.tz_localize("UTC"), ) -- GitLab From 2b3625ab6634fc5e7f0dfbf795aa8a90e78cd051 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 9 Sep 2025 10:49:22 -0400 Subject: [PATCH 20/21] Use base SimConfig in Engine --- raps/engine.py | 11 ++++------- raps/sim_config.py | 6 ++++++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/raps/engine.py b/raps/engine.py index 3d27848..c569bd0 100644 --- a/raps/engine.py +++ b/raps/engine.py @@ -38,7 +38,7 @@ from raps.workload import Workload, continuous_job_generation from raps.account import Accounts from raps.downtime import Downtime from raps.weather import Weather -from raps.sim_config import SingleSimConfig +from raps.sim_config import SimConfig from raps.system_config import SystemConfig from bisect import bisect_right @@ -135,7 +135,7 @@ class Engine: # Workload class to generate from for continuous generation continuous_workload: Workload | None = None, accounts=None, - sim_config: SingleSimConfig, + sim_config: SimConfig, system_config: SystemConfig, ): self.config = system_config.get_legacy() @@ -212,12 +212,9 @@ class Engine: self.network_model = None @staticmethod - def from_sim_config(sim_config: SingleSimConfig, partition: str | None = None): + def from_sim_config(sim_config: SimConfig, partition: str | None = None): if partition: - system_config_by_name = {s.system_name: s for s in sim_config.system_configs} - system_config = system_config_by_name.get(partition) - if not system_config: - raise ValueError(f"Partition {partition} isn't in SimConfig") + system_config = sim_config.get_system_config_by_name(partition) elif len(sim_config.system_configs) > 1: raise ValueError( "Engine can only run single-partition simulations. Use MultiPartEngine for " + diff --git a/raps/sim_config.py b/raps/sim_config.py index 6aec463..0cbe587 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -320,6 +320,12 @@ class SimConfig(RAPSBaseModel, abc.ABC): """ pass + def get_system_config_by_name(self, name: str) -> SystemConfig: + for s in self.system_configs: + if s.system_name == name: + return s + raise ValueError(f"Partition {name} isn't in SimConfig") + def get_legacy_args(self): """ Return as an argparse.Namespace object for backwards compatability -- GitLab From 49539856ad4b19de5e3270ee955d9d91eb5ba08c Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 9 Sep 2025 11:09:24 -0400 Subject: [PATCH 21/21] Improve help strings --- raps/sim_config.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/raps/sim_config.py b/raps/sim_config.py index 0cbe587..a73cd3e 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -358,6 +358,11 @@ class SimConfig(RAPSBaseModel, abc.ABC): class SingleSimConfig(SimConfig, abc.ABC): system: SystemConfig | str = "frontier" + """ + Name of the system to simulate, e.g "frontier". Can also be a path to a yaml file containing + the SystemConfig. You can also make modificiations to the SystemConfig on the CLI using + `--system.base`, e.g. `--system.base frontier --system.cooling.fmu-path path/to/my.fmu` + """ @property def system_name(self) -> str: @@ -370,6 +375,10 @@ class SingleSimConfig(SimConfig, abc.ABC): class MultiPartSimConfig(SimConfig): partitions: list[SystemConfig | str] + """ + List of multiple systems/partitions to run. Can be names of preconfigured systems, or paths + to custom SystemConfig yaml files. + """ @property def system_name(self) -> str: -- GitLab