diff --git a/raps/dataloaders/lassen.py b/raps/dataloaders/lassen.py index fd0e364314a4f11354c72bd19ffdb4f73118995d..bc57a7c0d8d959a3c2157444b253b586e9d56e1f 100644 --- a/raps/dataloaders/lassen.py +++ b/raps/dataloaders/lassen.py @@ -245,7 +245,8 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): return WorkloadData( jobs=job_list, telemetry_start=telemetry_start_time, telemetry_end=telemetry_end_time, - start_date=telemetry_start_timestamp, + # TODO: Confirm whether lassen timestamps are UTC or PDT + start_date=telemetry_start_timestamp.tz_localize("UTC"), ) diff --git a/raps/dataloaders/mit_supercloud/loader.py b/raps/dataloaders/mit_supercloud/loader.py index 2c8dbc18a7669f41abb3d2b63b89068407f80783..ab68eb7c941bd4677eddd95a98575948e39a1693 100644 --- a/raps/dataloaders/mit_supercloud/loader.py +++ b/raps/dataloaders/mit_supercloud/loader.py @@ -119,7 +119,7 @@ from collections import Counter from datetime import datetime, timezone from raps.job import job_dict, Job -from raps.utils import summarize_ranges, next_arrival, WorkloadData +from raps.utils import summarize_ranges, WorkloadData from .utils import proc_cpu_series, proc_gpu_series, to_epoch from .utils import DEFAULT_START, DEFAULT_END @@ -211,7 +211,6 @@ def load_data(local_dataset_path, **kwargs): """ debug = kwargs.get("debug") config = kwargs.get("config") - arrival = kwargs.get("arrival") NL_PATH = os.path.dirname(__file__) skip_counts = Counter() diff --git a/raps/engine.py b/raps/engine.py index 23a2605733ccaddae240ab2987ed525bd669c4ca..c569bd0d7903f7d974e783a8d7657762e40c653f 100644 --- a/raps/engine.py +++ b/raps/engine.py @@ -161,7 +161,6 @@ class Engine: self.flops_manager = flops_manager self.debug = sim_config.debug self.continuous_workload = continuous_workload - self.output = sim_config.output self.replay = sim_config.replay self.downscale = sim_config.downscale # Factor to downscale the 1s timesteps (power of 10) self.simulate_network = sim_config.simulate_network @@ -215,10 +214,7 @@ class Engine: @staticmethod def from_sim_config(sim_config: SimConfig, partition: str | None = None): if partition: - system_config_by_name = {s.system_name: s for s in sim_config.system_configs} - system_config = system_config_by_name.get(partition) - if not system_config: - raise ValueError(f"Partition {partition} isn't in SimConfig") + system_config = sim_config.get_system_config_by_name(partition) elif len(sim_config.system_configs) > 1: raise ValueError( "Engine can only run single-partition simulations. Use MultiPartEngine for " + @@ -232,8 +228,6 @@ class Engine: sim_config_args = sim_config.get_legacy_args() sim_config_dict = sim_config.get_legacy_args_dict() sim_config_dict['config'] = system_config_dict - if partition: - sim_config_dict["system"] = sim_config.system_name if sim_config.seed: random.seed(sim_config.seed) diff --git a/raps/multi_part_engine.py b/raps/multi_part_engine.py index 944ced9a31758c843e0d168a4c62a4144a0dbdac..57e3e27d28e44b9eb72dec37d327360cea275a85 100644 --- a/raps/multi_part_engine.py +++ b/raps/multi_part_engine.py @@ -1,6 +1,6 @@ from collections.abc import Iterable from raps.engine import Engine, TickData -from raps.sim_config import SimConfig +from raps.sim_config import MultiPartSimConfig from raps.utils import WorkloadData @@ -11,7 +11,7 @@ class MultiPartEngine: self.jobs = jobs @staticmethod - def from_sim_config(sim_config: SimConfig): + def from_sim_config(sim_config: MultiPartSimConfig): if sim_config.replay: root_systems = set(s.system_name.split("/")[0] for s in sim_config.system_configs) # TODO should consider how to pass separate replay values for separate systems diff --git a/raps/run_sim.py b/raps/run_sim.py index 5afd6f1823bcbe47f7343f7df041b728a7229644..ce89529a7fc925f240a4cc4c513927aa25ecfc8c 100644 --- a/raps/run_sim.py +++ b/raps/run_sim.py @@ -22,7 +22,7 @@ from raps.stats import ( print_formatted_report ) -from raps.sim_config import SimConfig +from raps.sim_config import SingleSimConfig, MultiPartSimConfig def read_yaml(config_file: str): @@ -62,7 +62,7 @@ def run_sim_add_parser(subparsers: SubParsers): YAML sim config file, can be used to configure an experiment instead of using CLI flags. Pass "-" to read from stdin. """) - model_validate = pydantic_add_args(parser, SimConfig, model_config={ + model_validate = pydantic_add_args(parser, SingleSimConfig, model_config={ "cli_shortcuts": shortcuts, }) parser.set_defaults( @@ -70,23 +70,26 @@ def run_sim_add_parser(subparsers: SubParsers): ) -def run_sim(sim_config: SimConfig): +def run_sim(sim_config: SingleSimConfig): if sim_config.verbose or sim_config.debug: - print(f"SimConfig: {sim_config.model_dump_json(indent=4)}") + print(f"SingleSimConfig: {sim_config.model_dump_json(indent=4)}") if len(sim_config.system_configs) > 1: print("Use run-parts to run multi-partition simulations") sys.exit(1) engine, workload_data, time_delta = Engine.from_sim_config(sim_config) - out = sim_config.output + out = sim_config.get_output() if out: out.mkdir(parents=True) engine.telemetry.save_snapshot( - dest=str(out), + dest=str(out / 'snapshot.npz'), result=workload_data, args=sim_config, ) + config_yaml = yaml_dump(sim_config.model_dump(mode="json", exclude_defaults=True)) + (out / 'sim_config.yaml').write_text(config_yaml) + jobs = workload_data.jobs timestep_start, timestep_end = workload_data.telemetry_start, workload_data.telemetry_end total_timesteps = timestep_end - timestep_start @@ -234,7 +237,7 @@ def run_parts_sim_add_parser(subparsers: SubParsers): YAML sim config file, can be used to configure an experiment instead of using CLI flags. Pass "-" to read from stdin. """) - model_validate = pydantic_add_args(parser, SimConfig, model_config={ + model_validate = pydantic_add_args(parser, MultiPartSimConfig, model_config={ "cli_shortcuts": shortcuts, }) parser.set_defaults( @@ -242,8 +245,7 @@ def run_parts_sim_add_parser(subparsers: SubParsers): ) -def run_parts_sim(sim_config: SimConfig): - +def run_parts_sim(sim_config: MultiPartSimConfig): if len(sim_config.system_configs) == 1: warnings.warn( "run_parts_sim is usually for multiple partitions. Did you mean to run with one?", @@ -253,13 +255,18 @@ def run_parts_sim(sim_config: SimConfig): multi_engine, workload_results, timestep_start, timestep_end, time_delta = \ MultiPartEngine.from_sim_config(sim_config) - if sim_config.output: + out = sim_config.get_output() + if out: + out.mkdir(parents=True) for part, engine in multi_engine.engines.items(): engine.telemetry.save_snapshot( - dest=str(sim_config.output / part.split('/')[-1]), + dest=str(out / part.split('/')[-1]), result=workload_results[part], args=sim_config, ) + config_yaml = yaml_dump(sim_config.model_dump(mode="json", exclude_defaults=True)) + (out / 'sim_config.yaml').write_text(config_yaml) + jobs = {p: w.jobs for p, w in workload_results.items()} ui_update_freq = sim_config.system_configs[0].scheduler.ui_update_freq @@ -317,7 +324,7 @@ def show_add_parser(subparsers: SubParsers): parser.add_argument("--show-defaults", default=False, help=""" If true, include defaults in the output YAML """) - model_validate = pydantic_add_args(parser, SimConfig, model_config={ + model_validate = pydantic_add_args(parser, SingleSimConfig, model_config={ "cli_shortcuts": shortcuts, }) @@ -328,6 +335,6 @@ def show_add_parser(subparsers: SubParsers): parser.set_defaults(impl=impl) -def show(sim_config: SimConfig, show_defaults=False): +def show(sim_config: SingleSimConfig, show_defaults=False): data = sim_config.model_dump(mode="json", exclude_defaults=not show_defaults) print(yaml_dump(data), end="") diff --git a/raps/sim_config.py b/raps/sim_config.py index 5cdd09c426350a4540dab4b15b335ad4ab2fedfc..a73cd3e29a577e8828d63157a02bf209aff335fd 100644 --- a/raps/sim_config.py +++ b/raps/sim_config.py @@ -1,24 +1,22 @@ import argparse +import abc +from pathlib import Path from functools import cached_property from datetime import timedelta from typing import Literal +import importlib from raps.schedulers.default import PolicyType, BackfillType from raps.utils import ( - parse_time_unit, convert_to_time_unit, infer_time_unit, ExpandedPath, parse_td, + parse_time_unit, convert_to_time_unit, infer_time_unit, ExpandedPath, parse_td, create_casename, + RAPSBaseModel, ) -from raps.system_config import SystemConfig, get_partition_configs -from pydantic import BaseModel, model_validator -import importlib +from raps.system_config import SystemConfig, get_partition_configs, get_system_config +from pydantic import model_validator Distribution = Literal['uniform', 'weibull', 'normal'] -class SimConfig(BaseModel): - system: str | None = None - """ System config to use """ - partitions: list[str] = [] - """ List of multiple system configurations for a multi-partition run. Can contain wildcards """ - +class SimConfig(RAPSBaseModel, abc.ABC): cooling: bool = False """ Include the FMU cooling model """ simulate_network: bool = False @@ -62,8 +60,25 @@ class SimConfig(BaseModel): seed: int | None = None """ Set RNG seed for deterministic simulation """ - output: ExpandedPath | None = None - """ Output power, cooling, and loss models for later analysis. Argument specifies name. """ + + output: ExpandedPath | Literal['none'] | None = None + """ + Where to output power, cooling, and loss models for later analysis. + If omitted it will output to raps-output- by default. + Set to "none" to disable file output entirely. + """ + + _random_output: Path | None = None + + def get_output(self) -> Path | None: + if self.output is None: # by default, output to a random directory + if not self._random_output: + self._random_output = Path(create_casename("raps-output-")).resolve() + return self._random_output + elif self.output == "none": # allow explicitly disabling output with "none" + return None + else: + return self.output # return user defined output path debug: bool = False """ Enable debug mode and disable rich layout """ @@ -242,12 +257,6 @@ class SimConfig(BaseModel): @model_validator(mode="after") def _validate_after(self): - # This is called after Pydantic has parsed everything into the model - if self.system and self.partitions: - raise ValueError("system and partitions are mutually exclusive") - elif not self.system and not self.partitions: - self.system = "frontier" - if not self.replay and not self.workload: self.workload = "random" @@ -294,24 +303,28 @@ class SimConfig(BaseModel): return self @property + @abc.abstractmethod def system_name(self) -> str: """ Name of the system. - Note, this is different than system, as system can be a file or None if partition is set. + Note, this is different than system, as system can be a file, or there can be multiple systems """ - return self._multi_partition_system_config.system_name + pass @property + @abc.abstractmethod def system_configs(self) -> list[SystemConfig]: """ Return the SystemConfigs for the selected systems. Will be a single element array unless multiple `partitions` are selected. """ - return self._multi_partition_system_config.partitions + pass - @cached_property - def _multi_partition_system_config(self): - return get_partition_configs(self.partitions if self.partitions else [self.system]) + def get_system_config_by_name(self, name: str) -> SystemConfig: + for s in self.system_configs: + if s.system_name == name: + return s + raise ValueError(f"Partition {name} isn't in SimConfig") def get_legacy_args(self): """ @@ -326,6 +339,7 @@ class SimConfig(BaseModel): contains the SimConfig object itself. """ args_dict = self.model_dump(mode="json") + args_dict['system'] = self.system_name # validate has been renamed to power_scope args_dict['validate'] = args_dict["power_scope"] == "node" args_dict['downscale'] = self.downscale @@ -340,3 +354,40 @@ class SimConfig(BaseModel): args_dict['sim_config'] = self return args_dict + + +class SingleSimConfig(SimConfig, abc.ABC): + system: SystemConfig | str = "frontier" + """ + Name of the system to simulate, e.g "frontier". Can also be a path to a yaml file containing + the SystemConfig. You can also make modificiations to the SystemConfig on the CLI using + `--system.base`, e.g. `--system.base frontier --system.cooling.fmu-path path/to/my.fmu` + """ + + @property + def system_name(self) -> str: + return self.system_configs[0].system_name + + @cached_property + def system_configs(self) -> list[SystemConfig]: + return [get_system_config(self.system)] + + +class MultiPartSimConfig(SimConfig): + partitions: list[SystemConfig | str] + """ + List of multiple systems/partitions to run. Can be names of preconfigured systems, or paths + to custom SystemConfig yaml files. + """ + + @property + def system_name(self) -> str: + return self._multi_partition_system_config.system_name + + @property + def system_configs(self) -> list[SystemConfig]: + return self._multi_partition_system_config.partitions + + @cached_property + def _multi_partition_system_config(self): + return get_partition_configs(self.partitions) diff --git a/raps/system_config.py b/raps/system_config.py index 726c08642fbdfacba8b7244b084f7a17353bbc44..bd405beab618a846adf7d1474ea985f003ee3424 100644 --- a/raps/system_config.py +++ b/raps/system_config.py @@ -5,13 +5,17 @@ from typing import Any, Literal from pathlib import Path from functools import cached_property import yaml -from pydantic import BaseModel, computed_field, model_validator, field_validator +from pydantic import ( + model_validator, field_validator, model_serializer, SerializationInfo, + SerializerFunctionWrapHandler, +) +from raps.utils import RAPSBaseModel, deep_merge, deep_subtract_dicts from raps.raps_config import raps_config # Define Pydantic models for the config to handle parsing and validation -class SystemSystemConfig(BaseModel): +class SystemSystemConfig(RAPSBaseModel): num_cdus: int racks_per_cdu: int nodes_per_rack: int @@ -41,27 +45,22 @@ class SystemSystemConfig(BaseModel): self.down_nodes = sorted(set(self.down_nodes)) return self - @computed_field @cached_property def num_racks(self) -> int: return self.num_cdus * self.racks_per_cdu - len(self.missing_racks) - @computed_field @cached_property def sc_shape(self) -> list[int]: return [self.num_cdus, self.racks_per_cdu, self.nodes_per_rack] - @computed_field @cached_property def total_nodes(self) -> int: return self.num_cdus * self.racks_per_cdu * self.nodes_per_rack - @computed_field @cached_property def blades_per_chassis(self) -> int: return int(self.nodes_per_rack / self.chassis_per_rack / self.nodes_per_blade) - @computed_field @cached_property def power_df_header(self) -> list[str]: power_df_header = ["CDU"] @@ -73,13 +72,12 @@ class SystemSystemConfig(BaseModel): power_df_header.append("Loss") return power_df_header - @computed_field @cached_property def available_nodes(self) -> int: return self.total_nodes - len(self.down_nodes) -class SystemPowerConfig(BaseModel): +class SystemPowerConfig(RAPSBaseModel): power_gpu_idle: float power_gpu_max: float power_cpu_idle: float @@ -100,7 +98,7 @@ class SystemPowerConfig(BaseModel): power_cost: float -class SystemUqConfig(BaseModel): +class SystemUqConfig(RAPSBaseModel): power_gpu_uncertainty: float power_cpu_uncertainty: float power_mem_uncertainty: float @@ -115,7 +113,7 @@ class SystemUqConfig(BaseModel): JobEndStates = Literal["COMPLETED", "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL"] -class SystemSchedulerConfig(BaseModel): +class SystemSchedulerConfig(RAPSBaseModel): job_arrival_time: int mtbf: int trace_quanta: int @@ -127,7 +125,7 @@ class SystemSchedulerConfig(BaseModel): multitenant: bool = False -class SystemCoolingConfig(BaseModel): +class SystemCoolingConfig(RAPSBaseModel): cooling_efficiency: float wet_bulb_temp: float zip_code: str | None = None @@ -140,7 +138,7 @@ class SystemCoolingConfig(BaseModel): temperature_keys: list[str] -class SystemNetworkConfig(BaseModel): +class SystemNetworkConfig(RAPSBaseModel): topology: Literal["capacity", "fat-tree", "dragonfly", "torus3d"] network_max_bw: float latency: float | None = None @@ -163,10 +161,16 @@ class SystemNetworkConfig(BaseModel): node_coords_csv: str | None = None -class SystemConfig(BaseModel): +class SystemConfig(RAPSBaseModel): system_name: str """ Name of the system, defaults to the yaml file name """ + base: str | None = None + """ + Optional, name or path to another SystemConfig to "inherit" from. Lets you make small modifications + to an existing system without having to copy the whole config. + """ + system: SystemSystemConfig power: SystemPowerConfig scheduler: SystemSchedulerConfig @@ -174,6 +178,22 @@ class SystemConfig(BaseModel): cooling: SystemCoolingConfig | None = None network: SystemNetworkConfig | None = None + @model_validator(mode="before") + def _load_base(cls, data): + if isinstance(data, dict) and data.get("base"): + base = get_system_config(data['base']) + data = deep_merge(base.model_dump(mode='json'), data) + return data + + @model_serializer(mode='wrap') + def model_serializer(self, handler: SerializerFunctionWrapHandler, info: SerializationInfo): + # don't include the base system data in the output + if self.base and (info.exclude_defaults or info.exclude_unset): + base = get_system_config(self.base) + return deep_subtract_dicts(handler(self), handler(base)) + else: + return handler(self) + def get_legacy(self) -> dict[str, Any]: """ Return the system config as a flattened, uppercased dict. This is for backwards @@ -181,6 +201,8 @@ class SystemConfig(BaseModel): gradually. The dict also as a "system_config" key that contains the SystemConfig object itself. """ + dump = self.model_dump(mode="json", exclude_none=True) + renames = { # fields that need to be renamed to something other than just .upper() "system_name": "system_name", "w_htwps_key": "W_HTWPs_KEY", @@ -188,7 +210,6 @@ class SystemConfig(BaseModel): "w_cts_key": "W_CTs_KEY", "multitenant": "multitenant", } - dump = self.model_dump(mode="json", exclude_none=True) config_dict: dict[str, Any] = {} for k, v in dump.items(): # flatten @@ -196,13 +217,20 @@ class SystemConfig(BaseModel): config_dict.update(v) else: config_dict[k] = v + config_dict["num_racks"] = self.system.num_racks + config_dict["sc_shape"] = self.system.sc_shape + config_dict["total_nodes"] = self.system.total_nodes + config_dict["blades_per_chassis"] = self.system.blades_per_chassis + config_dict["power_df_header"] = self.system.power_df_header + config_dict["available_nodes"] = self.system.available_nodes + # rename keys config_dict = {renames.get(k, k.upper()): v for k, v in config_dict.items()} config_dict['system_config'] = self return config_dict -class MultiPartitionSystemConfig(BaseModel): +class MultiPartitionSystemConfig(RAPSBaseModel): system_name: str partitions: list[SystemConfig] @@ -227,13 +255,15 @@ def list_systems() -> list[str]: ]) -@functools.cache -def get_system_config(system: str) -> SystemConfig: +def get_system_config(system: str | SystemConfig) -> SystemConfig: """ Returns the system config as a Pydantic object. system can either be a path to a custom .yaml file, or the name of one of the pre-configured systems defined in RAPS_SYSTEM_CONFIG_DIR. """ + if isinstance(system, SystemConfig): # Just pass system through if its already parsed + return system + if system in list_systems(): config_path = raps_config.system_config_dir / f"{system}.yaml" system_name = system @@ -247,10 +277,13 @@ def get_system_config(system: str) -> SystemConfig: "system_name": system_name, # You can override system_name in the yaml as well **yaml.safe_load(config_path.read_text()), } + base = str(config.get('base', '')) + if base.endswith(".yaml"): + config['base'] = str(config_path.parent / base) # path relative to yaml return SystemConfig.model_validate(config) -def get_partition_configs(partitions: list[str]) -> MultiPartitionSystemConfig: +def get_partition_configs(partitions: list[str | SystemConfig]) -> MultiPartitionSystemConfig: """ Resolves multiple partition config files. Can pass globs, or directories to include all yaml files under the directory. @@ -261,7 +294,10 @@ def get_partition_configs(partitions: list[str]) -> MultiPartitionSystemConfig: parsed_configs: list[SystemConfig] = [] for pat in partitions: - if pat in multi_partition_systems: + if isinstance(pat, SystemConfig): + parsed_configs.append(pat) + combined_system_name.append(pat.system_name) + elif pat in multi_partition_systems: matched_systems = fnmatch.filter(systems, f"{pat}/*") combined_system_name.append(pat) elif fnmatch.filter(systems, pat): diff --git a/raps/telemetry.py b/raps/telemetry.py index 340b9ae5e0cdceddbd5288f4f262fc81b824bcb5..63ee158f0a69f658fe9af03c6ea1f31e3a83c34a 100644 --- a/raps/telemetry.py +++ b/raps/telemetry.py @@ -15,7 +15,7 @@ from types import ModuleType import importlib import numpy as np import pandas as pd -from pydantic import BaseModel, model_validator +from pydantic import model_validator # from rich.progress import track from raps.sim_config import SimConfig @@ -28,12 +28,12 @@ from raps.plotting import ( plot_network_histogram ) from raps.utils import ( - next_arrival_byconfargs, pydantic_add_args, SubParsers, ExpandedPath, WorkloadData, + next_arrival_byconfargs, pydantic_add_args, SubParsers, ExpandedPath, WorkloadData, RAPSBaseModel, ) # TODO: should reuse this model in SimConfig -class TelemetryArgs(BaseModel): +class TelemetryArgs(RAPSBaseModel): jid: str = '*' """ Replay job id """ replay: list[ExpandedPath] | None = None diff --git a/raps/utils.py b/raps/utils.py index 323ac8af423094ff421ff009c230fb0056997f25..ab02a2ae1691fee09cf16ffb15c1e6429d46a711 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -21,12 +21,39 @@ import json import argparse from pathlib import Path from typing import Annotated as A, TypeVar, Callable, TypeAlias -from pydantic import BaseModel, TypeAdapter, AfterValidator, ConfigDict, AwareDatetime +from pydantic import BaseModel, TypeAdapter, AfterValidator, ConfigDict, AwareDatetime, ValidationError from pydantic_settings import BaseSettings, SettingsConfigDict, CliApp, CliSettingsSource import yaml from raps.job import Job +def deep_merge(a: dict, b: dict): + a = {**a} + for key in b.keys(): + if key in a and isinstance(a[key], dict) and isinstance(b[key], dict): + a[key] = deep_merge(a[key], b[key]) + else: + a[key] = b[key] + return a + + +def deep_subtract_dicts(a: dict, b: dict): + """ + Remove all fields from a that are already in b, such that + deep_merge(deep_subtract_dicts(a, b), b) == a + a should contain a superset of b's keys. + """ + a = {**a} + for key in b.keys(): + if key in a: + if a[key] == b[key]: + a.pop(key) + elif isinstance(a[key], dict) and isinstance(b[key], dict): + a[key] = deep_subtract_dicts(a[key], b[key]) + # otherwise keep key in a as is + return a + + def sum_values(values): return sum(x[1] for x in values) if values else 0 @@ -639,6 +666,13 @@ SmartTimedelta = A[timedelta, AfterValidator(parse_td)] T = TypeVar("T", bound=BaseModel) +class RAPSBaseModel(BaseModel): + """ Base Pydantic model with shared config """ + model_config = ConfigDict( + use_attribute_docstrings=True, + ) + + def pydantic_add_args( parser: argparse.ArgumentParser, model_cls: type[T], model_config: SettingsConfigDict | None = None, @@ -655,6 +689,7 @@ def pydantic_add_args( model_config_dict = SettingsConfigDict({ "cli_implicit_flags": True, "cli_kebab_case": True, + "title": model_cls.__name__, **(model_config or {}), "cli_parse_args": False, # Don't automatically parse args }) @@ -671,13 +706,17 @@ def pydantic_add_args( cli_settings_source = CliSettingsSource(SettingsModel, root_parser=parser) def model_validate_args(args: argparse.Namespace, data: dict | None = None): - model = CliApp.run(SettingsModel, - cli_args=args, - cli_settings_source=cli_settings_source, - **(data or {}), - ) - # Recreate model so we don't return the SettingsModel subclass - return model_cls.model_validate(model.model_dump()) + try: + model = CliApp.run(SettingsModel, + cli_args=args, + cli_settings_source=cli_settings_source, + **(data or {}), + ) + # Recreate model so we don't return the SettingsModel subclass + return model_cls.model_validate(model.model_dump()) + except ValidationError as err: + print(err) + sys.exit(1) return model_validate_args @@ -711,7 +750,7 @@ def yaml_dump(data): ) -class WorkloadData(BaseModel): +class WorkloadData(RAPSBaseModel): """ Represents a workload, a list of jobs with some metadata. Returned by dataloaders load_data() function, and by Workload.generate_jobs(). diff --git a/raps/workload.py b/raps/workload.py index 6fb3c3b1896b5780fdbb4b677180385878a55053..2a630b28ce07520313d599151cc15490bf38ce1d 100644 --- a/raps/workload.py +++ b/raps/workload.py @@ -40,7 +40,7 @@ import matplotlib.pyplot as plt from raps.telemetry import Telemetry from raps.job import job_dict, Job from raps.utils import create_file_indexed, SubParsers, pydantic_add_args -from raps.sim_config import SimConfig +from raps.sim_config import SingleSimConfig JOB_NAMES = ["LAMMPS", "GROMACS", "VASP", "Quantum ESPRESSO", "NAMD", @@ -965,13 +965,13 @@ def run_workload_add_parser(subparsers: SubParsers): YAML sim config file, can be used to configure an experiment instead of using CLI flags. Pass "-" to read from stdin. """) - model_validate = pydantic_add_args(parser, SimConfig, model_config={ + model_validate = pydantic_add_args(parser, SingleSimConfig, model_config={ "cli_shortcuts": shortcuts, }) parser.set_defaults(impl=lambda args: run_workload(model_validate(args, {}))) -def run_workload(sim_config: SimConfig): +def run_workload(sim_config: SingleSimConfig): args = sim_config.get_legacy_args() args_dict = sim_config.get_legacy_args() config = sim_config.system_configs[0].get_legacy() @@ -987,10 +987,11 @@ def run_workload(sim_config: SimConfig): dist_split=sim_config.multimodal, gantt_nodes=sim_config.gantt_nodes) - if sim_config.output: + out = sim_config.get_output() + if out: timestep_start = min([x.submit_time for x in jobs]) timestep_end = math.ceil(max([x.submit_time for x in jobs]) + max([x.expected_run_time for x in jobs])) - filename = create_file_indexed('wl', create=False, ending="npz").split(".npz")[0] + filename = create_file_indexed('wl', path=str(out), create=False, ending="npz").split(".npz")[0] # savez_compressed add npz itself, but create_file_indexed needs to check for .npz to find existing files np.savez_compressed(filename, jobs=jobs, timestep_start=timestep_start, timestep_end=timestep_end, args=args) print(filename + ".npz") # To std-out to show which npz was created. diff --git a/tests/smoke.py b/tests/smoke.py index 946f6db94a7bfd8f0f382472e098ef624c817f48..a2ea598f39d69a812f7180cf3cdc41a0b66939f8 100644 --- a/tests/smoke.py +++ b/tests/smoke.py @@ -32,7 +32,7 @@ def run_command(command): def build_command(system, file_paths, additional_args=""): """Build the command string for the given system and file paths.""" full_paths = " ".join([os.path.join(DATAPATH, path) for path in file_paths.split()]) - return f"python main.py run --system {system} -f {full_paths} -t {DEFAULT_TIME} {additional_args}".strip() + return f"python main.py run --system {system} -f {full_paths} -t {DEFAULT_TIME} -o none {additional_args}".strip() def execute_system_tests(systems): diff --git a/tests/systems/test_engine.py b/tests/systems/test_engine.py index e483b18b15b7c7eecc84be180ad5cadbcd0aab37..0404e89c00c24fbffffa06447c9eb8d93e0adaa1 100644 --- a/tests/systems/test_engine.py +++ b/tests/systems/test_engine.py @@ -1,6 +1,6 @@ import pytest from raps.engine import Engine -from raps.sim_config import SimConfig +from raps.sim_config import SingleSimConfig from raps.stats import ( get_engine_stats, # get_job_stats, @@ -18,7 +18,7 @@ def test_engine(system, system_config, sim_output): if not system_config.get("main", False): pytest.skip(f"{system} does not support basic main run.") - sim_config = SimConfig.model_validate({ + sim_config = SingleSimConfig.model_validate({ "system": system, "time": "2m", }) diff --git a/tests/systems/test_main_basic_run.py b/tests/systems/test_main_basic_run.py index 0cc9b690ed20a6050c062fef63a8897cf15a9893..37661f30ca384e6d8a8d181d0793c5ae703b3478 100644 --- a/tests/systems/test_main_basic_run.py +++ b/tests/systems/test_main_basic_run.py @@ -19,6 +19,6 @@ def test_main_basic_run(system, system_config, sim_output): "python", "main.py", "run", "--time", "1m", "--system", system, - "-o", sim_output + "-o", sim_output, ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_network_withdata_run.py b/tests/systems/test_main_network_withdata_run.py index 58d14f96c7ef575aabbad51141f54664ffe3aecc..1cbeae863542c3a48c32a77bdac6e6061fd7c5c9 100644 --- a/tests/systems/test_main_network_withdata_run.py +++ b/tests/systems/test_main_network_withdata_run.py @@ -1,7 +1,7 @@ import os import subprocess import pytest -from tests.util import PROJECT_ROOT, DATA_PATH +from tests.util import PROJECT_ROOT pytestmark = [ diff --git a/tests/systems/test_main_time_delta_sub_second_run.py b/tests/systems/test_main_time_delta_sub_second_run.py index 55c0e3ca2ad99551770b76406b1291deec480710..db80105777545f110d25b1bfbe71aabafef2f929 100644 --- a/tests/systems/test_main_time_delta_sub_second_run.py +++ b/tests/systems/test_main_time_delta_sub_second_run.py @@ -34,17 +34,11 @@ def test_main_time_delta_sub_second_run(system, system_config, time_arg, tdelta_ "--time-delta", tdelta_arg, "--system", system, "--noui", - "-o", sim_output + "-o", sim_output, ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" time = parse_td(time_arg).seconds assert f"Time Simulated: {convert_seconds_to_hhmmss(time)}" in result.stdout - subprocess.run( - f"rm {sim_output}.npz && rm -fr simulation_results/{sim_output}", - shell=True, - check=True - ) - del result gc.collect() diff --git a/tests/systems/test_main_withdata_run.py b/tests/systems/test_main_withdata_run.py index eb996a346b6920eabc4db8934e3d69018a551b6c..3539db9acd77add3f651b79240bcf62c81c944ef 100644 --- a/tests/systems/test_main_withdata_run.py +++ b/tests/systems/test_main_withdata_run.py @@ -1,7 +1,7 @@ import os import subprocess import pytest -from tests.util import PROJECT_ROOT, DATA_PATH +from tests.util import PROJECT_ROOT pytestmark = [ diff --git a/tests/systems/test_multi_part_sim_basic_run.py b/tests/systems/test_multi_part_sim_basic_run.py index 9351fd6bc83008ca3a8dc9c977af9a0aadf15972..0edcc903b02330a93e37095a358d952cc8922344 100644 --- a/tests/systems/test_multi_part_sim_basic_run.py +++ b/tests/systems/test_multi_part_sim_basic_run.py @@ -11,7 +11,7 @@ pytestmark = [ ] -def test_multi_part_sim_basic_run(system, system_config): +def test_multi_part_sim_basic_run(system, system_config, sim_output): if not system_config.get("multi-part-sim", False): pytest.skip(f"{system} does not support basic multi-part-sim run.") @@ -21,6 +21,7 @@ def test_multi_part_sim_basic_run(system, system_config): "python", "main.py", "run-parts", "--time", "1h", "-x", f"{system}/*", + "-o", sim_output, ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" del result diff --git a/tests/systems/test_multi_part_sim_withdata_run.py b/tests/systems/test_multi_part_sim_withdata_run.py index f38cf8ee35fd2d2188e381974eb179d21b146633..538726c4ece73c68179e7291a5b81b3063d25b16 100644 --- a/tests/systems/test_multi_part_sim_withdata_run.py +++ b/tests/systems/test_multi_part_sim_withdata_run.py @@ -1,8 +1,7 @@ import os import subprocess -import gc import pytest -from tests.util import PROJECT_ROOT, DATA_PATH +from tests.util import PROJECT_ROOT pytestmark = [ @@ -12,7 +11,7 @@ pytestmark = [ ] -def test_multi_part_sim_withdata_run(system, system_config, system_files): +def test_multi_part_sim_withdata_run(system, system_config, system_files, sim_output): if not system_config.get("multi-part-sim", False): pytest.skip(f"{system} does not support basic multi-part-sim run even without data.") if not system_config.get("withdata", False): @@ -24,5 +23,6 @@ def test_multi_part_sim_withdata_run(system, system_config, system_files): "--time", "1h", "-x", f"{system}/*", "-f", ','.join(system_files), + "-o", sim_output, ], capture_output=True, text=True, stdin=subprocess.DEVNULL) assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_telemetry_withdata_run.py b/tests/systems/test_telemetry_withdata_run.py index 2729c7c981be740460cff2bdc75bcfbff9caab76..43a218b045b226fba9d1ebfb469b196bcfe8d016 100644 --- a/tests/systems/test_telemetry_withdata_run.py +++ b/tests/systems/test_telemetry_withdata_run.py @@ -1,7 +1,7 @@ import os import subprocess import pytest -from tests.util import PROJECT_ROOT, DATA_PATH +from tests.util import PROJECT_ROOT pytestmark = [ diff --git a/tests/test_main.py b/tests/test_main.py index 5c08182e6300789d100a53306050b0ea8d8c5041..4b09fa0515abb09703afb74c2b0e0acd19c2ddbc 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -13,11 +13,10 @@ PROJECT_ROOT = Path(__file__).resolve().parent.parent # adjust if needed def test_main_withui(): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run", - "--time", "1h", - ], capture_output=True, - text=True - ) + "python", "main.py", "run", + "--time", "1h", + "-o", 'none', + ], capture_output=True, text=True) assert result.returncode == 0 @@ -25,12 +24,10 @@ def test_main_withui(): def test_main_noui(): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run", - "--time", "1h", - "--noui" - ], capture_output=True, - text=True - ) + "python", "main.py", "run", + "--time", "1h", + "--noui", "-o", 'none', + ], capture_output=True, text=True) assert result.returncode == 0 @@ -39,8 +36,6 @@ def test_main_noui(): def test_main_long(): os.chdir(PROJECT_ROOT) result = subprocess.run([ - "python", "main.py", "run", - ], capture_output=True, - text=True - ) + "python", "main.py", "run", "-o", 'none', + ], capture_output=True, text=True) assert result.returncode == 0