From dc8b46235c73e4bcfb9f03cb1470b0638e77658e Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 19 Sep 2025 09:44:10 -0400 Subject: [PATCH 01/77] README tweaks --- README.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 1a4c809..b6ded35 100644 --- a/README.md +++ b/README.md @@ -16,15 +16,6 @@ simulation you'll need to download FMU models into the `models` directory. You c `Simulator_olcf5_base.fmu` from https://code.ornl.gov/exadigit/fmu-models if you have access. (The FMU models aren't currently publicly available.) - -## Deploying -To deploy the server, run -```bash -./scripts/deploy.sh prod -``` - -This will build both the server and simulation docker images, and push them to Slate. - ## Running locally To run a local version of the server run ```bash @@ -38,6 +29,14 @@ If you want to run replay data locally, you'll need to download the datasets (se and then ingest them in Druid. After launching, you can access the Druid UI at http://localhost:8888 and submit druid ingests for the system you want. +## Deploying +To deploy the server, run +```bash +./scripts/deploy.sh prod +``` + +This will build both the server and simulation docker images, and push them to Slate. + ## API Docs You can view the API docs and the `openapi.json` with the API specification at https://exadigit.github.io/SimulationServer -- GitLab From 387512db7f2bec3d92e4bb95a4565237e0455cc6 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 19 Sep 2025 11:53:45 -0400 Subject: [PATCH 02/77] Update dependencies --- pyproject.toml | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f120dba..d99297e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,33 +14,22 @@ readme = "README.md" # license = {file = "LICENSE.txt"} dependencies = [ - "pydantic==2.9.2", - "pydantic-settings==2.5.2 ", - "loguru==0.7.2", - "SQLAlchemy==2.0.27", - "pydruid==0.6.6", - "PyYAML==6.0.1", - "kafka-python==2.0.2", - "python-snappy==0.6.1", - "jsonpath-ng==1.6.1", - "fastapi==0.115.2", - "gunicorn==23.0.0", - "uvicorn==0.32.0", - "sqlparse==0.5.1", - "kubernetes==29.0.0", - "matplotlib==3.7.2", - "numpy==1.23.5", - "rich==13.6.0", - "fmpy==0.3.19", - "pandas==2.0.3", - "scipy==1.10.1", - "pyarrow==15.0.1", - "uncertainties==3.2.1", - "ClusterShell==1.9.2", + "pydantic==2.11.9", + "pydantic-settings==2.10.1", + "loguru==0.7.3", + "SQLAlchemy==2.0.43", + "pydruid==0.6.9", + "kafka-python==2.2.15", + "python-snappy==0.7.3", + "jsonpath-ng==1.7.0", + "fastapi==0.116.2", + "uvicorn==0.35.0", + "sqlparse==0.5.3", + "kubernetes==33.1.0", "elasticsearch==7.13.4", "elasticsearch-dbapi==0.2.11", - "tqdm==4.66.5", - "requests==2.32.3", + "requests==2.32.5", + "raps@{root:uri}/simulation_server/simulation/raps", ] [project.optional-dependencies] -- GitLab From 95e1c08ad9ee5e91c12bd9d372bce1bab0d5fe99 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 19 Sep 2025 14:25:53 -0400 Subject: [PATCH 03/77] Update README --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index b6ded35..7bd92c5 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,13 @@ git submodule update --init --recursive to load the submodule. ## Downloading FMU models +The Frontier FMU models aren't currently publicly available. To run Frontier simulations with cooling enabled, use this +command to download them (if you have access to the fmu-models repo). +``` +cd simulation_server/simulation/raps +make fetch-fmu-models +``` + You can run the job and power simulation without downloading any FMU models. But to use the cooling simulation you'll need to download FMU models into the `models` directory. You can download `Simulator_olcf5_base.fmu` from https://code.ornl.gov/exadigit/fmu-models if you have access. (The -- GitLab From 35a00b51e1aa6b151425341242facd3265e11e08 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 19 Sep 2025 14:26:13 -0400 Subject: [PATCH 04/77] Remove models dir --- models/README.md | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 models/README.md diff --git a/models/README.md b/models/README.md deleted file mode 100644 index c2e932c..0000000 --- a/models/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# FMU Models -Place FMU models here. -- GitLab From 24f700c2d270288003e175dfdda38b37cd7fe609 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 19 Sep 2025 16:02:58 -0400 Subject: [PATCH 05/77] Move raps submodule --- .gitmodules | 4 ++-- README.md | 2 +- pyproject.toml | 2 +- raps | 1 + simulation_server/simulation/raps | 1 - 5 files changed, 5 insertions(+), 5 deletions(-) create mode 160000 raps delete mode 160000 simulation_server/simulation/raps diff --git a/.gitmodules b/.gitmodules index 1af7ae0..691b42d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ -[submodule "simulation_server/simulation/raps"] - path = simulation_server/simulation/raps +[submodule "raps"] + path = raps url = https://github.com/ExaDigiT/RAPS.git branch = main diff --git a/README.md b/README.md index 7bd92c5..18f0d36 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ to load the submodule. The Frontier FMU models aren't currently publicly available. To run Frontier simulations with cooling enabled, use this command to download them (if you have access to the fmu-models repo). ``` -cd simulation_server/simulation/raps +cd ./raps make fetch-fmu-models ``` diff --git a/pyproject.toml b/pyproject.toml index d99297e..95a409a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "elasticsearch==7.13.4", "elasticsearch-dbapi==0.2.11", "requests==2.32.5", - "raps@{root:uri}/simulation_server/simulation/raps", + "raps@{root:uri}/raps", ] [project.optional-dependencies] diff --git a/raps b/raps new file mode 160000 index 0000000..15025de --- /dev/null +++ b/raps @@ -0,0 +1 @@ +Subproject commit 15025ded86c20023544db626720828d27f7e5cb3 diff --git a/simulation_server/simulation/raps b/simulation_server/simulation/raps deleted file mode 160000 index 945f878..0000000 --- a/simulation_server/simulation/raps +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 945f878039b82207fca96182544536914809a302 -- GitLab From 90af00cf1f283ca613765ce059f8eaf6d2eca8ee Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 19 Sep 2025 16:46:23 -0400 Subject: [PATCH 06/77] Update dockerfile --- Dockerfile | 23 +++++++++++++++++++++++ Dockerfile.server | 25 ------------------------- Dockerfile.simulation | 24 ------------------------ 3 files changed, 23 insertions(+), 49 deletions(-) create mode 100644 Dockerfile delete mode 100644 Dockerfile.server delete mode 100644 Dockerfile.simulation diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8138ebe --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.12 + +RUN apt-get update \ + && apt-get install git libsnappy-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir --upgrade pip +RUN pip install --no-cache-dir hatch + +WORKDIR /app + +COPY pyproject.toml /app/ +COPY raps/ /app/raps/ +RUN hatch dep show requirements > /app/requirements.txt +RUN pip install --no-cache-dir -r /app/requirements.txt + +COPY druid_ingests /app/druid_ingests +COPY simulation_server /app/simulation_server +COPY README.md /app/ +RUN pip install --no-cache-dir -e . + +# CMD ["python", "-m", "simulation_server.simulation.main"] +# CMD ["python", "-m", "simulation_server.server.main"] diff --git a/Dockerfile.server b/Dockerfile.server deleted file mode 100644 index 55e5d2c..0000000 --- a/Dockerfile.server +++ /dev/null @@ -1,25 +0,0 @@ -FROM python:3.9 - -RUN apt-get update \ - && apt-get install -y libsnappy-dev \ - && rm -rf /var/lib/apt/lists/* - -RUN pip install --upgrade pip -RUN pip install hatch - -WORKDIR /app - -COPY pyproject.toml /app - -RUN hatch dep show requirements > /app/requirements.txt -# RUN hatch dep show requirements --feature=server >> /app/requirements.txt -RUN python3 -m pip install -r /app/requirements.txt -ENV RAPS_CONFIG=/app/simulation_server/simulation/raps/config - -COPY ["druid_ingests", "/app/druid_ingests/"] -COPY ["models", "/app/models"] -COPY ["simulation_server", "/app/simulation_server/"] -COPY ["README.md", "/app"] -RUN python3 -m pip install -e . - -CMD ["python3", "-m", "simulation_server.server.main"] diff --git a/Dockerfile.simulation b/Dockerfile.simulation deleted file mode 100644 index 0160fe6..0000000 --- a/Dockerfile.simulation +++ /dev/null @@ -1,24 +0,0 @@ -FROM ubuntu:22.04 - -RUN apt-get update \ - && apt-get install -y python3 python3-pip git libsnappy-dev \ - && rm -rf /var/lib/apt/lists/* - -RUN pip install --upgrade pip -RUN pip install hatch - -WORKDIR /app - -COPY pyproject.toml /app - -RUN hatch dep show requirements > /app/requirements.txt -# RUN hatch dep show requirements --feature=simulation >> /app/requirements.txt -RUN python3 -m pip install -r /app/requirements.txt -ENV RAPS_CONFIG=/app/simulation_server/simulation/raps/config - -COPY ["simulation_server", "/app/simulation_server/"] -COPY ["models", "/app/models"] -COPY ["README.md", "/app"] -RUN python3 -m pip install -e . - -CMD ["python3", "-m", "simulation_server.simulation.main"] -- GitLab From adf139b6d7bfdcd8036be27b8b0a431c26396108 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 22 Sep 2025 10:08:29 -0400 Subject: [PATCH 07/77] Better caching in Dockerfile --- Dockerfile | 15 ++++++++++++--- pyproject.toml | 8 -------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8138ebe..0e0d22b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,15 +9,24 @@ RUN pip install --no-cache-dir hatch WORKDIR /app -COPY pyproject.toml /app/ +# Install RAPS dependencies as first layer for caching +COPY raps/pyproject.toml /app/raps/ +RUN cd /app/raps && hatch dep show requirements > /app/raps/requirements.txt +RUN pip install --no-cache-dir -r /app/raps/requirements.txt + +# Install server dependencies (including raps) for caching COPY raps/ /app/raps/ +COPY pyproject.toml /app/ RUN hatch dep show requirements > /app/requirements.txt RUN pip install --no-cache-dir -r /app/requirements.txt -COPY druid_ingests /app/druid_ingests -COPY simulation_server /app/simulation_server +# Install simulation server COPY README.md /app/ +COPY druid_ingests/ /app/druid_ingests/ +COPY simulation_server/ /app/simulation_server/ RUN pip install --no-cache-dir -e . +# Re-install RAPS as editable (TODO: RAPS currently doesn't work in non-editable mode) +RUN pip install --no-cache-dir -e ./raps # CMD ["python", "-m", "simulation_server.simulation.main"] # CMD ["python", "-m", "simulation_server.server.main"] diff --git a/pyproject.toml b/pyproject.toml index 95a409a..edd5645 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,11 +36,3 @@ dependencies = [ dev = [ "pytest", ] - -# [project.scripts] -# spam-cli = "spam:main_cli" - -# [project.entry-points."spam.magical"] -# tomatoes = "spam:main_tomatoes" - - \ No newline at end of file -- GitLab From b1f6b85d7f1e88dc175ca8317e5a01bc75acf8bb Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 22 Sep 2025 10:49:29 -0400 Subject: [PATCH 08/77] Speed up Dockerfile --- Dockerfile | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0e0d22b..b61b15a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,28 +5,27 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir --upgrade pip -RUN pip install --no-cache-dir hatch +RUN pip install --no-cache-dir uv +ENV UV_NO_CACHE=true WORKDIR /app # Install RAPS dependencies as first layer for caching COPY raps/pyproject.toml /app/raps/ -RUN cd /app/raps && hatch dep show requirements > /app/raps/requirements.txt -RUN pip install --no-cache-dir -r /app/raps/requirements.txt +RUN uv pip install --system -r /app/raps/pyproject.toml # Install server dependencies (including raps) for caching COPY raps/ /app/raps/ COPY pyproject.toml /app/ -RUN hatch dep show requirements > /app/requirements.txt -RUN pip install --no-cache-dir -r /app/requirements.txt +COPY README.md /app/ +RUN uv pip install --system -r /app/pyproject.toml # Install simulation server -COPY README.md /app/ COPY druid_ingests/ /app/druid_ingests/ COPY simulation_server/ /app/simulation_server/ -RUN pip install --no-cache-dir -e . +RUN uv pip install --system -e . # Re-install RAPS as editable (TODO: RAPS currently doesn't work in non-editable mode) -RUN pip install --no-cache-dir -e ./raps +RUN uv pip install --system -e ./raps # CMD ["python", "-m", "simulation_server.simulation.main"] # CMD ["python", "-m", "simulation_server.server.main"] -- GitLab From 40ffb3300a37b286df9ee9d6ef3d892bc9d00fac Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 23 Sep 2025 13:41:12 -0400 Subject: [PATCH 09/77] simulation loop mostly working --- simulation_server/simulation/simulation.py | 438 +++++++++------------ 1 file changed, 178 insertions(+), 260 deletions(-) diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index 60b00fd..9cbe956 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -1,37 +1,19 @@ from typing import NamedTuple from datetime import datetime, timedelta -from pathlib import Path -import random, math, functools -import numpy as np +import functools from loguru import logger -from .raps.raps.config import ConfigManager -from .raps.raps.cooling import ThermoFluidsModel -from .raps.raps.power import PowerManager -from .raps.raps.flops import FLOPSManager -from .raps.raps.scheduler import Scheduler -from .raps.raps.telemetry import Telemetry -from .raps.raps.workload import Workload -from ..models.sim import SimConfig, SimSystem +from raps import Engine +from ..models.sim import ServerSimConfig from ..models.output import ( JobStateEnum, SchedulerSimJob, SchedulerSimJobPowerHistory, SchedulerSimSystem, CoolingSimCDU, CoolingSimCEP, ) from ..util.misc import nest_dict from . import SimException -from .dataloaders import DATA_LOADERS +# from .dataloaders import DATA_LOADERS -PKG_PATH = Path(__file__).parent.parent.parent - - -def _offset_to_time(start, offset): - if offset is not None: - return start + timedelta(seconds=offset) - else: - return None - - -class SimOutput(NamedTuple): +class SimTickOutput(NamedTuple): timestamp: datetime scheduler_sim_system: list[SchedulerSimSystem] scheduler_sim_jobs: list[SchedulerSimJob] @@ -40,263 +22,199 @@ class SimOutput(NamedTuple): power_history: list[SchedulerSimJobPowerHistory] -def get_scheduler( - system: SimSystem, - down_nodes = [], cooling_enabled = False, replay = False, - schedule_policy = 'fcfs', -): - if cooling_enabled and system != "frontier": - raise SimException("Cooling sim only supported for frontier") - - raps_config = ConfigManager(system_name = system).get_config() - if "FMU_PATH" in raps_config: - raps_config['FMU_PATH'] = str(PKG_PATH / raps_config['FMU_PATH']) - - down_nodes = [*raps_config['DOWN_NODES'], *down_nodes] - - power_manager = PowerManager(**raps_config) - flops_manager = FLOPSManager(**raps_config) - if cooling_enabled: - cooling_model = ThermoFluidsModel(**raps_config) - cooling_model.initialize() - else: - cooling_model = None - - return Scheduler( - power_manager = power_manager, - flops_manager = flops_manager, - cooling_model = cooling_model, - debug = False, replay = replay, - schedule = schedule_policy, - config = raps_config, - ) - - def get_job_state_hash(job: SchedulerSimJob): """ Return string that can be used to check if any meaningful state changed """ return job.model_dump_json(exclude={"time_snapshot"}) -def run_simulation(sim_config: SimConfig): +def run_simulation(sim_config: ServerSimConfig): + # TODO: replay logic + engine = Engine(sim_config) + sample_scheduler_sim_system = timedelta(seconds = 1).total_seconds() # Sample CDU as fast as it is available sample_cooling = timedelta(seconds = 1).total_seconds() + def _offset_to_time(offset): + if offset is not None: + return engine.start + timedelta(seconds=offset - engine.timestep_start) + else: + return None + + # Memoized function to convert raps indexes into node names. + # Memo increases performance since it gets called on snapshots of the same job multiple times. + @functools.lru_cache(maxsize = 65_536) + def _parse_nodes(node_indexes: tuple[int]): + return [engine.telemetry.node_index_to_name(i) for i in node_indexes] + # Keep record of how many power history steps we've emitted for each job power_history_counts: dict[int, int] = {} prev_jobs: dict[str, str] = {} - if sim_config.scheduler.enabled: - if sim_config.scheduler.seed: - # TODO: This is globabl and should probably be done in RAPS - random.seed(sim_config.scheduler.seed) - np.random.seed(sim_config.scheduler.seed) - - timesteps = math.ceil((sim_config.end - sim_config.start).total_seconds()) - - sc = get_scheduler( - system = sim_config.system, - down_nodes = sim_config.scheduler.down_nodes, - cooling_enabled = sim_config.cooling.enabled, - replay = (sim_config.scheduler.jobs_mode == "replay"), - schedule_policy = sim_config.scheduler.schedule_policy, - ) - telemetry = Telemetry(system = sim_config.system, config = sc.config) - - # Memoized function to convert raps indexes into node names. - # Memo increases performance since it gets called on snapshots of the same job multiple times. - @functools.lru_cache(maxsize = 65_536) - def _parse_nodes(node_indexes: tuple[int]): - return [telemetry.node_index_to_name(i) for i in node_indexes] - - if sim_config.scheduler.jobs_mode == "random": - num_jobs = sim_config.scheduler.num_jobs if sim_config.scheduler.num_jobs is not None else 1000 - workload = Workload(**sc.config) - jobs = workload.random(num_jobs=num_jobs) - elif sim_config.scheduler.jobs_mode == "test": - workload = Workload(**sc.config) - jobs = workload.test() - elif sim_config.scheduler.jobs_mode == "replay": - if sim_config.system not in DATA_LOADERS: - raise SimException(f"Replay not supported for {sim_config.system}") - logger.info("Fetching telemetry data...") - jobs = DATA_LOADERS[sim_config.system](sim_config, sc.config) - if len(jobs) == 0: - raise SimException(f"No data for {sim_config.system} {sim_config.start.isoformat()} -> {sim_config.end.isoformat()}") - logger.info(f"Fetched {len(jobs)} jobs") - elif sim_config.scheduler.jobs_mode == "custom": - raise SimException("Custom not supported") - else: - raise SimException(f'Unknown jobs_mode "{sim_config.scheduler.jobs_mode}"') - - for data in sc.run_simulation(jobs, timesteps=timesteps): - timestamp: datetime = _offset_to_time(sim_config.start, data.current_time) - is_last_tick = (timestamp + timedelta(seconds=1) == sim_config.end) + for tick in engine.run_simulation(): + timestamp: datetime = _offset_to_time(tick.current_timestep) + unix_timestamp = int(timestamp.timestamp()) + is_last_tick = (timestamp + timedelta(seconds=1) >= sim_config.end) - unix_timestamp = int(timestamp.timestamp()) + scheduler_sim_system: list[SchedulerSimSystem] = [] + if unix_timestamp % sample_scheduler_sim_system == 0 or is_last_tick: + down_nodes = _parse_nodes(tuple(tick.down_nodes)) + stats = engine.get_stats() - scheduler_sim_system: list[SchedulerSimSystem] = [] - if unix_timestamp % sample_scheduler_sim_system == 0 or is_last_tick: - down_nodes = _parse_nodes(tuple(data.down_nodes)) - stats = sc.get_stats() - - scheduler_sim_system = [SchedulerSimSystem.model_validate(dict( + scheduler_sim_system = [SchedulerSimSystem.model_validate(dict( + timestamp = timestamp, + down_nodes = down_nodes, + # TODO: Update sc.get_stats to return more easily parsable data + num_samples = stats['engine']['num_samples'], + + jobs_completed = stats['job']['jobs_completed'], + jobs_running = len(stats['job']['jobs_still_running']), + jobs_pending = len(stats['job']['jobs_still_in_queue']), + + throughput = stats['job']['throughput'], + average_power = stats['engine']['average_power'] * 1_000_000, + min_loss = stats['engine']['min_loss'] * 1_000_000, + average_loss = stats['engine']['average_loss'] * 1_000_000, + max_loss = stats['engine']['max_loss'] * 1_000_000, + system_power_efficiency = stats['engine']['system_power_efficiency'], + total_energy_consumed = stats['engine']['total_energy_consumed'], + carbon_emissions = stats['engine']['carbon_emissions'], + total_cost = stats['engine']['total_cost'], + + p_flops = tick.p_flops, + g_flops_w = tick.g_flops_w, + system_util = tick.system_util, + ))] + + scheduler_sim_jobs: list[SchedulerSimJob] = [] + curr_jobs = {} + tick_jobs = tick.queue + tick.running + tick.completed + tick.killed + for job in tick_jobs: + time_end = _offset_to_time(job.end_time) + # end_time is set to its planned end once its scheduled. Set it to None for unfinished jobs here + if time_end is not None and (job.start_time is None or time_end > timestamp): + time_end = None + + parsed_job = SchedulerSimJob.model_validate(dict( + job_id = str(job.id), + name = job.name, + node_count = job.nodes_required, + time_snapshot = timestamp, + time_submission = _offset_to_time(job.submit_time), + time_limit = job.time_limit, + time_start = _offset_to_time(job.start_time), + time_end = time_end, + state_current = JobStateEnum(job.current_state.name), + nodes = _parse_nodes(tuple(job.scheduled_nodes)) if job.scheduled_nodes else None, + # How does the new job.power attribute work? Is it total_energy? + # Or just the current wattage? + # power = job.power, + )) + job_state_hash = get_job_state_hash(parsed_job) + + # Output jobs if something other than time_snapshot changed + if is_last_tick or prev_jobs.get(parsed_job.job_id) != job_state_hash: + scheduler_sim_jobs.append(parsed_job) + curr_jobs[parsed_job.job_id] = job_state_hash + prev_jobs = curr_jobs + + power_history: list[SchedulerSimJobPowerHistory] = [] + for job in tick_jobs: + if job.id and power_history_counts.get(job.id, 0) < len(job.power_history): + power_history.append(SchedulerSimJobPowerHistory( timestamp = timestamp, - down_nodes = down_nodes, - # TODO: Update sc.get_stats to return more easily parsable data - num_samples = int(stats['num_samples']), - - jobs_completed = int(stats['jobs completed']), - jobs_running = len(stats['jobs still running']), - jobs_pending = len(stats['jobs still in queue']), - - throughput = float(stats['throughput'].split(' ')[0]), - average_power = float(stats['average power'].split(' ')[0]) * 1_000_000, - min_loss = float(stats['min loss'].split(' ')[0]) * 1_000_000, - average_loss = float(stats['average loss'].split(' ')[0]) * 1_000_000, - max_loss = float(stats['max loss'].split(' ')[0]) * 1_000_000, - system_power_efficiency = float(stats['system power efficiency']), - total_energy_consumed = float(stats['total energy consumed'].split(' ')[0]), - carbon_emissions = float(stats['carbon emissions'].split(' ')[0]), - total_cost = float(stats['total cost'].removeprefix("$")), - - p_flops = data.p_flops, - g_flops_w = data.g_flops_w, - system_util = data.system_util, - ))] - - scheduler_sim_jobs: list[SchedulerSimJob] = [] - curr_jobs = {} - data_jobs = data.completed + data.running + data.queue - for job in data_jobs: - # end_time is set to its planned end once its scheduled. Set it to None for unfinished jobs here - if job.start_time is not None: - time_end = _offset_to_time(sim_config.start, job.end_time) - else: - time_end = None - parsed_job = SchedulerSimJob.model_validate(dict( job_id = str(job.id), - name = job.name, - node_count = job.nodes_required, - time_snapshot = timestamp, - time_submission = _offset_to_time(sim_config.start, job.submit_time), - time_limit = job.wall_time, - time_start = _offset_to_time(sim_config.start, job.start_time), - time_end = time_end, - state_current = JobStateEnum(job.state.name), - nodes = _parse_nodes(tuple(job.scheduled_nodes)), - # How does the new job.power attribute work? Is it total_energy? - # Or just the current wattage? - # power = job.power, + power = job.power_history[-1], )) - job_state_hash = get_job_state_hash(parsed_job) - - # Output jobs if something other than time_snapshot changed - if is_last_tick or prev_jobs.get(parsed_job.job_id) != job_state_hash: - scheduler_sim_jobs.append(parsed_job) - curr_jobs[parsed_job.job_id] = job_state_hash - prev_jobs = curr_jobs - - power_history: list[SchedulerSimJobPowerHistory] = [] - for job in data_jobs: - if job.id and power_history_counts.get(job.id, 0) < len(job.power_history): - power_history.append(SchedulerSimJobPowerHistory( - timestamp = timestamp, - job_id = str(job.id), - power = job.power_history[-1], - )) - power_history_counts[job.id] = len(job.power_history) - - cooling_sim_cdus: list[CoolingSimCDU] = [] - cooling_sim_cep: list[CoolingSimCEP] = [] - - cooling_sim_cdu_map: dict[int, dict] = {} - if data.power_df is not None and (unix_timestamp % sample_cooling == 0 or is_last_tick): - for i, point in data.power_df.iterrows(): - cooling_sim_cdu_map[int(point['CDU'])] = dict( - rack_1_power = point['Rack 1'], - rack_2_power = point['Rack 2'], - rack_3_power = point['Rack 3'], - total_power = point['Sum'], - - rack_1_loss = point['Loss 1'], - rack_2_loss = point['Loss 2'], - rack_3_loss = point['Loss 3'], - total_loss = point['Loss'], - ) - - if data.fmu_outputs: - fmu_data = nest_dict({**data.fmu_outputs}) - - # CDU columns are output in the dict with keys like this: - # "simulator[1].datacenter[1].computeBlock[1].cdu[1].summary.m_flow_prim" - # "simulator[1].datacenter[1].computeBlock[1].cdu[1].summary.V_flow_prim_GPM" - # "simulator[1].datacenter[1].computeBlock[2].cdu[1].summary.m_flow_prim" - # "simulator[1].datacenter[1].computeBlock[2].cdu[1].summary.V_flow_prim_GPM" - # etc. - - cdus_data = fmu_data['simulator'][1]['datacenter'][1]['computeBlock'] - for cdu, cdu_data in cdus_data.items(): - cdu_data = cdu_data['cdu'][1]['summary'] - cooling_sim_cdu_map[cdu].update( - work_done_by_cdup = cdu_data['W_flow_CDUP_kW'], - rack_return_temp = cdu_data['T_sec_r_C'], - rack_supply_temp = cdu_data['T_sec_s_C'], - rack_supply_pressure = cdu_data['p_sec_s_psig'], - rack_return_pressure = cdu_data['p_sec_r_psig'], - rack_flowrate = cdu_data['V_flow_sec_GPM'], - facility_return_temp = cdu_data["T_prim_r_C"], - facility_supply_temp = cdu_data['T_prim_s_C'], - facility_supply_pressure = cdu_data['p_prim_s_psig'], - facility_return_pressure = cdu_data['p_prim_r_psig'], - facility_flowrate = cdu_data['V_flow_prim_GPM'], - ) - - cep_data = fmu_data['simulator'][1]['centralEnergyPlant'][1] - cooling_sim_cep = [CoolingSimCEP.model_validate(dict( - timestamp = timestamp, - htw_flowrate = cep_data['hotWaterLoop'][1]['summary']['V_flow_htw_GPM'], - ctw_flowrate = cep_data['coolingTowerLoop'][1]['summary']['V_flow_ctw_GPM'], - htw_return_pressure = cep_data['hotWaterLoop'][1]['summary']['p_fac_htw_r_psig'], - htw_supply_pressure = cep_data['hotWaterLoop'][1]['summary']['p_fac_htw_s_psig'], - ctw_return_pressure = cep_data['coolingTowerLoop'][1]['summary']['p_fac_ctw_r_psig'], - ctw_supply_pressure = cep_data['coolingTowerLoop'][1]['summary']['p_fac_ctw_s_psig'], - htw_return_temp = cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_r_C'], - htw_supply_temp = cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_s_C'], - ctw_return_temp = cep_data['coolingTowerLoop'][1]['summary']['T_fac_ctw_r_C'], - ctw_supply_temp = cep_data['coolingTowerLoop'][1]['summary']['T_fac_ctw_s_C'], - power_consumption_htwps = cep_data['hotWaterLoop'][1]['summary']['W_flow_HTWP_kW'], - power_consumption_ctwps = cep_data['coolingTowerLoop'][1]['summary']['W_flow_CTWP_kW'], - power_consumption_fan = cep_data['coolingTowerLoop'][1]['summary']['W_flow_CT_kW'], - htwp_speed = cep_data['hotWaterLoop'][1]['summary']['N_HTWP'], - nctwps_staged = cep_data['coolingTowerLoop'][1]['summary']['n_CTWPs'], - nhtwps_staged = cep_data['hotWaterLoop'][1]['summary']['n_HTWPs'], - pue_output = fmu_data['pue'], - nehxs_staged = cep_data['hotWaterLoop'][1]['summary']['n_EHXs'], - ncts_staged = cep_data['coolingTowerLoop'][1]['summary']['n_CTs'], - facility_return_temp = cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_r_C'], - cdu_loop_bypass_flowrate = fmu_data['simulator'][1]['datacenter'][1]['summary']['V_flow_bypass_GPM'], - ))] - + power_history_counts[job.id] = len(job.power_history) + + cooling_sim_cdus: list[CoolingSimCDU] = [] + cooling_sim_cep: list[CoolingSimCEP] = [] + + cooling_sim_cdu_map: dict[int, dict] = {} + if tick.power_df is not None and (unix_timestamp % sample_cooling == 0 or is_last_tick): + for i, point in tick.power_df.iterrows(): + cooling_sim_cdu_map[int(point['CDU'])] = dict( + rack_1_power = point['Rack 1'], + rack_2_power = point['Rack 2'], + rack_3_power = point['Rack 3'], + total_power = point['Sum'], + + rack_1_loss = point['Loss 1'], + rack_2_loss = point['Loss 2'], + rack_3_loss = point['Loss 3'], + total_loss = point['Loss'], + ) - for cdu_index, cdu_data in cooling_sim_cdu_map.items(): - cdu_name = telemetry.cdu_index_to_name(cdu_index) - row, col = telemetry.cdu_pos(cdu_index) - cdu_data.update( - timestamp = timestamp, - name = cdu_name, - row = row, - col = col, + if tick.fmu_outputs: + # CDU columns are output in the dict with keys like this: + # "simulator[1].datacenter[1].computeBlock[1].cdu[1].summary.m_flow_prim" + # "simulator[1].datacenter[1].computeBlock[1].cdu[1].summary.V_flow_prim_GPM" + # "simulator[1].datacenter[1].computeBlock[2].cdu[1].summary.m_flow_prim" + # "simulator[1].datacenter[1].computeBlock[2].cdu[1].summary.V_flow_prim_GPM" + # nest_dict will un-flatten it + fmu_data = nest_dict({**tick.fmu_outputs}) + + cdus_data = fmu_data['simulator'][1]['datacenter'][1]['computeBlock'] + for cdu, cdu_data in cdus_data.items(): + cdu_data = cdu_data['cdu'][1]['summary'] + cooling_sim_cdu_map[cdu].update( + work_done_by_cdup = cdu_data['W_flow_CDUP_kW'], + rack_return_temp = cdu_data['T_sec_r_C'], + rack_supply_temp = cdu_data['T_sec_s_C'], + rack_supply_pressure = cdu_data['p_sec_s_psig'], + rack_return_pressure = cdu_data['p_sec_r_psig'], + rack_flowrate = cdu_data['V_flow_sec_GPM'], + facility_return_temp = cdu_data["T_prim_r_C"], + facility_supply_temp = cdu_data['T_prim_s_C'], + facility_supply_pressure = cdu_data['p_prim_s_psig'], + facility_return_pressure = cdu_data['p_prim_r_psig'], + facility_flowrate = cdu_data['V_flow_prim_GPM'], ) - cooling_sim_cdus.append(CoolingSimCDU.model_validate(cdu_data)) - yield SimOutput( + cep_data = fmu_data['simulator'][1]['centralEnergyPlant'][1] + cooling_sim_cep = [CoolingSimCEP.model_validate(dict( + timestamp = timestamp, + htw_flowrate = cep_data['hotWaterLoop'][1]['summary']['V_flow_htw_GPM'], + ctw_flowrate = cep_data['coolingTowerLoop'][1]['summary']['V_flow_ctw_GPM'], + htw_return_pressure = cep_data['hotWaterLoop'][1]['summary']['p_fac_htw_r_psig'], + htw_supply_pressure = cep_data['hotWaterLoop'][1]['summary']['p_fac_htw_s_psig'], + ctw_return_pressure = cep_data['coolingTowerLoop'][1]['summary']['p_fac_ctw_r_psig'], + ctw_supply_pressure = cep_data['coolingTowerLoop'][1]['summary']['p_fac_ctw_s_psig'], + htw_return_temp = cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_r_C'], + htw_supply_temp = cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_s_C'], + ctw_return_temp = cep_data['coolingTowerLoop'][1]['summary']['T_fac_ctw_r_C'], + ctw_supply_temp = cep_data['coolingTowerLoop'][1]['summary']['T_fac_ctw_s_C'], + power_consumption_htwps = cep_data['hotWaterLoop'][1]['summary']['W_flow_HTWP_kW'], + power_consumption_ctwps = cep_data['coolingTowerLoop'][1]['summary']['W_flow_CTWP_kW'], + power_consumption_fan = cep_data['coolingTowerLoop'][1]['summary']['W_flow_CT_kW'], + htwp_speed = cep_data['hotWaterLoop'][1]['summary']['N_HTWP'], + nctwps_staged = cep_data['coolingTowerLoop'][1]['summary']['n_CTWPs'], + nhtwps_staged = cep_data['hotWaterLoop'][1]['summary']['n_HTWPs'], + pue_output = fmu_data['pue'], + nehxs_staged = cep_data['hotWaterLoop'][1]['summary']['n_EHXs'], + ncts_staged = cep_data['coolingTowerLoop'][1]['summary']['n_CTs'], + facility_return_temp = cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_r_C'], + cdu_loop_bypass_flowrate = fmu_data['simulator'][1]['datacenter'][1]['summary']['V_flow_bypass_GPM'], + ))] + + for cdu_index, cdu_data in cooling_sim_cdu_map.items(): + cdu_name = engine.telemetry.cdu_index_to_name(cdu_index) + row, col = engine.telemetry.cdu_pos(cdu_index) + cdu_data.update( timestamp = timestamp, - scheduler_sim_system = scheduler_sim_system, - scheduler_sim_jobs = scheduler_sim_jobs, - cooling_sim_cdus = cooling_sim_cdus, - cooling_sim_cep = cooling_sim_cep, - power_history = power_history, + name = cdu_name, + row = row, + col = col, ) - else: - raise SimException("No simulations specified") + cooling_sim_cdus.append(CoolingSimCDU.model_validate(cdu_data)) + + yield SimTickOutput( + timestamp = timestamp, + scheduler_sim_system = scheduler_sim_system, + scheduler_sim_jobs = scheduler_sim_jobs, + cooling_sim_cdus = cooling_sim_cdus, + cooling_sim_cep = cooling_sim_cep, + power_history = power_history, + ) + -- GitLab From 6078902ec7ccc021010eafe95c40482a0a1be4fd Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 23 Sep 2025 13:53:21 -0400 Subject: [PATCH 10/77] Improve simulation loop performance --- simulation_server/simulation/simulation.py | 34 ++++++++++++---------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index 9cbe956..cf306cf 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -3,6 +3,7 @@ from datetime import datetime, timedelta import functools from loguru import logger from raps import Engine +from raps.stats import get_engine_stats, get_job_stats from ..models.sim import ServerSimConfig from ..models.output import ( JobStateEnum, SchedulerSimJob, SchedulerSimJobPowerHistory, SchedulerSimSystem, CoolingSimCDU, @@ -59,27 +60,28 @@ def run_simulation(sim_config: ServerSimConfig): scheduler_sim_system: list[SchedulerSimSystem] = [] if unix_timestamp % sample_scheduler_sim_system == 0 or is_last_tick: down_nodes = _parse_nodes(tuple(tick.down_nodes)) - stats = engine.get_stats() + engine_stats = get_engine_stats(engine, fast = True) + job_stats = get_job_stats(engine) scheduler_sim_system = [SchedulerSimSystem.model_validate(dict( timestamp = timestamp, down_nodes = down_nodes, # TODO: Update sc.get_stats to return more easily parsable data - num_samples = stats['engine']['num_samples'], - - jobs_completed = stats['job']['jobs_completed'], - jobs_running = len(stats['job']['jobs_still_running']), - jobs_pending = len(stats['job']['jobs_still_in_queue']), - - throughput = stats['job']['throughput'], - average_power = stats['engine']['average_power'] * 1_000_000, - min_loss = stats['engine']['min_loss'] * 1_000_000, - average_loss = stats['engine']['average_loss'] * 1_000_000, - max_loss = stats['engine']['max_loss'] * 1_000_000, - system_power_efficiency = stats['engine']['system_power_efficiency'], - total_energy_consumed = stats['engine']['total_energy_consumed'], - carbon_emissions = stats['engine']['carbon_emissions'], - total_cost = stats['engine']['total_cost'], + num_samples = engine_stats['num_samples'], + + jobs_completed = job_stats['jobs_completed'], + jobs_running = len(job_stats['jobs_still_running']), + jobs_pending = len(job_stats['jobs_still_in_queue']), + + throughput = job_stats['throughput'], + average_power = engine_stats['average_power'] * 1_000_000, + min_loss = engine_stats['min_loss'] * 1_000_000, + average_loss = engine_stats['average_loss'] * 1_000_000, + max_loss = engine_stats['max_loss'] * 1_000_000, + system_power_efficiency = engine_stats['system_power_efficiency'], + total_energy_consumed = engine_stats['total_energy_consumed'], + carbon_emissions = engine_stats['carbon_emissions'], + total_cost = engine_stats['total_cost'], p_flops = tick.p_flops, g_flops_w = tick.g_flops_w, -- GitLab From b0a216ea14982b7bce9bf0dcd91b4f38fbe88c31 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 23 Sep 2025 14:37:44 -0400 Subject: [PATCH 11/77] Update simulation driver script --- simulation_server/models/sim.py | 100 ++------------------------- simulation_server/simulation/main.py | 85 +++++++++++------------ 2 files changed, 46 insertions(+), 139 deletions(-) diff --git a/simulation_server/models/sim.py b/simulation_server/models/sim.py index 104ba4a..5a21bf4 100644 --- a/simulation_server/models/sim.py +++ b/simulation_server/models/sim.py @@ -1,11 +1,11 @@ from __future__ import annotations from typing import Optional, Literal, Annotated as A, get_args -from datetime import timedelta, datetime, timezone -import json, math -from pydantic import AwareDatetime, model_validator, field_validator, Field +import json +from pydantic import AwareDatetime, Field +from raps import SingleSimConfig +from raps.utils import AutoAwareDatetime from .base import BaseModel -from .job_state import JobStateEnum from ..util.misc import omit from ..util.api_queries import filter_params, sort_params @@ -99,92 +99,6 @@ SIM_SORT = sort_params(omit(SIM_API_FIELDS, ['progress', 'progress_date', 'confi ]) -class SimConfig(BaseModel): - start: AwareDatetime - end: AwareDatetime - - system: SimSystem - scheduler: A[SchedulerSimConfig, Field(default_factory = lambda: SchedulerSimConfig())] - cooling: A[CoolingSimConfig, Field(default_factory = lambda: CoolingSimConfig())] - - @model_validator(mode='after') - def validate_model(self): - if self.end <= self.start: - raise ValueError("Start must be less than end") - - if not any(m.enabled for m in [self.scheduler, self.cooling]): - raise ValueError("Must enable one simulation") - if self.cooling.enabled and not self.scheduler.enabled: - raise ValueError("Currently can't run cooling simulation without the scheduler") - return self - - @field_validator("start", mode="after") - @classmethod - def trunc_start(cls, v: datetime, info): - return v.fromtimestamp(math.floor(v.timestamp()), tz=timezone.utc) - - @field_validator("end", mode="after") - @classmethod - def trunc_end(cls, v: datetime, info): - return v.fromtimestamp(math.ceil(v.timestamp()), tz=timezone.utc) - - -class SchedulerSimConfig(BaseModel): - """ - Config for RAPS job simulation. - There are 3 main "modes" for how to run the jobs. - - replay: Replay data based on the real jobs run on Frontier during start/end - - custom: Pass your own set of jobs to submit in the simulation in `jobs` - - random: Run random jobs. You can pass `seed` and `num_jobs` to customize it. - """ - - enabled: bool = False - down_nodes: list[int] = [] # List of hostnames. TODO: allow parsing from xnames - - jobs_mode: Literal['replay', 'custom', 'random', 'test'] = 'random' - schedule_policy: Literal['fcfs', 'sjf', 'prq'] ='fcfs' - """" - Policy to use when scheduling jobs. - Replay mode will ignore this and use the real time jobs were scheduled unless you also set - reschedule to true. - """ - reschedule: bool = False - """ If true, will apply schedule_policy in replay mode """ - - jobs: Optional[list[SchedulerSimCustomJob]] = None - """ - The list of jobs. - Only applicable if jobs_mode is "custom" - """ - - seed: Optional[int] = None - """ - Random seed for consistent random job generation. - Only applicable if jobs_mode is "random" - """ - num_jobs: Optional[int] = None - """ - Number of random jobs to generate. - Only applicable if jobs_mode is "random" - """ - - -class SchedulerSimCustomJob(BaseModel): - # This is mostly a subset of the SchedulerSimJob - name: str - allocation_nodes: int - """ Number of nodes required """ - time_submission: AwareDatetime - time_limit: timedelta - - cpu_util: float - gpu_util: float - cpu_trace: list[float] - gpu_trace: list[float] - - end_state: JobStateEnum - """ Slurm state job will end in """ - - -class CoolingSimConfig(BaseModel): - enabled: bool = False +class ServerSimConfig(SingleSimConfig): + start: AutoAwareDatetime # make start required + """ Start of the simulation """ diff --git a/simulation_server/simulation/main.py b/simulation_server/simulation/main.py index 01c09dd..f80206c 100644 --- a/simulation_server/simulation/main.py +++ b/simulation_server/simulation/main.py @@ -1,30 +1,25 @@ """ A script to run the ExaDigiT simulation """ +from typing import Callable import argparse, os, json from pathlib import Path from datetime import datetime, timezone from loguru import logger import yaml -from ..models.sim import Sim, SimConfig +from ..models.sim import Sim, ServerSimConfig from .simulation import run_simulation from ..util.kafka import get_kafka_producer -def cli_run(config: SimConfig): - for i, data in enumerate(run_simulation(config)): - print(f"TICK {i}") - - -def background_job(sim: Sim): +def write_sim(sim: Sim, writer: Callable[[str, bytes], None]): sim = sim.model_copy() - kafka_producer = get_kafka_producer() def output_rows(topic, rows): for row in rows: value = json.dumps({"sim_id": sim.id, **row.model_dump(mode='json')}).encode() - kafka_producer.send(topic=topic, value=value) + writer(topic, value) logger.info(f"Starting simulation {sim.model_dump_json()}") - config = SimConfig.model_validate(sim.config) + config = ServerSimConfig.model_validate(sim.config) progress_date = sim.start try: @@ -40,59 +35,57 @@ def background_job(sim: Sim): sim.execution_end = datetime.now(timezone.utc) sim.error_messages = str(e) sim.progress_date = progress_date - kafka_producer.send("svc-event-exadigit-sim", value = sim.serialize_for_druid()) - kafka_producer.close() # Close and wait for messages to be sent + writer("svc-event-exadigit-sim", sim.serialize_for_druid()) logger.info(f"Simulation {sim.id} failed") raise e sim.state = "success" sim.execution_end = datetime.now(timezone.utc) sim.progress_date = sim.end - kafka_producer.send(topic = "svc-event-exadigit-sim", value = sim.serialize_for_druid()) - kafka_producer.close() # Close and wait for messages to be sent + writer("svc-event-exadigit-sim", sim.serialize_for_druid()) logger.info(f"Simulation {sim.id} finished") +def write_sim_to_kafka(sim: Sim): + kafka_producer = get_kafka_producer() + def writer(topic: str, value: bytes): + kafka_producer.send(topic=topic, value=value) + try: + write_sim(sim, writer=writer) + finally: + kafka_producer.close() + + +def write_sim_to_disk(sim: Sim, dest: str): + Path(dest).mkdir(exist_ok=True) + def writer(topic: str, value: bytes): + with open(Path(dest) / f"{topic}.jsonl", 'ab') as f: + f.write(value + b"\n") + write_sim(sim, writer=writer) + + if __name__ == "__main__": parser = argparse.ArgumentParser( description = __doc__.strip(), allow_abbrev = False, formatter_class = argparse.RawDescriptionHelpFormatter, ) - subparsers = parser.add_subparsers(required=True, dest="action") - parser_cli_run = subparsers.add_parser('run') - parser_cli_run.add_argument("--config", type=str, help="JSON config string") - parser_cli_run.add_argument("--config-file", type=Path, help="Path to a yaml or json file contain the config") - - parser_cli_run = subparsers.add_parser('background-job') - parser_cli_run.add_argument("--sim", type=str, help="JSON config string") + parser.add_argument("--sim", type=str, help="Sim json") + parser.add_argument("--dest", default=None) args = parser.parse_args() + if args.sim: + sim = args.sim + elif os.environ.get("SIM"): + sim = os.environ["SIM"] + else: + raise Exception("No configuration passed") + + sim = Sim.model_validate(yaml.safe_load(sim)) - if args.action == "run": - if args.config and args.config_file: - raise Exception("You can only specify either config or config-file") - - if args.config: - config = yaml.safe_load(args.config) - elif args.config_file: - config = yaml.safe_load(args.config_file.read_text()) - elif "SIM_CONFIG" in os.environ: - config = yaml.safe_load(os.environ["SIM_CONFIG"]) - elif "SIM_CONFIG_FILE" in os.environ: - config = yaml.safe_load(Path(os.environ["SIM_CONFIG_FILE"]).read_text()) - else: - raise Exception("No configuration passed") - config = SimConfig.model_validate(config) - cli_run(config) - elif args.action == "background-job": - if args.sim: - sim = yaml.safe_load(args.sim) - elif "SIM" in os.environ: - sim = yaml.safe_load(os.environ["SIM"]) - else: - raise Exception("No sim passed") - sim = Sim.model_validate(sim) - background_job(sim) + if args.dest: + write_sim_to_disk(sim, args.dest) + else: + write_sim_to_kafka(sim) -- GitLab From 8173b6f8fb81ff7d02456fef4b6670eb0c798f6d Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 23 Sep 2025 14:45:55 -0400 Subject: [PATCH 12/77] Remove gunicorn --- simulation_server/server/main.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/simulation_server/server/main.py b/simulation_server/server/main.py index bfb1d05..b86e476 100644 --- a/simulation_server/server/main.py +++ b/simulation_server/server/main.py @@ -65,7 +65,7 @@ async def lifespan(api: FastAPI): app = FastAPI( title = "ExaDigiT Simulation Server", - version = "0.1.0", + version = "1.0.0", # Simplify ids and names in generated clients a bit # NOTE: This means we need one tag defined (or inherited from the APIRouter object) on every route generate_unique_id_function = lambda route: f"{route.tags[0]}_{route.name}", @@ -117,15 +117,8 @@ app.include_router(router) if __name__ == "__main__": - if settings.debug_mode: - uvicorn.run(app, - host='0.0.0.0', - port=settings.http_port, - reload=False, - ) - else: - subprocess.run(["gunicorn", - "simulation_server.server.main:app", - "--bind", f"0.0.0.0:{settings.http_port}", - "--worker-class", "uvicorn.workers.UvicornWorker", - ], check=True) + uvicorn.run(app, + host='0.0.0.0', + port=settings.http_port, + reload=False, + ) -- GitLab From 6feb55c84ed40a7bc2a056cab6306a4f1da1942e Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 23 Sep 2025 15:00:29 -0400 Subject: [PATCH 13/77] Remove SimSystem --- simulation_server/models/sim.py | 6 +----- simulation_server/server/endpoints.py | 8 ++++---- simulation_server/server/service.py | 8 ++++---- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/simulation_server/models/sim.py b/simulation_server/models/sim.py index 5a21bf4..d6eb861 100644 --- a/simulation_server/models/sim.py +++ b/simulation_server/models/sim.py @@ -10,10 +10,6 @@ from ..util.misc import omit from ..util.api_queries import filter_params, sort_params -SimSystem = Literal["frontier", "fugaku", "lassen", "marconi100"] -SIM_SYSTEMS: tuple[str] = get_args(SimSystem) - - class Sim(BaseModel): """ Represents a single simulation run """ @@ -23,7 +19,7 @@ class Sim(BaseModel): user: Optional[str] = None """ User who launched the simulation """ - system: SimSystem + system: str state: Optional[Literal['running', 'success', 'fail']] = None diff --git a/simulation_server/server/endpoints.py b/simulation_server/server/endpoints.py index cec31ff..e00d833 100644 --- a/simulation_server/server/endpoints.py +++ b/simulation_server/server/endpoints.py @@ -10,7 +10,7 @@ from ..models.output import ( CoolingSimCDU, COOLING_CDU_FILTERS, COOLING_CDU_FIELD_SELECTORS, CoolingSimCEP, COOLING_CEP_FIELD_SELECTORS, ) -from ..models.sim import Sim, SIM_FIELD_SELECTORS, SIM_FILTERS, SIM_SORT, SimConfig, SimSystem +from ..models.sim import Sim, SIM_FIELD_SELECTORS, SIM_FILTERS, SIM_SORT, ServerSimConfig from ..models.output import SystemInfo from ..util.api_queries import Granularity, granularity_params, Filters, Sort, get_selectors from .config import AppDeps @@ -27,10 +27,10 @@ GranularityDep = A[Granularity, Depends(granularity_params(default_granularity=t @router.post("/simulation/run", response_model=Sim) -def run(*, sim_config: A[SimConfig, Body()], deps: AppDeps): +def run(*, sim_config: A[ServerSimConfig, Body()], deps: AppDeps): """ Start running a simulation in the background. POST the configuration for the simulation. Returns - a Sim object containing an id you can use to query the results as they are generated. Foo + a Sim object containing an id you can use to query the results as they are generated. """ return run_simulation(sim_config, deps) @@ -213,5 +213,5 @@ def scheduler_system(*, @router.get("/system-info/{system}", response_model=SystemInfo) -def system_info(system: SimSystem): +def system_info(system: str): return get_system_info(system = system) diff --git a/simulation_server/server/service.py b/simulation_server/server/service.py index c71422b..d804a8a 100644 --- a/simulation_server/server/service.py +++ b/simulation_server/server/service.py @@ -4,7 +4,7 @@ import uuid, time, json, base64, os, sys, subprocess import sqlalchemy as sqla from loguru import logger from pydantic import ValidationError -from ..models.sim import Sim, SimConfig, SIM_FILTERS, SIM_FIELD_SELECTORS, SimSystem +from ..models.sim import Sim, ServerSimConfig, SIM_FILTERS, SIM_FIELD_SELECTORS from ..models.base import ResponseFormat from ..models.output import ( COOLING_CDU_API_FIELDS, COOLING_CDU_FIELD_SELECTORS, @@ -51,12 +51,12 @@ def wait_until_exists(stmt: sqla.Select, *, timeout: timedelta = timedelta(minut -def run_simulation(sim_config: SimConfig, deps: AppDeps): +def run_simulation(sim_config: ServerSimConfig, deps: AppDeps): sim = Sim( # Random sim id, use base32 to make it a bit shorter id = base64.b32encode(uuid.uuid4().bytes).decode().rstrip('=').lower(), user = "unknown", # TODO pull this from cookie/auth header - system = sim_config.system, + system = sim_config.system_configs[0].system_name, state = "running", start = sim_config.start, end = sim_config.end, @@ -696,7 +696,7 @@ def build_scheduler_sim_power_history_query(*, ) -def get_system_info(system: SimSystem): +def get_system_info(system: str): from ..simulation.simulation import get_scheduler sc = get_scheduler(system = system) return sc.get_gauge_limits() -- GitLab From a9ba5530771c42b3ce28e824551e0cb096d6b653 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 23 Sep 2025 15:20:53 -0400 Subject: [PATCH 14/77] Fixes to service.py --- simulation_server/server/service.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/simulation_server/server/service.py b/simulation_server/server/service.py index d804a8a..161ac91 100644 --- a/simulation_server/server/service.py +++ b/simulation_server/server/service.py @@ -1,9 +1,11 @@ from typing import Optional, Any from datetime import datetime, timedelta, timezone +import functools import uuid, time, json, base64, os, sys, subprocess import sqlalchemy as sqla from loguru import logger from pydantic import ValidationError +from fastapi import HTTPException from ..models.sim import Sim, ServerSimConfig, SIM_FILTERS, SIM_FIELD_SELECTORS from ..models.base import ResponseFormat from ..models.output import ( @@ -83,7 +85,7 @@ def run_simulation(sim_config: ServerSimConfig, deps: AppDeps): { "name": "main", "image": deps.settings.job_image, - "command": ['python3', "-m", "simulation_server.simulation.main", "background-job"], + "command": ['python3', "-m", "simulation_server.simulation.main"], "env": [ {"name": "SIM", "value": sim.model_dump_json()}, ], @@ -104,7 +106,7 @@ def run_simulation(sim_config: ServerSimConfig, deps: AppDeps): }) else: # Running locally, just use a subprocess proc = subprocess.Popen( - args = [sys.executable, "-m", "simulation_server.simulation.main", "background-job"], + args = [sys.executable, "-m", "simulation_server.simulation.main"], env = { "SIM": sim.model_dump_json(), **os.environ, @@ -696,7 +698,11 @@ def build_scheduler_sim_power_history_query(*, ) +@functools.cache def get_system_info(system: str): - from ..simulation.simulation import get_scheduler - sc = get_scheduler(system = system) - return sc.get_gauge_limits() + from raps.system_config import list_systems + from raps import Engine, SingleSimConfig + if system not in list_systems(): + raise HTTPException(status_code=404, detail=f"System {system} not found") + engine = Engine(SingleSimConfig(system = system)) + return engine.get_gauge_limits() -- GitLab From bf8494259967dc9015ee4bcb738232575806fedd Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 23 Sep 2025 15:22:30 -0400 Subject: [PATCH 15/77] Update launch_local.sh --- docker-compose.yml | 1 + scripts/launch_local.sh | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 706c516..796ed2d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -175,6 +175,7 @@ services: simulation-server: image: exadigit-simulation-server container_name: simulation-server + command: ["python", "-m", "simulation_server.server.main"] ports: - "8080:8080" depends_on: diff --git a/scripts/launch_local.sh b/scripts/launch_local.sh index 871da5f..bf04f96 100755 --- a/scripts/launch_local.sh +++ b/scripts/launch_local.sh @@ -5,8 +5,7 @@ set -e # Exit if any commmand fails BASE_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}")/..) cd "$BASE_DIR" -docker build -t exadigit-simulation-server:latest -f Dockerfile.server . -# docker build -t exadigit-simulation-server-simulation-job:latest -f Dockerfile.simulation . +docker build -t exadigit-simulation-server:latest -f Dockerfile . # trap 'docker compose down' SIGINT SIGTERM EXIT -- GitLab From 51a1a94cada4f70cfc1f3eaedf8e21c65ca223f4 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 24 Sep 2025 10:29:16 -0400 Subject: [PATCH 16/77] Add kafka admin client --- simulation_server/util/kafka.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/simulation_server/util/kafka.py b/simulation_server/util/kafka.py index af3e510..f6123c0 100644 --- a/simulation_server/util/kafka.py +++ b/simulation_server/util/kafka.py @@ -1,10 +1,9 @@ import os -from kafka import KafkaProducer, KafkaConsumer +from kafka import KafkaProducer, KafkaConsumer, KafkaAdminClient import functools -@functools.cache -def get_kafka_producer(**configs): +def _get_kafka_config(): env_configs = { # Pick-up credentials from the context 'bootstrap_servers': [os.environ['KAFKA_BOOTSTRAP']], @@ -14,18 +13,18 @@ def get_kafka_producer(**configs): 'security_protocol': os.environ.get('KAFKA_SECURITY_PROTOCOL'), } env_configs = {k: v for k, v in env_configs.items() if v is not None} - return KafkaProducer(**{**env_configs, **configs}) + return env_configs + +@functools.cache +def get_kafka_producer(**configs): + return KafkaProducer(**{**_get_kafka_config(), **configs}) @functools.cache def get_kafka_consumer(*topics, **configs): - env_configs = { - # Pick-up credentials from the context - 'bootstrap_servers': [os.environ['KAFKA_BOOTSTRAP']], - 'sasl_mechanism': os.environ.get('KAFKA_SASL_MECHANISM'), - 'sasl_plain_username': os.environ.get('KAFKA_SASL_USERNAME'), - 'sasl_plain_password': os.environ.get('KAFKA_SASL_PASSWORD'), - 'security_protocol': os.environ.get('KAFKA_SECURITY_PROTOCOL'), - } - env_configs = {k: v for k, v in env_configs.items() if v is not None} - return KafkaConsumer(*topics, **{**env_configs, **configs}) + return KafkaConsumer(*topics, **{**_get_kafka_config(), **configs}) + + +@functools.cache +def get_kafka_admin(**configs): + return KafkaAdminClient(**{**_get_kafka_config(), **configs}) -- GitLab From 7f6d90359be4ca900b3993d23a63988ba426cbd0 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 24 Sep 2025 10:31:30 -0400 Subject: [PATCH 17/77] Create topics on boot in dev mode --- simulation_server/server/main.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/simulation_server/server/main.py b/simulation_server/server/main.py index b86e476..aea77d9 100644 --- a/simulation_server/server/main.py +++ b/simulation_server/server/main.py @@ -1,6 +1,6 @@ """ A simple REST API for triggering and querying the results from the digital twin """ from pathlib import Path -import subprocess, asyncio, functools, os, json +import asyncio, functools, os, json from contextlib import asynccontextmanager from starlette.exceptions import HTTPException from starlette.requests import Request @@ -15,6 +15,8 @@ from loguru import logger from ..util.druid import submit_ingest from .service import cleanup_jobs from .config import AppSettings, get_app_settings, get_druid_engine, get_kafka_producer +from ..util.kafka import get_kafka_admin +from kafka.admin import NewTopic settings = AppSettings() @@ -47,11 +49,24 @@ async def lifespan(api: FastAPI): ) if settings.env == 'dev': + kafka_admin = get_kafka_admin() + kafka_admin.create_topics([ + NewTopic("svc-event-exadigit-sim", 1, 1), + NewTopic("svc-ts-exadigit-schedulersimsystem", 1, 1), + NewTopic("svc-event-exadigit-schedulersimjob", 1, 1), + NewTopic("svc-ts-exadigit-coolingsimcdu", 1, 1), + NewTopic("svc-ts-exadigit-coolingsimcep", 1, 1), + NewTopic("svc-ts-exadigit-jobpowerhistory", 1, 1), + ]) + druid_ingests_dir = Path(__file__).parent.parent.parent.resolve() / 'druid_ingests' ingests = [ - "cooling-sim-cdu", "cooling-sim-cep", "scheduler-job-power-history", - "scheduler-sim-job", "scheduler-sim-system", "sim", + "scheduler-sim-system", + "scheduler-sim-job", + "cooling-sim-cdu", + "cooling-sim-cep", + "scheduler-job-power-history", ] for ingest in ingests: -- GitLab From ee0eb3ddc1c44a28371af2a6f4acf8b09b569ba8 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 24 Sep 2025 11:06:51 -0400 Subject: [PATCH 18/77] Add back check for empty tables --- simulation_server/util/druid.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/simulation_server/util/druid.py b/simulation_server/util/druid.py index 2e49231..ef49a31 100644 --- a/simulation_server/util/druid.py +++ b/simulation_server/util/druid.py @@ -130,17 +130,17 @@ def execute_ignore_missing(conn, stmt) -> sqla.CursorResult: cursor. Note this may have unexpected results if you have joins/aggregations/etc that would have returned data with an empty table. """ - # try: - return conn.execute(stmt) - # except Exception as e: - # existing_tables = set(sqla.inspect(conn.engine).get_table_names()) - # stmt_tables = set([t.name for t in stmt.get_final_froms()]) - # missing_tables = stmt_tables - existing_tables - # if missing_tables: - # logger.info(f"table(s) {', '.join(stmt_tables)} missing, returning empty result") - # return conn.execute(sqla.text("SELECT 1 FROM (VALUES (1)) AS tbl(a) WHERE 1 != 1")) - # else: - # raise e + try: + return conn.execute(stmt) + except Exception as e: + existing_tables = set(sqla.inspect(conn.engine).get_table_names()) + stmt_tables = set([t.name for t in stmt.get_final_froms()]) + missing_tables = stmt_tables - existing_tables + if missing_tables: + logger.info(f"table(s) {', '.join(stmt_tables)} missing, returning empty result") + return conn.execute(sqla.text("SELECT 1 FROM (VALUES (1)) AS tbl(a) WHERE 1 != 1")) + else: + raise e def submit_ingest(ingest: dict): -- GitLab From db7f9f4cf5e692581219448652fb1aaafa240cbe Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 24 Sep 2025 12:14:45 -0400 Subject: [PATCH 19/77] Update druid images --- docker-compose.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 796ed2d..51b95e1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,7 +14,7 @@ volumes: services: postgres: container_name: postgres - image: postgres:latest + image: postgres:17.6-trixie # ports: # - "5432:5432" volumes: @@ -33,7 +33,7 @@ services: - ZOO_MY_ID=1 druid-coordinator: - image: apache/druid:30.0.1 + image: apache/druid:34.0.0 container_name: druid-coordinator volumes: - druid_shared:/opt/shared @@ -54,7 +54,7 @@ services: retries: 10 druid-broker: - image: apache/druid:30.0.1 + image: apache/druid:34.0.0 container_name: druid-broker volumes: - broker_var:/opt/druid/var @@ -75,7 +75,7 @@ services: retries: 10 druid-historical: - image: apache/druid:30.0.1 + image: apache/druid:34.0.0 container_name: druid-historical volumes: - druid_shared:/opt/shared @@ -97,7 +97,7 @@ services: retries: 10 druid-middlemanager: - image: apache/druid:30.0.1 + image: apache/druid:34.0.0 container_name: druid-middlemanager volumes: - druid_shared:/opt/shared @@ -116,7 +116,7 @@ services: - druid-environment.txt druid-router: - image: apache/druid:30.0.1 + image: apache/druid:34.0.0 container_name: druid-router volumes: - router_var:/opt/druid/var -- GitLab From 361b1a98be62143cafd519d4942ffd07c4c85639 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 24 Sep 2025 12:26:28 -0400 Subject: [PATCH 20/77] Log progress --- simulation_server/simulation/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/simulation_server/simulation/main.py b/simulation_server/simulation/main.py index f80206c..8ecfbbb 100644 --- a/simulation_server/simulation/main.py +++ b/simulation_server/simulation/main.py @@ -30,6 +30,8 @@ def write_sim(sim: Sim, writer: Callable[[str, bytes], None]): output_rows("svc-ts-exadigit-coolingsimcep", data.cooling_sim_cep) output_rows("svc-ts-exadigit-jobpowerhistory", data.power_history) progress_date = data.timestamp + if data.timestamp.second == 0: + logger.info(f"progress: {data.timestamp.isoformat()} / {sim.end.isoformat()}") except BaseException as e: sim.state = "fail" sim.execution_end = datetime.now(timezone.utc) -- GitLab From fe2afa1c45e479e6bef0b25e92a04905e9204ee4 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 24 Sep 2025 13:28:05 -0400 Subject: [PATCH 21/77] Ignore cors for local dev --- docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.yml b/docker-compose.yml index 51b95e1..8800939 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -196,5 +196,6 @@ services: # - EXADIGIT_ROOT_PATH - EXADIGIT_DEBUG_MODE=true # - EXADIGIT_JOB_IMAGE + - EXADIGIT_ALLOW_ORIGINS=["*"] - DRUID_SERVICE_URL=http://druid-router:8888 - KAFKA_BOOTSTRAP=kafka:9092 -- GitLab From b0270c0239ca7c83ef4d40426a6b4c0447a2d5dd Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 25 Sep 2025 17:10:28 -0400 Subject: [PATCH 22/77] Force no output --- simulation_server/models/sim.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/simulation_server/models/sim.py b/simulation_server/models/sim.py index d6eb861..dcff7df 100644 --- a/simulation_server/models/sim.py +++ b/simulation_server/models/sim.py @@ -1,7 +1,7 @@ from __future__ import annotations -from typing import Optional, Literal, Annotated as A, get_args +from typing import Optional, Literal, Annotated as A import json -from pydantic import AwareDatetime, Field +from pydantic import AwareDatetime, Field, model_validator from raps import SingleSimConfig from raps.utils import AutoAwareDatetime @@ -98,3 +98,10 @@ SIM_SORT = sort_params(omit(SIM_API_FIELDS, ['progress', 'progress_date', 'confi class ServerSimConfig(SingleSimConfig): start: AutoAwareDatetime # make start required """ Start of the simulation """ + + @model_validator(mode = "after") + def _validate_server_sim_config(self): + # Force these options regardless of input + self.noui = True + self.output = "none" + return self -- GitLab From 53d2d606ff663a0996fe99798743252690d490ce Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 26 Sep 2025 14:26:05 -0400 Subject: [PATCH 23/77] Set base_path in ServerSimConfig --- simulation_server/models/sim.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/simulation_server/models/sim.py b/simulation_server/models/sim.py index dcff7df..1ace96a 100644 --- a/simulation_server/models/sim.py +++ b/simulation_server/models/sim.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import Optional, Literal, Annotated as A import json +from pathlib import Path from pydantic import AwareDatetime, Field, model_validator from raps import SingleSimConfig from raps.utils import AutoAwareDatetime @@ -99,6 +100,18 @@ class ServerSimConfig(SingleSimConfig): start: AutoAwareDatetime # make start required """ Start of the simulation """ + def __init__(self, /, **data): + # Override context to set base_path + RAPS_PATH = (Path(__file__) / '../../../raps').resolve() + self.__pydantic_validator__.validate_python( + data, + self_instance=self, + context={ + "base_path": RAPS_PATH, + "force_under_base_path": True, + } + ) + @model_validator(mode = "after") def _validate_server_sim_config(self): # Force these options regardless of input -- GitLab From 74398e868f69bcf961edf757bb199a07e174f3a6 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 1 Oct 2025 10:23:48 -0400 Subject: [PATCH 24/77] Update RAPS --- raps | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/raps b/raps index 15025de..0477cd4 160000 --- a/raps +++ b/raps @@ -1 +1 @@ -Subproject commit 15025ded86c20023544db626720828d27f7e5cb3 +Subproject commit 0477cd4deea22b1397598eb4b0677151616f5807 -- GitLab From eab0a964c71dbe7c219a6770c07f106558e759b5 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 1 Oct 2025 11:11:12 -0400 Subject: [PATCH 25/77] Check if topics exist before creating --- simulation_server/server/main.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/simulation_server/server/main.py b/simulation_server/server/main.py index aea77d9..86900f9 100644 --- a/simulation_server/server/main.py +++ b/simulation_server/server/main.py @@ -50,14 +50,19 @@ async def lifespan(api: FastAPI): if settings.env == 'dev': kafka_admin = get_kafka_admin() - kafka_admin.create_topics([ - NewTopic("svc-event-exadigit-sim", 1, 1), - NewTopic("svc-ts-exadigit-schedulersimsystem", 1, 1), - NewTopic("svc-event-exadigit-schedulersimjob", 1, 1), - NewTopic("svc-ts-exadigit-coolingsimcdu", 1, 1), - NewTopic("svc-ts-exadigit-coolingsimcep", 1, 1), - NewTopic("svc-ts-exadigit-jobpowerhistory", 1, 1), - ]) + existing_topics = set(kafka_admin.list_topics()) + new_topics = [ + "svc-event-exadigit-sim", + "svc-ts-exadigit-schedulersimsystem", + "svc-event-exadigit-schedulersimjob", + "svc-ts-exadigit-coolingsimcdu", + "svc-ts-exadigit-coolingsimcep", + "svc-ts-exadigit-jobpowerhistory", + ] + for topic in new_topics: + if topic not in existing_topics: + logger.info(f"Creating kafka topic {topic}") + kafka_admin.create_topics([NewTopic(topic, 1, 1)]) druid_ingests_dir = Path(__file__).parent.parent.parent.resolve() / 'druid_ingests' ingests = [ -- GitLab From c0f11847e0642a79ddd7849f4aad4064bb6957e2 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 1 Oct 2025 11:22:57 -0400 Subject: [PATCH 26/77] Dockerfile tweaks --- Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index b61b15a..f17fb55 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.12 +FROM python:3.12.11 RUN apt-get update \ && apt-get install git libsnappy-dev \ @@ -27,5 +27,4 @@ RUN uv pip install --system -e . # Re-install RAPS as editable (TODO: RAPS currently doesn't work in non-editable mode) RUN uv pip install --system -e ./raps -# CMD ["python", "-m", "simulation_server.simulation.main"] -# CMD ["python", "-m", "simulation_server.server.main"] +CMD ["python", "-m", "simulation_server.server.main"] -- GitLab From 51a6c81f00e90b3e5dcc670067a5870a27f9a8dc Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 1 Oct 2025 11:37:30 -0400 Subject: [PATCH 27/77] Name all docker compose volumes --- docker-compose.yml | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 8800939..2831864 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,14 +2,19 @@ # Docker UI will be available from http://localhost:8888 volumes: - metadata_data: {} - middle_var: {} - historical_var: {} - broker_var: {} - coordinator_var: {} - router_var: {} + postgres_data: {} + zookeeper_data: {} + zookeeper_datalog: {} + zookeeper_logs: {} + druid_middlemanager_var: {} + druid_historical_var: {} + druid_broker_var: {} + druid_coordinator_var: {} + druid_router_var: {} druid_shared: {} - + kafka_secrets: {} + kafka_config: {} + kafka_data: {} services: postgres: @@ -18,7 +23,7 @@ services: # ports: # - "5432:5432" volumes: - - metadata_data:/var/lib/postgresql/data + - postgres_data:/var/lib/postgresql/data environment: - POSTGRES_PASSWORD=FoolishPassword - POSTGRES_USER=druid @@ -31,13 +36,17 @@ services: # - "2181:2181" environment: - ZOO_MY_ID=1 + volumes: + - zookeeper_data:/data + - zookeeper_datalog:/datalog + - zookeeper_logs:/logs druid-coordinator: image: apache/druid:34.0.0 container_name: druid-coordinator volumes: - druid_shared:/opt/shared - - coordinator_var:/opt/druid/var + - druid_coordinator_var:/opt/druid/var - ./data:/data depends_on: - zookeeper @@ -57,7 +66,7 @@ services: image: apache/druid:34.0.0 container_name: druid-broker volumes: - - broker_var:/opt/druid/var + - druid_broker_var:/opt/druid/var - ./data:/data depends_on: - zookeeper @@ -79,7 +88,7 @@ services: container_name: druid-historical volumes: - druid_shared:/opt/shared - - historical_var:/opt/druid/var + - druid_historical_var:/opt/druid/var - ./data:/data depends_on: - zookeeper @@ -101,7 +110,7 @@ services: container_name: druid-middlemanager volumes: - druid_shared:/opt/shared - - middle_var:/opt/druid/var + - druid_middlemanager_var:/opt/druid/var - ./data:/data depends_on: - zookeeper @@ -119,7 +128,7 @@ services: image: apache/druid:34.0.0 container_name: druid-router volumes: - - router_var:/opt/druid/var + - druid_router_var:/opt/druid/var # - ./data:/data depends_on: - zookeeper @@ -146,6 +155,10 @@ services: interval: 15s retries: 20 timeout: 3s + volumes: + - kafka_secrets:/etc/kafka/secrets + - kafka_config:/mnt/shared/config + - kafka_data:/var/lib/kafka/data environment: # Overriding any configs wipes the defaults, so most of this is copied from /opt/kafka/config/server.properties - KAFKA_PROCESS_ROLES=broker,controller -- GitLab From a055391b82458facf81a3a19e81fe69803af6c45 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 1 Oct 2025 11:49:08 -0400 Subject: [PATCH 28/77] Build in docker-compose.yml --- docker-compose.yml | 2 ++ scripts/launch_local.sh | 9 ++------- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 2831864..e4d175d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -186,7 +186,9 @@ services: - KAFKA_LOG_RETENTION_CHECK_INTERVAL_MS=300000 simulation-server: + pull_policy: build image: exadigit-simulation-server + build: . container_name: simulation-server command: ["python", "-m", "simulation_server.server.main"] ports: diff --git a/scripts/launch_local.sh b/scripts/launch_local.sh index bf04f96..e786a37 100755 --- a/scripts/launch_local.sh +++ b/scripts/launch_local.sh @@ -1,14 +1,9 @@ #!/bin/bash -# Launch local version set -e # Exit if any commmand fails - -BASE_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}")/..) -cd "$BASE_DIR" - -docker build -t exadigit-simulation-server:latest -f Dockerfile . +cd $(realpath $(dirname "${BASH_SOURCE[0]}")/..) # trap 'docker compose down' SIGINT SIGTERM EXIT -docker stop simulation-server >/dev/null 2>&1 || true +docker compose down docker compose up -d docker compose logs -f --no-log-prefix simulation-server -- GitLab From 4043ee335620d9739d1c4c32f1787b3a2ef42890 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 1 Oct 2025 12:02:45 -0400 Subject: [PATCH 29/77] Add healthcheck to docker-compose --- docker-compose.yml | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index e4d175d..33c7e83 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -60,7 +60,9 @@ services: healthcheck: test: ["CMD-SHELL", "wget -q -O - http://localhost:8081/status/health || exit 1"] interval: 10s - retries: 10 + retries: 3 + start_interval: 1s + start_period: 5m druid-broker: image: apache/druid:34.0.0 @@ -81,7 +83,9 @@ services: healthcheck: test: ["CMD-SHELL", "wget -q -O - http://localhost:8082/druid/broker/v1/readiness || exit 1"] interval: 10s - retries: 10 + retries: 3 + start_interval: 1s + start_period: 5m druid-historical: image: apache/druid:34.0.0 @@ -103,7 +107,9 @@ services: healthcheck: test: ["CMD-SHELL", "wget -q -O - http://localhost:8083/druid/historical/v1/readiness || exit 1"] interval: 10s - retries: 10 + retries: 3 + start_interval: 1s + start_period: 5m druid-middlemanager: image: apache/druid:34.0.0 @@ -143,7 +149,9 @@ services: healthcheck: test: ["CMD-SHELL", "wget -q -O - http://localhost:8888/status/health || exit 1"] interval: 10s - retries: 10 + retries: 3 + start_interval: 1s + start_period: 5m kafka: image: apache/kafka:3.7.1 @@ -152,9 +160,10 @@ services: # - 9092:9092 healthcheck: test: ["CMD-SHELL", "/opt/kafka/bin/kafka-topics.sh --bootstrap-server kafka:9092 --list || exit 1"] - interval: 15s - retries: 20 - timeout: 3s + interval: 10s + retries: 3 + start_interval: 1s + start_period: 5m volumes: - kafka_secrets:/etc/kafka/secrets - kafka_config:/mnt/shared/config @@ -214,3 +223,9 @@ services: - EXADIGIT_ALLOW_ORIGINS=["*"] - DRUID_SERVICE_URL=http://druid-router:8888 - KAFKA_BOOTSTRAP=kafka:9092 + healthcheck: + test: ["CMD-SHELL", "wget -q -O - http://localhost:8080/openapi.json || exit 1"] + interval: 10s + retries: 3 + start_interval: 1s + start_period: 1m -- GitLab From 6bbe08a6d4e39c6e3a11b333fb60216388250f7b Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 1 Oct 2025 12:18:22 -0400 Subject: [PATCH 30/77] Avoid rebuild on README update --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f17fb55..4ab22be 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,8 @@ RUN uv pip install --system -r /app/raps/pyproject.toml # Install server dependencies (including raps) for caching COPY raps/ /app/raps/ COPY pyproject.toml /app/ -COPY README.md /app/ +# pip install expects README to exist +RUN touch /app/README.md RUN uv pip install --system -r /app/pyproject.toml # Install simulation server -- GitLab From 60eceb48c7c8e675d6d5128a04126e35ba79552f Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 1 Oct 2025 12:19:35 -0400 Subject: [PATCH 31/77] Use docker compose directly to launch service --- README.md | 25 ++++++++++++++++++++----- scripts/launch_local.sh | 9 --------- 2 files changed, 20 insertions(+), 14 deletions(-) delete mode 100755 scripts/launch_local.sh diff --git a/README.md b/README.md index 18f0d36..e8a67ca 100644 --- a/README.md +++ b/README.md @@ -26,15 +26,30 @@ FMU models aren't currently publicly available.) ## Running locally To run a local version of the server run ```bash -./scripts/launch_local.sh +docker compose up --wait ``` The server will be hosted on http://localhost:8080 -You'll need at least 16 GiB of RAM, preferably 32 GiB for druid to run smoothly. +You'll need at least 16 GiB of RAM, preferably 32 GiB for druid and RAPS to run smoothly. -If you want to run replay data locally, you'll need to download the datasets (see ./scripts/fetch.sh) -and then ingest them in Druid. After launching, you can access the Druid UI at http://localhost:8888 -and submit druid ingests for the system you want. +If you want to run replay data locally, you'll need to download the datasets and then ingest them in +Druid. You can fetch the datasets with `./scripts/fetch.sh` and submit the druid ingests for them +under `./druid_ingests` using the Druid UI at http://localhost:8888. + +View the server logs with: +```bash +docker compose logs -f --no-log-prefix simulation-server +``` + +To shut down the server run: +```bash +docker compose down +``` + +Use this if you want to wipe all the database data as well: +```bash +docker compose down --volumes +``` ## Deploying To deploy the server, run diff --git a/scripts/launch_local.sh b/scripts/launch_local.sh deleted file mode 100755 index e786a37..0000000 --- a/scripts/launch_local.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -set -e # Exit if any commmand fails -cd $(realpath $(dirname "${BASH_SOURCE[0]}")/..) - -# trap 'docker compose down' SIGINT SIGTERM EXIT - -docker compose down -docker compose up -d -docker compose logs -f --no-log-prefix simulation-server -- GitLab From 70448860789d90535aecc24839dd2664bb698e5a Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 1 Oct 2025 16:32:57 -0400 Subject: [PATCH 32/77] List systems --- simulation_server/models/output.py | 1 + simulation_server/server/endpoints.py | 9 +++++++-- simulation_server/server/service.py | 12 +++++++++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/simulation_server/models/output.py b/simulation_server/models/output.py index fb1c619..91ed14c 100644 --- a/simulation_server/models/output.py +++ b/simulation_server/models/output.py @@ -255,6 +255,7 @@ COOLING_CEP_FIELD_SELECTORS = { class SystemInfo(BaseModel): + name: str peak_flops: float peak_power: float g_flops_w_peak: float diff --git a/simulation_server/server/endpoints.py b/simulation_server/server/endpoints.py index e00d833..a892e7f 100644 --- a/simulation_server/server/endpoints.py +++ b/simulation_server/server/endpoints.py @@ -17,7 +17,7 @@ from .config import AppDeps from .service import ( run_simulation, query_sims, query_cooling_sim_cdu, query_scheduler_sim_jobs, query_scheduler_sim_system, query_scheduler_sim_power_history, query_cooling_sim_cep, - get_system_info, + get_systems, get_system_info, ) router = APIRouter(tags=["simulation"]) @@ -212,6 +212,11 @@ def scheduler_system(*, return result -@router.get("/system-info/{system}", response_model=SystemInfo) +@router.get("/system/list", response_model=list[SystemInfo]) +def system_list(): + return get_systems() + + +@router.get("/system/{system}", response_model=SystemInfo) def system_info(system: str): return get_system_info(system = system) diff --git a/simulation_server/server/service.py b/simulation_server/server/service.py index 161ac91..df616f9 100644 --- a/simulation_server/server/service.py +++ b/simulation_server/server/service.py @@ -698,11 +698,21 @@ def build_scheduler_sim_power_history_query(*, ) +@functools.cache +def get_systems(): + from raps.system_config import list_systems + return [get_system_info(s) for s in list_systems()] + + @functools.cache def get_system_info(system: str): from raps.system_config import list_systems from raps import Engine, SingleSimConfig + from raps.stats import get_gauge_limits if system not in list_systems(): raise HTTPException(status_code=404, detail=f"System {system} not found") engine = Engine(SingleSimConfig(system = system)) - return engine.get_gauge_limits() + return { + "name": system, + **get_gauge_limits(engine), + } -- GitLab From a308734a739657463ac99cbe97144e53ff27b132 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 2 Oct 2025 11:45:11 -0400 Subject: [PATCH 33/77] Add replay flag --- simulation_server/models/sim.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/simulation_server/models/sim.py b/simulation_server/models/sim.py index 1ace96a..d418088 100644 --- a/simulation_server/models/sim.py +++ b/simulation_server/models/sim.py @@ -2,9 +2,9 @@ from __future__ import annotations from typing import Optional, Literal, Annotated as A import json from pathlib import Path -from pydantic import AwareDatetime, Field, model_validator +from pydantic import AwareDatetime, Field, model_validator, BeforeValidator from raps import SingleSimConfig -from raps.utils import AutoAwareDatetime +from raps.utils import AutoAwareDatetime, ResolvedPath from .base import BaseModel from ..util.misc import omit @@ -100,6 +100,12 @@ class ServerSimConfig(SingleSimConfig): start: AutoAwareDatetime # make start required """ Start of the simulation """ + replay: A[list[ResolvedPath] | None, + BeforeValidator(lambda r: ['database'] if r else None, bool)] = None + """ Whether to enable job replay. Pulls data from the database """ + # RAPS replay expects a list of paths, but that's not relevant when we are pulling the data from + # the database. We accept true/false as input and just put a dummy value in for the list. + def __init__(self, /, **data): # Override context to set base_path RAPS_PATH = (Path(__file__) / '../../../raps').resolve() @@ -112,9 +118,13 @@ class ServerSimConfig(SingleSimConfig): } ) - @model_validator(mode = "after") - def _validate_server_sim_config(self): + @model_validator(mode = "before") + def _validate_server_sim_config(cls, data): + data = {**data} # Force these options regardless of input - self.noui = True - self.output = "none" - return self + data['noui'] = True + data['output'] = "none" + if data.get("workload") == "replay" and 'replay' not in data: + data['replay'] = True + + return data -- GitLab From 5af437a58c1cec070147496e46eb923463fce449 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 3 Oct 2025 14:00:53 -0400 Subject: [PATCH 34/77] Update comment --- simulation_server/models/output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simulation_server/models/output.py b/simulation_server/models/output.py index 91ed14c..f96f3c0 100644 --- a/simulation_server/models/output.py +++ b/simulation_server/models/output.py @@ -27,7 +27,7 @@ class SchedulerSimJob(BaseModel): nodes: Optional[list[str]] = None """ - The nodes the job is running on ['x2307c3s0b1', 'x2408c5s2b1'] + The nodes the job is running on e.g. ['x2307c3s0b1', 'x2408c5s2b1'] """ # Removing these for now, they are constant and just what you set in the input. -- GitLab From e950d29c3c679df214bee6e1a0e476562990ac0f Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 3 Oct 2025 14:22:20 -0400 Subject: [PATCH 35/77] Some performance improvements --- pyproject.toml | 1 + simulation_server/simulation/simulation.py | 64 +++++++++++++--------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index edd5645..1868618 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "elasticsearch==7.13.4", "elasticsearch-dbapi==0.2.11", "requests==2.32.5", + "orjson==3.11.3", "raps@{root:uri}/raps", ] diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index cf306cf..30956e9 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -1,8 +1,10 @@ from typing import NamedTuple from datetime import datetime, timedelta -import functools +import functools, itertools +import orjson from loguru import logger from raps import Engine +from raps.job import Job as RapsJob from raps.stats import get_engine_stats, get_job_stats from ..models.sim import ServerSimConfig from ..models.output import ( @@ -23,9 +25,20 @@ class SimTickOutput(NamedTuple): power_history: list[SchedulerSimJobPowerHistory] -def get_job_state_hash(job: SchedulerSimJob): +def get_job_state_hash(job: RapsJob): """ Return string that can be used to check if any meaningful state changed """ - return job.model_dump_json(exclude={"time_snapshot"}) + return orjson.dumps([ + str(job.id), + job.name, + job.nodes_required, + job.submit_time, + job.time_limit, + job.start_time, + job.end_time, + job.current_state.name, + # Node list shouldn't change once set so just do len instead of serializing the large list + len(job.scheduled_nodes) if job.scheduled_nodes else None, + ]) def run_simulation(sim_config: ServerSimConfig): @@ -50,7 +63,7 @@ def run_simulation(sim_config: ServerSimConfig): # Keep record of how many power history steps we've emitted for each job power_history_counts: dict[int, int] = {} - prev_jobs: dict[str, str] = {} + prev_job_hashes: set[str] = set() for tick in engine.run_simulation(): timestamp: datetime = _offset_to_time(tick.current_timestep) @@ -89,36 +102,35 @@ def run_simulation(sim_config: ServerSimConfig): ))] scheduler_sim_jobs: list[SchedulerSimJob] = [] - curr_jobs = {} - tick_jobs = tick.queue + tick.running + tick.completed + tick.killed + curr_job_hashes = set() + tick_jobs = itertools.chain(tick.queue, tick.running, tick.completed, tick.killed) for job in tick_jobs: time_end = _offset_to_time(job.end_time) # end_time is set to its planned end once its scheduled. Set it to None for unfinished jobs here if time_end is not None and (job.start_time is None or time_end > timestamp): time_end = None - parsed_job = SchedulerSimJob.model_validate(dict( - job_id = str(job.id), - name = job.name, - node_count = job.nodes_required, - time_snapshot = timestamp, - time_submission = _offset_to_time(job.submit_time), - time_limit = job.time_limit, - time_start = _offset_to_time(job.start_time), - time_end = time_end, - state_current = JobStateEnum(job.current_state.name), - nodes = _parse_nodes(tuple(job.scheduled_nodes)) if job.scheduled_nodes else None, - # How does the new job.power attribute work? Is it total_energy? - # Or just the current wattage? - # power = job.power, - )) - job_state_hash = get_job_state_hash(parsed_job) - + job_state_hash = get_job_state_hash(job) # Output jobs if something other than time_snapshot changed - if is_last_tick or prev_jobs.get(parsed_job.job_id) != job_state_hash: + if is_last_tick or job_state_hash not in prev_job_hashes: + parsed_job = SchedulerSimJob.model_validate({ + "job_id": str(job.id), + "name": job.name, + "node_count": job.nodes_required, + "time_snapshot": timestamp, + "time_submission": _offset_to_time(job.submit_time), + "time_limit": job.time_limit, + "time_start": _offset_to_time(job.start_time), + "time_end": time_end, + "state_current": JobStateEnum(job.current_state.name), + "nodes": _parse_nodes(tuple(job.scheduled_nodes)) if job.scheduled_nodes else None, + # How does the new job.power attribute work? Is it total_energy? + # Or just the current wattage? + # power = job.power, + }) scheduler_sim_jobs.append(parsed_job) - curr_jobs[parsed_job.job_id] = job_state_hash - prev_jobs = curr_jobs + curr_job_hashes.add(job_state_hash) + prev_job_hashes = curr_job_hashes power_history: list[SchedulerSimJobPowerHistory] = [] for job in tick_jobs: -- GitLab From e0b1bb77c9251fddf54a91bbec271b2f6fcd812c Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 3 Oct 2025 15:03:28 -0400 Subject: [PATCH 36/77] Rename inner methods --- simulation_server/simulation/simulation.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index 30956e9..5125737 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -49,7 +49,7 @@ def run_simulation(sim_config: ServerSimConfig): # Sample CDU as fast as it is available sample_cooling = timedelta(seconds = 1).total_seconds() - def _offset_to_time(offset): + def offset_to_time(offset): if offset is not None: return engine.start + timedelta(seconds=offset - engine.timestep_start) else: @@ -58,7 +58,7 @@ def run_simulation(sim_config: ServerSimConfig): # Memoized function to convert raps indexes into node names. # Memo increases performance since it gets called on snapshots of the same job multiple times. @functools.lru_cache(maxsize = 65_536) - def _parse_nodes(node_indexes: tuple[int]): + def parse_nodes(node_indexes: tuple[int]): return [engine.telemetry.node_index_to_name(i) for i in node_indexes] # Keep record of how many power history steps we've emitted for each job @@ -66,13 +66,13 @@ def run_simulation(sim_config: ServerSimConfig): prev_job_hashes: set[str] = set() for tick in engine.run_simulation(): - timestamp: datetime = _offset_to_time(tick.current_timestep) + timestamp: datetime = offset_to_time(tick.current_timestep) unix_timestamp = int(timestamp.timestamp()) is_last_tick = (timestamp + timedelta(seconds=1) >= sim_config.end) scheduler_sim_system: list[SchedulerSimSystem] = [] if unix_timestamp % sample_scheduler_sim_system == 0 or is_last_tick: - down_nodes = _parse_nodes(tuple(tick.down_nodes)) + down_nodes = parse_nodes(tuple(tick.down_nodes)) engine_stats = get_engine_stats(engine, fast = True) job_stats = get_job_stats(engine) @@ -105,7 +105,7 @@ def run_simulation(sim_config: ServerSimConfig): curr_job_hashes = set() tick_jobs = itertools.chain(tick.queue, tick.running, tick.completed, tick.killed) for job in tick_jobs: - time_end = _offset_to_time(job.end_time) + time_end = offset_to_time(job.end_time) # end_time is set to its planned end once its scheduled. Set it to None for unfinished jobs here if time_end is not None and (job.start_time is None or time_end > timestamp): time_end = None @@ -118,12 +118,12 @@ def run_simulation(sim_config: ServerSimConfig): "name": job.name, "node_count": job.nodes_required, "time_snapshot": timestamp, - "time_submission": _offset_to_time(job.submit_time), + "time_submission": offset_to_time(job.submit_time), "time_limit": job.time_limit, - "time_start": _offset_to_time(job.start_time), + "time_start": offset_to_time(job.start_time), "time_end": time_end, "state_current": JobStateEnum(job.current_state.name), - "nodes": _parse_nodes(tuple(job.scheduled_nodes)) if job.scheduled_nodes else None, + "nodes": parse_nodes(tuple(job.scheduled_nodes)) if job.scheduled_nodes else None, # How does the new job.power attribute work? Is it total_energy? # Or just the current wattage? # power = job.power, -- GitLab From 57f49854e89a4e57f9bb119d26c4c7952623fdd3 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 3 Oct 2025 15:08:23 -0400 Subject: [PATCH 37/77] More performance improvements --- simulation_server/simulation/simulation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index 5125737..caf0fd2 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -102,6 +102,8 @@ def run_simulation(sim_config: ServerSimConfig): ))] scheduler_sim_jobs: list[SchedulerSimJob] = [] + power_history: list[SchedulerSimJobPowerHistory] = [] + curr_job_hashes = set() tick_jobs = itertools.chain(tick.queue, tick.running, tick.completed, tick.killed) for job in tick_jobs: @@ -130,17 +132,15 @@ def run_simulation(sim_config: ServerSimConfig): }) scheduler_sim_jobs.append(parsed_job) curr_job_hashes.add(job_state_hash) - prev_job_hashes = curr_job_hashes - power_history: list[SchedulerSimJobPowerHistory] = [] - for job in tick_jobs: - if job.id and power_history_counts.get(job.id, 0) < len(job.power_history): + if power_history_counts.get(job.id, 0) < len(job.power_history): power_history.append(SchedulerSimJobPowerHistory( timestamp = timestamp, job_id = str(job.id), power = job.power_history[-1], )) power_history_counts[job.id] = len(job.power_history) + prev_job_hashes = curr_job_hashes cooling_sim_cdus: list[CoolingSimCDU] = [] cooling_sim_cep: list[CoolingSimCEP] = [] -- GitLab From 36f07b37946d0e92f55520ed5cf0ef4a683ee908 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 3 Oct 2025 16:36:09 -0400 Subject: [PATCH 38/77] More performance improvements --- simulation_server/simulation/simulation.py | 42 +++++++++++++--------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index caf0fd2..2a1f769 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -41,13 +41,23 @@ def get_job_state_hash(job: RapsJob): ]) +def snap_sample_rate(desired_rate: int, actual_rate: int): + """ + Returns a sample rate close to desired_rate, but is still divisible by actual_rate. + E.g. if power is being ticked every 3 seconds, but disired sample rate is 10, round it to 9. + """ + if actual_rate >= desired_rate: + return actual_rate + else: + return int(desired_rate / actual_rate) * actual_rate + + def run_simulation(sim_config: ServerSimConfig): # TODO: replay logic engine = Engine(sim_config) - sample_scheduler_sim_system = timedelta(seconds = 1).total_seconds() - # Sample CDU as fast as it is available - sample_cooling = timedelta(seconds = 1).total_seconds() + sample_system = int(timedelta(seconds = 1).total_seconds()) + sample_power = snap_sample_rate(5, int(sim_config.time_delta.total_seconds())) def offset_to_time(offset): if offset is not None: @@ -71,7 +81,7 @@ def run_simulation(sim_config: ServerSimConfig): is_last_tick = (timestamp + timedelta(seconds=1) >= sim_config.end) scheduler_sim_system: list[SchedulerSimSystem] = [] - if unix_timestamp % sample_scheduler_sim_system == 0 or is_last_tick: + if unix_timestamp % sample_system == 0 or is_last_tick: down_nodes = parse_nodes(tuple(tick.down_nodes)) engine_stats = get_engine_stats(engine, fast = True) job_stats = get_job_stats(engine) @@ -146,19 +156,18 @@ def run_simulation(sim_config: ServerSimConfig): cooling_sim_cep: list[CoolingSimCEP] = [] cooling_sim_cdu_map: dict[int, dict] = {} - if tick.power_df is not None and (unix_timestamp % sample_cooling == 0 or is_last_tick): + if tick.power_df is not None and (is_last_tick or unix_timestamp % sample_power == 0): for i, point in tick.power_df.iterrows(): - cooling_sim_cdu_map[int(point['CDU'])] = dict( - rack_1_power = point['Rack 1'], - rack_2_power = point['Rack 2'], - rack_3_power = point['Rack 3'], - total_power = point['Sum'], - - rack_1_loss = point['Loss 1'], - rack_2_loss = point['Loss 2'], - rack_3_loss = point['Loss 3'], - total_loss = point['Loss'], - ) + cooling_sim_cdu_map[int(point['CDU'])] = { + "rack_1_power": point['Rack 1'], + "rack_2_power": point['Rack 2'], + "rack_3_power": point['Rack 3'], + "total_power": point['Sum'], + "rack_1_loss": point['Loss 1'], + "rack_2_loss": point['Loss 2'], + "rack_3_loss": point['Loss 3'], + "total_loss": point['Loss'], + } if tick.fmu_outputs: # CDU columns are output in the dict with keys like this: @@ -231,4 +240,3 @@ def run_simulation(sim_config: ServerSimConfig): cooling_sim_cep = cooling_sim_cep, power_history = power_history, ) - -- GitLab From 4f06747ae102c9a0581bca74b2ef4057b41df324 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 3 Oct 2025 17:32:38 -0400 Subject: [PATCH 39/77] Improve IO performance --- simulation_server/server/main.py | 20 +-- simulation_server/simulation/main.py | 61 +++++---- simulation_server/simulation/simulation.py | 142 +++++++++++---------- 3 files changed, 120 insertions(+), 103 deletions(-) diff --git a/simulation_server/server/main.py b/simulation_server/server/main.py index 86900f9..3dc2260 100644 --- a/simulation_server/server/main.py +++ b/simulation_server/server/main.py @@ -51,18 +51,18 @@ async def lifespan(api: FastAPI): if settings.env == 'dev': kafka_admin = get_kafka_admin() existing_topics = set(kafka_admin.list_topics()) + topic_configs = {"compression.type": "snappy"} new_topics = [ - "svc-event-exadigit-sim", - "svc-ts-exadigit-schedulersimsystem", - "svc-event-exadigit-schedulersimjob", - "svc-ts-exadigit-coolingsimcdu", - "svc-ts-exadigit-coolingsimcep", - "svc-ts-exadigit-jobpowerhistory", + NewTopic("svc-event-exadigit-sim", 1, 1, topic_configs = topic_configs), + NewTopic("svc-ts-exadigit-schedulersimsystem", 4, 1, topic_configs = topic_configs), + NewTopic("svc-event-exadigit-schedulersimjob", 2, 1, topic_configs = topic_configs), + NewTopic("svc-ts-exadigit-coolingsimcdu", 4, 1, topic_configs = topic_configs), + NewTopic("svc-ts-exadigit-coolingsimcep", 2, 1, topic_configs = topic_configs), + NewTopic("svc-ts-exadigit-jobpowerhistory", 4, 1, topic_configs = topic_configs), ] - for topic in new_topics: - if topic not in existing_topics: - logger.info(f"Creating kafka topic {topic}") - kafka_admin.create_topics([NewTopic(topic, 1, 1)]) + new_topics = [t for t in new_topics if t.name not in existing_topics] + logger.info(f"Creating kafka topics {', '.join(t.name for t in new_topics)}") + kafka_admin.create_topics(new_topics) druid_ingests_dir = Path(__file__).parent.parent.parent.resolve() / 'druid_ingests' ingests = [ diff --git a/simulation_server/simulation/main.py b/simulation_server/simulation/main.py index 8ecfbbb..d4aa959 100644 --- a/simulation_server/simulation/main.py +++ b/simulation_server/simulation/main.py @@ -1,6 +1,6 @@ """ A script to run the ExaDigiT simulation """ -from typing import Callable -import argparse, os, json +from collections.abc import Iterable +import argparse, os, orjson from pathlib import Path from datetime import datetime, timezone from loguru import logger @@ -10,25 +10,28 @@ from .simulation import run_simulation from ..util.kafka import get_kafka_producer -def write_sim(sim: Sim, writer: Callable[[str, bytes], None]): +def run_simulation_serialized(sim: Sim) -> Iterable[dict[str, list[bytes]]]: sim = sim.model_copy() - def output_rows(topic, rows): - for row in rows: - value = json.dumps({"sim_id": sim.id, **row.model_dump(mode='json')}).encode() - writer(topic, value) + def serialize_rows(rows): + return [ + orjson.dumps({"sim_id": sim.id, **row.model_dump(mode='json')}) + for row in rows + ] - logger.info(f"Starting simulation {sim.model_dump_json()}") + logger.info(f"Starting simulation: {sim.model_dump_json(indent = 4)}") config = ServerSimConfig.model_validate(sim.config) progress_date = sim.start try: for data in run_simulation(config): - output_rows("svc-ts-exadigit-schedulersimsystem", data.scheduler_sim_system) - output_rows("svc-event-exadigit-schedulersimjob", data.scheduler_sim_jobs) - output_rows("svc-ts-exadigit-coolingsimcdu", data.cooling_sim_cdus) - output_rows("svc-ts-exadigit-coolingsimcep", data.cooling_sim_cep) - output_rows("svc-ts-exadigit-jobpowerhistory", data.power_history) + yield { + "svc-ts-exadigit-schedulersimsystem": serialize_rows(data.scheduler_sim_system), + "svc-event-exadigit-schedulersimjob": serialize_rows(data.scheduler_sim_jobs), + "svc-ts-exadigit-coolingsimcdu": serialize_rows(data.cooling_sim_cdus), + "svc-ts-exadigit-coolingsimcep": serialize_rows(data.cooling_sim_cep), + "svc-ts-exadigit-jobpowerhistory": serialize_rows(data.power_history), + } progress_date = data.timestamp if data.timestamp.second == 0: logger.info(f"progress: {data.timestamp.isoformat()} / {sim.end.isoformat()}") @@ -37,33 +40,45 @@ def write_sim(sim: Sim, writer: Callable[[str, bytes], None]): sim.execution_end = datetime.now(timezone.utc) sim.error_messages = str(e) sim.progress_date = progress_date - writer("svc-event-exadigit-sim", sim.serialize_for_druid()) + yield {"svc-event-exadigit-sim": [sim.serialize_for_druid()]} logger.info(f"Simulation {sim.id} failed") raise e sim.state = "success" sim.execution_end = datetime.now(timezone.utc) sim.progress_date = sim.end - writer("svc-event-exadigit-sim", sim.serialize_for_druid()) + yield {"svc-event-exadigit-sim": [sim.serialize_for_druid()]} logger.info(f"Simulation {sim.id} finished") def write_sim_to_kafka(sim: Sim): - kafka_producer = get_kafka_producer() - def writer(topic: str, value: bytes): - kafka_producer.send(topic=topic, value=value) + kafka_producer = get_kafka_producer( + linger_ms = 2 * 1000, + batch_size = 65536, + compression_type = "snappy", + ) try: - write_sim(sim, writer=writer) + for data in run_simulation_serialized(sim): + # kafka_producer does its own buffering of output so we don't need to worry about batching + for topic, rows in data.items(): + for row in rows: + kafka_producer.send(topic=topic, value=row) finally: kafka_producer.close() def write_sim_to_disk(sim: Sim, dest: str): Path(dest).mkdir(exist_ok=True) - def writer(topic: str, value: bytes): - with open(Path(dest) / f"{topic}.jsonl", 'ab') as f: - f.write(value + b"\n") - write_sim(sim, writer=writer) + files = {} + try: + for data in run_simulation_serialized(sim): + for topic, rows in data.items(): + if topic not in files: + files[topic] = open(Path(dest) / f"{topic}.jsonl", 'ab') + files[topic].writelines(l + b"\n" for l in rows) + finally: + for file in files.values(): + file.close() if __name__ == "__main__": diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index 2a1f769..5baf090 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -56,8 +56,9 @@ def run_simulation(sim_config: ServerSimConfig): # TODO: replay logic engine = Engine(sim_config) - sample_system = int(timedelta(seconds = 1).total_seconds()) + sample_system = 1 sample_power = snap_sample_rate(5, int(sim_config.time_delta.total_seconds())) + sample_cooling = snap_sample_rate(5, int(sim_config.time_delta.total_seconds())) def offset_to_time(offset): if offset is not None: @@ -71,6 +72,12 @@ def run_simulation(sim_config: ServerSimConfig): def parse_nodes(node_indexes: tuple[int]): return [engine.telemetry.node_index_to_name(i) for i in node_indexes] + @functools.lru_cache(maxsize = 16384) + def cdu_info(cdu_index: int): + cdu_name = engine.telemetry.cdu_index_to_name(cdu_index) + row, col = engine.telemetry.cdu_pos(cdu_index) + return cdu_name, row, col + # Keep record of how many power history steps we've emitted for each job power_history_counts: dict[int, int] = {} prev_job_hashes: set[str] = set() @@ -86,30 +93,30 @@ def run_simulation(sim_config: ServerSimConfig): engine_stats = get_engine_stats(engine, fast = True) job_stats = get_job_stats(engine) - scheduler_sim_system = [SchedulerSimSystem.model_validate(dict( - timestamp = timestamp, - down_nodes = down_nodes, + scheduler_sim_system = [SchedulerSimSystem.model_validate({ + "timestamp": timestamp, + "down_nodes": down_nodes, # TODO: Update sc.get_stats to return more easily parsable data - num_samples = engine_stats['num_samples'], - - jobs_completed = job_stats['jobs_completed'], - jobs_running = len(job_stats['jobs_still_running']), - jobs_pending = len(job_stats['jobs_still_in_queue']), - - throughput = job_stats['throughput'], - average_power = engine_stats['average_power'] * 1_000_000, - min_loss = engine_stats['min_loss'] * 1_000_000, - average_loss = engine_stats['average_loss'] * 1_000_000, - max_loss = engine_stats['max_loss'] * 1_000_000, - system_power_efficiency = engine_stats['system_power_efficiency'], - total_energy_consumed = engine_stats['total_energy_consumed'], - carbon_emissions = engine_stats['carbon_emissions'], - total_cost = engine_stats['total_cost'], - - p_flops = tick.p_flops, - g_flops_w = tick.g_flops_w, - system_util = tick.system_util, - ))] + "num_samples": engine_stats['num_samples'], + + "jobs_completed": job_stats['jobs_completed'], + "jobs_running": len(job_stats['jobs_still_running']), + "jobs_pending": len(job_stats['jobs_still_in_queue']), + + "throughput": job_stats['throughput'], + "average_power": engine_stats['average_power'] * 1_000_000, + "min_loss": engine_stats['min_loss'] * 1_000_000, + "average_loss": engine_stats['average_loss'] * 1_000_000, + "max_loss": engine_stats['max_loss'] * 1_000_000, + "system_power_efficiency": engine_stats['system_power_efficiency'], + "total_energy_consumed": engine_stats['total_energy_consumed'], + "carbon_emissions": engine_stats['carbon_emissions'], + "total_cost": engine_stats['total_cost'], + + "p_flops": tick.p_flops, + "g_flops_w": tick.g_flops_w, + "system_util": tick.system_util, + })] scheduler_sim_jobs: list[SchedulerSimJob] = [] power_history: list[SchedulerSimJobPowerHistory] = [] @@ -169,7 +176,7 @@ def run_simulation(sim_config: ServerSimConfig): "total_loss": point['Loss'], } - if tick.fmu_outputs: + if tick.fmu_outputs and (is_last_tick or unix_timestamp % sample_cooling == 0): # CDU columns are output in the dict with keys like this: # "simulator[1].datacenter[1].computeBlock[1].cdu[1].summary.m_flow_prim" # "simulator[1].datacenter[1].computeBlock[1].cdu[1].summary.V_flow_prim_GPM" @@ -181,55 +188,50 @@ def run_simulation(sim_config: ServerSimConfig): cdus_data = fmu_data['simulator'][1]['datacenter'][1]['computeBlock'] for cdu, cdu_data in cdus_data.items(): cdu_data = cdu_data['cdu'][1]['summary'] - cooling_sim_cdu_map[cdu].update( - work_done_by_cdup = cdu_data['W_flow_CDUP_kW'], - rack_return_temp = cdu_data['T_sec_r_C'], - rack_supply_temp = cdu_data['T_sec_s_C'], - rack_supply_pressure = cdu_data['p_sec_s_psig'], - rack_return_pressure = cdu_data['p_sec_r_psig'], - rack_flowrate = cdu_data['V_flow_sec_GPM'], - facility_return_temp = cdu_data["T_prim_r_C"], - facility_supply_temp = cdu_data['T_prim_s_C'], - facility_supply_pressure = cdu_data['p_prim_s_psig'], - facility_return_pressure = cdu_data['p_prim_r_psig'], - facility_flowrate = cdu_data['V_flow_prim_GPM'], - ) + cooling_sim_cdu_map[cdu] = { + **cooling_sim_cdu_map.get(cdu, {}), + "work_done_by_cdup": cdu_data['W_flow_CDUP_kW'], + "rack_return_temp": cdu_data['T_sec_r_C'], + "rack_supply_temp": cdu_data['T_sec_s_C'], + "rack_supply_pressure": cdu_data['p_sec_s_psig'], + "rack_return_pressure": cdu_data['p_sec_r_psig'], + "rack_flowrate": cdu_data['V_flow_sec_GPM'], + "facility_return_temp": cdu_data["T_prim_r_C"], + "facility_supply_temp": cdu_data['T_prim_s_C'], + "facility_supply_pressure": cdu_data['p_prim_s_psig'], + "facility_return_pressure": cdu_data['p_prim_r_psig'], + "facility_flowrate": cdu_data['V_flow_prim_GPM'], + } cep_data = fmu_data['simulator'][1]['centralEnergyPlant'][1] - cooling_sim_cep = [CoolingSimCEP.model_validate(dict( - timestamp = timestamp, - htw_flowrate = cep_data['hotWaterLoop'][1]['summary']['V_flow_htw_GPM'], - ctw_flowrate = cep_data['coolingTowerLoop'][1]['summary']['V_flow_ctw_GPM'], - htw_return_pressure = cep_data['hotWaterLoop'][1]['summary']['p_fac_htw_r_psig'], - htw_supply_pressure = cep_data['hotWaterLoop'][1]['summary']['p_fac_htw_s_psig'], - ctw_return_pressure = cep_data['coolingTowerLoop'][1]['summary']['p_fac_ctw_r_psig'], - ctw_supply_pressure = cep_data['coolingTowerLoop'][1]['summary']['p_fac_ctw_s_psig'], - htw_return_temp = cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_r_C'], - htw_supply_temp = cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_s_C'], - ctw_return_temp = cep_data['coolingTowerLoop'][1]['summary']['T_fac_ctw_r_C'], - ctw_supply_temp = cep_data['coolingTowerLoop'][1]['summary']['T_fac_ctw_s_C'], - power_consumption_htwps = cep_data['hotWaterLoop'][1]['summary']['W_flow_HTWP_kW'], - power_consumption_ctwps = cep_data['coolingTowerLoop'][1]['summary']['W_flow_CTWP_kW'], - power_consumption_fan = cep_data['coolingTowerLoop'][1]['summary']['W_flow_CT_kW'], - htwp_speed = cep_data['hotWaterLoop'][1]['summary']['N_HTWP'], - nctwps_staged = cep_data['coolingTowerLoop'][1]['summary']['n_CTWPs'], - nhtwps_staged = cep_data['hotWaterLoop'][1]['summary']['n_HTWPs'], - pue_output = fmu_data['pue'], - nehxs_staged = cep_data['hotWaterLoop'][1]['summary']['n_EHXs'], - ncts_staged = cep_data['coolingTowerLoop'][1]['summary']['n_CTs'], - facility_return_temp = cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_r_C'], - cdu_loop_bypass_flowrate = fmu_data['simulator'][1]['datacenter'][1]['summary']['V_flow_bypass_GPM'], - ))] + cooling_sim_cep = [CoolingSimCEP.model_validate({ + "timestamp": timestamp, + "htw_flowrate": cep_data['hotWaterLoop'][1]['summary']['V_flow_htw_GPM'], + "ctw_flowrate": cep_data['coolingTowerLoop'][1]['summary']['V_flow_ctw_GPM'], + "htw_return_pressure": cep_data['hotWaterLoop'][1]['summary']['p_fac_htw_r_psig'], + "htw_supply_pressure": cep_data['hotWaterLoop'][1]['summary']['p_fac_htw_s_psig'], + "ctw_return_pressure": cep_data['coolingTowerLoop'][1]['summary']['p_fac_ctw_r_psig'], + "ctw_supply_pressure": cep_data['coolingTowerLoop'][1]['summary']['p_fac_ctw_s_psig'], + "htw_return_temp": cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_r_C'], + "htw_supply_temp": cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_s_C'], + "ctw_return_temp": cep_data['coolingTowerLoop'][1]['summary']['T_fac_ctw_r_C'], + "ctw_supply_temp": cep_data['coolingTowerLoop'][1]['summary']['T_fac_ctw_s_C'], + "power_consumption_htwps": cep_data['hotWaterLoop'][1]['summary']['W_flow_HTWP_kW'], + "power_consumption_ctwps": cep_data['coolingTowerLoop'][1]['summary']['W_flow_CTWP_kW'], + "power_consumption_fan": cep_data['coolingTowerLoop'][1]['summary']['W_flow_CT_kW'], + "htwp_speed": cep_data['hotWaterLoop'][1]['summary']['N_HTWP'], + "nctwps_staged": cep_data['coolingTowerLoop'][1]['summary']['n_CTWPs'], + "nhtwps_staged": cep_data['hotWaterLoop'][1]['summary']['n_HTWPs'], + "pue_output": fmu_data['pue'], + "nehxs_staged": cep_data['hotWaterLoop'][1]['summary']['n_EHXs'], + "ncts_staged": cep_data['coolingTowerLoop'][1]['summary']['n_CTs'], + "facility_return_temp": cep_data['hotWaterLoop'][1]['summary']['T_fac_htw_r_C'], + "cdu_loop_bypass_flowrate": fmu_data['simulator'][1]['datacenter'][1]['summary']['V_flow_bypass_GPM'], + })] for cdu_index, cdu_data in cooling_sim_cdu_map.items(): - cdu_name = engine.telemetry.cdu_index_to_name(cdu_index) - row, col = engine.telemetry.cdu_pos(cdu_index) - cdu_data.update( - timestamp = timestamp, - name = cdu_name, - row = row, - col = col, - ) + cdu_name, row, col = cdu_info(cdu_index) + cdu_data.update(timestamp = timestamp, name = cdu_name, row = row, col = col) cooling_sim_cdus.append(CoolingSimCDU.model_validate(cdu_data)) yield SimTickOutput( -- GitLab From cbaa998933ebfc5f357b2bc3935c77df76217676 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 6 Oct 2025 11:15:49 -0400 Subject: [PATCH 40/77] Use confluent_kafka client This is significantly faster --- pyproject.toml | 2 +- simulation_server/server/config.py | 4 +-- simulation_server/server/main.py | 22 ++++++++-------- simulation_server/server/service.py | 6 ++--- simulation_server/simulation/main.py | 20 ++++++++------- simulation_server/util/kafka.py | 38 ++++++++++++++-------------- 6 files changed, 46 insertions(+), 46 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1868618..09e35f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,6 @@ dependencies = [ "loguru==0.7.3", "SQLAlchemy==2.0.43", "pydruid==0.6.9", - "kafka-python==2.2.15", "python-snappy==0.7.3", "jsonpath-ng==1.7.0", "fastapi==0.116.2", @@ -30,6 +29,7 @@ dependencies = [ "elasticsearch-dbapi==0.2.11", "requests==2.32.5", "orjson==3.11.3", + "confluent_kafka==2.11.1", "raps@{root:uri}/raps", ] diff --git a/simulation_server/server/config.py b/simulation_server/server/config.py index b33aacd..c25c02b 100644 --- a/simulation_server/server/config.py +++ b/simulation_server/server/config.py @@ -4,7 +4,7 @@ from pydantic import StringConstraints from pydantic_settings import BaseSettings, SettingsConfigDict from fastapi import Depends import sqlalchemy as sqla -from kafka import KafkaProducer +from confluent_kafka import Producer from ..util.kafka import get_kafka_producer as _get_kafka_producer from ..util.druid import get_druid_engine as _get_druid_engine @@ -43,7 +43,7 @@ DruidDep = A[sqla.Engine, Depends(get_druid_engine)] @functools.cache def get_kafka_producer(): return _get_kafka_producer() -KafkaProducerDep = A[KafkaProducer, Depends(get_kafka_producer)] +KafkaProducerDep = A[Producer, Depends(get_kafka_producer)] class AppDeps_(NamedTuple): diff --git a/simulation_server/server/main.py b/simulation_server/server/main.py index 3dc2260..da1fcfb 100644 --- a/simulation_server/server/main.py +++ b/simulation_server/server/main.py @@ -16,7 +16,7 @@ from ..util.druid import submit_ingest from .service import cleanup_jobs from .config import AppSettings, get_app_settings, get_druid_engine, get_kafka_producer from ..util.kafka import get_kafka_admin -from kafka.admin import NewTopic +from confluent_kafka.admin import NewTopic settings = AppSettings() @@ -50,18 +50,18 @@ async def lifespan(api: FastAPI): if settings.env == 'dev': kafka_admin = get_kafka_admin() - existing_topics = set(kafka_admin.list_topics()) - topic_configs = {"compression.type": "snappy"} + existing_topics = {t.topic for t in kafka_admin.list_topics().topics} + topic_config = {"compression.type": "snappy"} new_topics = [ - NewTopic("svc-event-exadigit-sim", 1, 1, topic_configs = topic_configs), - NewTopic("svc-ts-exadigit-schedulersimsystem", 4, 1, topic_configs = topic_configs), - NewTopic("svc-event-exadigit-schedulersimjob", 2, 1, topic_configs = topic_configs), - NewTopic("svc-ts-exadigit-coolingsimcdu", 4, 1, topic_configs = topic_configs), - NewTopic("svc-ts-exadigit-coolingsimcep", 2, 1, topic_configs = topic_configs), - NewTopic("svc-ts-exadigit-jobpowerhistory", 4, 1, topic_configs = topic_configs), + NewTopic("svc-event-exadigit-sim", 1, 1, config = topic_config), + NewTopic("svc-ts-exadigit-schedulersimsystem", 4, 1, config = topic_config), + NewTopic("svc-event-exadigit-schedulersimjob", 2, 1, config = topic_config), + NewTopic("svc-ts-exadigit-coolingsimcdu", 4, 1, config = topic_config), + NewTopic("svc-ts-exadigit-coolingsimcep", 2, 1, config = topic_config), + NewTopic("svc-ts-exadigit-jobpowerhistory", 4, 1, config = topic_config), ] - new_topics = [t for t in new_topics if t.name not in existing_topics] - logger.info(f"Creating kafka topics {', '.join(t.name for t in new_topics)}") + new_topics = [t for t in new_topics if t.topic not in existing_topics] + logger.info(f"Creating kafka topics {', '.join(t.topic for t in new_topics)}") kafka_admin.create_topics(new_topics) druid_ingests_dir = Path(__file__).parent.parent.parent.resolve() / 'druid_ingests' diff --git a/simulation_server/server/service.py b/simulation_server/server/service.py index df616f9..0e20615 100644 --- a/simulation_server/server/service.py +++ b/simulation_server/server/service.py @@ -69,7 +69,7 @@ def run_simulation(sim_config: ServerSimConfig, deps: AppDeps): config = sim_config.model_dump(mode = 'json'), ) logger.info(f"Launching simulation {sim.id}") - deps.kafka_producer.send("svc-event-exadigit-sim", value = sim.serialize_for_druid()) + deps.kafka_producer.produce("svc-event-exadigit-sim", sim.serialize_for_druid()) deps.kafka_producer.flush() if 'KUBERNETES_SERVICE_HOST' in os.environ: # We're running on k8s @@ -174,9 +174,7 @@ def cleanup_jobs(druid_engine, kafka_producer): sim.execution_end = now sim.error_messages = "Simulation crashed" logger.warning(f"Marking stuck sim {sim.id} as failed") - kafka_producer.send("svc-event-exadigit-sim", - value = sim.serialize_for_druid() - ) + kafka_producer.produce("svc-event-exadigit-sim", sim.serialize_for_druid()) for sim in stuck_sims: stmt = ( diff --git a/simulation_server/simulation/main.py b/simulation_server/simulation/main.py index d4aa959..967a1c4 100644 --- a/simulation_server/simulation/main.py +++ b/simulation_server/simulation/main.py @@ -52,19 +52,21 @@ def run_simulation_serialized(sim: Sim) -> Iterable[dict[str, list[bytes]]]: def write_sim_to_kafka(sim: Sim): - kafka_producer = get_kafka_producer( - linger_ms = 2 * 1000, - batch_size = 65536, - compression_type = "snappy", - ) + kafka_producer = get_kafka_producer({ + 'bootstrap.servers': os.environ['KAFKA_BOOTSTRAP'], + 'linger.ms': 2 * 1000, + 'batch.size': 65536, + "compression.type": "snappy", + }) + try: for data in run_simulation_serialized(sim): # kafka_producer does its own buffering of output so we don't need to worry about batching - for topic, rows in data.items(): - for row in rows: - kafka_producer.send(topic=topic, value=row) + for topic, messages in data.items(): + for message in messages: + kafka_producer.produce(topic, message) finally: - kafka_producer.close() + kafka_producer.flush() def write_sim_to_disk(sim: Sim, dest: str): diff --git a/simulation_server/util/kafka.py b/simulation_server/util/kafka.py index f6123c0..3afff4a 100644 --- a/simulation_server/util/kafka.py +++ b/simulation_server/util/kafka.py @@ -1,30 +1,30 @@ import os -from kafka import KafkaProducer, KafkaConsumer, KafkaAdminClient -import functools +from confluent_kafka import Producer, Consumer +from confluent_kafka.admin import AdminClient def _get_kafka_config(): - env_configs = { + env_config = { # Pick-up credentials from the context - 'bootstrap_servers': [os.environ['KAFKA_BOOTSTRAP']], - 'sasl_mechanism': os.environ.get('KAFKA_SASL_MECHANISM'), - 'sasl_plain_username': os.environ.get('KAFKA_SASL_USERNAME'), - 'sasl_plain_password': os.environ.get('KAFKA_SASL_PASSWORD'), - 'security_protocol': os.environ.get('KAFKA_SECURITY_PROTOCOL'), + 'bootstrap.servers': os.environ['KAFKA_BOOTSTRAP'], + 'sasl.mechanism': os.environ.get('KAFKA_SASL_MECHANISM'), + 'security.protocol': os.environ.get('KAFKA_SECURITY_PROTOCOL'), + 'sasl.plain.username': os.environ.get('KAFKA_SASL_USERNAME'), + 'sasl.plain.password': os.environ.get('KAFKA_SASL_PASSWORD'), } - env_configs = {k: v for k, v in env_configs.items() if v is not None} - return env_configs + return {k: v for k, v in env_config.items() if v is not None} -@functools.cache -def get_kafka_producer(**configs): - return KafkaProducer(**{**_get_kafka_config(), **configs}) +def get_kafka_producer(config = {}): + # Use confluent_kafka as it has significantly better producer performance + # I think that kafka.KafkaProducer sends messages in a background thread so it still blocks the + # GIL, while confluent_kafka is using some kind c bindings internally which avoid that. + return Producer({**_get_kafka_config(), **config}) -@functools.cache -def get_kafka_consumer(*topics, **configs): - return KafkaConsumer(*topics, **{**_get_kafka_config(), **configs}) +def get_kafka_consumer(*topics, config = {}): + return Consumer({**_get_kafka_config(), **config}) -@functools.cache -def get_kafka_admin(**configs): - return KafkaAdminClient(**{**_get_kafka_config(), **configs}) + +def get_kafka_admin(config = {}): + return AdminClient({**_get_kafka_config(), **config}) -- GitLab From 5d12c98cea560ad7a86d6d00f16adb84dd852b0f Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 6 Oct 2025 13:45:31 -0400 Subject: [PATCH 41/77] Add dashboard as submodule --- .gitmodules | 3 +++ simulation_dashboard | 1 + 2 files changed, 4 insertions(+) create mode 160000 simulation_dashboard diff --git a/.gitmodules b/.gitmodules index 691b42d..2fcb9b5 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,3 +2,6 @@ path = raps url = https://github.com/ExaDigiT/RAPS.git branch = main +[submodule "simulation_dashboard"] + path = simulation_dashboard + url = https://github.com/ExaDigiT/SimulationDashboard.git diff --git a/simulation_dashboard b/simulation_dashboard new file mode 160000 index 0000000..4e5decf --- /dev/null +++ b/simulation_dashboard @@ -0,0 +1 @@ +Subproject commit 4e5decf92ad60623e41a10ca0a670996e47ebe81 -- GitLab From 3877514e97b00b2d4695fc8c081de07e21ad4439 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 6 Oct 2025 14:22:50 -0400 Subject: [PATCH 42/77] Launch dashboard in docker compose --- README.md | 8 ++++---- docker-compose.yml | 28 ++++++++++++++++++++++++++-- simulation_server/server/config.py | 2 +- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e8a67ca..8afe335 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,13 @@ REST API that allows running and querying the results from the ExaDigit simulation and RAPS. -## Loading RAPS submodule +## Loading RAPS and Dashboard submodules This uses [RAPS](https://github.com/ExaDigiT/RAPS) to run the simulation, which is loaded as a -submodule. Make sure to run +submodule. The [Simulation Dashboard](https://github.com/ExaDigiT/SimulationDashboard) is also in a +separate repo and loaded as a submodule. Make to load the submodules by running: ``` git submodule update --init --recursive ``` -to load the submodule. ## Downloading FMU models The Frontier FMU models aren't currently publicly available. To run Frontier simulations with cooling enabled, use this @@ -28,7 +28,7 @@ To run a local version of the server run ```bash docker compose up --wait ``` -The server will be hosted on http://localhost:8080 +The API server will be hosted on http://localhost:8081. The dashboard will be hosted on http://localhost:8080. You'll need at least 16 GiB of RAM, preferably 32 GiB for druid and RAPS to run smoothly. diff --git a/docker-compose.yml b/docker-compose.yml index 33c7e83..d08bdad 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -201,7 +201,7 @@ services: container_name: simulation-server command: ["python", "-m", "simulation_server.server.main"] ports: - - "8080:8080" + - "8081:8081" depends_on: druid-coordinator: condition: service_healthy @@ -216,6 +216,7 @@ services: kafka: condition: service_healthy environment: + - EXADIGIT_HTTP_PORT=8081 - EXADIGIT_ENV=dev # - EXADIGIT_ROOT_PATH - EXADIGIT_DEBUG_MODE=true @@ -224,8 +225,31 @@ services: - DRUID_SERVICE_URL=http://druid-router:8888 - KAFKA_BOOTSTRAP=kafka:9092 healthcheck: - test: ["CMD-SHELL", "wget -q -O - http://localhost:8080/openapi.json || exit 1"] + test: ["CMD-SHELL", "wget -q -O - http://localhost:8081/openapi.json || exit 1"] interval: 10s retries: 3 start_interval: 1s start_period: 1m + + simulation-dashboard: + pull_policy: build + image: exadigit-simulation-dashboard + build: + context: ./simulation_dashboard + args: + VITE_PORT: "8080" + VITE_AUTH_URL: "" + VITE_BASE_PATH: "http://localhost:8080" + VITE_API_PATH: "http://localhost:8081" + container_name: simulation-dashboard + ports: + - "8080:80" + depends_on: + simulation-server: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "curl --fail -s http://localhost:80/index.html || exit 1"] + interval: 10s + retries: 3 + start_interval: 1s + start_period: 10s diff --git a/simulation_server/server/config.py b/simulation_server/server/config.py index c25c02b..5b0d53e 100644 --- a/simulation_server/server/config.py +++ b/simulation_server/server/config.py @@ -18,7 +18,7 @@ class AppSettings(BaseSettings): root_path: str = "" """ The root path of the application if you are behind a proxy """ - http_port: int = 8080 + http_port: int = 8081 allow_origins: list[str] = [] -- GitLab From 0f6d068c2871523e3f86609f323b48f9fadc2953 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 7 Oct 2025 09:11:55 -0400 Subject: [PATCH 43/77] Fix kafka loop Needs to call .poll --- simulation_server/simulation/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simulation_server/simulation/main.py b/simulation_server/simulation/main.py index 967a1c4..3b83d2a 100644 --- a/simulation_server/simulation/main.py +++ b/simulation_server/simulation/main.py @@ -64,6 +64,7 @@ def write_sim_to_kafka(sim: Sim): # kafka_producer does its own buffering of output so we don't need to worry about batching for topic, messages in data.items(): for message in messages: + kafka_producer.poll(0) kafka_producer.produce(topic, message) finally: kafka_producer.flush() -- GitLab From 0c577025aef8c71cb1087749df75226783893938 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 7 Oct 2025 09:37:05 -0400 Subject: [PATCH 44/77] More performance improvements to simulation loop --- simulation_server/simulation/simulation.py | 29 +++++++++++++--------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index 5baf090..de1fe22 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -91,19 +91,23 @@ def run_simulation(sim_config: ServerSimConfig): if unix_timestamp % sample_system == 0 or is_last_tick: down_nodes = parse_nodes(tuple(tick.down_nodes)) engine_stats = get_engine_stats(engine, fast = True) - job_stats = get_job_stats(engine) + + # Calculate throughput manually instead of using get_job_stats to avoid the rest of the + # expensive calculations in get_job_stats + duration = (timestamp - engine.start).total_seconds() + throughput = (engine.jobs_completed / duration) * 3600 if duration != 0 else 0 scheduler_sim_system = [SchedulerSimSystem.model_validate({ "timestamp": timestamp, "down_nodes": down_nodes, - # TODO: Update sc.get_stats to return more easily parsable data "num_samples": engine_stats['num_samples'], - "jobs_completed": job_stats['jobs_completed'], - "jobs_running": len(job_stats['jobs_still_running']), - "jobs_pending": len(job_stats['jobs_still_in_queue']), + # Don't call get_job_stats as it is slow + "jobs_completed": engine.jobs_completed, + "jobs_running": len(tick.running), + "jobs_pending": len(tick.queue), + "throughput": throughput, - "throughput": job_stats['throughput'], "average_power": engine_stats['average_power'] * 1_000_000, "min_loss": engine_stats['min_loss'] * 1_000_000, "average_loss": engine_stats['average_loss'] * 1_000_000, @@ -124,14 +128,14 @@ def run_simulation(sim_config: ServerSimConfig): curr_job_hashes = set() tick_jobs = itertools.chain(tick.queue, tick.running, tick.completed, tick.killed) for job in tick_jobs: - time_end = offset_to_time(job.end_time) - # end_time is set to its planned end once its scheduled. Set it to None for unfinished jobs here - if time_end is not None and (job.start_time is None or time_end > timestamp): - time_end = None - job_state_hash = get_job_state_hash(job) # Output jobs if something other than time_snapshot changed if is_last_tick or job_state_hash not in prev_job_hashes: + time_end = offset_to_time(job.end_time) + # end_time is set to its planned end once its scheduled. Set it to None for unfinished jobs here + if time_end is not None and (job.start_time is None or time_end > timestamp): + time_end = None + parsed_job = SchedulerSimJob.model_validate({ "job_id": str(job.id), "name": job.name, @@ -149,7 +153,9 @@ def run_simulation(sim_config: ServerSimConfig): }) scheduler_sim_jobs.append(parsed_job) curr_job_hashes.add(job_state_hash) + prev_job_hashes = curr_job_hashes + for job in itertools.chain(tick.running, tick.completed, tick.killed): if power_history_counts.get(job.id, 0) < len(job.power_history): power_history.append(SchedulerSimJobPowerHistory( timestamp = timestamp, @@ -157,7 +163,6 @@ def run_simulation(sim_config: ServerSimConfig): power = job.power_history[-1], )) power_history_counts[job.id] = len(job.power_history) - prev_job_hashes = curr_job_hashes cooling_sim_cdus: list[CoolingSimCDU] = [] cooling_sim_cep: list[CoolingSimCEP] = [] -- GitLab From f848a543d39d3995402ad9ed902d8b0e6a2c26a0 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 7 Oct 2025 13:34:08 -0400 Subject: [PATCH 45/77] Smarter job parsing --- simulation_server/simulation/simulation.py | 81 ++++++++++++---------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index de1fe22..c9079db 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -5,7 +5,7 @@ import orjson from loguru import logger from raps import Engine from raps.job import Job as RapsJob -from raps.stats import get_engine_stats, get_job_stats +from raps.stats import get_engine_stats from ..models.sim import ServerSimConfig from ..models.output import ( JobStateEnum, SchedulerSimJob, SchedulerSimJobPowerHistory, SchedulerSimSystem, CoolingSimCDU, @@ -25,7 +25,7 @@ class SimTickOutput(NamedTuple): power_history: list[SchedulerSimJobPowerHistory] -def get_job_state_hash(job: RapsJob): +def get_job_hash(job: RapsJob): """ Return string that can be used to check if any meaningful state changed """ return orjson.dumps([ str(job.id), @@ -77,10 +77,33 @@ def run_simulation(sim_config: ServerSimConfig): cdu_name = engine.telemetry.cdu_index_to_name(cdu_index) row, col = engine.telemetry.cdu_pos(cdu_index) return cdu_name, row, col + + def parse_job(job: RapsJob, timestamp: datetime): + # Output jobs only if something changed + time_end = offset_to_time(job.end_time) + # end_time is set to its planned end once its scheduled. Set it to None for + # unfinished jobs here + if time_end is not None and (job.start_time is None or time_end > timestamp): + time_end = None + return SchedulerSimJob.model_validate({ + "job_id": str(job.id), + "name": job.name, + "node_count": job.nodes_required, + "time_snapshot": timestamp, + "time_submission": offset_to_time(job.submit_time), + "time_limit": job.time_limit, + "time_start": offset_to_time(job.start_time), + "time_end": time_end, + "state_current": JobStateEnum(job.current_state.name), + "nodes": parse_nodes(tuple(job.scheduled_nodes)) if job.scheduled_nodes else None, + # How does the new job.power attribute work? Is it total_energy? + # Or just the current wattage? + # power = job.power, + }) + job_hashes: dict[int, bytes] = {} # Keep record of how many power history steps we've emitted for each job - power_history_counts: dict[int, int] = {} - prev_job_hashes: set[str] = set() + job_power_history_counts: dict[int, int] = {} for tick in engine.run_simulation(): timestamp: datetime = offset_to_time(tick.current_timestep) @@ -102,7 +125,6 @@ def run_simulation(sim_config: ServerSimConfig): "down_nodes": down_nodes, "num_samples": engine_stats['num_samples'], - # Don't call get_job_stats as it is slow "jobs_completed": engine.jobs_completed, "jobs_running": len(tick.running), "jobs_pending": len(tick.queue), @@ -125,44 +147,31 @@ def run_simulation(sim_config: ServerSimConfig): scheduler_sim_jobs: list[SchedulerSimJob] = [] power_history: list[SchedulerSimJobPowerHistory] = [] - curr_job_hashes = set() - tick_jobs = itertools.chain(tick.queue, tick.running, tick.completed, tick.killed) - for job in tick_jobs: - job_state_hash = get_job_state_hash(job) - # Output jobs if something other than time_snapshot changed - if is_last_tick or job_state_hash not in prev_job_hashes: - time_end = offset_to_time(job.end_time) - # end_time is set to its planned end once its scheduled. Set it to None for unfinished jobs here - if time_end is not None and (job.start_time is None or time_end > timestamp): - time_end = None - - parsed_job = SchedulerSimJob.model_validate({ - "job_id": str(job.id), - "name": job.name, - "node_count": job.nodes_required, - "time_snapshot": timestamp, - "time_submission": offset_to_time(job.submit_time), - "time_limit": job.time_limit, - "time_start": offset_to_time(job.start_time), - "time_end": time_end, - "state_current": JobStateEnum(job.current_state.name), - "nodes": parse_nodes(tuple(job.scheduled_nodes)) if job.scheduled_nodes else None, - # How does the new job.power attribute work? Is it total_energy? - # Or just the current wattage? - # power = job.power, - }) - scheduler_sim_jobs.append(parsed_job) - curr_job_hashes.add(job_state_hash) - prev_job_hashes = curr_job_hashes + # Only output running jobs when the state changes + for job in tick.queue: + # Just use a constant as hash for queued jobs to avoid computing the hash repeatedly for + # them. This assumes queued jobs don't change any meaningful state until they run + job_hash = b"queued" + if is_last_tick or job_hashes.get(job.id) != job_hash: + scheduler_sim_jobs.append(parse_job(job, timestamp)) + job_hashes[job.id] = job_hash + for job in tick.running: + job_hash = get_job_hash(job) + if is_last_tick or job_hashes.get(job.id) != job_hash: + scheduler_sim_jobs.append(parse_job(job, timestamp)) + job_hashes[job.id] = job_hash + for job in itertools.chain(tick.completed, tick.killed): + scheduler_sim_jobs.append(parse_job(job, timestamp)) + job_hashes.pop(job.id, None) for job in itertools.chain(tick.running, tick.completed, tick.killed): - if power_history_counts.get(job.id, 0) < len(job.power_history): + if job_power_history_counts.get(job.id, 0) < len(job.power_history): power_history.append(SchedulerSimJobPowerHistory( timestamp = timestamp, job_id = str(job.id), power = job.power_history[-1], )) - power_history_counts[job.id] = len(job.power_history) + job_power_history_counts[job.id] = len(job.power_history) cooling_sim_cdus: list[CoolingSimCDU] = [] cooling_sim_cep: list[CoolingSimCEP] = [] -- GitLab From ddbc11466b71ad3c418e417e639bcb3e9be957ce Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 8 Oct 2025 15:43:36 -0400 Subject: [PATCH 46/77] Faster stats calculations --- raps | 2 +- simulation_server/simulation/simulation.py | 32 ++++++++++------------ 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/raps b/raps index 0477cd4..0e40c7e 160000 --- a/raps +++ b/raps @@ -1 +1 @@ -Subproject commit 0477cd4deea22b1397598eb4b0677151616f5807 +Subproject commit 0e40c7ea1710de5c51a6d39977a8c32f8567c78a diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index c9079db..fab7968 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -5,7 +5,7 @@ import orjson from loguru import logger from raps import Engine from raps.job import Job as RapsJob -from raps.stats import get_engine_stats +from raps.stats import RunningStats from ..models.sim import ServerSimConfig from ..models.output import ( JobStateEnum, SchedulerSimJob, SchedulerSimJobPowerHistory, SchedulerSimSystem, CoolingSimCDU, @@ -55,6 +55,7 @@ def snap_sample_rate(desired_rate: int, actual_rate: int): def run_simulation(sim_config: ServerSimConfig): # TODO: replay logic engine = Engine(sim_config) + running_stats = RunningStats(engine) sample_system = 1 sample_power = snap_sample_rate(5, int(sim_config.time_delta.total_seconds())) @@ -113,31 +114,26 @@ def run_simulation(sim_config: ServerSimConfig): scheduler_sim_system: list[SchedulerSimSystem] = [] if unix_timestamp % sample_system == 0 or is_last_tick: down_nodes = parse_nodes(tuple(tick.down_nodes)) - engine_stats = get_engine_stats(engine, fast = True) - - # Calculate throughput manually instead of using get_job_stats to avoid the rest of the - # expensive calculations in get_job_stats - duration = (timestamp - engine.start).total_seconds() - throughput = (engine.jobs_completed / duration) * 3600 if duration != 0 else 0 + stats = running_stats.get_stats() scheduler_sim_system = [SchedulerSimSystem.model_validate({ "timestamp": timestamp, "down_nodes": down_nodes, - "num_samples": engine_stats['num_samples'], + "num_samples": stats['num_samples'], "jobs_completed": engine.jobs_completed, "jobs_running": len(tick.running), "jobs_pending": len(tick.queue), - "throughput": throughput, - - "average_power": engine_stats['average_power'] * 1_000_000, - "min_loss": engine_stats['min_loss'] * 1_000_000, - "average_loss": engine_stats['average_loss'] * 1_000_000, - "max_loss": engine_stats['max_loss'] * 1_000_000, - "system_power_efficiency": engine_stats['system_power_efficiency'], - "total_energy_consumed": engine_stats['total_energy_consumed'], - "carbon_emissions": engine_stats['carbon_emissions'], - "total_cost": engine_stats['total_cost'], + "throughput": stats["throughput"], + + "average_power": stats['average_power'] * 1_000_000, + "min_loss": stats['min_loss'] * 1_000_000, + "average_loss": stats['average_loss'] * 1_000_000, + "max_loss": stats['max_loss'] * 1_000_000, + "system_power_efficiency": stats['system_power_efficiency'], + "total_energy_consumed": stats['total_energy_consumed'], + "carbon_emissions": stats['carbon_emissions'], + "total_cost": stats['total_cost'], "p_flops": tick.p_flops, "g_flops_w": tick.g_flops_w, -- GitLab From 5261b45ec7ed0ecf0b26359d0661b47d940b0e7a Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 15 Oct 2025 10:21:29 -0400 Subject: [PATCH 47/77] Log frontend url --- docker-compose.yml | 2 +- simulation_server/server/main.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index d08bdad..dc09214 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -221,7 +221,7 @@ services: # - EXADIGIT_ROOT_PATH - EXADIGIT_DEBUG_MODE=true # - EXADIGIT_JOB_IMAGE - - EXADIGIT_ALLOW_ORIGINS=["*"] + - EXADIGIT_ALLOW_ORIGINS=["http://localhost:8080"] - DRUID_SERVICE_URL=http://druid-router:8888 - KAFKA_BOOTSTRAP=kafka:9092 healthcheck: diff --git a/simulation_server/server/main.py b/simulation_server/server/main.py index da1fcfb..bf5792d 100644 --- a/simulation_server/server/main.py +++ b/simulation_server/server/main.py @@ -130,7 +130,9 @@ if settings.allow_origins: allow_credentials=True, allow_methods=["*"], allow_headers=["*"], -) + ) + if "*" not in settings.allow_origins: + logger.info(f"Frontend hosted at {' '.join(settings.allow_origins)}") from .endpoints import router app.include_router(router) -- GitLab From e613e443e4fb2bc454bc32926dc73c071739d903 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 16 Oct 2025 16:37:03 -0400 Subject: [PATCH 48/77] Update raps --- raps | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/raps b/raps index 0e40c7e..82f348a 160000 --- a/raps +++ b/raps @@ -1 +1 @@ -Subproject commit 0e40c7ea1710de5c51a6d39977a8c32f8567c78a +Subproject commit 82f348a2c759261f6f0046fc63027e9c3d43e960 -- GitLab From fa237978f156f69af54283fe14ade50f02c5ef31 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 16 Oct 2025 16:37:53 -0400 Subject: [PATCH 49/77] Update simulation dashboard --- simulation_dashboard | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simulation_dashboard b/simulation_dashboard index 4e5decf..3471587 160000 --- a/simulation_dashboard +++ b/simulation_dashboard @@ -1 +1 @@ -Subproject commit 4e5decf92ad60623e41a10ca0a670996e47ebe81 +Subproject commit 347158763cd88f02626adb616f9033514f1643f0 -- GitLab From ff466e633ad75b51660ff298186bb48f070ddfa9 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 16 Oct 2025 16:39:13 -0400 Subject: [PATCH 50/77] Add shebang to script --- scripts/fetch.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/fetch.sh b/scripts/fetch.sh index 5df0b8a..244c9da 100755 --- a/scripts/fetch.sh +++ b/scripts/fetch.sh @@ -1,3 +1,4 @@ +#!/bin/bash set -e mkdir data -- GitLab From 74963e40506c9b146491559b63c8535d2a22c6bc Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 16 Oct 2025 17:31:25 -0400 Subject: [PATCH 51/77] Update fetch script --- scripts/fetch.sh | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/scripts/fetch.sh b/scripts/fetch.sh index 244c9da..f775905 100755 --- a/scripts/fetch.sh +++ b/scripts/fetch.sh @@ -2,24 +2,12 @@ set -e mkdir data -cd data -# lassen -git clone https://github.com/LLNL/LAST/ lassen-repo -cd lassen-repo -git lfs pull -cd .. -mkdir lassen -mv lassen-repo/Lassen-Supercomputer-Job-Dataset/*.csv lassen -rm -rf lassen-repo -python3 ../scripts/preprocess_lassen.py lassen +raps download --system lassen --dest ./data/lassen +mv ./data/lassen/Lassen-Supercomputer-Job-Dataset/* ./data/lassen +rm -rf ./data/lassen/Lassen-Supercomputer-Job-Dataset +python3 ./scripts/preprocess_lassen.py ./data/lassen -# marconi -wget https://zenodo.org/api/records/10127767/files-archive -O marconi100.zip -unzip marconi100.zip -d marconi100 -rm marconi100.zip - -# fugaku -wget https://zenodo.org/api/records/11467483/files-archive -O fugaku.zip -unzip fugaku.zip -d fugaku -rm fugaku/*.csv +raps download --system marconi100 --dest ./data/marconi100 +raps download --system fugaku --dest ./data/fugaku +raps download --system adastraMI250 --dest ./data/adastraMI250 -- GitLab From 03c0ee7ea5291e1ed6aa18497da0c71937293ef1 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 20 Oct 2025 15:24:39 -0400 Subject: [PATCH 52/77] Ad script to submit druid ingests Also fix fugaku ingest causing ram issues --- README.md | 6 +- druid_ingests/data-fugaku.json | 40 +---------- .../data-lassen-allocation-history.json | 9 +-- druid_ingests/data-lassen-node-history.json | 9 +-- druid_ingests/data-lassen-step-history.json | 11 +-- druid_ingests/data-marconi100.json | 9 +-- pyproject.toml | 1 + scripts/{fetch.sh => fetch_data.sh} | 3 + scripts/preprocess_fugaku.py | 34 +++++++++ scripts/submit_data_ingests.py | 69 +++++++++++++++++++ simulation_server/simulation/dataloaders.py | 4 +- 11 files changed, 123 insertions(+), 72 deletions(-) rename scripts/{fetch.sh => fetch_data.sh} (88%) create mode 100755 scripts/preprocess_fugaku.py create mode 100755 scripts/submit_data_ingests.py diff --git a/README.md b/README.md index 8afe335..1a8e628 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,11 @@ docker compose up --wait ``` The API server will be hosted on http://localhost:8081. The dashboard will be hosted on http://localhost:8080. -You'll need at least 16 GiB of RAM, preferably 32 GiB for druid and RAPS to run smoothly. +You'll need at least 32 GiB of RAM for druid and RAPS to run smoothly. If you want to run replay data locally, you'll need to download the datasets and then ingest them in -Druid. You can fetch the datasets with `./scripts/fetch.sh` and submit the druid ingests for them -under `./druid_ingests` using the Druid UI at http://localhost:8888. +Druid. You can fetch the datasets with `./scripts/fetch_data.sh`, and use the `./scripts/submit_data_ingests.py` +script to ingest them into druid. View the server logs with: ```bash diff --git a/druid_ingests/data-fugaku.json b/druid_ingests/data-fugaku.json index f287e47..e14ede2 100644 --- a/druid_ingests/data-fugaku.json +++ b/druid_ingests/data-fugaku.json @@ -3,13 +3,6 @@ "spec": { "ioConfig": { "type": "index_parallel", - // "inputSource": { - // "type": "s3", - // "objectGlob": "**.parquet", - // "prefixes": [ - // "s3://scratch/raps-datasets/fugaku/" - // ] - // }, "inputSource": { "type": "local", "baseDir": "/data/fugaku/", @@ -25,7 +18,8 @@ "type": "dynamic" }, "maxNumConcurrentSubTasks": 2, - "maxRowsInMemory": 100000 + "maxRowsInMemory": 100000, + "awaitSegmentAvailabilityTimeoutMillis": 1800000 }, "dataSchema": { "dataSource": "svc-ts-exadigit-data-fugaku", @@ -39,36 +33,6 @@ "name": "__time", "type": "expression", "expression": "timestamp_parse(sdt)" - }, - { - "name": "adt", - "type": "expression", - "expression": "timestamp_format(timestamp_parse(adt))" - }, - { - "name": "qdt", - "type": "expression", - "expression": "timestamp_format(timestamp_parse(qdt))" - }, - { - "name": "schedsdt", - "type": "expression", - "expression": "timestamp_format(timestamp_parse(schedsdt))" - }, - { - "name": "deldt", - "type": "expression", - "expression": "timestamp_format(timestamp_parse(deldt))" - }, - { - "name": "sdt", - "type": "expression", - "expression": "timestamp_format(timestamp_parse(sdt))" - }, - { - "name": "edt", - "type": "expression", - "expression": "timestamp_format(timestamp_parse(edt))" } ] }, diff --git a/druid_ingests/data-lassen-allocation-history.json b/druid_ingests/data-lassen-allocation-history.json index 43086bd..04cf93e 100644 --- a/druid_ingests/data-lassen-allocation-history.json +++ b/druid_ingests/data-lassen-allocation-history.json @@ -3,12 +3,6 @@ "spec": { "ioConfig": { "type": "index_parallel", - // "inputSource": { - // "type": "s3", - // "prefixes": [ - // "s3://scratch/raps-datasets/lassen/final_csm_allocation_history_hashed.csv" - // ] - // }, "inputSource": { "type": "local", "baseDir": "/data/lassen/final_csm_allocation_history_hashed.csv", @@ -25,7 +19,8 @@ "type": "dynamic" }, "maxNumConcurrentSubTasks": 2, - "maxRowsInMemory": 100000 + "maxRowsInMemory": 100000, + "awaitSegmentAvailabilityTimeoutMillis": 1800000 }, "dataSchema": { "dataSource": "svc-ts-exadigit-data-lassen-allocation-history", diff --git a/druid_ingests/data-lassen-node-history.json b/druid_ingests/data-lassen-node-history.json index 191c529..752e25e 100644 --- a/druid_ingests/data-lassen-node-history.json +++ b/druid_ingests/data-lassen-node-history.json @@ -3,12 +3,6 @@ "spec": { "ioConfig": { "type": "index_parallel", - // "inputSource": { - // "type": "s3", - // "prefixes": [ - // "s3://scratch/raps-datasets/lassen/final_csm_allocation_node_history_with_time.csv" - // ] - // }, "inputSource": { "type": "local", "baseDir": "/data/lassen/final_csm_allocation_node_history_with_time.csv", @@ -25,7 +19,8 @@ "type": "dynamic" }, "maxNumConcurrentSubTasks": 2, - "maxRowsInMemory": 100000 + "maxRowsInMemory": 100000, + "awaitSegmentAvailabilityTimeoutMillis": 1800000 }, "dataSchema": { "dataSource": "svc-ts-exadigit-data-lassen-node-history", diff --git a/druid_ingests/data-lassen-step-history.json b/druid_ingests/data-lassen-step-history.json index b9c3dc3..5ff08dc 100644 --- a/druid_ingests/data-lassen-step-history.json +++ b/druid_ingests/data-lassen-step-history.json @@ -3,12 +3,6 @@ "spec": { "ioConfig": { "type": "index_parallel", - // "inputSource": { - // "type": "s3", - // "prefixes": [ - // "s3://scratch/raps-datasets/lassen/final_csm_step_history.csv" - // ] - // }, "inputSource": { "type": "local", "baseDir": "/data/lassen/final_csm_step_history.csv", @@ -25,10 +19,11 @@ "type": "dynamic" }, "maxNumConcurrentSubTasks": 2, - "maxRowsInMemory": 100000 + "maxRowsInMemory": 100000, + "awaitSegmentAvailabilityTimeoutMillis": 1800000 }, "dataSchema": { - "dataSource": "svc-ts-exadigit-data-fugaku-lassen-step-history", + "dataSource": "svc-ts-exadigit-data-lassen-step-history", "timestampSpec": { "column": "!!!_no_such_column_!!!", "missingValue": "2010-01-01T00:00:00Z" diff --git a/druid_ingests/data-marconi100.json b/druid_ingests/data-marconi100.json index fc8b5c7..16e3159 100644 --- a/druid_ingests/data-marconi100.json +++ b/druid_ingests/data-marconi100.json @@ -3,12 +3,6 @@ "spec": { "ioConfig": { "type": "index_parallel", - // "inputSource": { - // "type": "s3", - // "prefixes": [ - // "s3://scratch/raps-datasets/marconi100/" - // ] - // }, "inputSource": { "type": "local", "baseDir": "/data/marconi100/", @@ -24,7 +18,8 @@ "type": "dynamic" }, "maxNumConcurrentSubTasks": 2, - "maxRowsInMemory": 100000 + "maxRowsInMemory": 100000, + "awaitSegmentAvailabilityTimeoutMillis": 1800000 }, "dataSchema": { "dataSource": "svc-ts-exadigit-data-marconi100", diff --git a/pyproject.toml b/pyproject.toml index 09e35f8..5bd5c8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "requests==2.32.5", "orjson==3.11.3", "confluent_kafka==2.11.1", + "pyjson5==2.0.0", "raps@{root:uri}/raps", ] diff --git a/scripts/fetch.sh b/scripts/fetch_data.sh similarity index 88% rename from scripts/fetch.sh rename to scripts/fetch_data.sh index f775905..7e12984 100755 --- a/scripts/fetch.sh +++ b/scripts/fetch_data.sh @@ -9,5 +9,8 @@ rm -rf ./data/lassen/Lassen-Supercomputer-Job-Dataset python3 ./scripts/preprocess_lassen.py ./data/lassen raps download --system marconi100 --dest ./data/marconi100 + raps download --system fugaku --dest ./data/fugaku +python3 ./scripts/preprocess_fugaku.py ./data/fugaku + raps download --system adastraMI250 --dest ./data/adastraMI250 diff --git a/scripts/preprocess_fugaku.py b/scripts/preprocess_fugaku.py new file mode 100755 index 0000000..739773e --- /dev/null +++ b/scripts/preprocess_fugaku.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +""" +Split up the large fugaku parquets so druid doesn't choke on them when ingesting. +""" + +from pathlib import Path +import pandas as pd +import sys +from collections.abc import Iterable +from pyarrow.parquet import ParquetFile +import pyarrow as pa + +def read_parquet_chunked(file, chunk_size) -> Iterable[pd.DataFrame]: + pf = ParquetFile(file) + for chunk in pf.iter_batches(batch_size = chunk_size): + yield chunk.to_pandas() + +if __name__ == "__main__": + data_path = Path(sys.argv[1]) + files = list(data_path.glob("*.parquet")) + + for file in files: + for chunk_df in read_parquet_chunked(file, 100_000): + chunk_df['date'] = pd.to_datetime(chunk_df['sdt']).dt.strftime("%Y-%m-%d") + # fugaku dataset is indexed by submission date + for date, date_df in chunk_df.groupby('date'): + day_dir = data_path / Path(f"date={date}") + day_dir.mkdir(exist_ok = True) + num = max([int(p.stem) for p in day_dir.glob("*.parquet")], default=-1) + 1 + date_df.to_parquet(day_dir / f"{num:03}.parquet") + + # Delete the old parquets + for file in data_path.glob("*.parquet"): + file.unlink() diff --git a/scripts/submit_data_ingests.py b/scripts/submit_data_ingests.py new file mode 100755 index 0000000..6a70426 --- /dev/null +++ b/scripts/submit_data_ingests.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +from pathlib import Path +import urllib.parse +from typing import Any +import time, os +import pyjson5 +import requests +from loguru import logger +import orjson + + +class DruidApi: + def __init__(self, url: str, user: str | None = None, password: str | None = None) -> None: + self.url = url.removesuffix("/") + self.user = user + self.password = password + + def request(self, method, url, **kwargs) -> Any: + url = urllib.parse.urljoin(self.url, url) + if self.user and self.password: + auth = (self.user, self.password) + else: + auth = None + + response = requests.request(method, url, timeout = 5 * 60, auth = auth, **kwargs) + if not response.ok: + raise Exception(f"Request {url} failed with {response.status_code}: {response.text}") + + if response.text.strip(): + return response.json() + else: # Some druid endpoints return empty response + return None + + + +def submit_ingest(druid: DruidApi, file): + ingest = pyjson5.loads(Path(file).read_text()) # using yaml as hack to allow comments + logger.info(f"Submitting ingest for {file}...") + response = druid.request("POST", "/druid/indexer/v1/task", json = ingest) + task_id = response['task'] + logger.info(f"See {druid.url}/unified-console.html#tasks/task_id~{task_id} to view ingest progress.") + logger.info(f"Waiting for ingest{task_id} to complete...") + + status = "RUNNING" + while status == "RUNNING": + time.sleep(5) + response = druid.request("GET", f"/druid/indexer/v1/task/{task_id}/status") + status = response['status']['statusCode'] + if status != "SUCCESS": + raise ValueError(f"Ingest for {file} failed!") + else: + logger.info(f"Ingest for {file} finished.") + + +if __name__ == "__main__": + DRUID_URL = os.environ.get("DRUID_URL", "http://localhost:8888") + DRUID_USER = os.environ.get("DRUID_USER") or None # Convert "" to None + DRUID_PASSWORD = os.environ.get("DRUID_PASSWORD") or None + + druid = DruidApi(DRUID_URL, DRUID_USER, DRUID_PASSWORD) + + submit_ingest(druid, "./druid_ingests/data-marconi100.json") + submit_ingest(druid, "./druid_ingests/data-lassen-allocation-history.json") + submit_ingest(druid, "./druid_ingests/data-lassen-node-history.json") + submit_ingest(druid, "./druid_ingests/data-lassen-step-history.json") + submit_ingest(druid, "./druid_ingests/data-fugaku.json") + + logger.info("Done!") + diff --git a/simulation_server/simulation/dataloaders.py b/simulation_server/simulation/dataloaders.py index ab35d72..acdc6c6 100644 --- a/simulation_server/simulation/dataloaders.py +++ b/simulation_server/simulation/dataloaders.py @@ -154,7 +154,7 @@ def fetch_lassen_data(sim_config: SimConfig, raps_config: dict): node_df = pd.read_sql(node_query, druid_engine) step_df = query_time_range( - "svc-ts-exadigit-data-fugaku-lassen-step-history", start, end, 'end_time', + "svc-ts-exadigit-data-lassen-step-history", start, end, 'end_time', druid_engine = druid_engine, parse_dates = ["begin_time", "end_time"], ) @@ -174,4 +174,4 @@ DATA_LOADERS = { "fugaku": fetch_fugaku_data, "marconi100": fetch_marconi100_data, "lassen": fetch_lassen_data, -} \ No newline at end of file +} -- GitLab From 2c2455adc8063cfe753b701e9501b980e4a9185f Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 21 Oct 2025 13:14:47 -0400 Subject: [PATCH 53/77] Fixes to topic creation --- simulation_server/server/main.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/simulation_server/server/main.py b/simulation_server/server/main.py index bf5792d..d29df1a 100644 --- a/simulation_server/server/main.py +++ b/simulation_server/server/main.py @@ -50,9 +50,8 @@ async def lifespan(api: FastAPI): if settings.env == 'dev': kafka_admin = get_kafka_admin() - existing_topics = {t.topic for t in kafka_admin.list_topics().topics} topic_config = {"compression.type": "snappy"} - new_topics = [ + topics = [ NewTopic("svc-event-exadigit-sim", 1, 1, config = topic_config), NewTopic("svc-ts-exadigit-schedulersimsystem", 4, 1, config = topic_config), NewTopic("svc-event-exadigit-schedulersimjob", 2, 1, config = topic_config), @@ -60,9 +59,11 @@ async def lifespan(api: FastAPI): NewTopic("svc-ts-exadigit-coolingsimcep", 2, 1, config = topic_config), NewTopic("svc-ts-exadigit-jobpowerhistory", 4, 1, config = topic_config), ] - new_topics = [t for t in new_topics if t.topic not in existing_topics] - logger.info(f"Creating kafka topics {', '.join(t.topic for t in new_topics)}") - kafka_admin.create_topics(new_topics) + existing_topics = set(kafka_admin.list_topics().topics.keys()) + new_topics = [t for t in topics if t.topic not in existing_topics] + if new_topics: + logger.info(f"Creating kafka topics {', '.join(t.topic for t in new_topics)}") + kafka_admin.create_topics(new_topics) druid_ingests_dir = Path(__file__).parent.parent.parent.resolve() / 'druid_ingests' ingests = [ -- GitLab From 656ead904584766b4ba0cd65bb835156dd6fb008 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 21 Oct 2025 14:50:28 -0400 Subject: [PATCH 54/77] Update submodule --- raps | 2 +- simulation_dashboard | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/raps b/raps index 82f348a..f75cb91 160000 --- a/raps +++ b/raps @@ -1 +1 @@ -Subproject commit 82f348a2c759261f6f0046fc63027e9c3d43e960 +Subproject commit f75cb91a43e4a22c7f3a0cbd20eae4f4f5745338 diff --git a/simulation_dashboard b/simulation_dashboard index 3471587..f1b0bd8 160000 --- a/simulation_dashboard +++ b/simulation_dashboard @@ -1 +1 @@ -Subproject commit 347158763cd88f02626adb616f9033514f1643f0 +Subproject commit f1b0bd8c8538c3a713cda72dcbb6264ccf05ab2e -- GitLab From cb54fb91f56b4ff0eaff575cbf331c53e0c6f0d7 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 21 Oct 2025 17:19:33 -0400 Subject: [PATCH 55/77] Fix Kafka not persisting data --- docker-compose.yml | 2 +- simulation_server/util/kafka.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index dc09214..5830414 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -184,7 +184,7 @@ services: - KAFKA_SOCKET_SEND_BUFFER_BYTES=102400 - KAFKA_SOCKET_RECEIVE_BUFFER_BYTES=102400 - KAFKA_SOCKET_REQUEST_MAX_BYTES=104857600 - - KAFKA_LOG_DIRS=/tmp/kraft-combined-logs + - KAFKA_LOG_DIRS=/var/lib/kafka/data - KAFKA_NUM_PARTITIONS=1 - KAFKA_NUM_RECOVERY_THREADS_PER_DATA_DIR=1 - KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1 diff --git a/simulation_server/util/kafka.py b/simulation_server/util/kafka.py index 3afff4a..cd20b38 100644 --- a/simulation_server/util/kafka.py +++ b/simulation_server/util/kafka.py @@ -23,7 +23,9 @@ def get_kafka_producer(config = {}): def get_kafka_consumer(*topics, config = {}): - return Consumer({**_get_kafka_config(), **config}) + consumer = Consumer({**_get_kafka_config(), **config}) + consumer.subscribe(list(topics)) + return consumer def get_kafka_admin(config = {}): -- GitLab From 22b6c4ac1dd2698dd7f6ec2cea763a863d168f05 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 22 Oct 2025 10:12:14 -0400 Subject: [PATCH 56/77] Fugaku replay working --- simulation_server/simulation/dataloaders.py | 177 ------------------ .../simulation/dataloaders/fugaku.py | 18 ++ simulation_server/simulation/simulation.py | 11 +- simulation_server/util/dataloader.py | 35 ++++ 4 files changed, 62 insertions(+), 179 deletions(-) delete mode 100644 simulation_server/simulation/dataloaders.py create mode 100644 simulation_server/simulation/dataloaders/fugaku.py create mode 100644 simulation_server/util/dataloader.py diff --git a/simulation_server/simulation/dataloaders.py b/simulation_server/simulation/dataloaders.py deleted file mode 100644 index acdc6c6..0000000 --- a/simulation_server/simulation/dataloaders.py +++ /dev/null @@ -1,177 +0,0 @@ -import pandas as pd -import numpy as np -import sqlalchemy as sqla -from loguru import logger -from datetime import datetime, timedelta -from .raps.raps.telemetry import Telemetry -from ..models.sim import SimConfig -from ..util.druid import get_druid_engine, get_table, to_timestamp -from ..util.es import get_nccs_cadence_engine -from . import SimException - - -def fetch_frontier_data(sim_config: SimConfig, raps_config: dict): - """ - Fetch and parse real telemetry data - """ - # TODO: Should consider using LVA API instead of directly querying the DB for this - nccs_cadence_engine = get_nccs_cadence_engine() - druid_engine = get_druid_engine() - start, end = sim_config.start, sim_config.end - - job_query = sqla.text(""" - SELECT - "allocation_id", "job_id", "slurm_version", "account", "group", "user", "name", - "time_limit", "time_submission", "time_eligible", "time_start", "time_end", "time_elapsed", - "node_count", xnames_str AS "xnames", "state_current", "state_reason", - "time_snapshot" - FROM "stf218.frontier.job-summary" - WHERE - (time_start IS NOT NULL AND time_start <= CONVERT(:end, TIMESTAMP)) AND - (time_end IS NULL OR time_end > CONVERT(:start, TIMESTAMP)) - """).bindparams( - start = start.isoformat(), end = end.isoformat(), - ) - job_data = pd.read_sql_query(job_query, nccs_cadence_engine, parse_dates=[ - "time_snapshot", "time_submission", "time_eligible", "time_start", "time_end", - ]) - # TODO: Even with sqlStringifyArrays: false, multivalue columns are returned as json strings. - # And single rows are returned as raw strings. When we update Druid we can use ARRAYS and remove - # this. Moving the jobs table to postgres would also fix this (and other issues). - job_data['xnames'] = job_data['xnames'].map(lambda x: x.split(",") if x else []) - - job_profile_tbl = get_table("pub-ts-frontier-job-profile", druid_engine) - job_profile_query = ( - sqla.select( - job_profile_tbl.c['__time'].label("timestamp"), - job_profile_tbl.c.allocation_id, - job_profile_tbl.c.sum_cpu0_power, - job_profile_tbl.c.sum_gpu_power, - ) - .where( - to_timestamp(start) <= job_profile_tbl.c['__time'], - job_profile_tbl.c['__time'] < to_timestamp(end), - ) - ) - job_profile_data = pd.read_sql(job_profile_query, druid_engine, parse_dates=[ - "timestamp", - ]) - - if (job_data.empty or job_profile_data.empty): - raise SimException(f"No telemetry data for {start.isoformat()} -> {end.isoformat()}") - - telemetry = Telemetry(system = "frontier", config = raps_config) - jobs = telemetry.load_data_from_df(job_data, job_profile_data, - min_time = start, - reschedule = sim_config.scheduler.reschedule, - config = raps_config, - ) - return jobs - - -def query_time_range( - tbl_name: str, start: datetime, end: datetime, end_col: str, *, - druid_engine, parse_dates: list[str], -): - tbl = get_table(tbl_name, druid_engine) - query = ( - sqla.select(sqla.text("*")) - .where( - (tbl.c['__time'] <= to_timestamp(end)) & - (tbl.c['__time'] >= to_timestamp(start - timedelta(days=3))) & - (tbl.c[end_col] >= to_timestamp(start)) - ) - ) - data = pd.read_sql(query, druid_engine, parse_dates=parse_dates) - return data - - -def split_list(x): - x = x.split(",") if x else [] - return np.array([int(x) for x in x]) - - -def fetch_fugaku_data(sim_config: SimConfig, raps_config: dict): - druid_engine = get_druid_engine() - start, end = sim_config.start, sim_config.end - - data = query_time_range( - "svc-ts-exadigit-data-fugaku", start, end, 'edt', - druid_engine = druid_engine, - parse_dates = ["adt", "qdt", "schedsdt", "deldt", "sdt", "edt"], - ) - telemetry = Telemetry(system = "fugaku", config = raps_config) - jobs = telemetry.load_data_from_df(data, - min_time = start, - reschedule = sim_config.scheduler.reschedule, - config = raps_config, - ) - return jobs - - -def fetch_marconi100_data(sim_config: SimConfig, raps_config: dict): - druid_engine = get_druid_engine() - start, end = sim_config.start, sim_config.end - - data = query_time_range( - "svc-ts-exadigit-data-marconi100", start, end, 'end_time', - druid_engine = druid_engine, - parse_dates = ["submit_time", "start_time", "end_time", "eligible_time"], - ) - - data['nodes'] = data['nodes'].map(split_list) - data['node_power_consumption'] = data['node_power_consumption'].map(split_list) - data['mem_power_consumption'] = data['mem_power_consumption'].map(split_list) - data['cpu_power_consumption'] = data['cpu_power_consumption'].map(split_list) - - telemetry = Telemetry(system = "marconi100", config = raps_config) - jobs = telemetry.load_data_from_df(data, - min_time = start, - reschedule = sim_config.scheduler.reschedule, - config = raps_config, - ) - return jobs - - -def fetch_lassen_data(sim_config: SimConfig, raps_config: dict): - druid_engine = get_druid_engine() - start, end = sim_config.start, sim_config.end - - allocation_df = query_time_range( - "svc-ts-exadigit-data-lassen-allocation-history", start, end, 'end_time', - druid_engine = druid_engine, - parse_dates = ["begin_time", "end_time", "job_submit_time"], - ) - - tbl = get_table("svc-ts-exadigit-data-lassen-node-history", druid_engine) - node_query = ( - sqla.select(sqla.text("*")) - .where( - (tbl.c['__time'] <= to_timestamp(end)) & - (tbl.c['__time'] >= to_timestamp(start - timedelta(days=3))) - ) - ) - node_df = pd.read_sql(node_query, druid_engine) - - step_df = query_time_range( - "svc-ts-exadigit-data-lassen-step-history", start, end, 'end_time', - druid_engine = druid_engine, - parse_dates = ["begin_time", "end_time"], - ) - - telemetry = Telemetry(system = "lassen", config = raps_config) - jobs = telemetry.load_data_from_df( - allocation_df = allocation_df, node_df = node_df, step_df = step_df, - min_time = start, - reschedule = sim_config.scheduler.reschedule, - config = raps_config, - ) - return jobs - - -DATA_LOADERS = { - "frontier": fetch_frontier_data, - "fugaku": fetch_fugaku_data, - "marconi100": fetch_marconi100_data, - "lassen": fetch_lassen_data, -} diff --git a/simulation_server/simulation/dataloaders/fugaku.py b/simulation_server/simulation/dataloaders/fugaku.py new file mode 100644 index 0000000..8b492d6 --- /dev/null +++ b/simulation_server/simulation/dataloaders/fugaku.py @@ -0,0 +1,18 @@ +from ...util.druid import get_druid_engine +from ...util.dataloader import query_time_range +from ...models.sim import ServerSimConfig + +# Re-use these from the raps fugaku dataloader +from raps.dataloaders.fugaku import load_data_from_df, node_index_to_name, cdu_index_to_name, cdu_pos + + +def load_data(_paths, **kwargs): + druid_engine = get_druid_engine() + sim_config: ServerSimConfig = kwargs['sim_config'] + start, end = sim_config.start, sim_config.end + df = query_time_range( + "svc-ts-exadigit-data-fugaku", start, end, 'sdt', 'edt', + druid_engine = druid_engine, + parse_dates = ["adt", "qdt", "schedsdt", "deldt", "sdt", "edt"], + ) + return load_data_from_df(df, **kwargs) diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index fab7968..c6d513e 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -1,6 +1,7 @@ from typing import NamedTuple from datetime import datetime, timedelta import functools, itertools +import importlib, importlib.util import orjson from loguru import logger from raps import Engine @@ -13,7 +14,6 @@ from ..models.output import ( ) from ..util.misc import nest_dict from . import SimException -# from .dataloaders import DATA_LOADERS class SimTickOutput(NamedTuple): @@ -53,7 +53,14 @@ def snap_sample_rate(desired_rate: int, actual_rate: int): def run_simulation(sim_config: ServerSimConfig): - # TODO: replay logic + if sim_config.replay: + if not isinstance(sim_config.system, str): + raise SimException(f"replay is not supported for custom systems") + dataloader = f"simulation_server.simulation.dataloaders.{sim_config.system}" + if not importlib.util.find_spec(dataloader): + raise SimException(f"{sim_config.system} does not support replay") + sim_config = sim_config.model_copy(update = {"dataloader": dataloader}) + engine = Engine(sim_config) running_stats = RunningStats(engine) diff --git a/simulation_server/util/dataloader.py b/simulation_server/util/dataloader.py new file mode 100644 index 0000000..df91915 --- /dev/null +++ b/simulation_server/util/dataloader.py @@ -0,0 +1,35 @@ +from datetime import datetime, timedelta +import pandas as pd +import numpy as np +import sqlalchemy as sqla +from .druid import to_timestamp, get_table +from ..simulation import SimException + + +def query_time_range( + tbl_name: str, + start: datetime, end: datetime, + start_col: str, end_col: str, *, + druid_engine, parse_dates: list[str], +) -> pd.DataFrame: + """ Queries a time range in druid. Returns a dataframe, throws if empty. """ + tbl = get_table(tbl_name, druid_engine) + query = ( + sqla.select(sqla.text("*")) + .where( + # __time is submission time + (tbl.c['__time'] <= to_timestamp(end)) & + (tbl.c['__time'] >= to_timestamp(start - timedelta(days=7))) & + (tbl.c[start_col] <= to_timestamp(end)) & + (tbl.c[end_col] >= to_timestamp(start)) + ) + ) + df = pd.read_sql(query, druid_engine, parse_dates=parse_dates) + if len(df) == 0: + raise SimException(f"No data found for {start.isoformat()} -> {end.isoformat()}") + return df + + +def split_list(x): + x = x.split(",") if x else [] + return np.array([int(x) for x in x]) -- GitLab From 658f9a0587930d05eb1f077c14d1280256777fe6 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 22 Oct 2025 14:27:39 -0400 Subject: [PATCH 57/77] Update submodules --- raps | 2 +- simulation_dashboard | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/raps b/raps index f75cb91..ef2ce66 160000 --- a/raps +++ b/raps @@ -1 +1 @@ -Subproject commit f75cb91a43e4a22c7f3a0cbd20eae4f4f5745338 +Subproject commit ef2ce667c5b42b1897370603f531017523d3fa84 diff --git a/simulation_dashboard b/simulation_dashboard index f1b0bd8..9b53bae 160000 --- a/simulation_dashboard +++ b/simulation_dashboard @@ -1 +1 @@ -Subproject commit f1b0bd8c8538c3a713cda72dcbb6264ccf05ab2e +Subproject commit 9b53baede4fb8e08792194a3137f336fa46bc475 -- GitLab From a6f7fed5ae5aee05402de04b9db658c73839463c Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 22 Oct 2025 14:51:51 -0400 Subject: [PATCH 58/77] Remove commented code --- simulation_server/util/druid.py | 1 - 1 file changed, 1 deletion(-) diff --git a/simulation_server/util/druid.py b/simulation_server/util/druid.py index ef49a31..98f42fa 100644 --- a/simulation_server/util/druid.py +++ b/simulation_server/util/druid.py @@ -8,7 +8,6 @@ from loguru import logger import sqlalchemy as sqla from sqlalchemy.sql import ColumnElement from .misc import to_iso_duration -# from ..config import get_app_settings def get_druid_engine(**kwargs): -- GitLab From 1cb5312546d4740729ecbdd4dabdda22a09094ae Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 23 Oct 2025 15:09:02 -0400 Subject: [PATCH 59/77] Clean up stuck sims on local --- pyproject.toml | 1 + simulation_server/models/sim.py | 2 +- simulation_server/server/main.py | 23 +++++---- simulation_server/server/service.py | 76 ++++++++++++++--------------- simulation_server/util/k8s.py | 8 +-- 5 files changed, 53 insertions(+), 57 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5bd5c8d..d6d1add 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "orjson==3.11.3", "confluent_kafka==2.11.1", "pyjson5==2.0.0", + "psutil==7.1.0", "raps@{root:uri}/raps", ] diff --git a/simulation_server/models/sim.py b/simulation_server/models/sim.py index d418088..766bc33 100644 --- a/simulation_server/models/sim.py +++ b/simulation_server/models/sim.py @@ -20,7 +20,7 @@ class Sim(BaseModel): user: Optional[str] = None """ User who launched the simulation """ - system: str + system: Optional[str] = None state: Optional[Literal['running', 'success', 'fail']] = None diff --git a/simulation_server/server/main.py b/simulation_server/server/main.py index d29df1a..e9d2128 100644 --- a/simulation_server/server/main.py +++ b/simulation_server/server/main.py @@ -22,12 +22,12 @@ settings = AppSettings() def repeat_task(func, seconds): - if not asyncio.iscoroutinefunction(func): - func = functools.partial(run_in_threadpool, func) - async def loop() -> None: while True: - await func() + try: + await func() + except Exception as e: + logger.exception(f"Background task failed: {e}") await asyncio.sleep(seconds) return asyncio.create_task(loop()) @@ -40,13 +40,13 @@ async def lifespan(api: FastAPI): for dep in deps: api.dependency_overrides.get(dep, dep)() - # TODO: Should add cleanup handler for local as well - background_task_loop = None - if settings.env == 'prod' and 'KUBERNETES_SERVICE_HOST' in os.environ: - background_task_loop = repeat_task( - lambda: cleanup_jobs(druid_engine = get_druid_engine(), kafka_producer = get_kafka_producer()), - seconds = 5 * 60, + async def background_task(): + cleanup_jobs( + druid_engine = get_druid_engine(), + kafka_producer = get_kafka_producer(), + settings = get_app_settings(), ) + background_task_loop = repeat_task(background_task, seconds = 5) if settings.env == 'dev': kafka_admin = get_kafka_admin() @@ -80,8 +80,7 @@ async def lifespan(api: FastAPI): yield - # if background_task_loop: - # background_task_loop.cancel() + background_task_loop.cancel() app = FastAPI( diff --git a/simulation_server/server/service.py b/simulation_server/server/service.py index 0e20615..cb8415d 100644 --- a/simulation_server/server/service.py +++ b/simulation_server/server/service.py @@ -1,5 +1,6 @@ from typing import Optional, Any from datetime import datetime, timedelta, timezone +import psutil import functools import uuid, time, json, base64, os, sys, subprocess import sqlalchemy as sqla @@ -15,14 +16,14 @@ from ..models.output import ( SCHEDULER_SIM_JOB_POWER_HISTORY_API_FIELDS, SCHEDULER_SIM_JOB_POWER_HISTORY_FIELD_SELECTORS, ) from ..util.misc import pick, omit -from ..util.k8s import submit_job, get_job, get_job_state, get_job_end_time +from ..util.k8s import submit_job, get_k8s_jobs, get_k8s_job_state, get_k8s_job_end_time from ..util.druid import to_timestamp, any_value, latest, execute_ignore_missing from ..util.api_queries import ( Filters, Sort, QuerySpan, Granularity, expand_field_selectors, DatetimeValidator, DEFAULT_FIELD_TYPES, ) from . import orm -from .config import AppDeps, AppSettings +from .config import AppDeps def wait_until_exists(stmt: sqla.Select, *, timeout: timedelta = timedelta(minutes=1), druid_engine: sqla.Engine): @@ -121,62 +122,57 @@ def run_simulation(sim_config: ServerSimConfig, deps: AppDeps): return sim -_sim_jobs_cache: dict[str, tuple[Any, datetime]] = {} -_sim_job_cache_expire = timedelta(minutes=5) -def get_sim_job(sim_id: str): - now = datetime.now() - # Expire old entries - for cid in list(_sim_jobs_cache.keys()): - if (now - _sim_jobs_cache[cid][1]) > _sim_job_cache_expire: - del _sim_jobs_cache[cid] - - if sim_id not in _sim_jobs_cache: - _sim_jobs_cache[sim_id] = (get_job(f"exadigit-simulation-server-{sim_id}"), now) - - return _sim_jobs_cache[sim_id][0] - - -def cleanup_jobs(druid_engine, kafka_producer): +def cleanup_jobs(druid_engine, kafka_producer, settings): """ If a simulation job dies unexpectedly (e.g. OOM error), it won't be able to send the kafka message marking the sim as complete, leaving the sim stuck as running. This task checks all running sim jobs and cleans them up if their job is dead. """ + if 'KUBERNETES_SERVICE_HOST' in os.environ and settings.env != 'prod': + # Skip job cleanup on stage/dev k8s deployments to avoid multiple instances of the server + # trying to cancel jobs + return logger.info(f"Checking for stuck jobs") now = datetime.now(timezone.utc) - threshold = timedelta(minutes=5) - - sims, _ = query_sims( + # threshold after job has ended before sending a cancel (incase the job did send its own + # cancel message and it just hasn't shown up in Druid yet) + threshold = timedelta(minutes=1) + + running_jobs = set() + if 'KUBERNETES_SERVICE_HOST' in os.environ: + for job in get_k8s_jobs(): + if job.metadata.name.startswith('exadigit-simulation-server-'): + sim_id = job.metadata.name.removeprefix('exadigit-simulation-server-') + # Add a little bit of threshold to avoid potentially sending duplicate fail messages + if get_k8s_job_state(job) == "running" or get_k8s_job_end_time(job) < now - threshold: + running_jobs.add(sim_id) + else: + for proc in psutil.Process().children(): + try: + if 'simulation_server.simulation.main' in proc.cmdline(): + sim_id = json.loads(proc.environ()["SIM"])['id'] + if proc.is_running(): + running_jobs.add(sim_id) + except (psutil.Error): + pass + + running_sims, _ = query_sims( filters=SIM_FILTERS(state = ["eq:running"]), - fields = ["id"], + fields = ["all"], limit = 1000, # If somehow there's more than that we'll just get them next trigger druid_engine = druid_engine, ) - - stuck_ids = [] - for sim in sims: - job = get_sim_job(sim.id) - job_state = get_job_state(job) - if job_state != 'running' and (not job or get_job_end_time(job) < now - threshold): - stuck_ids.append(sim.id) - - if stuck_ids: - stuck_sims, _ = query_sims( - filters = SIM_FILTERS(id = [f'one_of:{",".join(stuck_ids)}']), - fields = ['all'], - limit = len(stuck_ids), - druid_engine = druid_engine, - ) - - for sim in stuck_sims: + + for sim in running_sims: + if sim.id not in running_jobs and now - sim.execution_start > threshold: sim.state = 'fail' sim.execution_end = now sim.error_messages = "Simulation crashed" logger.warning(f"Marking stuck sim {sim.id} as failed") kafka_producer.produce("svc-event-exadigit-sim", sim.serialize_for_druid()) - for sim in stuck_sims: + # Block until saved to make sure we don't double-send stmt = ( sqla.select(orm.sim.c.id) .where(orm.sim.c.id == sim.id, orm.sim.c.state == 'fail') diff --git a/simulation_server/util/k8s.py b/simulation_server/util/k8s.py index 58c8fcf..cb6b1b2 100644 --- a/simulation_server/util/k8s.py +++ b/simulation_server/util/k8s.py @@ -17,9 +17,9 @@ def submit_job(job: dict): return get_batch_api().create_namespaced_job(namespace = get_namespace(), body = job) -def get_job(name: str): +def get_k8s_jobs(): try: - return get_batch_api().read_namespaced_job(namespace = get_namespace(), name = name) + return get_batch_api().list_namespaced_job(namespace = get_namespace()) except k8s.client.ApiException as e: if e.status == 404: return None @@ -27,7 +27,7 @@ def get_job(name: str): raise e -def get_job_state(job): +def get_k8s_job_state(job): if job: if job.status.succeeded: return 'success' @@ -39,6 +39,6 @@ def get_job_state(job): return 'deleted' -def get_job_end_time(job): +def get_k8s_job_end_time(job): # completion_time for failed jobs is null return job.status.completion_time or job.status.conditions[-1].last_transition_time -- GitLab From bfe86287a9ea05520053d037cfb2460676162d3c Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 23 Oct 2025 15:21:12 -0400 Subject: [PATCH 60/77] Lassen replay --- .../simulation/dataloaders/lassen.py | 51 +++++++++++++++++++ simulation_server/util/dataloader.py | 2 +- 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 simulation_server/simulation/dataloaders/lassen.py diff --git a/simulation_server/simulation/dataloaders/lassen.py b/simulation_server/simulation/dataloaders/lassen.py new file mode 100644 index 0000000..a27a1ad --- /dev/null +++ b/simulation_server/simulation/dataloaders/lassen.py @@ -0,0 +1,51 @@ +import pandas as pd +import sqlalchemy as sqla +from datetime import timedelta + +from ...util.druid import get_druid_engine, get_table, to_timestamp +from ...util.dataloader import query_time_range +from ...models.sim import ServerSimConfig + +# Re-use these from the raps fugaku dataloader +from raps.dataloaders.lassen import load_data_from_df, node_index_to_name, cdu_index_to_name, cdu_pos + + +def load_data(_paths, **kwargs): + druid_engine = get_druid_engine() + sim_config: ServerSimConfig = kwargs['sim_config'] + start, end = sim_config.start, sim_config.end + + allocation_df = query_time_range( + "svc-ts-exadigit-data-lassen-allocation-history", + start, end, 'begin_time', 'end_time', + druid_engine = druid_engine, + parse_dates = ["begin_time", "end_time", "job_submit_time"], + ) + # load_data_from_df expects naive datetimes + allocation_df["begin_time"] = allocation_df["begin_time"].dt.tz_localize(None) + allocation_df["end_time"] = allocation_df["end_time"].dt.tz_localize(None) + allocation_df["job_submit_time"] = allocation_df["job_submit_time"].dt.tz_localize(None) + + tbl = get_table("svc-ts-exadigit-data-lassen-node-history", druid_engine) + node_query = ( + sqla.select(sqla.text("*")) + .where( + (tbl.c['__time'] <= to_timestamp(end)) & + (tbl.c['__time'] >= to_timestamp(start - timedelta(days=3))) + ) + ) + node_df = pd.read_sql(node_query, druid_engine) + + # step_df doesn't appear to actually be used by load_data_from_df? + step_df = query_time_range( + "svc-ts-exadigit-data-lassen-step-history", start, end, 'begin_time', 'end_time', + druid_engine = druid_engine, + parse_dates = ["begin_time", "end_time"], + ) + step_df["begin_time"] = step_df["begin_time"].dt.tz_localize(None) + step_df["end_time"] = step_df["end_time"].dt.tz_localize(None) + + return load_data_from_df( + allocation_df = allocation_df, node_df = node_df, step_df = step_df, + **kwargs, + ) diff --git a/simulation_server/util/dataloader.py b/simulation_server/util/dataloader.py index df91915..b494ca1 100644 --- a/simulation_server/util/dataloader.py +++ b/simulation_server/util/dataloader.py @@ -10,7 +10,7 @@ def query_time_range( tbl_name: str, start: datetime, end: datetime, start_col: str, end_col: str, *, - druid_engine, parse_dates: list[str], + druid_engine, parse_dates: list[str] = [], ) -> pd.DataFrame: """ Queries a time range in druid. Returns a dataframe, throws if empty. """ tbl = get_table(tbl_name, druid_engine) -- GitLab From 522bd5f19e6ad5e5cdc4042f8169456c10efef64 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 24 Oct 2025 13:23:07 -0400 Subject: [PATCH 61/77] Fix lassen rack error --- simulation_server/models/output.py | 4 ++++ simulation_server/simulation/simulation.py | 15 +++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/simulation_server/models/output.py b/simulation_server/models/output.py index f96f3c0..55187d4 100644 --- a/simulation_server/models/output.py +++ b/simulation_server/models/output.py @@ -143,6 +143,10 @@ class CoolingSimCDU(BaseModel): col: Optional[int] = None """ Col index of the cdu (Note this is the col of the neighboring cabinet.)""" + # TODO: RAPS supports any number of racks per CDU, while this is still hard-coded to the 3 in + # Frontier. This will work for any system with 3 or less. We need to rethink how the racks are + # stored in the DB, maybe a separate table. Or use an Array type for the field, but that makes + # timeseries aggregation queries harder. rack_1_power: Optional[float] = None rack_2_power: Optional[float] = None rack_3_power: Optional[float] = None diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index c6d513e..8095a6f 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -181,15 +181,18 @@ def run_simulation(sim_config: ServerSimConfig): cooling_sim_cdu_map: dict[int, dict] = {} if tick.power_df is not None and (is_last_tick or unix_timestamp % sample_power == 0): + # TODO: RAPS supports any number of racks per CDU, while this is still hard-coded to the + # 3 in Frontier. This will work for any system with 3 or less. We need to rethink how + # the racks are stored in the DB, maybe a separate table for i, point in tick.power_df.iterrows(): cooling_sim_cdu_map[int(point['CDU'])] = { - "rack_1_power": point['Rack 1'], - "rack_2_power": point['Rack 2'], - "rack_3_power": point['Rack 3'], + "rack_1_power": point.get('Rack 1'), + "rack_2_power": point.get('Rack 2'), + "rack_3_power": point.get('Rack 3'), "total_power": point['Sum'], - "rack_1_loss": point['Loss 1'], - "rack_2_loss": point['Loss 2'], - "rack_3_loss": point['Loss 3'], + "rack_1_loss": point.get('Loss 1'), + "rack_2_loss": point.get('Loss 2'), + "rack_3_loss": point.get('Loss 3'), "total_loss": point['Loss'], } -- GitLab From 6edc24d3f84855923b33f9621abe28e65726d4a8 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 24 Oct 2025 13:58:42 -0400 Subject: [PATCH 62/77] Reduce interval of background cleanup task --- simulation_server/server/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simulation_server/server/main.py b/simulation_server/server/main.py index e9d2128..174ac44 100644 --- a/simulation_server/server/main.py +++ b/simulation_server/server/main.py @@ -46,7 +46,7 @@ async def lifespan(api: FastAPI): kafka_producer = get_kafka_producer(), settings = get_app_settings(), ) - background_task_loop = repeat_task(background_task, seconds = 5) + background_task_loop = repeat_task(background_task, seconds = 2 * 60) if settings.env == 'dev': kafka_admin = get_kafka_admin() -- GitLab From e3a326a76204ff7f2ff71559b10339d1455a3abf Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 24 Oct 2025 14:22:58 -0400 Subject: [PATCH 63/77] Marconi100 replay working --- .../simulation/dataloaders/fugaku.py | 2 +- .../simulation/dataloaders/lassen.py | 2 +- .../simulation/dataloaders/marconi100.py | 24 +++++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 simulation_server/simulation/dataloaders/marconi100.py diff --git a/simulation_server/simulation/dataloaders/fugaku.py b/simulation_server/simulation/dataloaders/fugaku.py index 8b492d6..87b103c 100644 --- a/simulation_server/simulation/dataloaders/fugaku.py +++ b/simulation_server/simulation/dataloaders/fugaku.py @@ -2,7 +2,7 @@ from ...util.druid import get_druid_engine from ...util.dataloader import query_time_range from ...models.sim import ServerSimConfig -# Re-use these from the raps fugaku dataloader +# Re-use these from the raps dataloader from raps.dataloaders.fugaku import load_data_from_df, node_index_to_name, cdu_index_to_name, cdu_pos diff --git a/simulation_server/simulation/dataloaders/lassen.py b/simulation_server/simulation/dataloaders/lassen.py index a27a1ad..be1e52d 100644 --- a/simulation_server/simulation/dataloaders/lassen.py +++ b/simulation_server/simulation/dataloaders/lassen.py @@ -6,7 +6,7 @@ from ...util.druid import get_druid_engine, get_table, to_timestamp from ...util.dataloader import query_time_range from ...models.sim import ServerSimConfig -# Re-use these from the raps fugaku dataloader +# Re-use these from the raps dataloader from raps.dataloaders.lassen import load_data_from_df, node_index_to_name, cdu_index_to_name, cdu_pos diff --git a/simulation_server/simulation/dataloaders/marconi100.py b/simulation_server/simulation/dataloaders/marconi100.py new file mode 100644 index 0000000..e4fcad0 --- /dev/null +++ b/simulation_server/simulation/dataloaders/marconi100.py @@ -0,0 +1,24 @@ +from ...util.druid import get_druid_engine +from ...util.dataloader import query_time_range, split_list +from ...models.sim import ServerSimConfig + +# Re-use these from the raps dataloader +from raps.dataloaders.marconi100 import load_data_from_df, node_index_to_name, cdu_index_to_name, cdu_pos + + +def load_data(_paths, **kwargs): + druid_engine = get_druid_engine() + sim_config: ServerSimConfig = kwargs['sim_config'] + start, end = sim_config.start, sim_config.end + df = query_time_range( + "svc-ts-exadigit-data-marconi100", + start, end, 'start_time', 'end_time', + druid_engine = druid_engine, + parse_dates = ["submit_time", "start_time", "end_time", "eligible_time"], + ) + df['nodes'] = df['nodes'].map(split_list) + df['node_power_consumption'] = df['node_power_consumption'].map(split_list) + df['mem_power_consumption'] = df['mem_power_consumption'].map(split_list) + df['cpu_power_consumption'] = df['cpu_power_consumption'].map(split_list) + + return load_data_from_df(df, **kwargs) -- GitLab From f166bb3fbbbc03d1707eebae744a0492ea2095a4 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 24 Oct 2025 14:48:37 -0400 Subject: [PATCH 64/77] Update raps submodule --- raps | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/raps b/raps index ef2ce66..b0364e5 160000 --- a/raps +++ b/raps @@ -1 +1 @@ -Subproject commit ef2ce667c5b42b1897370603f531017523d3fa84 +Subproject commit b0364e568c171faa359a4cbfb56178e76c259707 -- GitLab From ad6f3d648afbd6f08a474881a6fffc0f3a0e47be Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 24 Oct 2025 16:37:04 -0400 Subject: [PATCH 65/77] Update ingests with s3 example --- druid_ingests/data-fugaku.json | 7 +++++++ druid_ingests/data-lassen-allocation-history.json | 6 ++++++ druid_ingests/data-lassen-node-history.json | 6 ++++++ druid_ingests/data-lassen-step-history.json | 6 ++++++ druid_ingests/data-marconi100.json | 6 ++++++ 5 files changed, 31 insertions(+) diff --git a/druid_ingests/data-fugaku.json b/druid_ingests/data-fugaku.json index e14ede2..8c3b30d 100644 --- a/druid_ingests/data-fugaku.json +++ b/druid_ingests/data-fugaku.json @@ -3,6 +3,13 @@ "spec": { "ioConfig": { "type": "index_parallel", + // "inputSource": { + // "type": "s3", + // "objectGlob": "**.parquet", + // "prefixes": [ + // "s3://scratch/raps-datasets/fugaku/" + // ] + // }, "inputSource": { "type": "local", "baseDir": "/data/fugaku/", diff --git a/druid_ingests/data-lassen-allocation-history.json b/druid_ingests/data-lassen-allocation-history.json index 04cf93e..076b2e0 100644 --- a/druid_ingests/data-lassen-allocation-history.json +++ b/druid_ingests/data-lassen-allocation-history.json @@ -3,6 +3,12 @@ "spec": { "ioConfig": { "type": "index_parallel", + // "inputSource": { + // "type": "s3", + // "prefixes": [ + // "s3://scratch/raps-datasets/lassen/final_csm_allocation_history_hashed.csv" + // ] + // }, "inputSource": { "type": "local", "baseDir": "/data/lassen/final_csm_allocation_history_hashed.csv", diff --git a/druid_ingests/data-lassen-node-history.json b/druid_ingests/data-lassen-node-history.json index 752e25e..a9a0e24 100644 --- a/druid_ingests/data-lassen-node-history.json +++ b/druid_ingests/data-lassen-node-history.json @@ -3,6 +3,12 @@ "spec": { "ioConfig": { "type": "index_parallel", + // "inputSource": { + // "type": "s3", + // "prefixes": [ + // "s3://scratch/raps-datasets/lassen/final_csm_allocation_node_history_with_time.csv" + // ] + // }, "inputSource": { "type": "local", "baseDir": "/data/lassen/final_csm_allocation_node_history_with_time.csv", diff --git a/druid_ingests/data-lassen-step-history.json b/druid_ingests/data-lassen-step-history.json index 5ff08dc..b3c445e 100644 --- a/druid_ingests/data-lassen-step-history.json +++ b/druid_ingests/data-lassen-step-history.json @@ -3,6 +3,12 @@ "spec": { "ioConfig": { "type": "index_parallel", + // "inputSource": { + // "type": "s3", + // "prefixes": [ + // "s3://scratch/raps-datasets/lassen/final_csm_step_history.csv" + // ] + // }, "inputSource": { "type": "local", "baseDir": "/data/lassen/final_csm_step_history.csv", diff --git a/druid_ingests/data-marconi100.json b/druid_ingests/data-marconi100.json index 16e3159..1e89d82 100644 --- a/druid_ingests/data-marconi100.json +++ b/druid_ingests/data-marconi100.json @@ -3,6 +3,12 @@ "spec": { "ioConfig": { "type": "index_parallel", + // "inputSource": { + // "type": "s3", + // "prefixes": [ + // "s3://scratch/raps-datasets/marconi100/" + // ] + // }, "inputSource": { "type": "local", "baseDir": "/data/marconi100/", -- GitLab From 18c6bdcde8ab470cc8a9bbe602a2bce241c0d558 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 24 Oct 2025 16:49:30 -0400 Subject: [PATCH 66/77] Update druid ingest script --- scripts/submit_data_ingests.py | 78 +++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 25 deletions(-) diff --git a/scripts/submit_data_ingests.py b/scripts/submit_data_ingests.py index 6a70426..962500c 100755 --- a/scripts/submit_data_ingests.py +++ b/scripts/submit_data_ingests.py @@ -1,12 +1,15 @@ #!/usr/bin/env python3 +""" +Submits the replay data ingests to druid. +""" from pathlib import Path import urllib.parse from typing import Any import time, os import pyjson5 import requests +import getpass, argparse from loguru import logger -import orjson class DruidApi: @@ -34,36 +37,61 @@ class DruidApi: def submit_ingest(druid: DruidApi, file): - ingest = pyjson5.loads(Path(file).read_text()) # using yaml as hack to allow comments logger.info(f"Submitting ingest for {file}...") - response = druid.request("POST", "/druid/indexer/v1/task", json = ingest) - task_id = response['task'] - logger.info(f"See {druid.url}/unified-console.html#tasks/task_id~{task_id} to view ingest progress.") - logger.info(f"Waiting for ingest{task_id} to complete...") - - status = "RUNNING" - while status == "RUNNING": - time.sleep(5) - response = druid.request("GET", f"/druid/indexer/v1/task/{task_id}/status") - status = response['status']['statusCode'] - if status != "SUCCESS": - raise ValueError(f"Ingest for {file} failed!") + ingest = pyjson5.loads(Path(file).read_text()) # using yaml as hack to allow comments + ingest_type = ingest['type'] + + if ingest_type == "kafka": + response = druid.request("POST", "/druid/indexer/v1/supervisor", json = ingest) + logger.info(f"Supervisor for {file} submitted") + logger.info(f"See {druid.url}/unified-console.html to view the streaming ingest.") else: - logger.info(f"Ingest for {file} finished.") + response = druid.request("POST", "/druid/indexer/v1/task", json = ingest) + task_id = response['task'] + + logger.info(f"See {druid.url}/unified-console.html#tasks/task_id~{task_id} to view ingest progress.") + logger.info(f"Waiting for ingest{task_id} to complete...") + + status = "RUNNING" + while status == "RUNNING": + time.sleep(5) + response = druid.request("GET", f"/druid/indexer/v1/task/{task_id}/status") + status = response['status']['statusCode'] + if status != "SUCCESS": + raise ValueError(f"Ingest for {file} failed!") + else: + logger.info(f"Ingest for {file} finished.") if __name__ == "__main__": - DRUID_URL = os.environ.get("DRUID_URL", "http://localhost:8888") - DRUID_USER = os.environ.get("DRUID_USER") or None # Convert "" to None - DRUID_PASSWORD = os.environ.get("DRUID_PASSWORD") or None + parser = argparse.ArgumentParser( + description = __doc__.strip(), + formatter_class = argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("ingests", type = Path, nargs = "*", help = "List of druid ingests") + args = parser.parse_args() - druid = DruidApi(DRUID_URL, DRUID_USER, DRUID_PASSWORD) + if not args.ingests: + ingests = sorted(Path("./druid_ingests").resolve().glob("data-*.json")) + else: + ingests = [Path(p).resolve() for p in args.ingests] - submit_ingest(druid, "./druid_ingests/data-marconi100.json") - submit_ingest(druid, "./druid_ingests/data-lassen-allocation-history.json") - submit_ingest(druid, "./druid_ingests/data-lassen-node-history.json") - submit_ingest(druid, "./druid_ingests/data-lassen-step-history.json") - submit_ingest(druid, "./druid_ingests/data-fugaku.json") + druid_url = os.environ.get("DRUID_URL") + if not druid_url: + druid_url = input("Druid URL (http://localhost:8888): ") + druid_url = druid_url.strip() or "http://localhost:8888" - logger.info("Done!") + druid_username = os.environ.get("DRUID_USERNAME") + if not druid_username: + druid_username = input("Druid Username: ").strip() or None + + druid_password = os.environ.get("DRUID_PASSWORD") + if not druid_password: + druid_password = getpass.getpass("Druid Password: ").strip() or None + + druid = DruidApi(druid_url, druid_username, druid_password) + for ingest in ingests: + submit_ingest(druid, ingest) + + logger.info("Done!") -- GitLab From 35e75ba66ef0b4074644274af44a44e805cd99db Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Fri, 24 Oct 2025 17:02:06 -0400 Subject: [PATCH 67/77] Fix kafka cred configs --- simulation_server/util/kafka.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simulation_server/util/kafka.py b/simulation_server/util/kafka.py index cd20b38..b366195 100644 --- a/simulation_server/util/kafka.py +++ b/simulation_server/util/kafka.py @@ -9,8 +9,8 @@ def _get_kafka_config(): 'bootstrap.servers': os.environ['KAFKA_BOOTSTRAP'], 'sasl.mechanism': os.environ.get('KAFKA_SASL_MECHANISM'), 'security.protocol': os.environ.get('KAFKA_SECURITY_PROTOCOL'), - 'sasl.plain.username': os.environ.get('KAFKA_SASL_USERNAME'), - 'sasl.plain.password': os.environ.get('KAFKA_SASL_PASSWORD'), + 'sasl.username': os.environ.get('KAFKA_SASL_USERNAME'), + 'sasl.password': os.environ.get('KAFKA_SASL_PASSWORD'), } return {k: v for k, v in env_config.items() if v is not None} -- GitLab From 46484149667a77f1e46a05c7a83922b8ff0d2b8c Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 28 Oct 2025 11:20:43 -0400 Subject: [PATCH 68/77] Improve execute_ignore_missing --- simulation_server/util/druid.py | 34 +++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/simulation_server/util/druid.py b/simulation_server/util/druid.py index 98f42fa..52377d7 100644 --- a/simulation_server/util/druid.py +++ b/simulation_server/util/druid.py @@ -122,6 +122,23 @@ earliest = _size_func(sqla.func.earliest) earliest_py = _size_func(sqla.func.earliest_py) +def table_is_ready(engine, tbl: str): + """ + sqla.inspect(conn.engine) returns tables that have streaming ingestion but no data yet and so + still cause errors. Only workaround to check for this I've found is to just check for the error. + + This only seems to be an issue on older versions of Druid + """ + try: + return sqla.inspect(engine).has_table(tbl) + except Exception as e: + # druid throws errors like "has_table() got an unexpected keyword argument 'info_cache'" + if "has_table" in str(e): + return False + else: + raise e + + def execute_ignore_missing(conn, stmt) -> sqla.CursorResult: """ Wrapper conn.execute that handles missing tables. @@ -132,14 +149,15 @@ def execute_ignore_missing(conn, stmt) -> sqla.CursorResult: try: return conn.execute(stmt) except Exception as e: - existing_tables = set(sqla.inspect(conn.engine).get_table_names()) - stmt_tables = set([t.name for t in stmt.get_final_froms()]) - missing_tables = stmt_tables - existing_tables - if missing_tables: - logger.info(f"table(s) {', '.join(stmt_tables)} missing, returning empty result") - return conn.execute(sqla.text("SELECT 1 FROM (VALUES (1)) AS tbl(a) WHERE 1 != 1")) - else: - raise e + try: + stmt_tables = set([t.name for t in stmt.get_final_froms()]) + missing_tables = [tbl for tbl in stmt_tables if not table_is_ready(conn.engine, tbl)] + if missing_tables: + logger.info(f"table(s) {', '.join(missing_tables)} missing, returning empty result") + return conn.execute(sqla.text("SELECT 1 FROM (VALUES (1)) AS tbl(a) WHERE 1 != 1")) + except: + pass # Just raise the original error + raise e def submit_ingest(ingest: dict): -- GitLab From 6eb3c67a888a0bb6854080a9f93e4d6f44cc7744 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 28 Oct 2025 11:27:55 -0400 Subject: [PATCH 69/77] Fix deployment ports --- deployment.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deployment.yaml b/deployment.yaml index 2c3e37e..1d9cf47 100644 --- a/deployment.yaml +++ b/deployment.yaml @@ -41,6 +41,8 @@ objects: env: - name: EXADIGIT_ENV value: ${ENV} + - name: EXADIGIT_HTTP_PORT + value: "8080" - name: EXADIGIT_ROOT_PATH value: "/exadigit/api" - name: EXADIGIT_DEBUG_MODE -- GitLab From 981108072eae19be4a30d296e1f0fddddf023701 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 28 Oct 2025 11:28:07 -0400 Subject: [PATCH 70/77] Update deployment script --- scripts/deploy.sh | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/scripts/deploy.sh b/scripts/deploy.sh index 02aaea0..4f9db50 100755 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -1,31 +1,28 @@ #!/bin/bash # Deploy the pod. Pass the environment (prod or stage) you want to deploy to set -e # Exit if any commmand fails +BASE_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}")/..) +cd "$BASE_DIR" REGISTRY="registry.apps.marble.ccs.ornl.gov/stf218-app" -BASE_DIR=$(realpath $(dirname "${BASH_SOURCE[0]}")/..) -cd "$BASE_DIR" ENV=$1 if [ "$ENV" != "prod" ] && [ "$ENV" != "stage" ]; then echo 'You need to pass either "prod" or "stage"' exit fi -SERVER_IMAGE_STREAM="$REGISTRY/exadigit-simulation-server" -JOB_IMAGE_STREAM="$REGISTRY/exadigit-simulation-server-simulation-job" -docker build -t $SERVER_IMAGE_STREAM:latest -f Dockerfile.server . -docker build -t $JOB_IMAGE_STREAM:latest -f Dockerfile.simulation . +SERVER_IMAGE_STREAM="$REGISTRY/exadigit-simulation-server" +docker build -t $SERVER_IMAGE_STREAM:latest -f Dockerfile . docker push $SERVER_IMAGE_STREAM:latest -docker push $JOB_IMAGE_STREAM:latest SERVER_IMAGE=$(docker inspect --format='{{index .RepoDigests 0}}' $SERVER_IMAGE_STREAM:latest) -JOB_IMAGE=$(docker inspect --format='{{index .RepoDigests 0}}' $JOB_IMAGE_STREAM:latest) +echo "$SERVER_IMAGE" # Scale down so pod gets recreated and uses new image. Allow error if pod doesn't exist oc --namespace stf218-app scale deploy -l env=$ENV,app=exadigit-simulation-server --replicas=0 || true # Process template and apply oc process -f ./deployment.yaml -o yaml \ - --param=ENV=$ENV --param=SERVER_IMAGE="$SERVER_IMAGE" --param=JOB_IMAGE="$JOB_IMAGE" \ + --param=ENV=$ENV --param=SERVER_IMAGE="$SERVER_IMAGE" --param=JOB_IMAGE="$SERVER_IMAGE" \ | oc apply -f - -- GitLab From a2c2148681e335188db8e813dd73029401c6b874 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 28 Oct 2025 11:38:31 -0400 Subject: [PATCH 71/77] Fixes to cleanup task --- simulation_server/server/service.py | 29 ++++++++++++++--------------- simulation_server/util/k8s.py | 4 ++-- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/simulation_server/server/service.py b/simulation_server/server/service.py index cb8415d..7be5aae 100644 --- a/simulation_server/server/service.py +++ b/simulation_server/server/service.py @@ -16,7 +16,7 @@ from ..models.output import ( SCHEDULER_SIM_JOB_POWER_HISTORY_API_FIELDS, SCHEDULER_SIM_JOB_POWER_HISTORY_FIELD_SELECTORS, ) from ..util.misc import pick, omit -from ..util.k8s import submit_job, get_k8s_jobs, get_k8s_job_state, get_k8s_job_end_time +from ..util.k8s import submit_job, get_k8s_job, get_k8s_job_state, get_k8s_job_end_time from ..util.druid import to_timestamp, any_value, latest, execute_ignore_missing from ..util.api_queries import ( Filters, Sort, QuerySpan, Granularity, expand_field_selectors, DatetimeValidator, @@ -137,16 +137,22 @@ def cleanup_jobs(druid_engine, kafka_producer, settings): now = datetime.now(timezone.utc) # threshold after job has ended before sending a cancel (incase the job did send its own # cancel message and it just hasn't shown up in Druid yet) - threshold = timedelta(minutes=1) + threshold = timedelta(minutes=2) + + running_sims, _ = query_sims( + filters=SIM_FILTERS(state = ["eq:running"]), + fields = ["all"], + limit = 1000, # If somehow there's more than that we'll just get them next trigger + druid_engine = druid_engine, + ) running_jobs = set() if 'KUBERNETES_SERVICE_HOST' in os.environ: - for job in get_k8s_jobs(): - if job.metadata.name.startswith('exadigit-simulation-server-'): - sim_id = job.metadata.name.removeprefix('exadigit-simulation-server-') - # Add a little bit of threshold to avoid potentially sending duplicate fail messages - if get_k8s_job_state(job) == "running" or get_k8s_job_end_time(job) < now - threshold: - running_jobs.add(sim_id) + for sim in running_sims: + job = get_k8s_job(f"exadigit-simulation-server-{sim.id}") + # Add a little bit of threshold to avoid potentially sending duplicate fail messages + if get_k8s_job_state(job) == "running" or (job and now - get_k8s_job_end_time(job) < threshold): + running_jobs.add(sim.id) else: for proc in psutil.Process().children(): try: @@ -157,13 +163,6 @@ def cleanup_jobs(druid_engine, kafka_producer, settings): except (psutil.Error): pass - running_sims, _ = query_sims( - filters=SIM_FILTERS(state = ["eq:running"]), - fields = ["all"], - limit = 1000, # If somehow there's more than that we'll just get them next trigger - druid_engine = druid_engine, - ) - for sim in running_sims: if sim.id not in running_jobs and now - sim.execution_start > threshold: sim.state = 'fail' diff --git a/simulation_server/util/k8s.py b/simulation_server/util/k8s.py index cb6b1b2..bf67a38 100644 --- a/simulation_server/util/k8s.py +++ b/simulation_server/util/k8s.py @@ -17,9 +17,9 @@ def submit_job(job: dict): return get_batch_api().create_namespaced_job(namespace = get_namespace(), body = job) -def get_k8s_jobs(): +def get_k8s_job(name): try: - return get_batch_api().list_namespaced_job(namespace = get_namespace()) + return get_batch_api().read_namespaced_job(namespace = get_namespace(), name = name) except k8s.client.ApiException as e: if e.status == 404: return None -- GitLab From 7982423d3725a571aa907bf361063a5a0dc54fbf Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 28 Oct 2025 13:53:41 -0400 Subject: [PATCH 72/77] Update submodule --- simulation_dashboard | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simulation_dashboard b/simulation_dashboard index 9b53bae..7a51dc8 160000 --- a/simulation_dashboard +++ b/simulation_dashboard @@ -1 +1 @@ -Subproject commit 9b53baede4fb8e08792194a3137f336fa46bc475 +Subproject commit 7a51dc80955f871ede392b6a446cbee9fe30faba -- GitLab From f0220d85d25133c7eb71896837cd42498e50e1e9 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 28 Oct 2025 13:54:18 -0400 Subject: [PATCH 73/77] Update port in dashboard --- docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 5830414..3a9dcd4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -243,12 +243,12 @@ services: VITE_API_PATH: "http://localhost:8081" container_name: simulation-dashboard ports: - - "8080:80" + - "8080:8080" depends_on: simulation-server: condition: service_healthy healthcheck: - test: ["CMD-SHELL", "curl --fail -s http://localhost:80/index.html || exit 1"] + test: ["CMD-SHELL", "curl --fail -s http://localhost:8080/index.html || exit 1"] interval: 10s retries: 3 start_interval: 1s -- GitLab From ce70e75b49a5ace11fef44e43146d36a457a81bb Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Tue, 28 Oct 2025 14:30:06 -0400 Subject: [PATCH 74/77] Update submodule --- raps | 2 +- simulation_dashboard | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/raps b/raps index b0364e5..a028911 160000 --- a/raps +++ b/raps @@ -1 +1 @@ -Subproject commit b0364e568c171faa359a4cbfb56178e76c259707 +Subproject commit a0289115c913d517abb1b13d23d1e104cd2cbadf diff --git a/simulation_dashboard b/simulation_dashboard index 7a51dc8..cda7a32 160000 --- a/simulation_dashboard +++ b/simulation_dashboard @@ -1 +1 @@ -Subproject commit 7a51dc80955f871ede392b6a446cbee9fe30faba +Subproject commit cda7a32569089236cf923ad83733ec9c777b1605 -- GitLab From 180a07943d3fb8c79b5dfa4f5cb5f183d11e8dfe Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 29 Oct 2025 15:18:15 -0400 Subject: [PATCH 75/77] Fix get_job_hash Some fields in raps have changed to be numpy --- simulation_server/simulation/simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simulation_server/simulation/simulation.py b/simulation_server/simulation/simulation.py index 8095a6f..bb68f4b 100644 --- a/simulation_server/simulation/simulation.py +++ b/simulation_server/simulation/simulation.py @@ -38,7 +38,7 @@ def get_job_hash(job: RapsJob): job.current_state.name, # Node list shouldn't change once set so just do len instead of serializing the large list len(job.scheduled_nodes) if job.scheduled_nodes else None, - ]) + ], option=orjson.OPT_SERIALIZE_NUMPY) def snap_sample_rate(desired_rate: int, actual_rate: int): -- GitLab From 4028c2612b4d5702191aba2881a8de5d856d4432 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Thu, 30 Oct 2025 14:01:00 -0400 Subject: [PATCH 76/77] Add Frontier dataloader Had to remove elasticsearch-dbapi to resolve a dependency conflict. elasticsearch package needed to be updated to support numpy 2.0, but elasticsearch-dbapi only supported elasticsearch<7.14 --- pyproject.toml | 4 +- .../simulation/dataloaders/frontier.py | 67 +++++++++++++++ simulation_server/util/es.py | 82 ++++++++++--------- 3 files changed, 114 insertions(+), 39 deletions(-) create mode 100644 simulation_server/simulation/dataloaders/frontier.py diff --git a/pyproject.toml b/pyproject.toml index d6d1add..460ff26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,8 +25,8 @@ dependencies = [ "uvicorn==0.35.0", "sqlparse==0.5.3", "kubernetes==33.1.0", - "elasticsearch==7.13.4", - "elasticsearch-dbapi==0.2.11", + "tenacity==9.1.2", + "elasticsearch==7.17.12", "requests==2.32.5", "orjson==3.11.3", "confluent_kafka==2.11.1", diff --git a/simulation_server/simulation/dataloaders/frontier.py b/simulation_server/simulation/dataloaders/frontier.py new file mode 100644 index 0000000..3b006cd --- /dev/null +++ b/simulation_server/simulation/dataloaders/frontier.py @@ -0,0 +1,67 @@ +from ...util.druid import get_druid_engine, get_table, to_timestamp +from ...util.dataloader import query_time_range +from ...util.es import get_nccs_cadence_es, es_sql_query +from ...models.sim import ServerSimConfig +from .. import SimException +import sqlalchemy as sqla +import pandas as pd + +# Re-use these from the raps dataloader +from raps.dataloaders.frontier import load_data_from_df, node_index_to_name, cdu_index_to_name, cdu_pos + + +def load_data(_paths, **kwargs): + # TODO: Should consider using LVA API instead of directly querying the DB for this + druid_engine = get_druid_engine() + es = get_nccs_cadence_es() + + sim_config: ServerSimConfig = kwargs['sim_config'] + start, end = sim_config.start, sim_config.end + + job_query = """ + SELECT + "allocation_id", "job_id", "slurm_version", "account", "group", "user", "name", + "time_limit", "time_submission", "time_eligible", "time_start", "time_end", "time_elapsed", + "node_count", xnames_str AS "xnames", "state_current", "state_reason", + "time_snapshot" + FROM "stf218.frontier.job-summary" + WHERE + (time_end IS NULL OR time_end > CONVERT(?, TIMESTAMP)) AND + (time_start IS NOT NULL AND time_start <= CONVERT(?, TIMESTAMP)) + """ + job_query_params = [start.isoformat(), end.isoformat()] + job_data = es_sql_query(es, job_query, job_query_params, fetch_size=500) + + job_df = pd.DataFrame(job_data) + job_df['time_snapshot'] = pd.to_datetime(job_df['time_snapshot']) + job_df["time_submission"] = pd.to_datetime(job_df["time_submission"]) + job_df["time_eligible"] = pd.to_datetime(job_df["time_eligible"]) + job_df["time_start"] = pd.to_datetime(job_df["time_start"]) + job_df["time_end"] = pd.to_datetime(job_df["time_end"]) + job_df['xnames'] = job_df['xnames'].map(lambda x: x.split(",") if x else []) + + job_profile_tbl = get_table("pub-ts-frontier-job-profile", druid_engine) + job_profile_query = ( + sqla.select( + job_profile_tbl.c['__time'].label("timestamp"), + job_profile_tbl.c.allocation_id, + job_profile_tbl.c.sum_cpu0_power, + job_profile_tbl.c.sum_gpu_power, + ) + .where( + to_timestamp(start) <= job_profile_tbl.c['__time'], + job_profile_tbl.c['__time'] < to_timestamp(end), + ) + ) + job_profile_df = pd.read_sql(job_profile_query, druid_engine, parse_dates=[ + "timestamp", + ]) + + from loguru import logger + logger.info(f"job_df {job_df}") + logger.info(f"job_profile_df {job_profile_df}") + + if (job_df.empty or job_profile_df.empty): + raise SimException(f"No telemetry data for {start.isoformat()} -> {end.isoformat()}") + + return load_data_from_df(job_df, job_profile_df, **kwargs) diff --git a/simulation_server/util/es.py b/simulation_server/util/es.py index 32caa98..979441a 100644 --- a/simulation_server/util/es.py +++ b/simulation_server/util/es.py @@ -1,45 +1,53 @@ """ Connection to Cadence ES """ -import os, json -import urllib.parse -from datetime import datetime -import sqlalchemy as sqla -from sqlalchemy.engine import Engine, create_engine +import os from elasticsearch import Elasticsearch -from es.elastic.sqlalchemy import ESDialect +import tenacity -def get_nccs_cadence_engine(**kwargs) -> Engine: - import sqlalchemy.types as types - from sqlalchemy.ext.compiler import compiles - - # For some reason sqla/pydruid renders `cast(col, sqla.TIMESTAMP)` to `CAST(col AS LONG)`. This - # is a manual override to make sqla render them properly. - cast_fixes = { - types.TIMESTAMP: "TIMESTAMP", - } - - for (sqla_type, override) in cast_fixes.items(): - compiles(sqla_type, "elasticsearch")(lambda type_, compiler, override=override, **kw: override) - - # We need to set retry_on_status to work around intermittent 401 errors from Cadence ES. - # The query params will get passed to the Elasticsearch client, but only some specific - # ones get parsed and the rest are left as strings. This monkey patch hacks elasticsearch-dbapi - # to parse retry_on_status. We can remove this if the AM team fixes the auth errors - import es.basesqlalchemy - es.basesqlalchemy.BaseESDialect._map_parse_connection_parameters['retry_on_status'] = json.loads - - URL = urllib.parse.urlparse(os.environ["NCCS_CADENCE_URL"]) - HOST, PORT = URL.netloc.split(":") +def get_nccs_cadence_es(): + URL = os.environ["NCCS_CADENCE_URL"] USER = os.environ["NCCS_CADENCE_USER"] PASSWORD = os.environ["NCCS_CADENCE_PASSWORD"] - # These get passed through to the internal Elasticsearch instance - QUERY_PARAMS = 'use_ssl=false&ssl_show_warn=false&verify_certs=false&retry_on_status=[502,503,504,401]' - - engine = create_engine(f'elasticsearch+{URL.scheme}://{USER}:{PASSWORD}@{HOST}:{PORT}{URL.path}?{QUERY_PARAMS}', **kwargs) - return engine - - -def to_timestamp(val: datetime): - return sqla.func.convert(val.isoformat(), sqla.literal_column('TIMESTAMP')) + return Elasticsearch( + URL, + http_auth=(USER, PASSWORD), + # TODO: we need to fix the self-signed certs on ES + use_ssl=False, + ssl_show_warn=False, + verify_certs=False, + ) + + +def es_sql_query(client: Elasticsearch, query: str, params: list = [], fetch_size = 100): + """ + Runs an SQL query against ES. Use `?` format for SQL params. + """ + # Cadence ES is a bit flaky with intermittent 401 errors + @tenacity.retry( + stop = tenacity.stop_after_attempt(5), + wait = tenacity.wait_exponential(multiplier=0.5, min=1, max=30), + reraise = True, + ) + def _retry_query(query, params, cursor = None): + body = { + "query": query, + "params": params, + "fetch_size": fetch_size, + } + if cursor: + body["cursor"] = cursor + return client.sql.query(format = 'json', body = body) + + response = _retry_query(query, params) + rows = response['rows'] + cursor = response.get("cursor") + columns = [c['name'] for c in response['columns']] + while cursor: + response = _retry_query(query, params, cursor) + rows.extend(response['rows']) + cursor = response.get("cursor") + + rows = [dict(zip(columns, row)) for row in rows] + return rows -- GitLab From 44a8352021c67ff0872bb28c0075a0064c74f1bb Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Wed, 5 Nov 2025 15:42:23 -0500 Subject: [PATCH 77/77] Update submodules --- raps | 2 +- simulation_dashboard | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/raps b/raps index a028911..7c9840f 160000 --- a/raps +++ b/raps @@ -1 +1 @@ -Subproject commit a0289115c913d517abb1b13d23d1e104cd2cbadf +Subproject commit 7c9840f8c4272fdae1e926978eb1ae64d67ab350 diff --git a/simulation_dashboard b/simulation_dashboard index cda7a32..4de3411 160000 --- a/simulation_dashboard +++ b/simulation_dashboard @@ -1 +1 @@ -Subproject commit cda7a32569089236cf923ad83733ec9c777b1605 +Subproject commit 4de3411f04d978f0fc24ba79d7352bff5633df26 -- GitLab