diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000000000000000000000000000000000..ffffb5c4142d1ed46288d07daad6c917399287e8 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +exclude = .git, __pycache__, venv*, simulation_results, third_party, models, .venv +max-line-length = 120 diff --git a/.gitignore b/.gitignore index fc862bb8aee7d54b18cca1dfa659581e9e9da4ce..3e87161fa5112e2302c1454258a841ba872cc772 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,11 @@ __pycache__ .venv venv +*.npz +*.prof +simulation_results/ +models/fmu-models +.shell-completion-cache +raps-output-* +ppo_raps_logs +/data diff --git a/.gitmodules b/.gitmodules index 6b553fa09dd6b6c99c0d379456b8196a7e7509f3..f18b68f51b837f27d7b9ec64a78b2ce3f337bab8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ +[submodule "third_party/ScheduleFlow"] + path = third_party/ScheduleFlow + url = https://github.com/whbrewer/ScheduleFlow [submodule "models/POWER9CSM"] path = models/POWER9CSM url = https://code.ornl.gov/exadigit/POWER9CSM.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8ea69f7ea2fa637850d6cba943a5ed293160e68c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,11 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace +- repo: https://github.com/pycqa/flake8 + rev: '7.3.0' # pick a git hash / tag to point to + hooks: + - id: flake8 diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 77b218c3c745a1ed0ef1aea67a50b5728c690cdd..fd95582010e45ddba4325be4d9b3bc0b3b5fadf6 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -7,3 +7,9 @@ Matthias Maiterth (maiterthm@ornl.gov), Oak Ridge National Laboratory Sedrick Bouknight (bouknightsl@ornl.gov), Oak Ridge National Laboratory Jesse Hines (hinesjr@ornl.gov), Oak Ridge National Laboratory Jake Webb (webbtj@ornl.gov), Oak Ridge National Laboratory +Rashadul Kabir (rashadul.kabir@colostate.edu), Colorado State University +Bertrand Cirou (cirou@cines.fr), Centre Informatique National de l’Enseignement Supérieur +Kevin Menear (kmenear@nrel.gov), National Renewable Energy Laboratory +Tim Dykes (tim.dykes@hpe.com), Hewlett Packard Enterprise +Srishti Kalepu (skalepu3@gatech.edu), Georgia Institute of Technology +Damien Fay (damien.fay@hpe.com), Hewlett Packard Enterprise diff --git a/LICENSE-MIT b/LICENSE-MIT index b053dabcaecb0be9ad4018ea28f8dff935fa806e..9ca4d0ec8b618aea0d5130fa7bdb63d7e85f90bc 100644 --- a/LICENSE-MIT +++ b/LICENSE-MIT @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023-2024 UT-Battelle, LLC and other exadigitUE5 Project Developers. +Copyright (c) 2023-2024 UT-Battelle, LLC and other exadigit/raps Project Developers. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile index 99a1adb34b1aa646076567bc5bad52a1b7592034..d66f02c7000b5a3640a60bb4491df6d84600727b 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,6 @@ #Makefile +SHELL := /bin/bash + .PHONY: pip run docker_build docker_run all: pip @@ -8,6 +10,8 @@ pip: run: python3 ./main.py +test: + pytest -n 8 IMAGE_NAME = raps @@ -17,3 +21,9 @@ docker_build: docker_run: docker run --platform linux/amd64 -it $(IMAGE_NAME) +fetch-fmu-models: + if [ ! -d ./models/fmu-models ]; then \ + git clone git@code.ornl.gov:exadigit/fmu-models.git ./models/fmu-models; \ + else \ + git -C ./models/fmu-models pull; \ + fi diff --git a/README.md b/README.md index a0a93f5ba081b2eb2b4ad66ce95caeeb6d257e0f..c87ac5ed3e180c116220d068ad2f7bb1c095362d 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # ExaDigiT/RAPS -ExaDigiT's Resource Allocator and Power Simulator (RAPS) schedules workloads and -estimates dynamic system power at specified time intervals. RAPS either schedules +ExaDigiT's Resource Allocator and Power Simulator (RAPS) schedules workloads and +estimates dynamic system power at specified time intervals. RAPS either schedules synthetic workloads or replays system telemetry workloads, provides system monitoring during simulation, and an outputs a report of scheduling -and power statistics at the end of the simulation. RAPS also can interface with +and power statistics at the end of the simulation. RAPS also can interface with the FMU cooling model by providing CDU-level power inputs to the cooling model, and reporting the statistics back to the user. RAPS also has built-in plotting capabilities to generate plots of power and cooling at the end of simulation runs. @@ -13,36 +13,99 @@ Instructions for setup and usage are given below. An online documentation of Exa ## Setup environment -Note: Requires python3.9 or greater. +Note: Requires python3.12 or greater. pip install -e . ## Usage and help menu - python main.py -h + raps run -h ## Run simulator with default synthetic workload - python main.py + raps run ## Run simulator with telemetry replay - # Frontier + # Frontier DATEDIR="date=2024-01-18" - DPATH=~/data/frontier-sample-2024-01-18 - python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR + DPATH=/opt/data/frontier + raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR ## Open Telemetry dataset For Marconi supercomputer, download `job_table.parquet` from https://zenodo.org/records/10127767 # Marconi100 - python main.py --system marconi100 -f ~/data/marconi100/job_table.parquet + raps run --system marconi100 -f /opt/data/marconi100/job_table.parquet For Adastra MI250 supercomputer, download 'AdastaJobsMI250_15days.parquet' from https://zenodo.org/records/14007065 # Adastra MI250 - python main.py --system adastraMI250 -f AdastaJobsMI250_15days.parquet + raps run --system adastraMI250 -f AdastaJobsMI250_15days.parquet + +For Google cluster trace v2 + + raps run --system gcloudv2 -f /opt/data/gcloud/v2/google_cluster_data_2011_sample --start '2011-05-02T00:10:00Z' + + # analyze dataset + raps telemetry --system gcloudv2 -f /opt/data/gcloud/v2/google_cluster_data_2011_sample -v + +For MIT Supercloud + + # Following is the directory that contains slurm-log.csv and cpu and gpu directories + DPATH=/path/to/mit/data + + # Download the dataset - note the first time will build a file-manifest.txt file with all the files on S3 + # this will take some time, but subsequent calls should be much faster. + # Also, this command will dump output to `source_data` directory, or can specify directory using `--outdir` + python -m raps.dataloaders.mit_supercloud.cli download --start 2021-05-21T13:00 --end 2021-05-21T14:00 + + # Load data and run simulation - will save data as part-cpu.npz and part-gpu.npz files + raps run-parts -x mit_supercloud -f $DPATH --start 2021-05-21T13:00 --end 2021-05-21T14:00 + # or simply + raps run-parts experiments/mit-replay-25hrs.yaml + # Note: if no start, end dates provided will default to run 24 hours between + # 2021-05-21T00:00 to 2021-05-22T00:00 set by defaults in raps/dataloaders/mit_supercloud/utils.py + + # Re-run simulation using npz files (much faster load) + raps run-parts -x mit_supercloud -f part-*.npz + + # Synthetic tests for verification studies: + raps run-parts -x mit_supercloud -w multitenant + + # Reinforcement learning test case + raps train-rl --system mit_supercloud/part-cpu -f /opt/data/mit_supercloud/202201 + +Microsoft Azure - 2017 Philly Traces + + # Synthetic + python main.py run-parts -x philly -w multitenant + + # Telemetry replay + python main.py run-parts -x philly -f /opt/data/philly/trace-data --start 2017-10-03T00:14:56Z --end 2017-10-04T00:00 + +For Lumi + + # Synthetic test for Lumi: + raps run-parts -x lumi + +## Perform Network Simulation + +Lassen is one of the few datasets that has networking data. See `raps/dataloaders/lassen.py` for how to +get the datasets. To run a network simulation, use the following command: + + raps run -f /opt/data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit --start '2019-08-22T00:00:00+00:00' -t 12h --arrival poisson --net + +To simulate synthetic network tests: + + raps run --system lassen -w network_test --net -t 15m + + raps run --system lassen -w inter_job_congestion --net -t 15m + +Run network congestion tests outside of RAPS: + + python scripts/run_inter_job_congestion.py --config config/lassen.yaml -v ## Snapshot of extracted workload data @@ -50,8 +113,7 @@ To reduce the expense of extracting the needed data from the telemetry parquet f RAPS saves a snapshot of the extracted data in NPZ format. The NPZ file can be given instead of the parquet files for more quickly running subsequent simulations, e.g.: - python main.py -f jobs_2024-02-20_12-20-39.npz - + raps run -f jobs_2024-02-20_12-20-39.npz ## Cooling models @@ -62,46 +124,64 @@ We provide several cooling models in the repo https://code.ornl.gov/exadigit/POW Will install the POWER9CSM in the models folder. To activate cooling when running RAPS, use `--cooling` or `-c` argument. e.g., - python main.py --system marconi100 -c + raps run --system marconi100 -c - python main.py --system lassen -c + raps run --system lassen -c - python main.py --system summit -c + raps run --system summit -c ## Support for multiple system partitions -Multi-partition systems are supported by running the `multi-part-sim.py` script, where a list of configurations can be specified using the `-x` flag as follows: +Multi-partition systems are supported by running `raps multi-parts ...` command, where a list of partitions can be specified using the `-x` flag as follows: - python multi-part-sim.py -x setonix/part-cpu setonix/part-gpu + raps run-parts -x setonix/part-cpu setonix/part-gpu or simply: - python multi-part-sim.py -x setonix/* # bash + raps run-parts -x setonix - python multi-part-sim.py -x 'setonix/*' # zsh +This will simulate synthetic workloads on two partitions as defined in `config/setonix-cpu` and `config/setonix-gpu`. To replay telemetry workloads from another system, e.g., Marconi100's PM100 dataset, first create a .npz snapshot of the telemetry data, e.g., -This will simulate synthetic workloads on two partitions as defined in `config/setonix-cpu` and `config/setonix-gpu`. To replay telemetry workloads from another system, e.g., Marconi100's PM100 dataset, first create a .npz snapshot of the telemetry data, e.g., + raps run-parts --system marconi100 -f /path/to/marconi100/job_table.parquet - python main.py --system marconi100 -f /path/to/marconi100/job_table.parquet +This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename this file to pm100.npz for clarity. Note: can control-C when the simulation starts. Now, this pm100.npz file can be used as follows: -This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename this file to pm100.npz for clarity. Note: can control-C when the simulation starts. Now, this pm100.npz file can be used with `multi-part-sim.py` as follows: + raps run-parts -x setonix -f pm100.npz --arrival poisson --scale 192 - python multi-part-sim.py -x setonix/* -f pm100.npz --reschedule poisson --scale 192 +## Modifications to telemetry replay -The `--reschedule` flag will use the internal scheduler to determine what nodes to schedule for each job, and the `--scale` flag will specify the maximum number of nodes for each job (generally set this to the max number of nodes of the smallest partition). +There are three ways to modify replaying of telemetry data: + +1. `--arrival`. Changing the arrival time distribution - replay cases will default to `--arrival prescribed`, where the jobs will be submitted exactly as they were submitted on the physical machine. This can be changed to `--arrival poisson` to change when the jobs arrive, which is especially useful in cases where there may be gaps in time, e.g., when the system goes down for several days, or the system is is underutilized. + + raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --arrival poisson + +2. `--policy`. Changing the way the jobs are scheduled. The `--policy` flag will be set by default to `replay` in cases where a telemetry file is provided, in which case the jobs will be scheduled according to the start times provided. Changing the `--policy` to `fcfs` or `backfill` will use the internal scheduler, e.g.: + + raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h + +3. `--scale`. Changing the scale of each job in the telemetry data. The `--scale` flag will specify the maximum number of nodes for each job (generally set this to the max number of nodes of the smallest partition), and randomly select the number of nodes for each job from one to max nodes. This flag is useful when replaying telemetry from a larger system onto a smaller system. + +4. `--shuffle`. Shuffle the jobs before playing. ## Job-level power output example for replay of single job - python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR --jid 1234567 -o + raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --jid 1234567 -o ## Compute stats on telemetry data, e.g., average job arrival time - python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR + raps telemetry -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR ## Build and run Docker container make docker_build && make docker_run +## Third party schedulers + +To install third-party schedulers, such as ScheduleFlow, run: + + git submodule update --init --recursive + ### Setup Simulation Server See instructions in [server/README.md](https://code.ornl.gov/exadigit/simulationserver) @@ -110,17 +190,58 @@ See instructions in [server/README.md](https://code.ornl.gov/exadigit/simulation See instructions in [dashboard/README.md](https://code.ornl.gov/exadigit/simulation-dashboard) +## Running Tests + +RAPS uses [pytest](https://docs.pytest.org/) for its test suite. +Before running tests, ensure that you have a valid data directory available (e.g., `/opt/data`) and set the environment variable `RAPS_DATA_DIR` to point to it. + +### Run all tests +```bash +RAPS_DATA_DIR=/opt/data pytest -n auto -x +``` + +By default, tests are parallelized with `pytest-xdist` (`-n auto`) to speed up execution. +The `-x` flag stops execution after the first failure. Add `-v` to run in verbose mode. + +### Run tests on multi-partition systems + +```bash +pytest -v -k "multi_part_sim" +``` + +### Run only network-related tests + +```bash +RAPS_DATA_DIR=/opt/data pytest -n auto -x -m network +``` + +See `pytest.ini` for the different options for `-m`. + +### Run a specific test file + +```bash +RAPS_DATA_DIR=/opt/data pytest tests/systems/test_engine.py +``` + +### Contributing Code + +Install pre-commit hooks as set by the project: +``` +pip install pre-commit +pre-commit install +``` + ## Authors -Many thanks to the contributors of ExaDigiT/RAPS. -The full list of contributors and organizations involved are found in CONTRIBUTORS.txt. +Many thanks to the contributors of ExaDigiT/RAPS. +The full list of contributors and organizations involved are found in CONTRIBUTORS.txt. ## Citation If you use ExaDigiT or RAPS in your research, please cite our work: @inproceedings{inproceedings, - title={A Digital Twin Framework for Liquid-cooled Supercomputers as Demonstrated at Exascale}, + title={A Digital Twin Framework for Liquid-cooled Supercomputers as Demonstrated at Exascale}, author={Brewer, Wesley and Maiterth, Matthias and Kumar, Vineet and Wojda, Rafal and Bouknight, Sedrick and Hines, Jesse and Shin, Woong and Greenwood, Scott and Grant, David and Williams, Wesley and Wang, Feiyi}, booktitle={SC24: International Conference for High Performance Computing, Networking, Storage and Analysis}, pages={1--18}, @@ -142,17 +263,16 @@ Thank you for your support! ## License -ExaDigiT/RAPS is distributed under the terms of both the MIT license and the Apache License (Version 2.0). -Users may choose either license, at their option. +ExaDigiT/RAPS is distributed under the terms of both the MIT license and the Apache License (Version 2.0). +Users may choose either license, at their option. -All new contributions must be made under both the MIT and Apache-2.0 licenses. -See LICENSE-MIT, LICENSE-APACHE, COPYRIGHT, NOTICE, and CONTRIBUTORS.txt for details. +All new contributions must be made under both the MIT and Apache-2.0 licenses. +See LICENSE-MIT, LICENSE-APACHE, COPYRIGHT, NOTICE, and CONTRIBUTORS.txt for details. -SPDX-License-Identifier: (Apache-2.0 OR MIT) +SPDX-License-Identifier: (Apache-2.0 OR MIT) ## Attributions Map data used in this project is provided by [OpenStreetMap](https://www.openstreetmap.org/copyright) and is available under the Open Database License (ODbL). © OpenStreetMap contributors. Weather data used in this project is provided by the [Open-Meteo API](https://open-meteo.com/en/docs). Open-Meteo offers free weather forecast data for various applications, and their API provides easy access to weather information without requiring user authentication. - diff --git a/api_client/README.md b/api_client/README.md index 87f7a79a12895a62efda2d08d5189f34390bc985..ac0f0288f9e005faeb700e4ddfdf489ae386941b 100644 --- a/api_client/README.md +++ b/api_client/README.md @@ -1,6 +1,20 @@ -API documentation availalbe at: https://exadigit.github.io/SimulationServer/ +API documentation available at: https://exadigit.github.io/SimulationServer/ +# Launch the simulation server + +sudo ./scripts/launch_local.sh + +# Population some initial simulations, e.g., + +python api_client.py run --system frontier --policy default --start 2024-01-01T00:00:00Z --end 2024-01-01T05:00:00Z --scheduler --scheduler-num-jobs 1000 --scheduler-seed 100 --scheduler-jobs-mode random +{'id': '2gr3nqgbmfapvlwhwzizgzbxr4', 'user': 'unknown', 'system': 'frontier', 'state': 'running', 'error_messages': None, 'start': '2024-01-01T00:00:00Z', 'end': '2024-01-01T00:10:00Z', 'execution_start': '2025-08-15T20:22:13.778590Z', 'execution_end': None, 'progress_date': '2024-01-01T00:00:00Z', 'progress': 0.0, 'config': {'start': '2024-01-01T00:00:00Z', 'end': '2024-01-01T00:10:00Z', 'system': 'frontier', 'scheduler': {'enabled': True, 'down_nodes': [], 'jobs_mode': 'random', 'schedule_policy': 'fcfs', 'reschedule': False, 'jobs': None, 'seed': 100, 'num_jobs': 1000}, 'cooling': {'enabled': False}}} + +# List the simulations + +python api_client.py list + +# export BASE_URL="https://myurl.com" -python get_api_token.py +#python get_api_token.py python api_client.py list python api_client.py details --id 5rkkb222xnge7c4ba4oxshqeha diff --git a/api_client/api_client.py b/api_client/api_client.py index c4f513ae12afe0f62acb40e66631190a807b20c0..f5af1b6ccd33d7602e306784e30e48c5b3cdb627 100644 --- a/api_client/api_client.py +++ b/api_client/api_client.py @@ -1,126 +1,217 @@ +#!/usr/bin/env python3 import os import argparse +import json import requests import pandas as pd from dotenv import load_dotenv -# Load environment variables +# ---------------------------------- +# Environment / configuration +# ---------------------------------- load_dotenv() -URL = os.getenv("BASE_URL") -RAPS_URL = os.path.join(URL, "exadigit/api") +# BASE_URL from env if you want, else localhost default +URL = os.getenv("BASE_URL", "http://localhost:8080") +# ---------------------------------- +# Auth / HTTP helpers +# ---------------------------------- def read_token(): - with open('.api-token', 'r') as token_file: - return token_file.read().strip() + token_path = ".api-token" + if os.path.exists(token_path): + try: + with open(token_path, "r") as token_file: + token = token_file.read().strip() + if token: + return token + except OSError as e: + print(f"Warning: Could not read token file: {e}") + # Fallback for localhost or dev use + return "xyz123" + def call_api(endpoint, method="GET", params=None, data=None): - TOKEN = read_token() - url = f"{RAPS_URL}{endpoint}" - headers = {"Authorization": f"Bearer {TOKEN}"} - - response = requests.request(method, url, headers=headers, params=params, json=data) - - if response.status_code == 200: - return response.json() + token = read_token() + url = f"{URL}{endpoint}" + headers = {"Authorization": f"Bearer {token}"} + + try: + resp = requests.request(method, url, headers=headers, params=params, json=data) + except requests.RequestException as e: + print(f"Request error: {e}") + return None + + if resp.status_code == 200: + # handle empty 200 + if not resp.content: + return None + try: + return resp.json() + except ValueError: + print("Error: Response was 200 but not JSON") + return None else: - print(f"Error: {response.status_code} - {response.text}") + print(f"Error: {resp.status_code} - {resp.text}") return None +# ---------------------------------- +# Command handlers +# ---------------------------------- def handle_run(args): - data = {"system": args.system, "policy": args.policy, "parameters": args.parameters} - response = call_api('/simulation/run', method="POST", data=data) + # Build nested payload while omitting keys the user didn’t set + data = { + "start": args.start, + "end": args.end, + "system": args.system, + "policy": args.policy, + "parameters": args.parameters or {}, + } + + scheduler = { + "enabled": args.scheduler_enabled, + "num_jobs": args.scheduler_num_jobs, + "seed": args.scheduler_seed, + "jobs_mode": args.scheduler_jobs_mode, + } + scheduler = {k: v for k, v in scheduler.items() if v is not None} + if scheduler: + data["scheduler"] = scheduler + + cooling = { + "enabled": args.cooling_enabled, + } + cooling = {k: v for k, v in cooling.items() if v is not None} + if cooling: + data["cooling"] = cooling + + response = call_api("/simulation/run", method="POST", data=data) print(response) def handle_list(args): - response = call_api('/simulation/list') + response = call_api("/simulation/list") if response: - results = response.get('results', []) + results = response.get("results", []) + if not results: + print("No simulations found.") + return df = pd.DataFrame(results) - #pd.set_option('display.max_columns', None) - #pd.set_option('display.max_colwidth', None) - #pd.set_option('display.width', None) + # Feel free to uncomment for wider console displays: + # pd.set_option('display.max_columns', None) + # pd.set_option('display.max_colwidth', None) + # pd.set_option('display.width', None) print(df) def handle_simulation_details(args): - response = call_api(f'/simulation/{args.id}') + response = call_api(f"/simulation/{args.id}") print(response) def handle_cooling_cdu(args): - response = call_api(f'/simulation/{args.id}/cooling/cdu') + response = call_api(f"/simulation/{args.id}/cooling/cdu") print(response) def handle_cooling_cep(args): - response = call_api(f'/simulation/{args.id}/cooling/cep') + response = call_api(f"/simulation/{args.id}/cooling/cep") print(response) def handle_scheduler_jobs(args): - response = call_api(f'/simulation/{args.id}/scheduler/jobs') + response = call_api(f"/simulation/{args.id}/scheduler/jobs") print(response) def handle_power_history(args): - response = call_api(f'/simulation/{args.id}/scheduler/jobs/{args.job_id}/power-history') + response = call_api(f"/simulation/{args.id}/scheduler/jobs/{args.job_id}/power-history") print(response) def handle_scheduler_system(args): - response = call_api(f'/simulation/{args.id}/scheduler/system') + response = call_api(f"/simulation/{args.id}/scheduler/system") print(response) def handle_system_info(args): - response = call_api(f'/system-info/{args.system}') + response = call_api(f"/system-info/{args.system}") print(response) -def main(): +# ---------------------------------- +# CLI +# ---------------------------------- +def build_parser(): parser = argparse.ArgumentParser(description="Interact with the SimulationServer REST API.") subparsers = parser.add_subparsers(title="commands", dest="command") - + # Run simulation run_parser = subparsers.add_parser("run", help="Run a simulation.") + + # Top-level options run_parser.add_argument("--system", required=True, help="System to run the simulation on.") run_parser.add_argument("--policy", required=True, help="Policy to use.") - run_parser.add_argument("--parameters", type=dict, default={}, help="Simulation parameters.") + run_parser.add_argument( + "--parameters", + type=json.loads, + default={}, + help='Simulation parameters as JSON, e.g. \'{"alpha":0.1,"beta":"x"}\'', + ) + run_parser.add_argument("--start", required=True, help="ISO time, e.g. 2024-01-01T00:00:00Z") + run_parser.add_argument("--end", required=True, help="ISO time, e.g. 2024-01-01T00:10:00Z") + + # Scheduler group + sched_grp = run_parser.add_argument_group("scheduler options") + sched_grp.add_argument("--scheduler", dest="scheduler_enabled", action="store_true", help="Enable scheduler.") + sched_grp.add_argument("--no-scheduler", dest="scheduler_enabled", action="store_false", help="Disable scheduler.") + sched_grp.set_defaults(scheduler_enabled=None) # omit if unspecified + sched_grp.add_argument("--scheduler-num-jobs", type=int, help="Number of jobs.") + sched_grp.add_argument("--scheduler-seed", type=int, help="Random seed.") + sched_grp.add_argument("--scheduler-jobs-mode", choices=["random", "sequential"], help="Jobs mode.") + + # Cooling group + cool_grp = run_parser.add_argument_group("cooling options") + cool_grp.add_argument("--cooling", dest="cooling_enabled", action="store_true", help="Enable cooling.") + cool_grp.add_argument("--no-cooling", dest="cooling_enabled", action="store_false", help="Disable cooling.") + cool_grp.set_defaults(cooling_enabled=None) # omit if unspecified + run_parser.set_defaults(func=handle_run) - + # List simulations list_parser = subparsers.add_parser("list", help="List all simulations.") list_parser.set_defaults(func=handle_list) - + # Get simulation details details_parser = subparsers.add_parser("details", help="Get details of a simulation.") details_parser.add_argument("--id", required=True, help="Simulation ID.") details_parser.set_defaults(func=handle_simulation_details) - + # Cooling CDU cdu_parser = subparsers.add_parser("cooling-cdu", help="Get cooling CDU data for a simulation.") cdu_parser.add_argument("--id", required=True, help="Simulation ID.") cdu_parser.set_defaults(func=handle_cooling_cdu) - + # Cooling CEP cep_parser = subparsers.add_parser("cooling-cep", help="Get cooling CEP data for a simulation.") cep_parser.add_argument("--id", required=True, help="Simulation ID.") cep_parser.set_defaults(func=handle_cooling_cep) - + # Scheduler jobs jobs_parser = subparsers.add_parser("scheduler-jobs", help="Get scheduler jobs for a simulation.") jobs_parser.add_argument("--id", required=True, help="Simulation ID.") jobs_parser.set_defaults(func=handle_scheduler_jobs) - + # Power history power_parser = subparsers.add_parser("power-history", help="Get power history for a specific job in a simulation.") power_parser.add_argument("--id", required=True, help="Simulation ID.") power_parser.add_argument("--job-id", required=True, help="Job ID.") power_parser.set_defaults(func=handle_power_history) - + # Scheduler system scheduler_parser = subparsers.add_parser("scheduler-system", help="Get scheduler system data for a simulation.") scheduler_parser.add_argument("--id", required=True, help="Simulation ID.") scheduler_parser.set_defaults(func=handle_scheduler_system) - + # System info system_info_parser = subparsers.add_parser("system-info", help="Get system information.") system_info_parser.add_argument("--system", required=True, help="System name.") system_info_parser.set_defaults(func=handle_system_info) - - # Parse and execute + + return parser + +def main(): + parser = build_parser() args = parser.parse_args() if args.command: args.func(args) diff --git a/args.py b/args.py deleted file mode 100644 index 8cfcd77f4d218958651bc76c52af5195fd99bc01..0000000000000000000000000000000000000000 --- a/args.py +++ /dev/null @@ -1,44 +0,0 @@ -import argparse -from raps.schedulers.default import PolicyType - -parser = argparse.ArgumentParser(description='Resource Allocator & Power Simulator (RAPS)') -parser.add_argument('-c', '--cooling', action='store_true', help='Include FMU cooling model') -parser.add_argument('--start', type=str, help='ISO8061 string for start of simulation') -parser.add_argument('--end', type=str, help='ISO8061 string for end of simulation') -parser.add_argument('-d', '--debug', action='store_true', help='Enable debug mode and disable rich layout') -parser.add_argument('-e', '--encrypt', action='store_true', help='Encrypt any sensitive data in telemetry') -parser.add_argument('-n', '--numjobs', type=int, default=1000, help='Number of jobs to schedule') -parser.add_argument('-t', '--time', type=str, default=None, help='Length of time to simulate, e.g., 123, 123s, 27m, 3h, 7d') -parser.add_argument('-ff', '--fastforward', type=str, default=None, help='Fast-forward by time amount (uses same units as -t)') -parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output') -parser.add_argument('--seed', action='store_true', help='Set random number seed for deterministic simulation') -parser.add_argument('-f', '--replay', nargs='+', type=str, help='Either: path/to/joblive path/to/jobprofile' + \ - ' -or- filename.npz (overrides --workload option)') -choices = ['poisson', 'submit-time'] -parser.add_argument('--reschedule', type=str, choices=choices, help='Reschedule the telemetry workload') -parser.add_argument('-u', '--uncertainties', action='store_true', - help='Change from floating point units to floating point units with uncertainties.' + \ - ' Very expensive w.r.t simulation time!') -parser.add_argument('--jid', type=str, default='*', help='Replay job id') -parser.add_argument('--validate', action='store_true', help='Use node power instead of CPU/GPU utilizations') -parser.add_argument('-o', '--output', action='store_true', help='Output power, cooling, and loss models for later analysis') -parser.add_argument('-p', '--plot', nargs='+', choices=['power', 'loss', 'pue', 'temp', 'util'], - help='Specify one or more types of plots to generate: power, loss, pue, util, temp') -choices = ['png', 'svg', 'jpg', 'pdf', 'eps'] -parser.add_argument('--imtype', type=str, choices=choices, default=choices[0], help='Plot image type') -parser.add_argument('--scale', type=int, default=0, help='Scale telemetry to max nodes specified in order to run telemetry on a smaller smaller target system/partition, e.g., --scale 192') -parser.add_argument('--system', type=str, default='frontier', help='System config to use') -choices = ['default', 'nrel', 'anl', 'flux'] -parser.add_argument('--scheduler', type=str, choices=choices, default=choices[0], help='Name of scheduler') -choices = [policy.value for policy in PolicyType] -parser.add_argument('--policy', type=str, choices=choices, default=choices[0], help='Schedule policy to use') -choices = ['random', 'benchmark', 'peak', 'idle'] -parser.add_argument('-w', '--workload', type=str, choices=choices, default=choices[0], help='Type of synthetic workload') -choices = ['layout1', 'layout2'] -parser.add_argument('-x', '--partitions', nargs='+', default=None, help='List of machine configurations to use, e.g., -x setonix-cpu setonix-gpu') -parser.add_argument('--layout', type=str, choices=choices, default=choices[0], help='Layout of UI') -parser.add_argument('--accounts-json', type=str, help='Json of account stats generated in previous run. see raps/accounts.py') - -args = parser.parse_args() -args_dict = vars(args) -print(args_dict) diff --git a/config/40frontiers.yaml b/config/40frontiers.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cc2783e81357ff894fdb20f828c905fdf32e1475 --- /dev/null +++ b/config/40frontiers.yaml @@ -0,0 +1,60 @@ +system: + num_cdus: 1000 + racks_per_cdu: 3 + nodes_per_rack: 128 + chassis_per_rack: 8 + nodes_per_blade: 2 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: + - 41 + down_nodes: [] + cpus_per_node: 1 + gpus_per_node: 4 + cpu_peak_flops: 2048000000000.0 + gpu_peak_flops: 52000000000000.0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 88 + power_gpu_max: 560 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 74.26 + power_nic: 20 + power_nvme: 30 + power_switch: 250 + power_cdu: 8473.47 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + job_arrival_time: 1 + mtbf: 11 + trace_quanta: 15 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 9000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 +uq: + power_gpu_uncertainty: 0.05 + power_cpu_uncertainty: 0.05 + power_mem_uncertainty: 0.05 + power_nic_uncertainty: 0.05 + power_nvme_uncertainty: 0.05 + power_cdus_uncertainty: 0.05 + power_node_uncertainty: 0.002 + power_switch_uncertainty: 0.05 + rectifier_power_uncertainty: 0.05 diff --git a/config/OCIZettascale10.yaml b/config/OCIZettascale10.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74971642fcef8d868dcbfc8a66914ad47a71f43d --- /dev/null +++ b/config/OCIZettascale10.yaml @@ -0,0 +1,59 @@ +system: + num_cdus: 2778 # 800,000 Vera Rubin total + racks_per_cdu: 3 + nodes_per_rack: 72 # 600kW # like NV72 + chassis_per_rack: 1 + nodes_per_blade: 1 + switches_per_chassis: 72 # Chassis concept is Cray => NV72? + nics_per_node: 1 # Most likely 4 + rectifiers_per_chassis: 1 # power / losses will be set to zero as this is unknown + nodes_per_rectifier: 1 # power / losses will be set to zero as this is unknown + #missing_racks: + down_nodes: [] + cpus_per_node: 1 + gpus_per_node: 4 # Chiplets 4? + cpu_peak_flops: 2048000000000.0 # Insignificant + gpu_peak_flops: 15000000000000000000.0 # 15EFlops/s FP4 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 200 # 200 == 4* 50 + power_gpu_max: 2200 # 2kW per node = 4*525 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 74.26 + power_nic: 20 + power_nvme: 30 + power_switch: 250 + power_cdu: 8473.47 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 0 + sivoc_efficiency: 1.00 + rectifier_loss_constant: 0 + rectifier_efficiency: 1.00 + power_cost: 0.094 +scheduler: + job_arrival_time: 1 + mtbf: 11 + trace_quanta: 15 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 9000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 +uq: + power_gpu_uncertainty: 0.05 + power_cpu_uncertainty: 0.05 + power_mem_uncertainty: 0.05 + power_nic_uncertainty: 0.05 + power_nvme_uncertainty: 0.05 + power_cdus_uncertainty: 0.05 + power_node_uncertainty: 0.002 + power_switch_uncertainty: 0.05 + rectifier_power_uncertainty: 0.05 diff --git a/config/adastraMI250.yaml b/config/adastraMI250.yaml new file mode 100644 index 0000000000000000000000000000000000000000..88f68e9bca9b30ed2311321fcaf9cc2009589d00 --- /dev/null +++ b/config/adastraMI250.yaml @@ -0,0 +1,110 @@ +system: + num_cdus: 1 + racks_per_cdu: 3 + nodes_per_rack: 128 + chassis_per_rack: 8 + nodes_per_blade: 2 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: + - 356 + - 357 + - 358 + - 359 + - 360 + - 361 + - 362 + - 363 + - 364 + - 365 + - 366 + - 367 + - 368 + - 369 + - 370 + - 371 + - 372 + - 373 + - 374 + - 375 + - 376 + - 377 + - 378 + - 379 + - 380 + - 381 + - 382 + - 383 + cpus_per_node: 1 + gpus_per_node: 8 + cpu_peak_flops: 2048000000000.0 + gpu_peak_flops: 21120000000000.0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 44 + power_gpu_max: 238 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 37.13 + power_nic: 20 + power_nvme: 0 + power_switch: 250 + power_cdu: 8473.47 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + job_arrival_time: 900 + mtbf: 11 + trace_quanta: 20 + min_wall_time: 60 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 324 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 +uq: + power_gpu_uncertainty: 0.05 + power_cpu_uncertainty: 0.05 + power_mem_uncertainty: 0.05 + power_nic_uncertainty: 0.05 + power_nvme_uncertainty: 0.05 + power_cdus_uncertainty: 0.05 + power_node_uncertainty: 0.002 + power_switch_uncertainty: 0.05 + rectifier_power_uncertainty: 0.05 +cooling: + cooling_efficiency: 0.945 + wet_bulb_temp: 290.0 + zip_code: '37831' + country_code: US + fmu_path: "../models/fmu-models/Frontier/Simulator_olcf5_base.fmu" + fmu_column_mapping: + T_sec_r_C: "Rack Return Temperature (°C)" + T_sec_s_C: "Rack Supply Temperature (°C)" + p_sec_r_psig: "Rack Supply Pressure (psig)" + p_sec_s_psig: "Rack Return Pressure (psig)" + V_flow_sec_GPM: "Rack Flowrate (gpm)" + T_prim_r_C: "Facility Return Temperature (°C)" + T_prim_s_C: "Facility Supply Temperature (°C)" + p_prim_s_psig: "Facility Supply Pressure (psig)" + p_prim_r_psig: "Facility Return Pressure (psig)" + V_flow_prim_GPM: "Facility Flowrate (gpm)" + W_flow_CDUP_kW: "Work Done By CDUP (kW)" + temperature_keys: + - simulator_1_centralEnergyPlant_1_coolingTowerLoop_1_sources_Towb + w_htwps_key: "simulator[1].centralEnergyPlant[1].hotWaterLoop[1].summary.W_flow_HTWP_kW" + w_ctwps_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CTWP_kW" + w_cts_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" diff --git a/config/adastraMI250/cooling.json b/config/adastraMI250/cooling.json deleted file mode 100644 index 778a56d1dbe2edeb446f95631d6a52d8279a8370..0000000000000000000000000000000000000000 --- a/config/adastraMI250/cooling.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "COOLING_EFFICIENCY": 0.945, - "WET_BULB_TEMP": 290.0, - "ZIP_CODE": 37831, - "COUNTRY_CODE": "US", - "FMU_PATH": "models/Simulator_olcf5_base.fmu", - "FMU_COLUMN_MAPPING": { - "T_sec_r_C": "Rack Return Temperature (\u00b0C)", - "T_sec_s_C": "Rack Supply Temperature (\u00b0C)", - "p_sec_r_psig": "Rack Supply Pressure (psig)", - "p_sec_s_psig": "Rack Return Pressure (psig)", - "V_flow_sec_GPM": "Rack Flowrate (gpm)", - "T_prim_r_C": "Facility Return Temperature (\u00b0C)", - "T_prim_s_C": "Facility Supply Temperature (\u00b0C)", - "p_prim_s_psig": "Facility Supply Pressure (psig)", - "p_prim_r_psig": "Facility Return Pressure (psig)", - "V_flow_prim_GPM": "Facility Flowrate (gpm)", - "W_flow_CDUP_kW": "Work Done By CDUP (kW)" - }, - "TEMPERATURE_KEY": "simulator_1_centralEnergyPlant_1_coolingTowerLoop_1_sources_Towb", - "W_HTWPs_KEY": "simulator[1].centralEnergyPlant[1].hotWaterLoop[1].summary.W_flow_HTWP_kW", - "W_CTWPs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CTWP_kW", - "W_CTs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" - -} diff --git a/config/adastraMI250/power.json b/config/adastraMI250/power.json deleted file mode 100644 index 77d8c692833809e6918d11227f6a0503d60d2fe3..0000000000000000000000000000000000000000 --- a/config/adastraMI250/power.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "POWER_GPU_IDLE": 44, - "POWER_GPU_MAX": 238, - "POWER_CPU_IDLE": 90, - "POWER_CPU_MAX": 280, - "POWER_MEM": 37.13, - "POWER_NIC": 20, - "POWER_NVME": 0, - "POWER_SWITCH": 250, - "POWER_CDU": 8473.47, - "POWER_UPDATE_FREQ": 15, - "RECTIFIER_PEAK_THRESHOLD": 13670, - "SIVOC_LOSS_CONSTANT": 13, - "SIVOC_EFFICIENCY": 0.98, - "RECTIFIER_LOSS_CONSTANT": 17, - "RECTIFIER_EFFICIENCY": 0.96, - "POWER_COST": 0.094 -} diff --git a/config/adastraMI250/scheduler.json b/config/adastraMI250/scheduler.json deleted file mode 100644 index 3ae6644bccbf58be5c5d24a221dad466fadb5eb8..0000000000000000000000000000000000000000 --- a/config/adastraMI250/scheduler.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "SEED": 42, - "JOB_ARRIVAL_TIME": 900, - "MTBF": 11, - "TRACE_QUANTA": 20, - "MIN_WALL_TIME": 60, - "MAX_WALL_TIME": 43200, - "UI_UPDATE_FREQ": 900, - "MAX_NODES_PER_JOB": 324, - "JOB_END_PROBS": { - "COMPLETED": 0.63, - "FAILED": 0.13, - "CANCELLED": 0.12, - "TIMEOUT": 0.11, - "NODE_FAIL": 0.01 - } -} diff --git a/config/adastraMI250/system.json b/config/adastraMI250/system.json deleted file mode 100644 index 36a689e0d8da3b019652882002c432051c9272f6..0000000000000000000000000000000000000000 --- a/config/adastraMI250/system.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "NUM_CDUS": 1, - "RACKS_PER_CDU": 3, - "NODES_PER_RACK": 128, - "RECTIFIERS_PER_RACK": 32, - "CHASSIS_PER_RACK": 8, - "NODES_PER_BLADE": 2, - "SWITCHES_PER_CHASSIS": 4, - "NICS_PER_NODE": 4, - "RECTIFIERS_PER_CHASSIS": 4, - "NODES_PER_RECTIFIER": 4, - "MISSING_RACKS": [], - "DOWN_NODES": [356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383], - "CPUS_PER_NODE": 1, - "GPUS_PER_NODE": 8, - "CPU_PEAK_FLOPS": 2048E9, - "GPU_PEAK_FLOPS": 21.120000E12, - "CPU_FP_RATIO": 0.667, - "GPU_FP_RATIO": 0.667 -} diff --git a/config/adastraMI250/uq.json b/config/adastraMI250/uq.json deleted file mode 100644 index 7359bc2f7370232665e391a184f096108691288c..0000000000000000000000000000000000000000 --- a/config/adastraMI250/uq.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "POWER_GPU_UNCERTAINTY": 0.05 , - "POWER_CPU_UNCERTAINTY": 0.05 , - "POWER_MEM_UNCERTAINTY": 0.05 , - "POWER_NIC_UNCERTAINTY": 0.05 , - "POWER_NVME_UNCERTAINTY": 0.05 , - "POWER_CDUS_UNCERTAINTY": 0.05 , - "POWER_NODE_UNCERTAINTY": 0.002, - "POWER_SWITCH_UNCERTAINTY": 0.05 , - "RECTIFIER_POWER_UNCERTAINTY": 0.05 -} diff --git a/config/bluewaters.yaml b/config/bluewaters.yaml new file mode 100644 index 0000000000000000000000000000000000000000..989ca78377a58a39c10e1074d0621471f40b50c5 --- /dev/null +++ b/config/bluewaters.yaml @@ -0,0 +1,63 @@ +system: + num_cdus: 36 + racks_per_cdu: 6 + nodes_per_rack: 128 + chassis_per_rack: 8 + nodes_per_blade: 2 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 2 + gpus_per_node: 0 + cpu_peak_flops: 264960000000.0 + gpu_peak_flops: 0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0 +power: + power_gpu_idle: 0 + power_gpu_max: 0 + power_cpu_idle: 38 + power_cpu_max: 95 + power_mem: 74.26 + power_nic: 20 + power_nvme: 30 + power_switch: 250 + power_cdu: 8473.47 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + job_arrival_time: 100 + mtbf: 11 + trace_quanta: 60 + min_wall_time: 60 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 26884 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 +network: + topology: torus3d + #topology: capacity + #network_max_bw: 9.6E9 + network_max_bw: 1E7 + torus_x: 24 + torus_y: 24 + torus_z: 24 + torus_wrap: true + hosts_per_router: 2 + torus_link_bw: 9600000000.0 + latency_per_hop: 1.0e-06 + torus_routing: DOR_XYZ + node_coords_csv: null diff --git a/config/frontier.yaml b/config/frontier.yaml new file mode 100644 index 0000000000000000000000000000000000000000..84891c1179ccb90e8032bf749717eefe1f4c63ef --- /dev/null +++ b/config/frontier.yaml @@ -0,0 +1,83 @@ +system: + num_cdus: 25 + racks_per_cdu: 3 + nodes_per_rack: 128 + chassis_per_rack: 8 + nodes_per_blade: 2 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: + - 41 + down_nodes: [] + cpus_per_node: 1 + gpus_per_node: 4 + cpu_peak_flops: 2048000000000.0 + gpu_peak_flops: 52000000000000.0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 88 + power_gpu_max: 560 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 74.26 + power_nic: 20 + power_nvme: 30 + power_switch: 250 + power_cdu: 8473.47 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + job_arrival_time: 100 + mtbf: 11 + trace_quanta: 15 + min_wall_time: 60 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 3000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 +uq: + power_gpu_uncertainty: 0.05 + power_cpu_uncertainty: 0.05 + power_mem_uncertainty: 0.05 + power_nic_uncertainty: 0.05 + power_nvme_uncertainty: 0.05 + power_cdus_uncertainty: 0.05 + power_node_uncertainty: 0.002 + power_switch_uncertainty: 0.05 + rectifier_power_uncertainty: 0.05 +cooling: + cooling_efficiency: 0.945 + wet_bulb_temp: 290.0 + zip_code: '37831' + country_code: US + fmu_path: "../models/fmu-models/Frontier/Simulator_olcf5_base.fmu" + fmu_column_mapping: + T_sec_r_C: "Rack Return Temperature (°C)" + T_sec_s_C: "Rack Supply Temperature (°C)" + p_sec_r_psig: "Rack Supply Pressure (psig)" + p_sec_s_psig: "Rack Return Pressure (psig)" + V_flow_sec_GPM: "Rack Flowrate (gpm)" + T_prim_r_C: "Facility Return Temperature (°C)" + T_prim_s_C: "Facility Supply Temperature (°C)" + p_prim_s_psig: "Facility Supply Pressure (psig)" + p_prim_r_psig: "Facility Return Pressure (psig)" + V_flow_prim_GPM: "Facility Flowrate (gpm)" + W_flow_CDUP_kW: "Work Done By CDUP (kW)" + temperature_keys: + - simulator_1_centralEnergyPlant_1_coolingTowerLoop_1_sources_Towb + w_htwps_key: "simulator[1].centralEnergyPlant[1].hotWaterLoop[1].summary.W_flow_HTWP_kW" + w_ctwps_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CTWP_kW" + w_cts_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" diff --git a/config/frontier/cooling.json b/config/frontier/cooling.json deleted file mode 100644 index fd734a579981e4a184de2c229e4d5edb76fc4d8e..0000000000000000000000000000000000000000 --- a/config/frontier/cooling.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "COOLING_EFFICIENCY": 0.945, - "WET_BULB_TEMP": 290.0, - "ZIP_CODE": 37831, - "COUNTRY_CODE": "US", - "FMU_PATH": "models/Simulator_olcf5_base.fmu", - "FMU_COLUMN_MAPPING": { - "T_sec_r_C": "Rack Return Temperature (\u00b0C)", - "T_sec_s_C": "Rack Supply Temperature (\u00b0C)", - "p_sec_r_psig": "Rack Supply Pressure (psig)", - "p_sec_s_psig": "Rack Return Pressure (psig)", - "V_flow_sec_GPM": "Rack Flowrate (gpm)", - "T_prim_r_C": "Facility Return Temperature (\u00b0C)", - "T_prim_s_C": "Facility Supply Temperature (\u00b0C)", - "p_prim_s_psig": "Facility Supply Pressure (psig)", - "p_prim_r_psig": "Facility Return Pressure (psig)", - "V_flow_prim_GPM": "Facility Flowrate (gpm)", - "W_flow_CDUP_kW": "Work Done By CDUP (kW)" - }, - "TEMPERATURE_KEYS": ["simulator_1_centralEnergyPlant_1_coolingTowerLoop_1_sources_Towb"], - "W_HTWPs_KEY": "simulator[1].centralEnergyPlant[1].hotWaterLoop[1].summary.W_flow_HTWP_kW", - "W_CTWPs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CTWP_kW", - "W_CTs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" - -} diff --git a/config/frontier/power.json b/config/frontier/power.json deleted file mode 100644 index d6ec29e8f2d5e259706643017e61e6a36551bf78..0000000000000000000000000000000000000000 --- a/config/frontier/power.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "POWER_GPU_IDLE": 88, - "POWER_GPU_MAX": 560, - "POWER_CPU_IDLE": 90, - "POWER_CPU_MAX": 280, - "POWER_MEM": 74.26, - "POWER_NIC": 20, - "POWER_NVME": 30, - "POWER_SWITCH": 250, - "POWER_CDU": 8473.47, - "POWER_UPDATE_FREQ": 15, - "RECTIFIER_PEAK_THRESHOLD": 13670, - "SIVOC_LOSS_CONSTANT": 13, - "SIVOC_EFFICIENCY": 0.98, - "RECTIFIER_LOSS_CONSTANT": 17, - "RECTIFIER_EFFICIENCY": 0.96, - "POWER_COST": 0.094 -} diff --git a/config/frontier/scheduler.json b/config/frontier/scheduler.json deleted file mode 100644 index 5caf890b5448efe15057bdd96ecc1420957ac07d..0000000000000000000000000000000000000000 --- a/config/frontier/scheduler.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "SEED": 42, - "JOB_ARRIVAL_TIME": 900, - "MTBF": 11, - "TRACE_QUANTA": 15, - "MIN_WALL_TIME": 3600, - "MAX_WALL_TIME": 43200, - "UI_UPDATE_FREQ": 900, - "MAX_NODES_PER_JOB": 3000, - "JOB_END_PROBS": { - "COMPLETED": 0.63, - "FAILED": 0.13, - "CANCELLED": 0.12, - "TIMEOUT": 0.11, - "NODE_FAIL": 0.01 - } -} diff --git a/config/frontier/system.json b/config/frontier/system.json deleted file mode 100644 index b1b9d7686aca7197ce1f5840d61c6b392ee201fc..0000000000000000000000000000000000000000 --- a/config/frontier/system.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "NUM_CDUS": 25, - "RACKS_PER_CDU": 3, - "NODES_PER_RACK": 128, - "RECTIFIERS_PER_RACK": 32, - "CHASSIS_PER_RACK": 8, - "NODES_PER_BLADE": 2, - "SWITCHES_PER_CHASSIS": 4, - "NICS_PER_NODE": 4, - "RECTIFIERS_PER_CHASSIS": 4, - "NODES_PER_RECTIFIER": 4, - "MISSING_RACKS": [41], - "DOWN_NODES": [], - "CPUS_PER_NODE": 1, - "GPUS_PER_NODE": 4, - "CPU_PEAK_FLOPS": 2048E9, - "GPU_PEAK_FLOPS": 52E12, - "CPU_FP_RATIO": 0.667, - "GPU_FP_RATIO": 0.667 -} diff --git a/config/frontier/uq.json b/config/frontier/uq.json deleted file mode 100644 index 7359bc2f7370232665e391a184f096108691288c..0000000000000000000000000000000000000000 --- a/config/frontier/uq.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "POWER_GPU_UNCERTAINTY": 0.05 , - "POWER_CPU_UNCERTAINTY": 0.05 , - "POWER_MEM_UNCERTAINTY": 0.05 , - "POWER_NIC_UNCERTAINTY": 0.05 , - "POWER_NVME_UNCERTAINTY": 0.05 , - "POWER_CDUS_UNCERTAINTY": 0.05 , - "POWER_NODE_UNCERTAINTY": 0.002, - "POWER_SWITCH_UNCERTAINTY": 0.05 , - "RECTIFIER_POWER_UNCERTAINTY": 0.05 -} diff --git a/config/fugaku.yaml b/config/fugaku.yaml new file mode 100644 index 0000000000000000000000000000000000000000..afc82bfa613997c9b3334b9b16f1d2f8cea533bb --- /dev/null +++ b/config/fugaku.yaml @@ -0,0 +1,49 @@ +system: + num_cdus: 24 + racks_per_cdu: 18 + nodes_per_rack: 368 + chassis_per_rack: 8 + nodes_per_blade: 1 + switches_per_chassis: 2 + nics_per_node: 1 + rectifiers_per_chassis: 1 + nodes_per_rectifier: 48 + missing_racks: [] + down_nodes: [] + cpus_per_node: 1 + gpus_per_node: 0 + cpu_peak_flops: 3379000000000.0 + gpu_peak_flops: 0 + cpu_fp_ratio: 0.82 + gpu_fp_ratio: 0.0 +power: + power_gpu_idle: 0 + power_gpu_max: 0 + power_cpu_idle: 30 + power_cpu_max: 150 + power_mem: 10 + power_nic: 0 + power_nvme: 0 + power_switch: 0 + power_cdu: 0 + power_update_freq: 10 + rectifier_peak_threshold: 0 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + job_arrival_time: 10 + mtbf: 11 + trace_quanta: 10 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 3600 + max_nodes_per_job: 3000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/fugaku/power.json b/config/fugaku/power.json deleted file mode 100644 index 759a0c4270a7283133e9eaaaf5275af7bc57b1cc..0000000000000000000000000000000000000000 --- a/config/fugaku/power.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "POWER_GPU_IDLE": 0, - "POWER_GPU_MAX": 0, - "POWER_CPU_IDLE": 30, - "POWER_CPU_MAX": 150, - "POWER_MEM": 10, - "POWER_NIC": 0, - "POWER_NVME": 0, - "POWER_SWITCH": 0, - "POWER_CDU": 0, - "POWER_UPDATE_FREQ": 10, - "RECTIFIER_PEAK_THRESHOLD": 0, - "SIVOC_LOSS_CONSTANT": 13, - "SIVOC_EFFICIENCY": 0.98, - "RECTIFIER_LOSS_CONSTANT": 17, - "RECTIFIER_EFFICIENCY": 0.96, - "POWER_COST": 0.094 -} diff --git a/config/fugaku/scheduler.json b/config/fugaku/scheduler.json deleted file mode 100644 index 94cde88071c2f4884281348812b2986e74e023f0..0000000000000000000000000000000000000000 --- a/config/fugaku/scheduler.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "SEED": 42, - "JOB_ARRIVAL_TIME": 10, - "MTBF": 11, - "TRACE_QUANTA": 10, - "MIN_WALL_TIME": 3600, - "MAX_WALL_TIME": 43200, - "UI_UPDATE_FREQ": 3600, - "MAX_NODES_PER_JOB": 3000, - "JOB_END_PROBS": { - "COMPLETED": 0.63, - "FAILED": 0.13, - "CANCELLED": 0.12, - "TIMEOUT": 0.11, - "NODE_FAIL": 0.01 - } -} diff --git a/config/fugaku/system.json b/config/fugaku/system.json deleted file mode 100644 index 6a0e63aceb82bb728536ba8dd7c1f4d378c33de3..0000000000000000000000000000000000000000 --- a/config/fugaku/system.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "NUM_CDUS": 24, - "RACKS_PER_CDU": 18, - "NODES_PER_RACK": 368, - "RECTIFIERS_PER_RACK": 8, - "CHASSIS_PER_RACK": 8, - "NODES_PER_BLADE": 1, - "SWITCHES_PER_CHASSIS": 2, - "NICS_PER_NODE": 1, - "RECTIFIERS_PER_CHASSIS": 1, - "NODES_PER_RECTIFIER": 48, - "MISSING_RACKS": [], - "DOWN_NODES": [], - "CPUS_PER_NODE": 1, - "GPUS_PER_NODE": 0, - "CPU_PEAK_FLOPS": 3.379E12, - "GPU_PEAK_FLOPS": 0, - "CPU_FP_RATIO": 0.82, - "GPU_FP_RATIO": 0.0 -} diff --git a/config/gcloudv2.yaml b/config/gcloudv2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..53cd99930b53720a1f29e2fda0c2ad5cea3aa27a --- /dev/null +++ b/config/gcloudv2.yaml @@ -0,0 +1,49 @@ +system: + num_cdus: 125 + racks_per_cdu: 1 + nodes_per_rack: 100 + chassis_per_rack: 1 + nodes_per_blade: 2 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 1 + gpus_per_node: 0 + cpu_peak_flops: 2048000000000.0 + gpu_peak_flops: 0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 88 + power_gpu_max: 560 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 74.26 + power_nic: 20 + power_nvme: 30 + power_switch: 250 + power_cdu: 8473.47 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + job_arrival_time: 100 + mtbf: 11 + trace_quanta: 15 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 3000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/kestrel.yaml b/config/kestrel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..15e3ece77df39b2df3c4b379d4e4ec7d1d4527aa --- /dev/null +++ b/config/kestrel.yaml @@ -0,0 +1,53 @@ +system: + num_cdus: 6 + racks_per_cdu: 6 + nodes_per_rack: 80 + rectifiers_per_rack: 6 + chassis_per_rack: 1 + nodes_per_blade: 1 + switches_per_chassis: 5 + nics_per_node: 2 + rectifiers_per_chassis: 5 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 1 + gpus_per_node: 4 + cpu_peak_flops: 396800000000.0 + gpu_peak_flops: 7800000000000.0 + cpu_fp_ratio: 0.69 + gpu_fp_ratio: 0.69 + +power: + power_gpu_idle: 75 + power_gpu_max: 300 + power_cpu_idle: 100 + power_cpu_max: 800 + power_mem: 74.26 + power_nic: 21 + power_nvme: 45 + power_switch: 250 + power_cdu: 0 + power_update_freq: 20 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 0 + sivoc_efficiency: 1 + rectifier_loss_constant: 0 + rectifier_efficiency: 1 + power_cost: 0.094 + +scheduler: + seed: 42 + job_arrival_time: 20 + mtbf: 11 + trace_quanta: 20 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 3600 + max_nodes_per_job: 3000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 \ No newline at end of file diff --git a/config/lassen.yaml b/config/lassen.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ad636f0d7c89b9309e287f0ea22170cff71a2659 --- /dev/null +++ b/config/lassen.yaml @@ -0,0 +1,134 @@ +system: + num_cdus: 257 + racks_per_cdu: 1 + nodes_per_rack: 18 + chassis_per_rack: 1 + nodes_per_blade: 1 + switches_per_chassis: 5 + nics_per_node: 2 + rectifiers_per_chassis: 5 + nodes_per_rectifier: 4 + missing_racks: + - 44 + down_nodes: [] + cpus_per_node: 2 + threads_per_core: 4 + cpu_frequency: 2400000000 + gpus_per_node: 4 + cpu_peak_flops: 396800000000.0 + gpu_peak_flops: 7800000000000.0 + cpu_fp_ratio: 0.72 + gpu_fp_ratio: 0.72 +power: + power_gpu_idle: 75 + power_gpu_max: 300 + power_cpu_idle: 47.25 + power_cpu_max: 252 + power_mem: 74.26 + power_nic_idle: 10 + power_nic_max: 50 + power_nvme: 45 + power_switch: 250 + power_cdu: 0 + power_update_freq: 20 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 0 + sivoc_efficiency: 1 + rectifier_loss_constant: 0 + rectifier_efficiency: 1 + power_cost: 0.094 +scheduler: + job_arrival_time: 20 + mtbf: 11 + trace_quanta: 20 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 3600 + max_nodes_per_job: 3000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 +cooling: + cooling_efficiency: 0.945 + wet_bulb_temp: 290.0 + zip_code: '94550' + country_code: US + fmu_path: "../models/POWER9CSM/fmus/lassen.fmu" + fmu_column_mapping: + T_sec_r_C: "Rack Return Temperature (°C)" + T_sec_s_C: "Rack Supply Temperature (°C)" + p_sec_r_psig: "Rack Supply Pressure (psig)" + p_sec_s_psig: "Rack Return Pressure (psig)" + V_flow_sec_GPM: "Rack Flowrate (gpm)" + T_prim_r_C: "Facility Return Temperature (°C)" + T_prim_s_C: "Facility Supply Temperature (°C)" + p_prim_s_psig: "Facility Supply Pressure (psig)" + p_prim_r_psig: "Facility Return Pressure (psig)" + V_flow_prim_GPM: "Facility Flowrate (gpm)" + W_flow_CDUP_kW: "Work Done By CDUP (kW)" + temperature_keys: + - simulator_1_datacenter_1_computeBlock_1_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_2_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_3_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_4_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_5_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_6_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_7_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_8_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_9_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_10_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_11_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_12_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_13_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_14_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_15_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_16_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_17_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_18_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_19_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_20_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_21_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_22_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_23_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_24_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_25_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_26_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_27_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_28_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_29_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_30_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_31_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_32_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_33_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_34_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_35_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_36_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_37_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_38_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_39_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_40_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_41_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_42_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_43_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_44_cabinet_1_sources_T_Air + - simulator_1_centralEnergyPlant_1_coolingTowerLoop_1_sources_T_ext + w_htwps_key: "simulator[1].centralEnergyPlant[1].hotWaterLoop[1].summary.W_flow_HTWP_kW" + w_ctwps_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CTWP_kW" + w_cts_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" +network: + topology: fat-tree + network_max_bw: 12.5e9 + fattree_k: 32 + dragonfly_d: 11 + dragonfly_a: 9 + dragonfly_p: 8 + latency: 1 + torus_x: 17 + torus_y: 17 + torus_z: 8 + torus_wrap: true + hosts_per_router: 2 + torus_routing: DOR_XYZ diff --git a/config/lassen/cooling.json b/config/lassen/cooling.json deleted file mode 100644 index 871ceecc8173417215e87c33b742aaeb0a89242c..0000000000000000000000000000000000000000 --- a/config/lassen/cooling.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "COOLING_EFFICIENCY": 0.945, - "WET_BULB_TEMP": 290.0, - "ZIP_CODE": 94550, - "COUNTRY_CODE": "US", - "FMU_PATH": "models/POWER9CSM/fmus/lassen.fmu", - "FMU_COLUMN_MAPPING": { - "T_sec_r_C": "Rack Return Temperature (\u00b0C)", - "T_sec_s_C": "Rack Supply Temperature (\u00b0C)", - "p_sec_r_psig": "Rack Supply Pressure (psig)", - "p_sec_s_psig": "Rack Return Pressure (psig)", - "V_flow_sec_GPM": "Rack Flowrate (gpm)", - "T_prim_r_C": "Facility Return Temperature (\u00b0C)", - "T_prim_s_C": "Facility Supply Temperature (\u00b0C)", - "p_prim_s_psig": "Facility Supply Pressure (psig)", - "p_prim_r_psig": "Facility Return Pressure (psig)", - "V_flow_prim_GPM": "Facility Flowrate (gpm)", - "W_flow_CDUP_kW": "Work Done By CDUP (kW)" - }, - "TEMPERATURE_KEYS": [ - "simulator_1_datacenter_1_computeBlock_1_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_2_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_3_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_4_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_5_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_6_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_7_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_8_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_9_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_10_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_11_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_12_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_13_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_14_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_15_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_16_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_17_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_18_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_19_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_20_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_21_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_22_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_23_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_24_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_25_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_26_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_27_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_28_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_29_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_30_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_31_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_32_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_33_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_34_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_35_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_36_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_37_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_38_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_39_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_40_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_41_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_42_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_43_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_44_cabinet_1_sources_T_Air", - "simulator_1_centralEnergyPlant_1_coolingTowerLoop_1_sources_T_ext" - ], - "W_HTWPs_KEY": "simulator[1].centralEnergyPlant[1].hotWaterLoop[1].summary.W_flow_HTWP_kW", - "W_CTWPs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CTWP_kW", - "W_CTs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" - -} diff --git a/config/lassen/power.json b/config/lassen/power.json deleted file mode 100644 index 5b314b62f7a47b29c4cb6f747bbf05686e09a309..0000000000000000000000000000000000000000 --- a/config/lassen/power.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "POWER_GPU_IDLE": 75, - "POWER_GPU_MAX": 300, - "POWER_CPU_IDLE": 47.25, - "POWER_CPU_MAX": 252, - "POWER_MEM": 74.26, - "POWER_NIC_IDLE": 10, - "POWER_NIC_MAX": 50, - "POWER_NVME": 45, - "POWER_SWITCH": 250, - "POWER_CDU": 0, - "POWER_UPDATE_FREQ": 20, - "RECTIFIER_PEAK_THRESHOLD": 13670, - "SIVOC_LOSS_CONSTANT": 0, - "SIVOC_EFFICIENCY": 1, - "RECTIFIER_LOSS_CONSTANT": 0, - "RECTIFIER_EFFICIENCY": 1, - "POWER_COST": 0.094 -} diff --git a/config/lassen/scheduler.json b/config/lassen/scheduler.json deleted file mode 100644 index 709f08014fbaf6136448c85f3e9e6e31f4b83839..0000000000000000000000000000000000000000 --- a/config/lassen/scheduler.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "SEED": 42, - "JOB_ARRIVAL_TIME": 20, - "MTBF": 11, - "TRACE_QUANTA": 20, - "MIN_WALL_TIME": 3600, - "MAX_WALL_TIME": 43200, - "UI_UPDATE_FREQ": 3600, - "MAX_NODES_PER_JOB": 3000, - "JOB_END_PROBS": { - "COMPLETED": 0.63, - "FAILED": 0.13, - "CANCELLED": 0.12, - "TIMEOUT": 0.11, - "NODE_FAIL": 0.01 - } -} diff --git a/config/lassen/system.json b/config/lassen/system.json deleted file mode 100644 index b7434848ae230e5356d6fd50d4b15cb80b9f553c..0000000000000000000000000000000000000000 --- a/config/lassen/system.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "NUM_CDUS": 257, - "RACKS_PER_CDU": 1, - "NODES_PER_RACK": 18, - "RECTIFIERS_PER_RACK": 5, - "CHASSIS_PER_RACK": 1, - "NODES_PER_BLADE": 1, - "SWITCHES_PER_CHASSIS": 5, - "NICS_PER_NODE": 2, - "RECTIFIERS_PER_CHASSIS": 5, - "NODES_PER_RECTIFIER": 4, - "MISSING_RACKS": [44], - "DOWN_NODES": [], - "CPUS_PER_NODE": 2, - "CORES_PER_CPU": 22, - "GPUS_PER_NODE": 4, - "CPU_PEAK_FLOPS": 396.8E9, - "GPU_PEAK_FLOPS": 7.8E12, - "CPU_FP_RATIO": 0.72, - "GPU_FP_RATIO": 0.72 -} diff --git a/config/lumi/lumi-c.yaml b/config/lumi/lumi-c.yaml new file mode 100644 index 0000000000000000000000000000000000000000..091060baf6bbfb96a25b800d93e891c7c2238078 --- /dev/null +++ b/config/lumi/lumi-c.yaml @@ -0,0 +1,49 @@ +system: + num_cdus: 1 + racks_per_cdu: 2 + nodes_per_rack: 256 + chassis_per_rack: 8 + nodes_per_blade: 4 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 2 + gpus_per_node: 0 + cpu_peak_flops: 2509440000000.0 + gpu_peak_flops: 0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 88 + power_gpu_max: 560 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 74.26 + power_nic: 20 + power_nvme: 30 + power_switch: 250 + power_cdu: 8473.47 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + job_arrival_time: 900 + mtbf: 11 + trace_quanta: 15 + min_wall_time: 60 + max_wall_time: 172800 + ui_update_freq: 900 + max_nodes_per_job: 512 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/lumi/lumi-g.yaml b/config/lumi/lumi-g.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7c0f77cd851f76739c30926771db6c368e4eb8cd --- /dev/null +++ b/config/lumi/lumi-g.yaml @@ -0,0 +1,49 @@ +system: + num_cdus: 10 + racks_per_cdu: 3 + nodes_per_rack: 128 + chassis_per_rack: 8 + nodes_per_blade: 2 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 1 + gpus_per_node: 4 + cpu_peak_flops: 2048000000000.0 + gpu_peak_flops: 52000000000000.0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 88 + power_gpu_max: 560 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 74.26 + power_nic: 20 + power_nvme: 30 + power_switch: 250 + power_cdu: 8473.47 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + job_arrival_time: 900 + mtbf: 11 + trace_quanta: 15 + min_wall_time: 60 + max_wall_time: 172800 + ui_update_freq: 900 + max_nodes_per_job: 1024 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/marconi100.yaml b/config/marconi100.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0e66a7e8c20ba117ea4f566aed60a172c43a4310 --- /dev/null +++ b/config/marconi100.yaml @@ -0,0 +1,121 @@ +system: + num_cdus: 49 + racks_per_cdu: 1 + nodes_per_rack: 20 + chassis_per_rack: 1 + nodes_per_blade: 1 + switches_per_chassis: 5 + nics_per_node: 2 + rectifiers_per_chassis: 5 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 2 + gpus_per_node: 4 + cpu_peak_flops: 396800000000.0 + gpu_peak_flops: 7800000000000.0 + cpu_fp_ratio: 0.69 + gpu_fp_ratio: 0.69 +power: + power_gpu_idle: 75 + power_gpu_max: 300 + power_cpu_idle: 47.25 + power_cpu_max: 252 + power_mem: 74.26 + power_nic: 21 + power_nvme: 45 + power_switch: 250 + power_cdu: 0 + power_update_freq: 20 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 0 + sivoc_efficiency: 1 + rectifier_loss_constant: 0 + rectifier_efficiency: 1 + power_cost: 0.094 +scheduler: + job_arrival_time: 20 + mtbf: 11 + trace_quanta: 20 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 3600 + max_nodes_per_job: 3000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 +cooling: + cooling_efficiency: 0.945 + wet_bulb_temp: 290.0 + zip_code: '30170' + country_code: IT + fmu_path: "../models/POWER9CSM/fmus/marconi100.fmu" + fmu_column_mapping: + T_sec_r_C: "Rack Return Temperature (°C)" + T_sec_s_C: "Rack Supply Temperature (°C)" + p_sec_r_psig: "Rack Supply Pressure (psig)" + p_sec_s_psig: "Rack Return Pressure (psig)" + V_flow_sec_GPM: "Rack Flowrate (gpm)" + T_prim_r_C: "Facility Return Temperature (°C)" + T_prim_s_C: "Facility Supply Temperature (°C)" + p_prim_s_psig: "Facility Supply Pressure (psig)" + p_prim_r_psig: "Facility Return Pressure (psig)" + V_flow_prim_GPM: "Facility Flowrate (gpm)" + W_flow_CDUP_kW: "Work Done By CDUP (kW)" + temperature_keys: + - simulator_1_datacenter_1_computeBlock_1_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_2_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_3_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_4_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_5_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_6_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_7_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_8_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_9_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_10_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_11_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_12_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_13_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_14_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_15_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_16_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_17_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_18_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_19_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_20_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_21_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_22_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_23_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_24_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_25_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_26_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_27_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_28_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_29_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_30_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_31_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_32_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_33_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_34_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_35_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_36_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_37_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_38_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_39_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_40_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_41_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_42_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_43_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_44_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_45_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_46_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_47_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_48_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_49_cabinet_1_sources_T_Air + - simulator_1_centralEnergyPlant_1_coolingTowerLoop_1_sources_T_ext + w_htwps_key: "simulator[1].centralEnergyPlant[1].hotWaterLoop[1].summary.W_flow_HTWP_kW" + w_ctwps_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CTWP_kW" + w_cts_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" diff --git a/config/marconi100/cooling.json b/config/marconi100/cooling.json deleted file mode 100644 index cda9d1673868bdbb6a9f9113ae8d9f85bebac47d..0000000000000000000000000000000000000000 --- a/config/marconi100/cooling.json +++ /dev/null @@ -1,76 +0,0 @@ -{ - "COOLING_EFFICIENCY": 0.945, - "WET_BULB_TEMP": 290.0, - "ZIP_CODE": 30170, - "COUNTRY_CODE": "IT", - "FMU_PATH": "models/POWER9CSM/fmus/marconi100.fmu", - "FMU_COLUMN_MAPPING": { - "T_sec_r_C": "Rack Return Temperature (\u00b0C)", - "T_sec_s_C": "Rack Supply Temperature (\u00b0C)", - "p_sec_r_psig": "Rack Supply Pressure (psig)", - "p_sec_s_psig": "Rack Return Pressure (psig)", - "V_flow_sec_GPM": "Rack Flowrate (gpm)", - "T_prim_r_C": "Facility Return Temperature (\u00b0C)", - "T_prim_s_C": "Facility Supply Temperature (\u00b0C)", - "p_prim_s_psig": "Facility Supply Pressure (psig)", - "p_prim_r_psig": "Facility Return Pressure (psig)", - "V_flow_prim_GPM": "Facility Flowrate (gpm)", - "W_flow_CDUP_kW": "Work Done By CDUP (kW)" - }, - "TEMPERATURE_KEYS": [ - "simulator_1_datacenter_1_computeBlock_1_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_2_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_3_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_4_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_5_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_6_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_7_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_8_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_9_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_10_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_11_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_12_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_13_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_14_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_15_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_16_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_17_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_18_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_19_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_20_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_21_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_22_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_23_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_24_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_25_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_26_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_27_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_28_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_29_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_30_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_31_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_32_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_33_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_34_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_35_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_36_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_37_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_38_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_39_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_40_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_41_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_42_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_43_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_44_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_45_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_46_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_47_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_48_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_49_cabinet_1_sources_T_Air", - "simulator_1_centralEnergyPlant_1_coolingTowerLoop_1_sources_T_ext" - ], - "W_HTWPs_KEY": "simulator[1].centralEnergyPlant[1].hotWaterLoop[1].summary.W_flow_HTWP_kW", - "W_CTWPs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CTWP_kW", - "W_CTs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" - -} diff --git a/config/marconi100/power.json b/config/marconi100/power.json deleted file mode 100644 index 9e2302237db8457d85aaf2091953e89f8b0e056a..0000000000000000000000000000000000000000 --- a/config/marconi100/power.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "POWER_GPU_IDLE": 75, - "POWER_GPU_MAX": 300, - "POWER_CPU_IDLE": 47.25, - "POWER_CPU_MAX": 252, - "POWER_MEM": 74.26, - "POWER_NIC": 21, - "POWER_NVME": 45, - "POWER_SWITCH": 250, - "POWER_CDU": 0, - "POWER_UPDATE_FREQ": 20, - "RECTIFIER_PEAK_THRESHOLD": 13670, - "SIVOC_LOSS_CONSTANT": 0, - "SIVOC_EFFICIENCY": 1, - "RECTIFIER_LOSS_CONSTANT": 0, - "RECTIFIER_EFFICIENCY": 1, - "POWER_COST": 0.094 -} diff --git a/config/marconi100/scheduler.json b/config/marconi100/scheduler.json deleted file mode 100644 index 709f08014fbaf6136448c85f3e9e6e31f4b83839..0000000000000000000000000000000000000000 --- a/config/marconi100/scheduler.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "SEED": 42, - "JOB_ARRIVAL_TIME": 20, - "MTBF": 11, - "TRACE_QUANTA": 20, - "MIN_WALL_TIME": 3600, - "MAX_WALL_TIME": 43200, - "UI_UPDATE_FREQ": 3600, - "MAX_NODES_PER_JOB": 3000, - "JOB_END_PROBS": { - "COMPLETED": 0.63, - "FAILED": 0.13, - "CANCELLED": 0.12, - "TIMEOUT": 0.11, - "NODE_FAIL": 0.01 - } -} diff --git a/config/marconi100/system.json b/config/marconi100/system.json deleted file mode 100644 index 816e802eb06786a7d2fc34c14ecb50dd21e4c7d1..0000000000000000000000000000000000000000 --- a/config/marconi100/system.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "NUM_CDUS": 49, - "RACKS_PER_CDU": 1, - "NODES_PER_RACK": 20, - "RECTIFIERS_PER_RACK": 5, - "CHASSIS_PER_RACK": 1, - "NODES_PER_BLADE": 1, - "SWITCHES_PER_CHASSIS": 5, - "NICS_PER_NODE": 2, - "RECTIFIERS_PER_CHASSIS": 5, - "NODES_PER_RECTIFIER": 4, - "MISSING_RACKS": [], - "DOWN_NODES": [], - "CPUS_PER_NODE": 2, - "GPUS_PER_NODE": 4, - "CPU_PEAK_FLOPS": 396.8E9, - "GPU_PEAK_FLOPS": 7.8E12, - "CPU_FP_RATIO": 0.69, - "GPU_FP_RATIO": 0.69 -} diff --git a/config/mit_supercloud/part-cpu.yaml b/config/mit_supercloud/part-cpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b780b10f6ea51d395189e8c3fdf44118a77e1d38 --- /dev/null +++ b/config/mit_supercloud/part-cpu.yaml @@ -0,0 +1,51 @@ +system: + num_cdus: 12 + racks_per_cdu: 1 + nodes_per_rack: 40 + chassis_per_rack: 8 + nodes_per_blade: 1 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 2 + cores_per_cpu: 24 + gpus_per_node: 0 + cpu_peak_flops: 2995200000000.0 + gpu_peak_flops: 0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 88 + power_gpu_max: 560 + power_cpu_idle: 1 + power_cpu_max: 6 + power_mem: 74.26 + power_nvme: 30 + power_nic: 20 + power_cdu: 8473.47 + power_switch: 250 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + multitenant: true + job_arrival_time: 1 + mtbf: 11 + trace_quanta: 10 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 3000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/mit_supercloud/part-gpu.yaml b/config/mit_supercloud/part-gpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b8428318e5a5961aaa6353455ecc90c9b9000eb2 --- /dev/null +++ b/config/mit_supercloud/part-gpu.yaml @@ -0,0 +1,51 @@ +system: + num_cdus: 1 + racks_per_cdu: 1 + nodes_per_rack: 224 + chassis_per_rack: 8 + nodes_per_blade: 2 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 2 + cores_per_cpu: 20 + gpus_per_node: 2 + cpu_peak_flops: 1248000000000.0 + gpu_peak_flops: 7800000000000.0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 75 + power_gpu_max: 300 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 74.26 + power_nvme: 30 + power_nic: 20 + power_cdu: 8473.47 + power_switch: 250 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + multitenant: true + job_arrival_time: 900 + mtbf: 11 + trace_quanta: 20 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 192 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/perlmutter.yaml b/config/perlmutter.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e8de04c5720f506413142152ab89b8948c5c82c2 --- /dev/null +++ b/config/perlmutter.yaml @@ -0,0 +1,51 @@ +system: + num_cdus: 36 + racks_per_cdu: 3 + nodes_per_rack: 128 + rectifiers_per_rack: 32 + chassis_per_rack: 8 + nodes_per_blade: 2 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 1 + gpus_per_node: 4 + cpu_peak_flops: 3580000000000.0 + gpu_peak_flops: 9700000000000.0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 88 + power_gpu_max: 300 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 74.26 + power_nic: 20 + power_nvme: 30 + power_switch: 250 + power_cdu: 8473.47 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + seed: 42 + job_arrival_time: 900 + mtbf: 11 + trace_quanta: 10 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 3000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/philly/2-gpu.yaml b/config/philly/2-gpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7201b2e1b86e4aa38948b11c679d96d89817095 --- /dev/null +++ b/config/philly/2-gpu.yaml @@ -0,0 +1,51 @@ +system: + num_cdus: 1 + racks_per_cdu: 1 + nodes_per_rack: 321 + chassis_per_rack: 3 + nodes_per_blade: 2 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 2 + cores_per_cpu: 20 + gpus_per_node: 2 + cpu_peak_flops: 1248000000000.0 # assume Xeon E5-2690v4 CPU 64-bit + gpu_peak_flops: 9300000000000.0 # assume 12G P100 32-bit + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 30 + power_gpu_max: 250 + power_cpu_idle: 90 + power_cpu_max: 270 + power_mem: 74.26 + power_nvme: 30 + power_nic: 20 + power_cdu: 8473.47 + power_switch: 250 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + multitenant: true + job_arrival_time: 900 + mtbf: 11 + trace_quanta: 20 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 192 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/philly/8-gpu.yaml b/config/philly/8-gpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1e922826ca1ea3cbef6b5562f3ed72d601dfd842 --- /dev/null +++ b/config/philly/8-gpu.yaml @@ -0,0 +1,51 @@ +system: + num_cdus: 1 + racks_per_cdu: 1 + nodes_per_rack: 231 + chassis_per_rack: 3 + nodes_per_blade: 2 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 2 + cores_per_cpu: 20 + gpus_per_node: 8 + cpu_peak_flops: 1248000000000.0 # assume Xeon E5-2690v4 CPU 64-bit + gpu_peak_flops: 12000000000000.0 # assume 24G P40 32-bit + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 50 + power_gpu_max: 250 + power_cpu_idle: 90 + power_cpu_max: 270 + power_mem: 74.26 + power_nvme: 30 + power_nic: 20 + power_cdu: 8473.47 + power_switch: 250 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + multitenant: true + job_arrival_time: 900 + mtbf: 11 + trace_quanta: 20 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 192 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/selene.yaml b/config/selene.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fcf8c1a34b967dd95c9d58073bf3a188624f1e25 --- /dev/null +++ b/config/selene.yaml @@ -0,0 +1,51 @@ +system: + num_cdus: 20 + racks_per_cdu: 7 + nodes_per_rack: 4 + rectifiers_per_rack: 32 + chassis_per_rack: 4 + nodes_per_blade: 1 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 2 + gpus_per_node: 8 + cpu_peak_flops: 3481000000000.0 + gpu_peak_flops: 624000000000000.0 # BF8 performance + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 88 + power_gpu_max: 400 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 74.26 + power_nic: 20 + power_nvme: 30 + power_switch: 250 + power_cdu: 8473.47 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + seed: 42 + job_arrival_time: 900 + mtbf: 11 + trace_quanta: 10 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 3000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/setonix/part-cpu.yaml b/config/setonix/part-cpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3d7ce90f7c70305a2862ad57b971ae16c92a7ec3 --- /dev/null +++ b/config/setonix/part-cpu.yaml @@ -0,0 +1,242 @@ +system: + num_cdus: 1 + racks_per_cdu: 7 + nodes_per_rack: 256 + chassis_per_rack: 8 + nodes_per_blade: 4 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: + - 1600 + - 1601 + - 1602 + - 1603 + - 1604 + - 1605 + - 1606 + - 1607 + - 1608 + - 1609 + - 1610 + - 1611 + - 1612 + - 1613 + - 1614 + - 1615 + - 1616 + - 1617 + - 1618 + - 1619 + - 1620 + - 1621 + - 1622 + - 1623 + - 1624 + - 1625 + - 1626 + - 1627 + - 1628 + - 1629 + - 1630 + - 1631 + - 1632 + - 1633 + - 1634 + - 1635 + - 1636 + - 1637 + - 1638 + - 1639 + - 1640 + - 1641 + - 1642 + - 1643 + - 1644 + - 1645 + - 1646 + - 1647 + - 1648 + - 1649 + - 1650 + - 1651 + - 1652 + - 1653 + - 1654 + - 1655 + - 1656 + - 1657 + - 1658 + - 1659 + - 1660 + - 1661 + - 1662 + - 1663 + - 1664 + - 1665 + - 1666 + - 1667 + - 1668 + - 1669 + - 1670 + - 1671 + - 1672 + - 1673 + - 1674 + - 1675 + - 1676 + - 1677 + - 1678 + - 1679 + - 1680 + - 1681 + - 1682 + - 1683 + - 1684 + - 1685 + - 1686 + - 1687 + - 1688 + - 1689 + - 1690 + - 1691 + - 1692 + - 1693 + - 1694 + - 1695 + - 1696 + - 1697 + - 1698 + - 1699 + - 1700 + - 1701 + - 1702 + - 1703 + - 1704 + - 1705 + - 1706 + - 1707 + - 1708 + - 1709 + - 1710 + - 1711 + - 1712 + - 1713 + - 1714 + - 1715 + - 1716 + - 1717 + - 1718 + - 1719 + - 1720 + - 1721 + - 1722 + - 1723 + - 1724 + - 1725 + - 1726 + - 1727 + - 1728 + - 1729 + - 1730 + - 1731 + - 1732 + - 1733 + - 1734 + - 1735 + - 1736 + - 1737 + - 1738 + - 1739 + - 1740 + - 1741 + - 1742 + - 1743 + - 1744 + - 1745 + - 1746 + - 1747 + - 1748 + - 1749 + - 1750 + - 1751 + - 1752 + - 1753 + - 1754 + - 1755 + - 1756 + - 1757 + - 1758 + - 1759 + - 1760 + - 1761 + - 1762 + - 1763 + - 1764 + - 1765 + - 1766 + - 1767 + - 1768 + - 1769 + - 1770 + - 1771 + - 1772 + - 1773 + - 1774 + - 1775 + - 1776 + - 1777 + - 1778 + - 1779 + - 1780 + - 1781 + - 1782 + - 1783 + - 1784 + - 1785 + - 1786 + - 1787 + - 1788 + - 1789 + - 1790 + - 1791 + cores_per_cpu: 64 + cpus_per_node: 2 + gpus_per_node: 0 + cpu_peak_flops: 2509440000000.0 + gpu_peak_flops: 0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 88 + power_gpu_max: 560 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 74.26 + power_nvme: 30 + power_nic: 20 + power_cdu: 8473.47 + power_switch: 250 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + job_arrival_time: 900 + mtbf: 11 + trace_quanta: 20 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 3000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/setonix/part-cpu/power.json b/config/setonix/part-cpu/power.json deleted file mode 100644 index 5128c4c23cf302769724b9c20fdc5b3d8313b482..0000000000000000000000000000000000000000 --- a/config/setonix/part-cpu/power.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "POWER_GPU_IDLE": 88, - "POWER_GPU_MAX": 560, - "POWER_CPU_IDLE": 90, - "POWER_CPU_MAX": 280, - "POWER_MEM": 74.26, - "POWER_NVME": 30, - "POWER_NIC": 20, - "POWER_CDU": 8473.47, - "POWER_SWITCH": 250, - "POWER_UPDATE_FREQ": 15, - "RECTIFIER_PEAK_THRESHOLD": 13670, - "SIVOC_LOSS_CONSTANT": 13, - "SIVOC_EFFICIENCY": 0.98, - "RECTIFIER_LOSS_CONSTANT": 17, - "RECTIFIER_EFFICIENCY": 0.96, - "POWER_COST": 0.094 -} diff --git a/config/setonix/part-cpu/scheduler.json b/config/setonix/part-cpu/scheduler.json deleted file mode 100644 index 0ea905d46db0dc3442594d0c90688eb281448eee..0000000000000000000000000000000000000000 --- a/config/setonix/part-cpu/scheduler.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "SEED": 42, - "JOB_ARRIVAL_TIME": 900, - "MTBF": 11, - "MAX_TIME": 88200, - "TRACE_QUANTA": 20, - "MIN_WALL_TIME": 3600, - "MAX_WALL_TIME": 43200, - "UI_UPDATE_FREQ": 900, - "MAX_NODES_PER_JOB": 3000, - "JOB_END_PROBS": { - "COMPLETED": 0.63, - "FAILED": 0.13, - "CANCELLED": 0.12, - "TIMEOUT": 0.11, - "NODE_FAIL": 0.01 - } -} diff --git a/config/setonix/part-cpu/system.json b/config/setonix/part-cpu/system.json deleted file mode 100644 index 79da14f1933925d2e3258e9bb4fa6c2d1e3e52ce..0000000000000000000000000000000000000000 --- a/config/setonix/part-cpu/system.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "NUM_CDUS": 1, - "RACKS_PER_CDU": 7, - "NODES_PER_RACK": 256, - "RECTIFIERS_PER_RACK": 32, - "CHASSIS_PER_RACK": 8, - "NODES_PER_BLADE": 4, - "SWITCHES_PER_CHASSIS": 4, - "NICS_PER_NODE": 4, - "RECTIFIERS_PER_CHASSIS": 4, - "NODES_PER_RECTIFIER": 4, - "MISSING_RACKS": [], - "DOWN_NODES": [1600, 1601, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, 1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 1663, 1664, 1665, 1666, 1667, 1668, 1669, 1670, 1671, 1672, 1673, 1674, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682, 1683, 1684, 1685, 1686, 1687, 1688, 1689, 1690, 1691, 1692, 1693, 1694, 1695, 1696, 1697, 1698, 1699, 1700, 1701, 1702, 1703, 1704, 1705, 1706, 1707, 1708, 1709, 1710, 1711, 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728, 1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736, 1737, 1738, 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1746, 1747, 1748, 1749, 1750, 1751, 1752, 1753, 1754, 1755, 1756, 1757, 1758, 1759, 1760, 1761, 1762, 1763, 1764, 1765, 1766, 1767, 1768, 1769, 1770, 1771, 1772, 1773, 1774, 1775, 1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1785, 1786, 1787, 1788, 1789, 1790, 1791], - "CPUS_PER_NODE": 2, - "GPUS_PER_NODE": 0, - "CPU_PEAK_FLOPS": 2.50944E12, - "GPU_PEAK_FLOPS": 0, - "CPU_FP_RATIO": 0.667, - "GPU_FP_RATIO": 0.667 -} diff --git a/config/setonix/part-gpu.yaml b/config/setonix/part-gpu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..807a1cefbc2b0f27a633fa64045f719f5ead0dea --- /dev/null +++ b/config/setonix/part-gpu.yaml @@ -0,0 +1,114 @@ +system: + num_cdus: 1 + racks_per_cdu: 2 + nodes_per_rack: 128 + chassis_per_rack: 8 + nodes_per_blade: 2 + switches_per_chassis: 4 + nics_per_node: 4 + rectifiers_per_chassis: 4 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: + - 192 + - 193 + - 194 + - 195 + - 196 + - 197 + - 198 + - 199 + - 200 + - 201 + - 202 + - 203 + - 204 + - 205 + - 206 + - 207 + - 208 + - 209 + - 210 + - 211 + - 212 + - 213 + - 214 + - 215 + - 216 + - 217 + - 218 + - 219 + - 220 + - 221 + - 222 + - 223 + - 224 + - 225 + - 226 + - 227 + - 228 + - 229 + - 230 + - 231 + - 232 + - 233 + - 234 + - 235 + - 236 + - 237 + - 238 + - 239 + - 240 + - 241 + - 242 + - 243 + - 244 + - 245 + - 246 + - 247 + - 248 + - 249 + - 250 + - 251 + - 252 + - 253 + - 254 + - 255 + cores_per_cpu: 64 + cpus_per_node: 1 + gpus_per_node: 4 + cpu_peak_flops: 2048000000000.0 + gpu_peak_flops: 52000000000000.0 + cpu_fp_ratio: 0.667 + gpu_fp_ratio: 0.667 +power: + power_gpu_idle: 88 + power_gpu_max: 560 + power_cpu_idle: 90 + power_cpu_max: 280 + power_mem: 74.26 + power_nvme: 30 + power_nic: 20 + power_cdu: 8473.47 + power_switch: 250 + power_update_freq: 15 + rectifier_peak_threshold: 13670 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + job_arrival_time: 900 + mtbf: 11 + trace_quanta: 20 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 900 + max_nodes_per_job: 192 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 diff --git a/config/setonix/part-gpu/power.json b/config/setonix/part-gpu/power.json deleted file mode 100644 index 5128c4c23cf302769724b9c20fdc5b3d8313b482..0000000000000000000000000000000000000000 --- a/config/setonix/part-gpu/power.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "POWER_GPU_IDLE": 88, - "POWER_GPU_MAX": 560, - "POWER_CPU_IDLE": 90, - "POWER_CPU_MAX": 280, - "POWER_MEM": 74.26, - "POWER_NVME": 30, - "POWER_NIC": 20, - "POWER_CDU": 8473.47, - "POWER_SWITCH": 250, - "POWER_UPDATE_FREQ": 15, - "RECTIFIER_PEAK_THRESHOLD": 13670, - "SIVOC_LOSS_CONSTANT": 13, - "SIVOC_EFFICIENCY": 0.98, - "RECTIFIER_LOSS_CONSTANT": 17, - "RECTIFIER_EFFICIENCY": 0.96, - "POWER_COST": 0.094 -} diff --git a/config/setonix/part-gpu/scheduler.json b/config/setonix/part-gpu/scheduler.json deleted file mode 100644 index 937b71da447f1422fcc5b329bc4eff75d78dfae9..0000000000000000000000000000000000000000 --- a/config/setonix/part-gpu/scheduler.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "SEED": 42, - "JOB_ARRIVAL_TIME": 900, - "MTBF": 11, - "MAX_TIME": 88200, - "TRACE_QUANTA": 20, - "MIN_WALL_TIME": 3600, - "MAX_WALL_TIME": 43200, - "UI_UPDATE_FREQ": 900, - "MAX_NODES_PER_JOB": 192, - "JOB_END_PROBS": { - "COMPLETED": 0.63, - "FAILED": 0.13, - "CANCELLED": 0.12, - "TIMEOUT": 0.11, - "NODE_FAIL": 0.01 - } -} diff --git a/config/setonix/part-gpu/system.json b/config/setonix/part-gpu/system.json deleted file mode 100644 index 6ffa5a42dd0bbc863d705e9fa9fd98266e7dba63..0000000000000000000000000000000000000000 --- a/config/setonix/part-gpu/system.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "NUM_CDUS": 1, - "RACKS_PER_CDU": 2, - "NODES_PER_RACK": 128, - "RECTIFIERS_PER_RACK": 32, - "CHASSIS_PER_RACK": 8, - "NODES_PER_BLADE": 2, - "SWITCHES_PER_CHASSIS": 4, - "NICS_PER_NODE": 4, - "RECTIFIERS_PER_CHASSIS": 4, - "NODES_PER_RECTIFIER": 4, - "MISSING_RACKS": [], - "DOWN_NODES": [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255], - "CPUS_PER_NODE": 1, - "GPUS_PER_NODE": 4, - "CPU_PEAK_FLOPS": 2048E9, - "GPU_PEAK_FLOPS": 52E12, - "CPU_FP_RATIO": 0.667, - "GPU_FP_RATIO": 0.667 -} diff --git a/config/summit.yaml b/config/summit.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7b2b5fe301692ff48082c82dbe92de9b1ab0d444 --- /dev/null +++ b/config/summit.yaml @@ -0,0 +1,329 @@ +system: + num_cdus: 257 + racks_per_cdu: 1 + nodes_per_rack: 18 + chassis_per_rack: 1 + nodes_per_blade: 1 + switches_per_chassis: 5 + nics_per_node: 2 + rectifiers_per_chassis: 5 + nodes_per_rectifier: 4 + missing_racks: [] + down_nodes: [] + cpus_per_node: 2 + gpus_per_node: 6 + cpu_peak_flops: 436200000000.0 + gpu_peak_flops: 7800000000000.0 + cpu_fp_ratio: 0.674 + gpu_fp_ratio: 0.674 +power: + power_gpu_idle: 75 + power_gpu_max: 300 + power_cpu_idle: 47.25 + power_cpu_max: 300 + power_mem: 74.26 + power_nic: 21 + power_nvme: 45 + power_switch: 0 + power_cdu: 0 + power_update_freq: 10 + rectifier_peak_threshold: 0 + sivoc_loss_constant: 13 + sivoc_efficiency: 0.98 + rectifier_loss_constant: 17 + rectifier_efficiency: 0.96 + power_cost: 0.094 +scheduler: + job_arrival_time: 60 + mtbf: 11 + trace_quanta: 10 + min_wall_time: 3600 + max_wall_time: 43200 + ui_update_freq: 3600 + max_nodes_per_job: 3000 + job_end_probs: + COMPLETED: 0.63 + FAILED: 0.13 + CANCELLED: 0.12 + TIMEOUT: 0.11 + NODE_FAIL: 0.01 +cooling: + cooling_efficiency: 0.945 + wet_bulb_temp: 290.0 + zip_code: '37831' + country_code: US + fmu_path: "../models/POWER9CSM/fmus/summit.fmu" + fmu_column_mapping: + T_sec_r_C: "Rack Return Temperature (°C)" + T_sec_s_C: "Rack Supply Temperature (°C)" + p_sec_r_psig: "Rack Supply Pressure (psig)" + p_sec_s_psig: "Rack Return Pressure (psig)" + V_flow_sec_GPM: "Rack Flowrate (gpm)" + T_prim_r_C: "Facility Return Temperature (°C)" + T_prim_s_C: "Facility Supply Temperature (°C)" + p_prim_s_psig: "Facility Supply Pressure (psig)" + p_prim_r_psig: "Facility Return Pressure (psig)" + V_flow_prim_GPM: "Facility Flowrate (gpm)" + W_flow_CDUP_kW: "Work Done By CDUP (kW)" + temperature_keys: + - simulator_1_datacenter_1_computeBlock_1_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_2_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_3_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_4_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_5_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_6_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_7_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_8_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_9_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_10_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_11_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_12_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_13_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_14_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_15_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_16_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_17_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_18_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_19_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_20_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_21_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_22_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_23_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_24_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_25_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_26_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_27_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_28_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_29_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_30_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_31_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_32_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_33_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_34_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_35_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_36_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_37_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_38_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_39_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_40_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_41_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_42_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_43_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_44_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_45_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_46_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_47_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_48_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_49_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_50_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_51_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_52_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_53_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_54_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_55_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_56_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_57_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_58_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_59_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_60_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_61_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_62_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_63_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_64_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_65_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_66_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_67_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_68_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_69_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_70_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_71_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_72_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_73_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_74_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_75_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_76_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_77_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_78_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_79_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_80_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_81_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_82_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_83_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_84_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_85_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_86_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_87_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_88_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_89_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_90_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_91_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_92_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_93_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_94_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_95_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_96_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_97_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_98_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_99_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_100_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_101_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_102_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_103_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_104_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_105_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_106_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_107_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_108_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_109_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_110_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_111_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_112_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_113_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_114_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_115_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_116_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_117_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_118_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_119_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_120_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_121_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_122_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_123_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_124_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_125_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_126_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_127_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_128_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_129_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_130_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_131_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_132_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_133_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_134_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_135_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_136_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_137_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_138_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_139_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_140_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_141_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_142_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_143_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_144_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_145_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_146_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_147_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_148_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_149_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_150_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_151_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_152_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_153_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_154_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_155_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_156_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_157_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_158_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_159_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_160_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_161_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_162_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_163_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_164_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_165_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_166_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_167_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_168_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_169_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_170_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_171_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_172_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_173_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_174_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_175_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_176_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_177_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_178_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_179_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_180_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_181_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_182_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_183_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_184_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_185_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_186_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_187_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_188_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_189_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_190_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_191_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_192_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_193_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_194_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_195_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_196_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_197_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_198_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_199_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_200_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_201_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_202_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_203_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_204_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_205_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_206_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_207_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_208_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_209_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_210_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_211_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_212_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_213_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_214_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_215_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_216_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_217_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_218_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_219_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_220_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_221_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_222_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_223_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_224_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_225_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_226_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_227_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_228_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_229_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_230_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_231_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_232_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_233_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_234_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_235_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_236_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_237_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_238_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_239_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_240_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_241_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_242_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_243_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_244_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_245_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_246_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_247_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_248_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_249_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_250_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_251_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_252_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_253_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_254_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_255_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_256_cabinet_1_sources_T_Air + - simulator_1_datacenter_1_computeBlock_257_cabinet_1_sources_T_Air + - simulator_1_centralEnergyPlant_1_coolingTowerLoop_1_sources_T_ext + w_htwps_key: "simulator[1].centralEnergyPlant[1].hotWaterLoop[1].summary.W_flow_HTWP_kW" + w_ctwps_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CTWP_kW" + w_cts_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" diff --git a/config/summit/cooling.json b/config/summit/cooling.json deleted file mode 100644 index 3d1d2e4cc33ea0716e0078a6e9d9839126be70d8..0000000000000000000000000000000000000000 --- a/config/summit/cooling.json +++ /dev/null @@ -1,284 +0,0 @@ -{ - "COOLING_EFFICIENCY": 0.945, - "WET_BULB_TEMP": 290.0, - "ZIP_CODE": 37831, - "COUNTRY_CODE": "US", - "FMU_PATH": "models/POWER9CSM/fmus/summit.fmu", - "FMU_COLUMN_MAPPING": { - "T_sec_r_C": "Rack Return Temperature (\u00b0C)", - "T_sec_s_C": "Rack Supply Temperature (\u00b0C)", - "p_sec_r_psig": "Rack Supply Pressure (psig)", - "p_sec_s_psig": "Rack Return Pressure (psig)", - "V_flow_sec_GPM": "Rack Flowrate (gpm)", - "T_prim_r_C": "Facility Return Temperature (\u00b0C)", - "T_prim_s_C": "Facility Supply Temperature (\u00b0C)", - "p_prim_s_psig": "Facility Supply Pressure (psig)", - "p_prim_r_psig": "Facility Return Pressure (psig)", - "V_flow_prim_GPM": "Facility Flowrate (gpm)", - "W_flow_CDUP_kW": "Work Done By CDUP (kW)" - }, - "TEMPERATURE_KEYS": [ - "simulator_1_datacenter_1_computeBlock_1_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_2_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_3_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_4_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_5_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_6_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_7_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_8_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_9_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_10_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_11_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_12_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_13_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_14_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_15_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_16_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_17_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_18_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_19_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_20_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_21_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_22_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_23_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_24_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_25_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_26_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_27_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_28_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_29_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_30_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_31_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_32_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_33_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_34_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_35_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_36_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_37_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_38_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_39_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_40_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_41_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_42_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_43_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_44_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_45_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_46_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_47_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_48_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_49_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_50_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_51_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_52_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_53_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_54_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_55_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_56_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_57_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_58_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_59_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_60_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_61_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_62_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_63_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_64_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_65_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_66_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_67_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_68_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_69_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_70_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_71_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_72_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_73_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_74_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_75_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_76_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_77_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_78_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_79_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_80_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_81_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_82_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_83_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_84_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_85_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_86_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_87_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_88_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_89_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_90_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_91_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_92_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_93_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_94_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_95_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_96_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_97_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_98_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_99_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_100_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_101_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_102_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_103_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_104_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_105_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_106_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_107_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_108_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_109_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_110_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_111_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_112_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_113_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_114_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_115_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_116_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_117_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_118_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_119_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_120_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_121_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_122_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_123_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_124_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_125_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_126_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_127_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_128_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_129_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_130_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_131_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_132_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_133_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_134_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_135_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_136_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_137_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_138_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_139_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_140_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_141_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_142_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_143_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_144_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_145_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_146_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_147_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_148_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_149_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_150_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_151_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_152_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_153_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_154_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_155_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_156_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_157_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_158_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_159_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_160_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_161_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_162_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_163_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_164_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_165_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_166_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_167_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_168_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_169_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_170_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_171_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_172_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_173_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_174_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_175_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_176_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_177_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_178_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_179_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_180_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_181_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_182_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_183_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_184_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_185_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_186_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_187_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_188_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_189_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_190_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_191_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_192_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_193_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_194_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_195_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_196_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_197_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_198_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_199_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_200_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_201_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_202_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_203_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_204_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_205_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_206_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_207_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_208_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_209_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_210_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_211_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_212_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_213_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_214_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_215_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_216_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_217_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_218_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_219_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_220_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_221_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_222_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_223_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_224_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_225_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_226_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_227_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_228_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_229_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_230_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_231_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_232_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_233_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_234_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_235_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_236_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_237_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_238_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_239_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_240_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_241_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_242_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_243_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_244_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_245_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_246_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_247_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_248_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_249_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_250_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_251_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_252_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_253_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_254_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_255_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_256_cabinet_1_sources_T_Air", - "simulator_1_datacenter_1_computeBlock_257_cabinet_1_sources_T_Air", - "simulator_1_centralEnergyPlant_1_coolingTowerLoop_1_sources_T_ext" - ], - "W_HTWPs_KEY": "simulator[1].centralEnergyPlant[1].hotWaterLoop[1].summary.W_flow_HTWP_kW", - "W_CTWPs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CTWP_kW", - "W_CTs_KEY": "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" - -} diff --git a/config/summit/power.json b/config/summit/power.json deleted file mode 100644 index af6fdaa009ef669533ad2196966b5583433bbd1f..0000000000000000000000000000000000000000 --- a/config/summit/power.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "POWER_GPU_IDLE": 75, - "POWER_GPU_MAX": 300, - "POWER_CPU_IDLE": 47.25, - "POWER_CPU_MAX": 300, - "POWER_MEM": 74.26, - "POWER_NIC": 21, - "POWER_NVME": 45, - "POWER_SWITCH": 0, - "POWER_CDU": 0, - "POWER_UPDATE_FREQ": 10, - "RECTIFIER_PEAK_THRESHOLD": 0, - "SIVOC_LOSS_CONSTANT": 13, - "SIVOC_EFFICIENCY": 0.98, - "RECTIFIER_LOSS_CONSTANT": 17, - "RECTIFIER_EFFICIENCY": 0.96, - "POWER_COST": 0.094 -} diff --git a/config/summit/scheduler.json b/config/summit/scheduler.json deleted file mode 100644 index 263f3ec2b8c5699b44496d4d1488ba884157c4db..0000000000000000000000000000000000000000 --- a/config/summit/scheduler.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "SEED": 42, - "JOB_ARRIVAL_TIME": 60, - "MTBF": 11, - "TRACE_QUANTA": 10, - "MIN_WALL_TIME": 3600, - "MAX_WALL_TIME": 43200, - "UI_UPDATE_FREQ": 3600, - "MAX_NODES_PER_JOB": 3000, - "JOB_END_PROBS": { - "COMPLETED": 0.63, - "FAILED": 0.13, - "CANCELLED": 0.12, - "TIMEOUT": 0.11, - "NODE_FAIL": 0.01 - } -} diff --git a/config/summit/system.json b/config/summit/system.json deleted file mode 100644 index 6fcd420cdcd51ae811fac37c6403fa9e3f67a0b5..0000000000000000000000000000000000000000 --- a/config/summit/system.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "NUM_CDUS": 257, - "RACKS_PER_CDU": 1, - "NODES_PER_RACK": 18, - "CHASSIS_PER_RACK": 1, - "NODES_PER_BLADE": 1, - "SWITCHES_PER_CHASSIS": 5, - "NICS_PER_NODE": 2, - "RECTIFIERS_PER_CHASSIS": 5, - "NODES_PER_RECTIFIER": 4, - "MISSING_RACKS": [], - "DOWN_NODES": [], - "CPUS_PER_NODE": 2, - "GPUS_PER_NODE": 6, - "CPU_PEAK_FLOPS": 436.2E9, - "GPU_PEAK_FLOPS": 7.8E12, - "CPU_FP_RATIO": 0.674, - "GPU_FP_RATIO": 0.674 -} diff --git a/experiments/bluewaters.yaml b/experiments/bluewaters.yaml new file mode 100644 index 0000000000000000000000000000000000000000..80ab1296ae7a902a650fc7aeedd9697608338cf8 --- /dev/null +++ b/experiments/bluewaters.yaml @@ -0,0 +1,6 @@ +system: bluewaters +replay: + - /opt/data/bluewaters +start: "20170328" +simulate_network: True +filter: "traffic > 1e8" diff --git a/experiments/frontier-hourly-1year-AI-proxy.yaml b/experiments/frontier-hourly-1year-AI-proxy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c401c953c86d19b56891e501f27fc4146714d4b9 --- /dev/null +++ b/experiments/frontier-hourly-1year-AI-proxy.yaml @@ -0,0 +1,12 @@ +system: frontier +continuous-job-generation: True +downtime-first: 8h +downtime-interval: 7d +downtime-length: 4h +time: 356d +time-delta: 1h +numjobs: 1 +maxqueue: 30 +workload: randomAI +policy: fcfs +backfill: firstfit diff --git a/experiments/frontier.yaml b/experiments/frontier.yaml new file mode 100644 index 0000000000000000000000000000000000000000..280d95f8ae8e68329814e2f0d043cc2c3f20db84 --- /dev/null +++ b/experiments/frontier.yaml @@ -0,0 +1,4 @@ +system: frontier +replay: + - /opt/data/frontier/slurm/joblive/date=2024-01-18 + - /opt/data/frontier/jobprofile/date=2024-01-18 diff --git a/experiments/gcloudv2.yaml b/experiments/gcloudv2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9c7a7005ca8a9f7d09da0b000227917ac488b917 --- /dev/null +++ b/experiments/gcloudv2.yaml @@ -0,0 +1,4 @@ +system: gcloudv2 +replay: + - /opt/data/gcloud/v2/google_cluster_data_2011_sample +start: 2011-05-02T00:10:00Z diff --git a/experiments/lassen.yaml b/experiments/lassen.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7ee04beeeae219c9cdbe6845f7042eff54839e1e --- /dev/null +++ b/experiments/lassen.yaml @@ -0,0 +1,9 @@ +system: lassen +replay: + - /opt/data/lassen/Lassen-Supercomputer-Job-Dataset +policy: fcfs +backfill: firstfit +fastforward: 365d +time: 12h +arrival: poisson +simulate_network: true diff --git a/experiments/marconi100.yaml b/experiments/marconi100.yaml new file mode 100644 index 0000000000000000000000000000000000000000..859222975067c11a03bf83395101572d87500463 --- /dev/null +++ b/experiments/marconi100.yaml @@ -0,0 +1,3 @@ +system: marconi100 +replay: + - /opt/data/marconi100/job_table.parquet diff --git a/experiments/mit-replay-24hrs.yaml b/experiments/mit-replay-24hrs.yaml new file mode 100644 index 0000000000000000000000000000000000000000..69900699af0d6b63d68720a1e4ad59ad9dcf4579 --- /dev/null +++ b/experiments/mit-replay-24hrs.yaml @@ -0,0 +1,6 @@ +# raps run-multi-part experiments/mit-replay-24hrs.yaml +partitions: ["mit_supercloud/part-cpu", "mit_supercloud/part-gpu"] +replay: + - /opt/data/mit_supercloud/202201 +start: 2021-05-21T00:00 +end: 2021-05-22T00:00 diff --git a/experiments/mit-synthetic.yaml b/experiments/mit-synthetic.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a24946f41f3b21ada3546a074c46fabaf10932c --- /dev/null +++ b/experiments/mit-synthetic.yaml @@ -0,0 +1,3 @@ +# raps run-multi-part experiments/mit-synthetic.yaml +partitions: ["mit_supercloud/part-cpu", "mit_supercloud/part-gpu"] +workload: multitenant diff --git a/experiments/mitrl.yaml b/experiments/mitrl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3bbd988d0d4005821e2b588304eb2bedf251e6fb --- /dev/null +++ b/experiments/mitrl.yaml @@ -0,0 +1,7 @@ +system: "mit_supercloud" +replay: + - /opt/data/mit_supercloud +start: 2021-05-21T00:00 +end: 2021-05-22T00:00 +episode_length: 500 +arrival: poisson diff --git a/hetero-setonix.py b/hetero-setonix.py deleted file mode 100644 index b42dcdb456b7b4e9a74c995a56c24966ac968676..0000000000000000000000000000000000000000 --- a/hetero-setonix.py +++ /dev/null @@ -1,85 +0,0 @@ -from raps.helpers import check_python_version -check_python_version() - -from args import args -import copy -args_dict1 = copy.deepcopy(vars(args)) -args_dict2 = copy.deepcopy(vars(args)) -print(args_dict1) -print(args_dict2) - -from raps.config import ConfigManager -from raps.ui import LayoutManager -from raps.flops import FLOPSManager -from raps.power import PowerManager, compute_node_power -from raps.scheduler import Scheduler -from raps.workload import Workload -from raps.utils import convert_to_seconds - -config1 = ConfigManager(system_name='setonix-cpu').get_config() -config2 = ConfigManager(system_name='setonix-gpu').get_config() - -args_dict1['config'] = config1 -args_dict2['config'] = config2 - -pm1 = PowerManager(compute_node_power, **config1) -pm2 = PowerManager(compute_node_power, **config2) - -fm1 = FLOPSManager(**args_dict1) -fm2 = FLOPSManager(**args_dict2) - -sc1 = Scheduler(power_manager=pm1, flops_manager=fm1, cooling_model=None, **args_dict1) -sc2 = Scheduler(power_manager=pm2, flops_manager=fm2, cooling_model=None, **args_dict2) - -layout_manager1 = LayoutManager(args.layout, scheduler=sc1, debug=args.debug, **config1) -layout_manager2 = LayoutManager(args.layout, scheduler=sc2, debug=args.debug, **config2) - -print(config1) -print(config2) -configs = [config1, config2] -wl = Workload(*configs) - -jobs = getattr(wl, args.workload)(num_jobs=args.numjobs) -print(jobs) - -# Separate jobs based on partition -jobs1 = [job for job in jobs if job['partition'] == 'setonix-cpu'] -jobs2 = [job for job in jobs if job['partition'] == 'setonix-gpu'] - -# Print counts for verification -print(f"Jobs for setonix-cpu: {len(jobs1)}") -print(f"Jobs for setonix-gpu: {len(jobs2)}") - -if args.time: - timesteps = convert_to_seconds(args.time) -else: - timesteps = 88200 # 24 hours - -if args.verbose: print(jobs) - -# Create generator objects for both partitions -gen1 = layout_manager1.run_stepwise(jobs1, timesteps=timesteps) -gen2 = layout_manager2.run_stepwise(jobs2, timesteps=timesteps) - -# Step through both generators in lockstep -#for _ in range(timesteps): -# next(gen1) # Advance first scheduler -# next(gen2) # Advance second scheduler - -for timestep in range(timesteps): - # Advance generators - next(gen1) - next(gen2) - - # Timestep - print(f"[DEBUG] Timestep: {timestep}") - - # Queue lengths - print(f"[DEBUG] setonix-cpu Queue: {len(layout_manager1.scheduler.queue)}") - print(f"[DEBUG] setonix-gpu Queue: {len(layout_manager2.scheduler.queue)}") - - # System utilization - sys_util1 = layout_manager1.scheduler.sys_util_history[-1][1] if layout_manager1.scheduler.sys_util_history else 0.0 - sys_util2 = layout_manager2.scheduler.sys_util_history[-1][1] if layout_manager2.scheduler.sys_util_history else 0.0 - print(f"[DEBUG] setonix-cpu Util: {sys_util1:.2f}%") - print(f"[DEBUG] setonix-gpu Util: {sys_util2:.2f}%") diff --git a/main.py b/main.py old mode 100644 new mode 100755 index 7e8919d3fa498fd2e6e17777e6743b08bccb2a9b..8ae0c064a4706645f13feba4243d2bb910572077 --- a/main.py +++ b/main.py @@ -1,258 +1,100 @@ -""" Shortest-job first (SJF) job schedule simulator """ - -import json -import numpy as np -import random -import pandas as pd +#!/usr/bin/env python3 +# PYTHON_ARGCOMPLETE_OK +""" +ExaDigiT Resource Allocator & Power Simulator (RAPS) +""" +import argparse +from pathlib import Path import os -import re -import time - -from tqdm import tqdm - -from raps.helpers import check_python_version -check_python_version() - -from args import args -args_dict = vars(args) -print(args_dict) - -from raps.config import ConfigManager -from raps.constants import OUTPUT_PATH, SEED -from raps.cooling import ThermoFluidsModel -from raps.ui import LayoutManager -from raps.flops import FLOPSManager -from raps.plotting import Plotter -from raps.power import PowerManager, compute_node_power, compute_node_power_validate -from raps.power import compute_node_power_uncertainties, compute_node_power_validate_uncertainties -from raps.engine import Engine -from raps.job import Job -from raps.telemetry import Telemetry -from raps.workload import Workload -from raps.account import Accounts -from raps.weather import Weather -from raps.utils import create_casename, convert_to_seconds, write_dict_to_file, next_arrival - -config = ConfigManager(system_name=args.system).get_config() - -if args.seed: - random.seed(SEED) - np.random.seed(SEED) - -if args.cooling: - cooling_model = ThermoFluidsModel(**config) - cooling_model.initialize() - args.layout = "layout2" - - if args_dict['start']: - cooling_model.weather = Weather(args_dict['start'], config=config) -else: - cooling_model = None - -if args.validate: - if args.uncertainties: - power_manager = PowerManager(compute_node_power_validate_uncertainties, **config) - else: - power_manager = PowerManager(compute_node_power_validate, **config) -else: - if args.uncertainties: - power_manager = PowerManager(compute_node_power_uncertainties, **config) - else: - power_manager = PowerManager(compute_node_power, **config) -args_dict['config'] = config -flops_manager = FLOPSManager(**args_dict) - -sc = Engine( - power_manager=power_manager, - flops_manager=flops_manager, - cooling_model=cooling_model, - **args_dict, -) -layout_manager = LayoutManager(args.layout, engine=sc, debug=args.debug, **config) - -if args.replay: - - if args.fastforward: - args.fastforward = convert_to_seconds(args.fastforward) - - td = Telemetry(**args_dict) - - # Try to extract date from given name to use as case directory - matched_date = re.search(r"\d{4}-\d{2}-\d{2}", args.replay[0]) - if matched_date: - extracted_date = matched_date.group(0) - DIR_NAME = "sim=" + extracted_date - else: - extracted_date = "Date not found" - DIR_NAME = create_casename() - - # Read telemetry data (either npz file or via custom data loader) - if args.replay[0].endswith(".npz"): # Replay .npz file - print(f"Loading {args.replay[0]}...") - jobs, accounts = td.load_snapshot(args.replay[0]) - - if args.scale: - for job in tqdm(jobs, desc=f"Scaling jobs to {args.scale} nodes"): - job['nodes_required'] = random.randint(1, args.scale) - job['requested_nodes'] = None # Setting to None triggers scheduler to assign nodes - - if args.reschedule == 'poisson': - print("available nodes:", config['AVAILABLE_NODES']) - for job in tqdm(jobs, desc="Rescheduling jobs"): - job['requested_nodes'] = None - job['submit_time'] = next_arrival(1 / config['JOB_ARRIVAL_TIME']) - elif args.reschedule == 'submit-time': - raise NotImplementedError - - else: # custom data loader - print(*args.replay) - jobs = td.load_data(args.replay) - accounts = Accounts(jobs) - sc.accounts = accounts - accounts_dict = accounts.to_dict() - td.save_snapshot(jobs, accounts, filename=DIR_NAME) +import textwrap +import copy +import gzip +import dill +import argcomplete - # Set number of timesteps based on the last job running which we assume - # is the maximum value of submit_time + wall_time of all the jobs - if args.time: - timesteps = convert_to_seconds(args.time) - else: - timesteps = int(max(job['wall_time'] + job['submit_time'] for job in jobs)) + 1 +# Implement shell completion using argcomplete +# Importing all of raps' dependencies like pandas etc can be rather slow, often taking 1-2 seconds. So for snappy shell +# completion we need avoid imports on the shell completion path. We could do this by shuffling the code around to +# create the parser without importing any heavy-weight libraries. But that would be a pain to maintain and track that +# pandas or scipy aren't accidentally imported transitively. Pandas can also be convenient to use in validating +# SimConfig etc, which is needed to build the argparser. So instead, we cache the generated argparser object so that +# shell completion can run without importing the rest of raps. +PARSER_CACHE = Path(__file__).parent / '.shell-completion-cache' - print(f'Simulating {len(jobs)} jobs for {timesteps} seconds') - time.sleep(1) -else: # Synthetic jobs - wl = Workload(config) - jobs = getattr(wl, args.workload)(num_jobs=args.numjobs) - job_accounts = Accounts(jobs) - if args.accounts_json: - loaded_accounts = Accounts.from_json_filename(args.accounts_json) - accounts = Accounts.merge(loaded_accounts,job_accounts) - else: - accounts = job_accounts +def shell_completion_add_parser(subparsers): + parser = subparsers.add_parser("shell-completion", description=textwrap.dedent(""" + Register shell completion for RAPS. + """).strip(), formatter_class=argparse.RawDescriptionHelpFormatter) - if args.verbose: - for job_vector in jobs: - job = Job(job_vector, 0) - print('jobid:', job.id, '\tlen(gpu_trace):', len(job.gpu_trace), '\twall_time(s):', job.wall_time) - time.sleep(2) + # Run the command from argcomplete, this edits ~/.bash_completion to register argcomplete + def impl(args): + os.system("activate-global-python-argcomplete") - if args.time: - timesteps = convert_to_seconds(args.time) - else: - timesteps = 88200 # 24 hours + parser.set_defaults(impl=impl) - DIR_NAME = create_casename() -OPATH = OUTPUT_PATH / DIR_NAME -print("Output directory is: ", OPATH) -sc.opath = OPATH -sc.accounts = accounts - -if args.plot or args.output: +def shell_complete(): try: - os.makedirs(OPATH) - except OSError as error: - print(f"Error creating directory: {error}") - -if args.verbose: - print(jobs) - -layout_manager.run(jobs, timesteps=timesteps) - -output_stats = sc.get_stats() -# Following b/c we get the following error when we use PM100 telemetry dataset -# TypeError: Object of type int64 is not JSON serializable -try: - print(json.dumps(output_stats, indent=4)) -except: - print(output_stats) - - -if args.plot: - if 'power' in args.plot: - pl = Plotter('Time (s)', 'Power (kW)', 'Power History', \ - OPATH / f'power.{args.imtype}', \ - uncertainties=args.uncertainties) - x, y = zip(*power_manager.history) - pl.plot_history(x, y) - - if 'util' in args.plot: - pl = Plotter('Time (s)', 'System Utilization (%)', \ - 'System Utilization History', OPATH / f'util.{args.imtype}') - x, y = zip(*sc.sys_util_history) - pl.plot_history(x, y) - - if 'loss' in args.plot: - pl = Plotter('Time (s)', 'Power Losses (kW)', 'Power Loss History', \ - OPATH / f'loss.{args.imtype}', \ - uncertainties=args.uncertainties) - x, y = zip(*power_manager.loss_history) - pl.plot_history(x, y) - - pl = Plotter('Time (s)', 'Power Losses (%)', 'Power Loss History', \ - OPATH / f'loss_pct.{args.imtype}', \ - uncertainties=args.uncertainties) - x, y = zip(*power_manager.loss_history_percentage) - pl.plot_history(x, y) - - if 'pue' in args.plot: - if cooling_model: - ylabel = 'PUE_Out[1]' - title = 'FMU ' + ylabel + 'History' - pl = Plotter('Time (s)', ylabel, title, OPATH / f'pue.{args.imtype}', \ - uncertainties=args.uncertainties) - df = pd.DataFrame(cooling_model.fmu_history) - df.to_parquet('cooling_model.parquet', engine='pyarrow') - pl.plot_history(df['time'], df[ylabel]) - else: - print('Cooling model not enabled... skipping output of plot') - - if 'temp' in args.plot: - if cooling_model: - ylabel = 'Tr_pri_Out[1]' - title = 'FMU ' + ylabel + 'History' - pl = Plotter('Time (s)', ylabel, title, OPATH / 'temp.svg') - df = pd.DataFrame(cooling_model.fmu_history) - df.to_parquet('cooling_model.parquet', engine='pyarrow') - pl.plot_compare(df['time'], df[ylabel]) - else: - print('Cooling model not enabled... skipping output of plot') - -if args.output: - - if args.uncertainties: - # Parquet cannot handle annotated ufloat format AFAIK - print('Data dump not implemented using uncertainties!') - else: - if cooling_model: - df = pd.DataFrame(cooling_model.fmu_history) - df.to_parquet(OPATH / 'cooling_model.parquet', engine='pyarrow') - - df = pd.DataFrame(power_manager.history) - df.to_parquet(OPATH / 'power_history.parquet', engine='pyarrow') - - df = pd.DataFrame(power_manager.loss_history) - df.to_parquet(OPATH / 'loss_history.parquet', engine='pyarrow') - - df = pd.DataFrame(sc.sys_util_history) - df.to_parquet(OPATH / 'util.parquet', engine='pyarrow') - - # Schedule history - job_history = pd.DataFrame(sc.get_job_history_dict()) - job_history.to_csv(OPATH / "job_history.csv", index=False) - - try: - with open(OPATH / 'stats.out', 'w') as f: - json.dump(output_stats, f, indent=4) - except: - write_dict_to_file(output_stats, OPATH / 'stats.out') - - try: - with open(OPATH / 'accounts.json', 'w') as f: - json_string = json.dumps(sc.accounts.to_dict()) - f.write(json_string) - except TypeError: - raise TypeError(f"{sc.accounts} could not be parsed by json.dump") - print("Output directory is: ", OPATH) # If output is enabled, the user wants this information as last output + parser = dill.loads(gzip.decompress(PARSER_CACHE.read_bytes())) + except Exception: + PARSER_CACHE.unlink(missing_ok=True) # delete cache if corrupted somehow + parser = argparse.ArgumentParser() + # Use a dummy parser so that autocomplete still handles sys.exit tab complete if there's no + # cache. Cache will be created on first run of `main.py` + + argcomplete.autocomplete(parser, always_complete_options=False) + + +def cache_parser(parser: argparse.ArgumentParser): + parser = copy.deepcopy(parser) + subparsers = next(a for a in parser._actions if isinstance(a, argparse._SubParsersAction)) + # Don't need to pickle the impl functions + for subparser in subparsers.choices.values(): + subparser.set_defaults(impl=lambda args: None) + + pickled = gzip.compress(dill.dumps(parser), compresslevel=4, mtime=0) + if not PARSER_CACHE.exists() or PARSER_CACHE.read_bytes() != pickled: + try: # Ignore if there's some kind of write or permission error + PARSER_CACHE.write_bytes(pickled) + except Exception: + pass + + +def main(cli_args: list[str] | None = None): + shell_complete() # will output shell completion and sys.exit during tab complete + + from raps.helpers import check_python_version + check_python_version() + + from raps.run_sim import run_sim_add_parser, run_parts_sim_add_parser, show_add_parser + from raps.workloads import run_workload_add_parser + from raps.telemetry import run_telemetry_add_parser, run_download_add_parser + from raps.train_rl import train_rl_add_parser + + parser = argparse.ArgumentParser( + description=""" + ExaDigiT Resource Allocator & Power Simulator (RAPS) + """, + allow_abbrev=False, + ) + subparsers = parser.add_subparsers(required=True) + + run_sim_add_parser(subparsers) + run_parts_sim_add_parser(subparsers) + show_add_parser(subparsers) + run_workload_add_parser(subparsers) + run_telemetry_add_parser(subparsers) + run_download_add_parser(subparsers) + train_rl_add_parser(subparsers) + shell_completion_add_parser(subparsers) + + cache_parser(parser) + + args = parser.parse_args(cli_args) + assert args.impl, "subparsers should add an impl function to args" + args.impl(args) + + +if __name__ == "__main__": + main() diff --git a/multi-part-sim.py b/multi-part-sim.py deleted file mode 100644 index 342d463012a742a8744686a4f14a9412007aee6d..0000000000000000000000000000000000000000 --- a/multi-part-sim.py +++ /dev/null @@ -1,105 +0,0 @@ -from raps.helpers import check_python_version -check_python_version() - -import glob -import os -import random -import sys - -from args import args -from raps.config import ConfigManager, CONFIG_PATH -from raps.schedulers.default import PolicyType -from raps.ui import LayoutManager -from raps.engine import Engine -from raps.flops import FLOPSManager -from raps.power import PowerManager, compute_node_power -from raps.telemetry import Telemetry -from raps.workload import Workload -from raps.utils import convert_to_seconds, next_arrival -from tqdm import tqdm - -# Load configurations for each partition -partition_names = args.partitions - -print(args.partitions) -if '*' in args.partitions[0]: - paths = glob.glob(os.path.join(CONFIG_PATH, args.partitions[0])) - partition_names = [os.path.join(*p.split(os.sep)[-2:]) for p in paths] - -configs = [ConfigManager(system_name=partition).get_config() for partition in partition_names] -args_dicts = [{**vars(args), 'config': config} for config in configs] - -# Initialize Workload -if args.replay: - - # Currently this assumes that an .npz file has already been created - # e.g., python main.py --system marconi100 -f ~/data/marconi100/job_table.parquet - td = Telemetry(**args_dicts[0]) - print(f"Loading {args.replay[0]}...") - jobs = td.load_snapshot(args.replay[0]) - available_nodes = [config['AVAILABLE_NODES'] for config in configs] - print("available nodes:", available_nodes) - - # Randomly assign partition - for job in jobs: - job['partition'] = random.choices(partition_names, weights=available_nodes, k=1)[0] - - if args.scale: - for job in tqdm(jobs, desc=f"Scaling jobs to {args.scale} nodes"): - job['nodes_required'] = random.randint(1, args.scale) - job['requested_nodes'] = None # Setting to None triggers scheduler to assign nodes - - if args.reschedule == 'poisson': - for job in tqdm(jobs, desc="Rescheduling jobs"): - partition = job['partition'] - partition_config = configs[partition_names.index(partition)] - job['requested_nodes'] = None - job['submit_time'] = next_arrival(1 / partition_config['JOB_ARRIVAL_TIME']) - elif args.reschedule == 'submit-time': - raise NotImplementedError - -else: # Synthetic workload - wl = Workload(*configs) - - # Generate jobs based on workload type - jobs = getattr(wl, args.workload)(num_jobs=args.numjobs) - -# Group jobs by partition -jobs_by_partition = {partition: [] for partition in partition_names} -for job in jobs: - jobs_by_partition[job['partition']].append(job) - -# Initialize layout managers for each partition -layout_managers = {} -for i, config in enumerate(configs): - pm = PowerManager(compute_node_power, **configs[i]) - fm = FLOPSManager(**args_dicts[i]) - sc = Engine(power_manager=pm, flops_manager=fm, cooling_model=None, **args_dicts[i]) - layout_managers[config['system_name']] = LayoutManager(args.layout, engine=sc, debug=args.debug, **config) - -# Set simulation timesteps -if args.time: - timesteps = convert_to_seconds(args.time) -else: - timesteps = 88200 # Default to 24 hours - -# Create generators for each layout manager -generators = {name: lm.run_stepwise(jobs_by_partition[name], timesteps=timesteps) - for name, lm in layout_managers.items()} - -# Step through all generators in lockstep -for timestep in range(timesteps): - for name, gen in generators.items(): - next(gen) # Advance each generator - - # Print debug info every UI_UPDATE_FREQ - if timestep % configs[0]['UI_UPDATE_FREQ'] == 0: # Assuming same frequency for all partitions - sys_power = 0 - for name, lm in layout_managers.items(): - sys_util = lm.engine.sys_util_history[-1] if lm.engine.sys_util_history else 0.0 - print(f"[DEBUG] {name} - Timestep {timestep} - Jobs running: {len(lm.engine.running)} - Utilization: {sys_util[1]:.2f}% - Power: {lm.engine.sys_power:.1f}kW") - sys_power += lm.engine.sys_power - print(f"system power: {sys_power:.1f}kW") - -print("Simulation complete.") - diff --git a/pyproject.toml b/pyproject.toml index 1b3f2e0106bd54dc808aad0db5fe8b815173cf5b..8cc586dcd1d8b9aad8ef868b1cd08f6cf255cccc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,21 +4,39 @@ build-backend = "hatchling.build" [project] name = "raps" -version = "0.0.1" -requires-python = ">=3.9" +version = "2.0.0.dev0" +requires-python = ">=3.12" description = "RAPS" readme = "README.md" # license = {file = "LICENSE.txt"} dependencies = [ - "matplotlib==3.7.2", - "numpy==1.23.5", - "rich==13.6.0", - "fmpy==0.3.19", - "pandas==2.0.3", - "scipy==1.10.1", - "pyarrow==15.0.1", - "tqdm==4.66.5", - "uncertainties==3.2.1", - "requests==2.32.3" + "matplotlib>=3.7.2", + "numpy>=1.23.5", + "rich>=13.6.0", + "fmpy>=0.3.19", + "pandas>=2.0.3", + "scipy>=1.10.1", + "pyarrow>=15.0.1", + "tqdm>=4.66.5", + "uncertainties>=3.2.1", + "requests>=2.32.3", + "fsspec>=2025.5.1", + "gcsfs>=2025.5.1", + "networkx>=3.5", + "pytest", + "pytest-order", + "pytest-xdist", + "pyyaml>=6.0.2", + "pydantic>=2.11.7", + "pydantic-settings>=2.10.1", + "stable-baselines3==2.7.0", + "gym==0.26.2", + "dill==0.4.0", + "argcomplete==3.6.2", + "pyzmq==27.1.0", + "pre-commit" ] + +[project.scripts] +raps = "main:main" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000000000000000000000000000000000000..78b07aa905b9a827b3fcbf5ff202c540b5299fe7 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,37 @@ +[pytest] +testpaths = tests +#python_paths = . +markers = + long: mark a test as long (skipped if not run with --runlong) + + main: raps basic main. + telemetry: raps telemetry analysis + workload: raps workload generation + + system: mark a test as system (integration) test + unit: mark a test as a unit test + withdata: marks tests that require external data + nodata: marks tests that can run without external data + cooling: cooling argument test + uncertainty: uncertainty argment test + time: time argument test + fastforward: fastforward argument test + time_delta: time delta argument test + time_delta_sub_second: sub second time delta argument test + network: network model test + + 40frontiers: System test + adastraMI250: System test + bluewaters: System test + frontier: System test + fugaku: System test + gcloudv2: System test + lassen: System test + marconi100: System test + mit_supercloud: System test + setonix: System test + summit: System test + lumi: System test + + +addopts = -ra diff --git a/raps/__init__.py b/raps/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a7f352320df24d3559a8d5556877faa40e132c5d 100644 --- a/raps/__init__.py +++ b/raps/__init__.py @@ -0,0 +1,16 @@ +from .sim_config import SimConfig, SingleSimConfig, MultiPartSimConfig +from .system_config import ( + SystemConfig, SystemCoolingConfig, SystemNetworkConfig, SystemPowerConfig, SystemSchedulerConfig, + SystemSystemConfig, SystemUqConfig, +) +from raps.schedulers.default import PolicyType, BackfillType +from .engine import Engine +from .multi_part_engine import MultiPartEngine + +__all__ = [ + "SimConfig", "SingleSimConfig", "MultiPartSimConfig", + "SystemConfig", "SystemCoolingConfig", "SystemNetworkConfig", "SystemPowerConfig", "SystemSchedulerConfig", + "SystemSystemConfig", "SystemUqConfig", + "PolicyType", "BackfillType", + "Engine", "MultiPartEngine", +] diff --git a/raps/account.py b/raps/account.py index 5ff80b992cfd71e96428a9baaf0833b4e314e31e..1bdf10f561255817cb83da8804a04406cf29c845 100644 --- a/raps/account.py +++ b/raps/account.py @@ -40,12 +40,13 @@ class Account: def update_fugaku_points(self, average_energy, average_power): if average_power == 0: - raise ValueError(f"{average_power} is zero") - self.fugaku_points = (average_energy - self.energy_allocated) / average_power + self.fugaku_points = 0 + else: + self.fugaku_points = (average_energy - self.energy_allocated) / average_power def update_statistics(self, jobstats, average_user): self.jobs_completed += 1 - self.time_allocated += jobstats.run_time + self.time_allocated += jobstats.run_time * jobstats.num_nodes self.energy_allocated += jobstats.energy if self.time_allocated == 0: self.avg_power = 0 @@ -93,7 +94,7 @@ class Account: return acct @classmethod - def merge(cls,account1:'Account', account2:'Account') -> 'Account': + def merge(cls, account1: 'Account', account2: 'Account') -> 'Account': """ Destructive merge @@ -139,7 +140,7 @@ class Accounts: self.average_user.energy_allocated = self.all_users.energy_allocated / total_accounts self.average_user.avg_power = self.all_users.avg_power / total_accounts if self.average_user.jobs_completed != 0.0: - self.average_user.update_fugaku_points(self.average_user.energy_allocated,self.average_user.avg_power) + self.average_user.update_fugaku_points(self.average_user.energy_allocated, self.average_user.avg_power) return self def __init__(self, jobs=None): @@ -147,10 +148,10 @@ class Accounts: self.all_users = Account(-2, "All_Users") self.average_user = Account(-1, "Avg_User") if jobs: - if not isinstance(jobs,list): + if not isinstance(jobs, list): raise TypeError for job_dict in jobs: - if not isinstance(job_dict,dict): + if not isinstance(job_dict, dict): raise TypeError if job_dict["account"] not in self.account_dict: self.account_dict[job_dict["account"]] = Account(job_dict["account"], jobs_enqueued=0) @@ -159,7 +160,7 @@ class Accounts: self.update_average_user() pass - def updates_all_users_by_account(self,account:Account): + def updates_all_users_by_account(self, account: Account): self.all_users.jobs_enqueued += account.jobs_enqueued self.all_users.jobs_completed += account.jobs_completed self.all_users.time_allocated += account.time_allocated @@ -167,16 +168,13 @@ class Accounts: self.all_users.avg_power = self.energy_allocated / self.time_allocated self.update_average_user() # Only necessary if averag_user was not updated before calling update all users. # Therefore As this is needed for fugaku points this should always be called. - self.all_users.update_fugaku_points(self.average_user.energy_allocated,self.average_user.avg_power) - + self.all_users.update_fugaku_points(self.average_user.energy_allocated, self.average_user.avg_power) - - def add_account(self, account:Account): + def add_account(self, account: Account): self.account_dict[account.name] = account self.add_user_stats_to_all_users(account) # update_average_user() is already called - @classmethod def from_dict(cls, dictionary): accounts = cls() @@ -218,12 +216,12 @@ class Accounts: account.update_statistics(jobstats, self.average_user) self.account_dict[jobstats.account] = account # Update the summary account (all_users) and the average_user account - self.all_users.update_statistics(jobstats,self.average_user) + self.all_users.update_statistics(jobstats, self.average_user) self.update_average_user() def to_dict(self): acct_dict = {} - for account_name,account in self.account_dict.items(): + for account_name, account in self.account_dict.items(): acct_dict[account_name] = account.to_dict() ret_dict = {} ret_dict['account_dict'] = acct_dict @@ -232,7 +230,7 @@ class Accounts: return ret_dict @classmethod - def merge(cls, accounts1:'Accounts', accounts2:'Accounts') -> 'Accounts': + def merge(cls, accounts1: 'Accounts', accounts2: 'Accounts') -> 'Accounts': """ Destructive merge of accounts """ @@ -241,7 +239,8 @@ class Accounts: for ac2_k, ac2_v in accounts2.account_dict.items(): if ac2_k in accounts1.account_dict: - merged_accounts.account_dict[ac2_k] = Account.merge(accounts1.account_dict[ac2_k], accounts2.account_dict[ac2_k]) + merged_accounts.account_dict[ac2_k] = Account.merge( + accounts1.account_dict[ac2_k], accounts2.account_dict[ac2_k]) else: merged_accounts.account_dict[ac2_k] = ac2_v for ac1_k, ac1_v in accounts1.account_dict.items(): @@ -252,15 +251,17 @@ class Accounts: pass # Update all users -> then update average user -> then fugagku points for all users (order is important!) - merged_accounts.all_users = Account.merge(accounts1.all_users,accounts2.all_users) + merged_accounts.all_users = Account.merge(accounts1.all_users, accounts2.all_users) merged_accounts.update_average_user() # Update to average user is needed before fugaku points can be caluculated. if merged_accounts.all_users.jobs_completed != 0: - merged_accounts.all_users.update_fugaku_points(merged_accounts.average_user.energy_allocated, merged_accounts.average_user.avg_power) + merged_accounts.all_users.update_fugaku_points( + merged_accounts.average_user.energy_allocated, merged_accounts.average_user.avg_power) for ac_k, ac_v in merged_accounts.account_dict.items(): if merged_accounts.account_dict[ac_k].jobs_completed != 0: - merged_accounts.account_dict[ac_k].update_fugaku_points(merged_accounts.average_user.energy_allocated, merged_accounts.average_user.avg_power) + merged_accounts.account_dict[ac_k].update_fugaku_points( + merged_accounts.average_user.energy_allocated, merged_accounts.average_user.avg_power) accounts1 = None accounts2 = None diff --git a/raps/config.py b/raps/config.py deleted file mode 100644 index 51e7d860a037e0f83591dbebcbca88c87f8ae857..0000000000000000000000000000000000000000 --- a/raps/config.py +++ /dev/null @@ -1,75 +0,0 @@ -import json -import os -from typing import Dict, Any -from pathlib import Path - -CONFIG_PATH = Path(os.environ.get("RAPS_CONFIG", 'config')).resolve() - - -class ConfigManager: - def __init__(self, system_name: str): - self.config: Dict[str, Any] = {} - self.load_system_config(system_name) - self.system_name = system_name - self.derive_values() - - def load_system_config(self, system_name: str) -> None: - base_path = CONFIG_PATH / system_name - config_files = ['system.json', 'power.json', 'scheduler.json'] - optional_files = ['cooling.json', 'uq.json'] - - for config_file in config_files + optional_files: - file_path = base_path / config_file - if config_file in optional_files and not file_path.exists(): - continue # Skip loading if the file is optional and doesn't exist - if not file_path.exists(): - raise FileNotFoundError(f"Mandatory configuration file {config_file} not found.") - config_data = self.load_config_file(file_path) - self.config.update(config_data) - - @staticmethod - def load_config_file(file_path: Path) -> dict[str, Any]: - with open(file_path, 'r') as file: - return json.load(file) - - def derive_values(self) -> None: - # Derive SC_SHAPE and TOTAL_NODES - num_cdus = self.config.get('NUM_CDUS', 0) - racks_per_cdu = self.config.get('RACKS_PER_CDU', 0) - nodes_per_rack = self.config.get('NODES_PER_RACK', 0) - chassis_per_rack = self.config.get('CHASSIS_PER_RACK', 0) - nodes_per_blade = self.config.get('NODES_PER_BLADE', 0) - down_nodes = self.config.get('DOWN_NODES', 0) - missing_racks = self.config.get('MISSING_RACKS', 0) - - self.config['NUM_RACKS'] = num_cdus * racks_per_cdu - len(missing_racks) - self.config['SC_SHAPE'] = [num_cdus, racks_per_cdu, nodes_per_rack] - self.config['TOTAL_NODES'] = num_cdus * racks_per_cdu * nodes_per_rack - self.config['BLADES_PER_CHASSIS'] = int(nodes_per_rack / chassis_per_rack / nodes_per_blade) - self.config['system_name'] = self.system_name - - # Generate POWER_DF_HEADER - power_df_header = ["CDU"] - for i in range(1, racks_per_cdu + 1): - power_df_header.append(f"Rack {i}") - power_df_header.append("Sum") - for i in range(1, racks_per_cdu + 1): - power_df_header.append(f"Loss {i}") - power_df_header.append("Loss") - self.config['POWER_DF_HEADER'] = power_df_header - - # Convert MISSING_RACKS into list of DOWN_NODES - for rack in missing_racks: - start_node_id = rack * nodes_per_rack - end_node_id = start_node_id + nodes_per_rack - down_nodes.extend(range(start_node_id, end_node_id)) - self.config['DOWN_NODES'] = down_nodes - - self.config['AVAILABLE_NODES'] = self.config['TOTAL_NODES'] - len(down_nodes) - - def get(self, key: str) -> Any: - return self.config.get(key) - - def get_config(self) -> Dict[str, Any]: - # Return the complete config dictionary - return self.config diff --git a/raps/constants.py b/raps/constants.py index 0cdd2fd5f90296df9d4388a252573a32505712b0..53711e1c0bfc38df15662219864c81d7974b9fef 100644 --- a/raps/constants.py +++ b/raps/constants.py @@ -5,4 +5,3 @@ from pathlib import Path ELLIPSES = '\u2026' OUTPUT_PATH = Path('simulation_results') -SEED = 42 diff --git a/raps/cooling.py b/raps/cooling.py index d45ab127097d1eafb8fd0dea0b64879d9dbfa24c..68a8a30a86a72a30c639661c16620f2c20fa32b1 100644 --- a/raps/cooling.py +++ b/raps/cooling.py @@ -1,10 +1,10 @@ """ -This module provides functionality for simulating a thermo-fluids model using +This module provides functionality for simulating a thermo-fluids model using an FMU (Functional Mock-up Unit). -The module defines a `ThermoFluidsModel` class that encapsulates the +The module defines a `ThermoFluidsModel` class that encapsulates the initialization, simulation step execution, -data conversion, and cleanup processes for the FMU-based model. +data conversion, and cleanup processes for the FMU-based model. """ import shutil import re @@ -16,13 +16,16 @@ from fmpy import read_model_description, extract from fmpy.fmi2 import FMU2Slave from datetime import timedelta +from raps.weather import Weather + + def get_matching_variables(variables, pattern): # Regex pattern to match strings containing .summary pattern = re.compile(pattern) # Filtering the list using the regex pattern filtered_vars = [var for var in variables if pattern.match(var)] - + return filtered_vars @@ -30,9 +33,9 @@ class ThermoFluidsModel: """ A class to represent a thermo-fluids model using an FMU (Functional Mock-up Unit). - This class encapsulates the initialization, simulation step execution, data conversion, - and cleanup processes for the FMU-based thermo-fluids model. It provides methods to - initialize the model, execute simulation steps, generate runtime values, calculate Power + This class encapsulates the initialization, simulation step execution, data conversion, + and cleanup processes for the FMU-based thermo-fluids model. It provides methods to + initialize the model, execute simulation steps, generate runtime values, calculate Power Usage Effectiveness (PUE), and properly manage the FMU resources. Attributes @@ -40,7 +43,7 @@ class ThermoFluidsModel: FMU_PATH : str The file path to the FMU file. fmu_history : list - A list to store the history of FMU states, combining cooling input, datacenter output, + A list to store the history of FMU states, combining cooling input, datacenter output, and central energy plant (CEP) output for each simulation step. inputs : list A list of input variables for the FMU. @@ -56,23 +59,24 @@ class ThermoFluidsModel: Methods ------- initialize(): - Initializes the FMU by extracting the file, reading the model description, setting up input and output variables, - and preparing the model for simulation. + Initializes the FMU by extracting the file, reading the model description, + setting up input and output variables, and preparing the model for simulation. generate_runtime_values(cdu_power, sc) -> dict: Generates runtime values dynamically for the FMU inputs based on CDU power and other configuration parameters. generate_fmu_inputs(runtime_values: dict, uncertainties: bool = False) -> list: Converts runtime values to a list suitable for FMU inputs, handling uncertainties if specified. calculate_pue(cooling_input: dict, datacenter_output: dict, cep_output: dict) -> float: - Calculates the Power Usage Effectiveness (PUE) of the data center based on the cooling, datacenter, + Calculates the Power Usage Effectiveness (PUE) of the data center based on the cooling, datacenter, and CEP output power values. step(current_time: float, fmu_inputs: list, step_size: float) -> Tuple[dict, dict, dict, float]: - Executes a simulation step with the given inputs and step size. Returns the cooling input, datacenter output, + Executes a simulation step with the given inputs and step size. Returns the cooling input, datacenter output, CEP output, and PUE for the current step. terminate(): Terminates the FMU instance, ensuring that all resources are properly released. cleanup(): Cleans up the extracted FMU directory, ensuring no temporary files are left behind. """ + def __init__(self, **config): """ Constructs all the necessary attributes for the ThermoFluidsModel object. @@ -88,8 +92,8 @@ class ThermoFluidsModel: self.outputs = None self.unzipdir = None self.fmu = None - self.weather = None - + self.weather: Weather | None = None + def initialize(self): """ Initializes the FMU by extracting the file and setting up the model. @@ -115,7 +119,7 @@ class ThermoFluidsModel: # Get the value references for the variables we want to get/set self.inputs = [v for v in model_description.modelVariables if v.causality == 'input'] self.outputs = [v for v in model_description.modelVariables if v.name in outputs] - + # Instantiate and initialize the FMU self.fmu = FMU2Slave(guid=model_description.guid, unzipDirectory=self.unzipdir, @@ -126,7 +130,7 @@ class ThermoFluidsModel: self.fmu.enterInitializationMode() self.fmu.exitInitializationMode() - def generate_runtime_values(self, cdu_power, sc) -> dict: + def generate_runtime_values(self, cdu_power, engine) -> dict: """ Generate the runtime values for the FMU inputs dynamically. @@ -139,18 +143,20 @@ class ThermoFluidsModel: """ # Dynamically generate the power inputs runtime_values = { - f"simulator_1_datacenter_1_computeBlock_{i+1}_cabinet_1_sources_Q_flow_total": cdu_power[i] * self.config['COOLING_EFFICIENCY'] / self.config['RACKS_PER_CDU'] - for i in range(self.config['NUM_CDUS']) + f"simulator_1_datacenter_1_computeBlock_{i + 1}" + f"_cabinet_1_sources_Q_flow_total": cdu_power[i] * + self.config['COOLING_EFFICIENCY'] / self.config['RACKS_PER_CDU'] + for i in range(self.config['NUM_CDUS']) } # Default temperature is from the config temperature = self.config['WET_BULB_TEMP'] # If replay mode is on and weather data is available - if sc.replay and self.weather and self.weather.start is not None and self.weather.has_coords: + if self.weather and self.weather.has_coords: # Convert total seconds to timedelta object - delta = timedelta(seconds=sc.current_time) - target_datetime = self.weather.start + delta + delta = timedelta(seconds=engine.current_timestep - engine.timestep_start) + target_datetime = engine.start + delta # Get temperature from weather data temperature = self.weather.get_temperature(target_datetime) or self.config['WET_BULB_TEMP'] @@ -160,7 +166,7 @@ class ThermoFluidsModel: runtime_values[temperature_key] = temperature return runtime_values - + def generate_fmu_inputs(self, runtime_values, uncertainties=False): """ Convert the runtime values based on the cooling model's inputs to a list suitable for FMU inputs. @@ -183,7 +189,7 @@ class ThermoFluidsModel: # Helper function to process uncertainty def process_uncertainty(value): - """Strip uncertainty if present, otherwise return the value as-is.""" + """Strip uncertainty if present, otherwise return the value as-is.""" # Convert to nominal value if it's an AffineScalarFunc and uncertainties flag is set return unumpy.nominal_values(value) if uncertainties and isinstance(value, AffineScalarFunc) else value @@ -202,7 +208,6 @@ class ThermoFluidsModel: return fmu_inputs - def calculate_pue(self, cooling_input, cooling_output): """ Calculate the Power Usage Effectiveness (PUE) of the data center. @@ -233,7 +238,8 @@ class ThermoFluidsModel: # Get the sum of the work done by all CDU pumps W_CDUPs = sum( - convert_to_watts(cooling_output.get(f'simulator[1].datacenter[1].computeBlock[{idx+1}].cdu[1].summary.W_flow_CDUP_kW')) + convert_to_watts(cooling_output.get( + f'simulator[1].datacenter[1].computeBlock[{idx + 1}].cdu[1].summary.W_flow_CDUP_kW')) for idx in range(self.config['NUM_CDUS']) ) @@ -244,10 +250,11 @@ class ThermoFluidsModel: total_input_power = np.maximum(total_cooling_input_power, 1e-3) # Calculate PUE - pue = (total_input_power + np.sum(W_CDUPs) + np.sum(W_HTWPs) + np.sum(W_CTWPs) + np.sum(W_CTs)) / total_input_power + pue = (total_input_power + np.sum(W_CDUPs) + np.sum(W_HTWPs) + + np.sum(W_CTWPs) + np.sum(W_CTs)) / total_input_power return pue - + def step(self, current_time, fmu_inputs, step_size): """ Executes a simulation step with the given inputs and step size. @@ -315,3 +322,15 @@ class ThermoFluidsModel: """ # Cleanup - at the end of the simulation shutil.rmtree(self.unzipdir, ignore_errors=True) + + def simulate_cooling(self, *, rack_power, engine): + cdu_power = rack_power.T[-1] * 1000 + runtime_values = self.generate_runtime_values(cdu_power, engine) + + # FMU inputs are N powers and the wetbulb temp + fmu_inputs = self.generate_fmu_inputs(runtime_values, + uncertainties=engine.power_manager.uncertainties) + cooling_inputs, cooling_outputs = self.step(engine.current_timestep, + fmu_inputs, + engine.config['POWER_UPDATE_FREQ']) + return cooling_inputs, cooling_outputs diff --git a/raps/dataloaders/adastraMI250.py b/raps/dataloaders/adastraMI250.py index 58eaec700adf2463c4c841ced1ef5df7f15c6f12..0f7f3663f32b06c787e1b6187d2ca6ff84b1ab4f 100644 --- a/raps/dataloaders/adastraMI250.py +++ b/raps/dataloaders/adastraMI250.py @@ -1,29 +1,33 @@ """ +# get the data +``` +raps download --system adastraMI250 +``` +This will download the dataset from https://zenodo.org/records/14007065/files/AdastaJobsMI250_15days.parquet - # get the data - Download `AdastaJobsMI250_15days.parquet` from https://zenodo.org/records/14007065/files/AdastaJobsMI250_15days.parquet +# to simulate the dataset +raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 +# to replay with different scheduling policy +raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 --policy priority --backfill easy - # to simulate the dataset - python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastra - - # to reschedule - python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastra --reschedule poisson - - # to fast-forward 60 days and replay for 1 day - python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastra -ff 60d -t 1d - - # to analyze dataset - python -m raps.telemetry -f /path/to/AdastaJobsMI250_15days.parquet --system adastra -v +# to run a specific time range +raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 \ + --start 2024-11-01T00:00:00Z --end 2024-11-02T00:00:00Z +# to analyze dataset +python -m raps.telemetry -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 -v """ import uuid import numpy as np import pandas as pd +from pathlib import Path +from datetime import datetime from tqdm import tqdm +import urllib.request -from ..job import job_dict -from ..utils import power_to_utilization, next_arrival +from ..job import job_dict, Job +from ..utils import WorkloadData def load_data(jobs_path, **kwargs): @@ -52,74 +56,71 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): ------- list The list of parsed jobs. + telemetry_start + telemetry_end """ count_jobs_notOK = 0 config = kwargs.get('config') - min_time = kwargs.get('min_time', None) - reschedule = kwargs.get('reschedule') - fastforward = kwargs.get('fastforward') validate = kwargs.get('validate') jid = kwargs.get('jid', '*') - if fastforward: print(f"fast-forwarding {fastforward} seconds") - # Sort jobs dataframe based on values in time_start column, adjust indices after sorting jobs_df = jobs_df.sort_values(by='start_time') jobs_df = jobs_df.reset_index(drop=True) - # Take earliest time as baseline reference - # We can use the start time of the first job. - if min_time: - time_zero = min_time - else: - time_zero = jobs_df['start_time'].min() + # We only have average power, therefore use the first start time as the start time for the telemetry + telemetry_start_timestamp = jobs_df['start_time'].min() + telemetry_end_timestamp = jobs_df['end_time'].max() + + telemetry_start_time = 0 + diff = telemetry_end_timestamp - telemetry_start_timestamp + telemetry_end_time = int(diff.total_seconds()) num_jobs = len(jobs_df) - print("time_zero:", time_zero, "num_jobs", num_jobs) + print("First start time:", telemetry_start_timestamp, "num_jobs", num_jobs) jobs = [] # Map dataframe to job state. Add results to jobs list for jidx in tqdm(range(num_jobs - 1), total=num_jobs, desc="Processing Jobs"): - - account = jobs_df.loc[jidx, 'user_id'] # or 'group_id' job_id = jobs_df.loc[jidx, 'job_id'] - if not jid == '*': if int(jid) == int(job_id): print(f'Extracting {job_id} profile') else: continue nodes_required = jobs_df.loc[jidx, 'num_nodes_alloc'] - name = str(uuid.uuid4())[:6] - wall_time = jobs_df.loc[jidx, 'run_time'] + account = jobs_df.loc[jidx, 'user_id'] + + wall_time = int(jobs_df.loc[jidx, 'run_time']) if wall_time <= 0: - print("error wall_time",wall_time) + print("error wall_time", wall_time) continue if nodes_required <= 0: - print("error nodes_required",nodes_required) + print("error nodes_required", nodes_required) continue - #wall_time = gpu_trace.size * TRACE_QUANTA # seconds if validate: node_power = jobs_df.loc[jidx, 'node_power_consumption'] node_power_array = node_power.tolist() - node_watts = sum(node_power_array) / (wall_time*nodes_required) + node_watts = sum(node_power_array) / (wall_time * nodes_required) cpu_trace = node_watts gpu_trace = 0.0 # should contain stddev_node_power when --validate flag is used else: cpu_power = jobs_df.loc[jidx, 'cpu_power_consumption'] cpu_power_array = cpu_power.tolist() - cpu_watts = sum(cpu_power_array) / (wall_time*nodes_required) - cpu_min_power = config['POWER_CPU_IDLE'] * config['CPUS_PER_NODE'] - cpu_max_power = config['POWER_CPU_MAX'] * config['CPUS_PER_NODE'] + cpu_watts = sum(cpu_power_array) / (wall_time * nodes_required) + # cpu_min_power = config['POWER_CPU_IDLE'] * config['CPUS_PER_NODE'] # Unused + # cpu_max_power = config['POWER_CPU_MAX'] * config['CPUS_PER_NODE'] # Unused + cpu_util = (cpu_watts / float(config['POWER_CPU_IDLE']) - config['CPUS_PER_NODE']) \ + / ((float(config['POWER_CPU_MAX']) / float(config['POWER_CPU_IDLE'])) - 1.0) + # power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power) + # print("cpu_watts",cpu_watts,"cpu_util",cpu_util) - cpu_util = (cpu_watts/float(config['POWER_CPU_IDLE']) - config['CPUS_PER_NODE']) / ((float(config['POWER_CPU_MAX']) / float(config['POWER_CPU_IDLE'])) -1.0) #power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power) - # print("cpu_watts",cpu_watts,"cpu_util",cpu_util) cpu_trace = np.maximum(0, cpu_util) node_power = (jobs_df.loc[jidx, 'node_power_consumption']).tolist() @@ -132,49 +133,82 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): mem_power = mem_power[:min_length] gpu_power = (node_power - cpu_power - mem_power - - ([config['NICS_PER_NODE'] * config['POWER_NIC']])) + - ([config['NICS_PER_NODE'] * config['POWER_NIC']])) gpu_power_array = gpu_power.tolist() - gpu_watts = sum(gpu_power_array) / (wall_time*nodes_required) - gpu_min_power = config['POWER_GPU_IDLE'] * config['GPUS_PER_NODE'] - gpu_max_power = config['POWER_GPU_MAX'] * config['GPUS_PER_NODE'] - gpu_util = (gpu_watts/float(config['POWER_GPU_IDLE']) - config['GPUS_PER_NODE']) / ((float(config['POWER_GPU_MAX']) / float(config['POWER_GPU_IDLE'])) -1.0) #power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power) - # print("gpu_watts",gpu_watts,"gpu_util",gpu_util) - gpu_trace = np.maximum(0, gpu_util) #gpu_util * GPUS_PER_NODE - - priority = int(jobs_df.loc[jidx, 'priority']) + gpu_watts = sum(gpu_power_array) / (wall_time * nodes_required) + # gpu_min_power = config['POWER_GPU_IDLE'] * config['GPUS_PER_NODE'] # Unused + # gpu_max_power = config['POWER_GPU_MAX'] * config['GPUS_PER_NODE'] # Unused + gpu_util = (gpu_watts / float(config['POWER_GPU_IDLE']) - config['GPUS_PER_NODE']) \ + / ((float(config['POWER_GPU_MAX']) / float(config['POWER_GPU_IDLE'])) - 1.0) + # power_to_utilization(gpu_power_array, gpu_min_power, gpu_max_power) + # print("gpu_watts",gpu_watts,"gpu_util",gpu_util) + gpu_trace = np.maximum(0, gpu_util) end_state = jobs_df.loc[jidx, 'job_state'] - time_start = jobs_df.loc[jidx, 'start_time'] - time_end = jobs_df.loc[jidx, 'end_time'] - diff = time_start - time_zero - if jid == '*': - time_offset = max(diff.total_seconds(), 0) - else: - # When extracting out a single job, run one iteration past the end of the job - time_offset = config['UI_UPDATE_FREQ'] - - if fastforward: - time_offset -= fastforward - - if reschedule == 'poisson': # Let the scheduler reschedule the jobs - scheduled_nodes = None - time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) - elif reschedule == 'submit-time': - raise NotImplementedError - else: # Prescribed replay - scheduled_nodes = (jobs_df.loc[jidx, 'nodes']).tolist() - - if time_offset >= 0 and wall_time > 0: - #print("start_time",time_start,"\tend_time",time_end,"\twall_time",wall_time,"\tquanta wall time",gpu_trace.size * TRACE_QUANTA ) - job_info = job_dict(nodes_required, name, account, cpu_trace, gpu_trace, [],[],wall_time, - end_state, scheduled_nodes, time_offset, job_id, priority) - jobs.append(job_info) + priority = int(jobs_df.loc[jidx, 'priority']) + + scheduled_nodes = (jobs_df.loc[jidx, 'nodes']).tolist() + + submit_timestamp = jobs_df.loc[jidx, 'submit_time'] + diff = submit_timestamp - telemetry_start_timestamp + submit_time = int(diff.total_seconds()) + + time_limit = jobs_df.loc[jidx, 'time_limit'] # in seconds + + start_timestamp = jobs_df.loc[jidx, 'start_time'] + diff = start_timestamp - telemetry_start_timestamp + start_time = int(diff.total_seconds()) + + end_timestamp = jobs_df.loc[jidx, 'end_time'] + diff = end_timestamp - telemetry_start_timestamp + end_time = int(diff.total_seconds()) + + if wall_time != end_time - start_time: + print("wall_time != end_time - start_time") + print(f"{wall_time} != {end_time - start_time}") + print(jobs_df[jidx]) + + trace_time = wall_time + trace_start_time = end_time + trace_end_time = start_time + + if wall_time > 0: + job_info = job_dict(nodes_required=nodes_required, + name=name, + account=account, + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=[], + nrx_trace=[], + end_state=end_state, + scheduled_nodes=scheduled_nodes, + id=job_id, + priority=priority, + submit_time=submit_time, + time_limit=time_limit, + start_time=start_time, + end_time=end_time, + expected_run_time=wall_time, + current_run_time=0, + trace_time=trace_time, + trace_start_time=trace_start_time, + trace_end_time=trace_end_time, + trace_quanta=None, + trace_missing_values=True + ) + job = Job(job_info) + jobs.append(job) else: - count_jobs_notOK = count_jobs_notOK + 1 + count_jobs_notOK += 1 + + print("jobs not added: ", count_jobs_notOK) + return WorkloadData( + jobs=jobs, + telemetry_start=telemetry_start_time, telemetry_end=telemetry_end_time, + start_date=telemetry_start_timestamp.tz_localize("UTC"), + ) - print("many jobs not OK !!!!!!!!!!!!!!! : ",count_jobs_notOK) - return jobs def xname_to_index(xname: str, config: dict): """ @@ -195,7 +229,8 @@ def xname_to_index(xname: str, config: dict): if row == 6: col -= 9 rack_index = row * 12 + col - node_index = chassis * config['BLADES_PER_CHASSIS'] * config['NODES_PER_BLADE'] + slot * config['NODES_PER_BLADE'] + node + node_index = chassis * config['BLADES_PER_CHASSIS'] * \ + config['NODES_PER_BLADE'] + slot * config['NODES_PER_BLADE'] + node return rack_index * config['SC_SHAPE'][2] + node_index @@ -236,6 +271,7 @@ CDU_NAMES = [ 'x2609c1', ] + def cdu_index_to_name(index: int, config: dict): return CDU_NAMES[index - 1] @@ -245,3 +281,11 @@ def cdu_pos(index: int, config: dict) -> tuple[int, int]: name = CDU_NAMES[index - 1] row, col = int(name[2]), int(name[3:5]) return (row, col) + + +def download(dest: Path, start: datetime | None, end: datetime | None): + dest.mkdir(parents=True) + filename = "AdastaJobsMI250_15days.parquet" + print(f"Downloading {filename}") + urllib.request.urlretrieve(f"https://zenodo.org/records/14007065/files/{filename}", dest / filename) + print("Done!") diff --git a/raps/dataloaders/bluewaters.py b/raps/dataloaders/bluewaters.py new file mode 100644 index 0000000000000000000000000000000000000000..7b1ee1f943611674a1e67b1791c3c0d3bc2f40aa --- /dev/null +++ b/raps/dataloaders/bluewaters.py @@ -0,0 +1,366 @@ +""" +Blue Waters dataloader + +Example test case: + + raps run -f /opt/data/bluewaters --start 20170328 --system bluewaters -net + +To download the necessary datasets: + + https://bluewaters.ncsa.illinois.edu/data-sets.html - this explains each of the datasets in detail + + There are two datasets available from: + + https://app.globus.org/file-manager?origin_id=854c1a5c-fa9f-4df4-a71c-407a33e44da0 + + 1. /torque_logs_anonimized (sic) - we are using the file 2017.tar.gz (377MB) + + 2. /node_metrics/cray_system_sampler - we are using the file 20170328.tgz (485MB) + + In order to speed up data loading, we have downsized these files to just + four columns using the following code: + + import csv + with open("20170328", "r") as infile, open("output.csv", "w", newline="") as outfile: + reader = csv.reader(infile, skipinitialspace=True) + writer = csv.writer(outfile) + for row in reader: + writer.writerow([row[0], row[1], row[15], row[16]]) + + Another dataset we plan to use (but not currently using) is: + + 3. Monet - Blue Waters Network Dataset (140GB) - https://databank.illinois.edu/datasets/IDB-2921318 + + We assume these datasets are setup as follows (assuming -f /opt/data/bluewaters): + + /opt/data/bluewaters/cray_system_sampler/20170328 + /opt/data/bluewaters/torque_logs/20170328 + /opt/data/bluewaters/monet/20170328 +""" + +import math +import re +import pandas as pd +from pathlib import Path +from datetime import datetime, timezone +from pprint import pprint +from raps.telemetry import Job, job_dict +from raps.utils import WorkloadData + + +def throughput_traces(total_tx, total_rx, intervals): + intervals = max(1, int(intervals or 1)) + tx = [(total_tx or 0) // intervals] * intervals + rx = [(total_rx or 0) // intervals] * intervals + # print(total_tx, total_rx, intervals, tx[:5], rx[:5]) + return tx, rx + + +def build_sampler_df(root, day, nodes, tmin, tmax, tx_idx, rx_idx, chunksize=None): + """One-time loader: returns a DataFrame of per-node positive deltas with mid-interval timestamps. + Columns: nid, mid_ts, dtx, drx (all numeric).""" + sdir = Path(root) / "cray_system_sampler" / day + files = [sdir] if sdir.is_file() else (sorted(f for f in sdir.iterdir() if f.is_file()) if sdir.exists() else []) + if not files: + raise FileNotFoundError(f"No Cray sampler files for day {day} under {sdir.parent}") + + cols = [0, 1, tx_idx, rx_idx] # ts, nid, tx, rx + out = [] + + def _process(df): + if df.empty: + return None + df = df[cols] + df.columns = ["ts", "nid", "tx", "rx"] + df = df[df["nid"].isin(nodes)] + if df.empty: + return None + # sort values (optional, for consistency) + df = df.sort_values(["nid", "ts"]) + # keep raw values + df = df[["nid", "ts", "tx", "rx"]].dropna() + return df + + for fp in files: + print(f"reading {fp}... this may take a while") + if chunksize: + for chunk in pd.read_csv(fp, header=None, skipinitialspace=True, chunksize=chunksize): + dfp = _process(chunk) + if dfp is not None: + out.append(dfp) + else: + df = pd.read_csv(fp, header=None, skipinitialspace=True) + dfp = _process(df) + if dfp is not None: + out.append(dfp) + + if not out: + # nothing matched nodes/time; return empty frame with expected columns + return pd.DataFrame(columns=["nid", "mid_ts", "dtx", "drx"]) + + return pd.concat(out, ignore_index=True) + + +def hms_to_seconds(wt: str) -> int: + try: + h, m, s = map(int, wt.split(":")) + return h * 3600 + m * 60 + s + except Exception: + return 0 + + +def extract_nodes_from_line(hosts_field: str): + """Extract node IDs from an exec_host field in one line.""" + nodes = [] + for token in hosts_field.split("+"): + if "/" in token: + node = token.split("/")[0] + try: + nodes.append(int(node)) + except ValueError: + pass + return nodes + + +# example line: +# 03/18/2017 00:01:15;E;6335144.bw;user=USER260243U group=GRP113775G +# account=A116610A jobname=dm5-8506-M9 queue=normal ctime=1489668573 +# qtime=1489668573 etime=1489798453 start=1489799118 +# owner=USER260243U@h2ologin2 +# exec_host=26742/0-31+26743/0-31+26728/0-31+26729/0-31 +# login_node=nid27563 Resource_List.flags=commtolerant:commlocal +# Resource_List.neednodes=4:ppn=32 Resource_List.nodect=4 +# Resource_List.nodes=4:ppn=32 Resource_List.partition=bwsched +# Resource_List.walltime=04:00:00 session=16472 total_execution_slots=128 +# unique_node_count=4 end=1489813275 Exit_status=2 resources_used.cput=28 +# resources_used.energy_used=0 resources_used.mem=18996kb +# resources_used.vmem=130088kb resources_used.walltime=03:55:49 + + +PATS = { + "id": re.compile(r"\b(jobid|job_id|Job_Id)[:=]\s*([^\s,]+)", re.I), + "name": re.compile(r"\b(jobname)[:=]\s*([^\s,]+)", re.I), + "account": re.compile(r"\b(account)[:=]\s*([^\s,]+)", re.I), + # Nodes: use Resource_List.nodect or unique_node_count + "nodes_required": re.compile(r"\b(?:Resource_List\.nodect|unique_node_count)[:=]\s*(\d+)", re.I), + # CPU cores per node: from ppn in Resource_List.nodes + "cpu_cores_required": re.compile(r"\bppn=(\d+)", re.I), + # GPUs per node + "gpu_units_required": re.compile(r"\bgpus?=(\d+)", re.I), + # Scheduled nodes list (exec_host=...) + "scheduled_nodes": re.compile(r"\bexec_host=([^\s,]+)", re.I), + # Times + "submit_time": re.compile(r"\bqtime=([0-9]+)", re.I), + "start_time": re.compile(r"\bstart=([0-9]+)", re.I), + "end_time": re.compile(r"\bend=([0-9]+)", re.I), + # Walltime used + "wall_time": re.compile(r"resources_used\.walltime=(\d{2}:\d{2}:\d{2})", re.I), +} + + +def _parse_line(line: str, debug=False): + rec = {} + for key, pat in PATS.items(): + m = pat.search(line) + if m: + if debug: + print(f"\n[{key}] matched pattern {pat.pattern}") + for i in range(0, (m.lastindex or 0) + 1): + print(f" group({i}): {m.group(i)}") + rec[key] = m.group(m.lastindex or 0) # take last group + # normalize scheduled_nodes into list + if "scheduled_nodes" in rec: + rec["scheduled_nodes"] = extract_nodes_from_line(rec["scheduled_nodes"]) + # wall_time + if rec.get("wall_time"): + rec["wall_time"] = hms_to_seconds(rec["wall_time"]) + + return rec + + +def load_data(local_dataset_path, **kwargs): + config = kwargs.get("config") + root = Path(local_dataset_path[0]) + # TODO: confirm bluewater dates are in UTC + start = datetime.fromisoformat(kwargs.get('start') or "2017-03-28T00:00:00Z") + start = start.astimezone(timezone.utc) + # TODO: support multiple day replay + day = start.strftime("%Y%m%d") + fp = root / "torque_logs" / day + filter_str = kwargs.get("filter") + debug = kwargs.get("debug") + + jobs_raw = [] + + # parse file + for line in fp.open("rt", errors="ignore"): + if "jobname" not in line.lower(): + continue + rec = _parse_line(line) + + # skip if missing times + if not (rec.get("start_time") and rec.get("end_time")): + continue + + # ints + st = int(rec["start_time"]) + et = int(rec["end_time"]) + sub = int(rec.get("submit_time", st)) + + duration = et - st if et >= st else 0 + nr = int(rec.get("nodes_required")) + int(rec.get("cpu_cores_required")) + + jid = rec.get("id") + trace_quanta = config.get("TRACE_QUANTA") + + job_d = job_dict( + nodes_required=nr, + name=rec.get("name"), + account=rec.get("account"), + # cpu_trace=[0]*nr*nc, # placeholder trace + # gpu_trace=[0]*nr*0, # Blue Waters has no GPUs + cpu_trace=0, + gpu_trace=0, + nrx_trace=[], + ntx_trace=[], + end_state="UNKNOWN", + scheduled_nodes=rec.get("scheduled_nodes"), + id=jid, + priority=0, + submit_time=sub, + time_limit=int(rec.get("wall_time")), + start_time=st, + end_time=et, + expected_run_time=duration, + current_run_time=0, + trace_time=sub, + trace_start_time=st, + trace_end_time=et, + trace_quanta=trace_quanta, + ) + jobs_raw.append(job_d) + + # jobs_raw = list of dicts with absolute epoch times (as ints), e.g.: + # {'id': '6335144.bw', 'name': '...', 'account': '...', 'scheduled_nodes': [26742, ...], + # 'nodes_required': 4, 'cpu_cores_required': 32, 'submit_time': 1489798453, + # 'start_time': 1489799118, 'end_time': 1489813275} + + # Gather global filters once + all_nodes = set() + abs_starts = [] + abs_ends = [] + + for r in jobs_raw: + if r.get("scheduled_nodes"): + all_nodes.update(r["scheduled_nodes"]) + abs_starts.append(int(r["start_time"])) + abs_ends.append(int(r["end_time"])) + if not all_nodes or not abs_starts: + return [], 0, 0 + + global_tmin = min(abs_starts) + global_tmax = max(abs_ends) + + # Confirm the correct 0-based indices for ipogif0_* from the HEADER + # tx_idx = 15 # for the original file + # rx_idx = 16 + tx_idx = 2 # for a downselected file with just four columns: [timestamp, node, tx, rx] - for faster loading + rx_idx = 3 + + # Build once (chunk if files are huge) + sampler_df = build_sampler_df(root, day, all_nodes, global_tmin, global_tmax, tx_idx, rx_idx, chunksize=None) + # Optional speed-ups: + # sampler_df.set_index(["nid"], inplace=True) # if you want .loc fast path per node + + # Option 1: take indices from kwargs (0-based). Option 2: keep your quick defaults. + + Path(local_dataset_path[0] if isinstance(local_dataset_path, (list, tuple)) else local_dataset_path) + + bin_s = config.get("TRACE_QUANTA") + jobs = [] + + for r in jobs_raw: # Is this intended? We go throught the 'raw' jobs_dicts that were creeated above? + st_abs = int(r["start_time"]) + et_abs = int(r["end_time"]) + nodes = r.get("scheduled_nodes") or [] + jid = r["id"] + + # Filter by nodes, sum positive deltas + dfj = sampler_df[sampler_df["nid"].isin(nodes)] + + # Print first 10 rows (node, tx, rx) + if debug: + print(dfj[["nid", "tx", "rx"]].head(10)) + + total_tx = int(dfj["tx"].sum()) if not dfj.empty else 0 + total_rx = int(dfj["rx"].sum()) if not dfj.empty else 0 + + nodes_required = r.get("nodes_required") + + avg_tx_per_node = total_tx / nodes_required if nodes_required > 0 else 0 + avg_rx_per_node = total_rx / nodes_required if nodes_required > 0 else 0 + + # Smear totals evenly across bins (simple first pass) + duration = max(1, et_abs - st_abs) + samples = max(1, math.ceil(duration / bin_s)) + ntx, nrx = throughput_traces(avg_tx_per_node, avg_rx_per_node, samples) + + job_d = job_dict( + nodes_required=nodes_required, + name=r.get("name"), + account=r.get("account", "unknown"), + cpu_trace=0, + gpu_trace=0, + nrx_trace=nrx, + ntx_trace=ntx, + end_state="UNKNOWN", + scheduled_nodes=nodes, + id=jid, + priority=0, + submit_time=int(r["submit_time"]), + time_limit=int(r["time_limit"]), + start_time=st_abs, + end_time=et_abs, + expected_run_time=et_abs - st_abs, + current_run_time=0, + trace_time=st_abs, + trace_start_time=st_abs, + trace_end_time=st_abs + samples * bin_s, + trace_quanta=bin_s, + trace_missing_values=False, + ) + + if filter_str: + traffic = (avg_tx_per_node + avg_rx_per_node) / 2. + keep_jobs = eval(filter_str) + print(job_d["id"], filter_str, traffic, keep_jobs) + else: + keep_jobs = True + + if keep_jobs: + jobs.append(Job(job_d)) + + # Normalize times so first start = 0 + t0 = min((j.start_time for j in jobs), default=0) + for j in jobs: + j.submit_time -= t0 + j.start_time -= t0 + j.end_time -= t0 + j.trace_time -= t0 + j.trace_start_time -= t0 + j.trace_end_time -= t0 + + # pprint(jobs) + + if debug: + pprint(jobs) + + telemetry_start = 0 + telemetry_end = max((j.end_time for j in jobs), default=0) + + return WorkloadData( + jobs=jobs, + telemetry_start=telemetry_start, telemetry_end=telemetry_end, + start_date=datetime.fromtimestamp(t0, timezone.utc), + ) diff --git a/raps/dataloaders/frontier.py b/raps/dataloaders/frontier.py index 6edd05209f3d888f33eb4e2b19fd8c0342287f5b..391a84ed99848cd1921553c40eb12b103f484556 100644 --- a/raps/dataloaders/frontier.py +++ b/raps/dataloaders/frontier.py @@ -1,20 +1,25 @@ """ - Note: Frontier telemetry data is not publicly available. +Note: Frontier telemetry data is not publicly available. - # To simulate - DATEDIR="date=2024-01-18" - DPATH=/path/to/data - python main.py -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR +# To simulate +DATEDIR="date=2024-01-18" +DPATH=/path/to/data +raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR - # To analyze the data - python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR $DPATH/jobprofile/$DATEDIR +# To analyze the data +python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR """ +import time +from datetime import datetime, timezone, timedelta +from zoneinfo import ZoneInfo import numpy as np import pandas as pd +import subprocess from tqdm import tqdm +from pathlib import Path -from ..job import job_dict -from ..utils import power_to_utilization, next_arrival, encrypt +from ..job import job_dict, Job +from ..utils import power_to_utilization, encrypt, WorkloadData, date_range def aging_boost(nnodes): @@ -38,6 +43,8 @@ def load_data(files, **kwargs): list The list of parsed jobs. """ + if kwargs.get("live") is True: + return load_live_data() assert (len(files) == 2), "Frontier dataloader requires two files: joblive and jobprofile" jobs_path = files[0] @@ -57,18 +64,85 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar ------- list The list of parsed jobs. + + telemetry_start + the first timestep in which the simulation be executed. + + telemetry_end + the last timestep in which the simulation can be executed. + ---- + Explanation regarding times: + + The loaded dataframe contains + a first timestamp with associated data + and a last timestamp with associated data + + These form the maximum extent of the simuluation time. + telemetry_start and telemetry_end. + + [ ] + ^ ^ + telemetry_start telemetry_end + + These values form the maximum extent of the simulation. + Telemetry start == 0! This means that any time before that is negative, + while anything after this is positive. + Next is the actual extent of the simulation: + + [ ] + ^ ^ + simulation_start simulation_end + + The start of the simulation simulation_start and telemetry_start are only + the same when fastfoward is 0. + In general simulation_end and telemetry_end are the same, as this is the + last time step we can simulate. + Both simulation_start and _end are set in engine.py + + Additionally, jobs can have started before telemetry_start, + And can have a recorded ending after simulation_end, + [ ] + ^ ^ + first_start_timestamp last_end_timestamp + + This means that the time between first_start_timestamp and telemetry_start + has no associated values in the traces! + The missing values after simulation_end can be ignored, as the simulatuion + will have stoped before. + + However, the times before telemetry_start have to be padded to generate + correct offsets within their data! + Within the simulation a job's current time is specified as the difference + between its start_time and the current timestep of the simulation. + + With this each job's + - submit_time + - time_limit + - start_time # Maybe Null + - end_time # Maybe Null + - expected_run_time (end_time - start_time) # Maybe Null + - current_run_time (How long did the job run already, when loading) # Maybe zero + - trace_time (length of each trace in seconds) # Maybe Null + - trace_start_time (time offset in seconds after which the trace starts) # Maybe Null + - trace_end_time (time offset in seconds after which the trace ends) # Maybe Null + - trace_quanta (job's associated trace quanta, to correctly replay with different trace quanta) # Maybe Null + has to be set for use within the simulation + + The values trace_start_time are similar to the telemetry_start and + telemetry_stop but may different due to missing data, for each job. + + The returned values are these three: + - The list of parsed jobs. (as a Job object) + - telemetry_start: int (in seconds) + - telemetry_end: int (in seconds) + + The implementation follows: """ config = kwargs.get('config') encrypt_bool = kwargs.get('encrypt') - fastforward = kwargs.get('fastforward') - reschedule = kwargs.get('reschedule') validate = kwargs.get('validate') jid = kwargs.get('jid', '*') - - if fastforward: - print(f"fast-forwarding {fastforward} seconds") - - min_time = kwargs.get('min_time', None) + debug = kwargs.get('debug') # Sort jobs dataframe based on values in time_start column, adjust indices after sorting jobs_df = jobs_df[jobs_df['time_start'].notna()] @@ -83,20 +157,33 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar jobprofile_df = jobprofile_df.sort_values(by='timestamp') jobprofile_df = jobprofile_df.reset_index(drop=True) - # Take earliest time as baseline reference - if min_time: - time_zero = min_time - else: - time_zero = jobs_df['time_snapshot'].min() + # telemetry_start_timestamp = jobs_df['time_snapshot'].min() # Earliets time snapshot within the day! + telemetry_start_timestamp = jobprofile_df['timestamp'].min() # Earliets time snapshot within the day! + # telemetry_end_timestamp = jobs_df['time_snapshot'].max() # This time has nothing to do with the jobs! + telemetry_end_timestamp = jobprofile_df['timestamp'].max() # Earliets time snapshot within the day! + + # Time that can be simulated # Take earliest time as baseline reference + telemetry_start = 0 # second 0 of the simulation + diff = telemetry_end_timestamp - telemetry_start_timestamp + telemetry_end = int(diff.total_seconds()) + + first_start_timestamp = jobs_df['time_start'].min() + diff = first_start_timestamp - telemetry_start_timestamp + # first_start = int(diff.total_seconds()) # negative seconds or 0 # Unused num_jobs = len(jobs_df) - print("time_zero:", time_zero, "num_jobs", num_jobs) + if debug: + print("num_jobs:", num_jobs) + print("telemetry_start:", telemetry_start, "simulation_fin", telemetry_end) + print("telemetry_start_timestamp:", telemetry_start_timestamp, + "telemetry_end_timestamp", telemetry_end_timestamp) + print("first_start_timestamp:", first_start_timestamp, "last start timestamp:", jobs_df['time_start'].max()) jobs = [] # Map dataframe to job state. Add results to jobs list for jidx in tqdm(range(num_jobs - 1), total=num_jobs, desc="Processing Jobs"): - user = jobs_df.loc[jidx, 'user'] + # user = jobs_df.loc[jidx, 'user'] account = jobs_df.loc[jidx, 'account'] job_id = jobs_df.loc[jidx, 'job_id'] allocation_id = jobs_df.loc[jidx, 'allocation_id'] @@ -118,6 +205,7 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar cpu_power_array = cpu_power.values cpu_min_power = nodes_required * config['POWER_CPU_IDLE'] * config['CPUS_PER_NODE'] cpu_max_power = nodes_required * config['POWER_CPU_MAX'] * config['CPUS_PER_NODE'] + # Will be negative! as cpu_power_array[i] can be smaller than cpu_min_power cpu_util = power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power) cpu_trace = cpu_util * config['CPUS_PER_NODE'] @@ -134,46 +222,323 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar cpu_trace[np.isnan(cpu_trace)] = 0 gpu_trace[np.isnan(gpu_trace)] = 0 - wall_time = gpu_trace.size * config['TRACE_QUANTA'] # seconds - - time_start = jobs_df.loc[jidx+1, 'time_start'] - diff = time_start - time_zero - time_offset = max(diff.total_seconds(), 0) - - if fastforward: - time_offset -= fastforward + # Times: + submit_timestamp = jobs_df.loc[jidx, 'time_submission'] + diff = submit_timestamp - telemetry_start_timestamp + submit_time = diff.total_seconds() + + time_limit = jobs_df.loc[jidx, 'time_limit'] # timelimit in seconds + + start_timestamp = jobs_df.loc[jidx, 'time_start'] + diff = start_timestamp - telemetry_start_timestamp + start_time = diff.total_seconds() + + end_time_timestamp = jobs_df.loc[jidx, 'time_end'] + diff = end_time_timestamp - telemetry_start_timestamp + end_time = diff.total_seconds() + if not start_time <= end_time or np.isnan(end_time): + continue # Start_time is not smaller than end_time or is not valid + # Skip entry. + + expected_run_time = end_time - start_time + current_run_time = 0 # Check if we the job may may be runninghave wall time of the jobs + + trace_quanta = config['TRACE_QUANTA'] + trace_time = gpu_trace.size * trace_quanta # seconds + + trace_start_time = 0 + trace_end_time = trace_time + if expected_run_time > trace_time: + missing_trace_time = int(expected_run_time - trace_time) + trace_missing_values = True + if start_time < 0: + trace_start_time = missing_trace_time + trace_end_time = expected_run_time + elif end_time > telemetry_end: + trace_start_time = 0 + trace_end_time = trace_time + else: + print(f"Job: {job_id} {end_state} {start_time} - {end_time}, " + f"Trace: {trace_start_time} - {trace_end_time}, " + f"Missing: {missing_trace_time}!") + else: + trace_missing_values = False xnames = jobs_df.loc[jidx, 'xnames'] # Don't replay any job with an empty set of xnames if '' in xnames: continue - if reschedule == 'poisson': # Let the scheduler reschedule the jobs - scheduled_nodes = None - time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) - priority = aging_boost(nodes_required) - - elif reschedule == 'submit-time': - scheduled_nodes = None - time_submit = jobs_df.loc[jidx, 'time_submission'] - diff = time_submit - time_zero - time_offset = max(diff.total_seconds(), 0) - priority = aging_boost(nodes_required) - #raise NotImplementedError - - else: # Prescribed replay - scheduled_nodes = [] - priority = 0 # not used for replay - for xname in xnames: - indices = xname_to_index(xname, config) - scheduled_nodes.append(indices) - - if gpu_trace.size > 0 and (jid == job_id or jid == '*') and time_offset > 0: - job_info = job_dict(nodes_required, name, account, cpu_trace, gpu_trace, [], [], wall_time, - end_state, scheduled_nodes, time_offset, job_id, priority) - jobs.append(job_info) - - return jobs + scheduled_nodes = [] + # priority = 0 # not used for replay + priority = aging_boost(nodes_required) + for xname in xnames: + indices = xname_to_index(xname, config) + scheduled_nodes.append(indices) + + if end_time < telemetry_start: + print("Job ends before first recorded telemetry entry:", job_id, "start:", + start_time, "end:", end_time, " Telemetry: ", len(gpu_trace), "entries.") + continue # skip + + if start_time > telemetry_end: + print("Job starts after last recorded telemetry entry:", job_id, "start:", + start_time, "end:", end_time, " Telemetry: ", len(gpu_trace), "entries.") + continue # skip + + # Throw out jobs that are not valid! + if gpu_trace.size == 0: + print("ignoring job b/c zero trace:", jidx, submit_time, start_time, nodes_required) + continue # SKIP! + + if gpu_trace.size > 0 and (jid == job_id or jid == '*'): # and time_submit >= 0: + job_info = job_dict( + nodes_required=nodes_required, + name=name, + account=account, + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + nrx_trace=None, + ntx_trace=None, + end_state=end_state, + scheduled_nodes=scheduled_nodes, + id=job_id, + priority=priority, # partition missing + submit_time=submit_time, + time_limit=time_limit, + start_time=start_time, + end_time=end_time, + expected_run_time=expected_run_time, + current_run_time=current_run_time, + trace_time=trace_time, + trace_start_time=trace_start_time, trace_end_time=trace_end_time, + trace_quanta=trace_quanta, trace_missing_values=trace_missing_values) + + job = Job(job_info) + jobs.append(job) + return WorkloadData( + jobs=jobs, + telemetry_start=telemetry_start, + telemetry_end=telemetry_end, + start_date=telemetry_start_timestamp, + ) + + +def load_live_data(**kwargs): + """ Load Slurm Live data using pyslurm """ + jobs = list() + telemetry_start = int(time.time()) # This is now! get unix time + telemetry_start = 1755721300 + if hasattr(kwargs, 'time'): + time_to_sim = kwargs.get('time') # Should be specified . + assert isinstance(time_to_sim, int) + else: + time_to_sim = 14 * 24 * 60 * 60 # or we simulate 2 weeks. + telemetry_end = telemetry_start + time_to_sim + + total_partitions = 0 + partition_dict = dict() + + import pyslurm # noqa + # Local Tests + # filename = "something/something/pyslurm.dump" + # with open(filename, 'r') as f: + # s = f.read() + # data = ast.literal_eval(s) + # + data = pyslurm.job().get() + + for jidx, jdata in data.items(): + if jdata['job_state'] == "COMPLETED" \ + or jdata['job_state'] == "CANCELLED": + continue + if jdata['job_state'] == "TIMEOUT" \ + or jdata['job_state'] == "FAILED": + if jdata['requeue'] is False: + continue + + # if jidx == XXX: + # print(jdata) + # exit() + # Picking the useful ones from the 110 features: Leaving the rest for potential changes + account = jdata['account'] + # 'accrue_time': String = 'Unknown', + # 'admin_comment': String, + # 'alloc_node': String = 'login08', + # 'alloc_sid': int + # 'array_job_id': None, + # 'array_task_id': None, + # 'array_task_str': None, + # 'het_job_id': None, + # 'het_job_id_set': None, + # 'het_job_offset': None, + # 'array_max_tasks': None, + # 'assoc_id': int, + # 'batch_flag': int, + # 'batch_features': None, + # 'batch_host': None, + # 'billable_tres': float, + # 'bitflags': int, + # 'boards_per_node': int, + # 'burst_buffer': None, + # 'burst_buffer_state': None, + # 'command': String, + # 'comment': None, + # 'contiguous': bool, + # 'core_spec': int, + # 'cores_per_socket': int, + # 'cpus_per_task': int, + # 'cpus_per_tres': None, + # 'cpu_freq_gov': int, + # 'cpu_freq_max': int, + # 'cpu_freq_min': int, + # 'dependency': None, + # 'derived_ec': String, + # 'eligible_time': int, + # 'end_time': int, + # 'exc_nodes': [], + # 'exit_code': String, + # 'features': [], + # 'group_id': int, + job_id = jdata['job_id'] + current_state = jdata['job_state'] + end_state = None + # 'last_sched_eval': String # e.g. '2013-02-31T14:29:09', + # 'licenses': {}, + # 'max_cpus': int, + # 'max_nodes': int, + # 'mem_per_tres': None, + name = jdata['name'] + # 'network': None, + # 'nodes': None, + # 'nice': 0, + # 'ntasks_per_core': int, + # 'ntasks_per_core_str': String + # 'ntasks_per_node': int, + # 'ntasks_per_socket': int, + # 'ntasks_per_socket_str': String, + # 'ntasks_per_board': 0, + # 'num_cpus': int, + nodes_required: int = jdata['num_nodes'] + # 'num_tasks': 49152, + # 'partition': String, # e.g.'batch', + if jdata['partition'] in partition_dict: + pass + else: + partition_dict[jdata['partition']] = total_partitions + total_partitions += 1 + partition = partition_dict[jdata['partition']] + # 'mem_per_cpu': bool, + # 'min_memory_cpu': None, + # 'mem_per_node': bool, + # 'min_memory_node': int, + # 'pn_min_memory': int, + # 'pn_min_cpus': int, + # 'pn_min_tmp_disk': int, + priority = jdata['priority'] + # 'profile': int, + # 'qos': String # e.g. 'normal', + # 'reboot': int, + scheduled_nodes_str_list = jdata['req_nodes'] # Explicitly requested nodes # Missmatch between slurm and raps + scheduled_nodes = [] + for n in scheduled_nodes_str_list: + scheduled_nodes = int(n[8:]) + # Do we need to reintroduce a list of explicitly required nodes? This is currently handled by setting the + # scheduled_nodes before the scheduler modifies this list + # 'req_switch': int, + # 'requeue': bool, + # 'resize_time': int, + # 'restart_cnt': int, + # 'resv_name': None, + # 'run_time': int, # ?? + # 'run_time_str': String, + # 'sched_nodes': None, + # 'selinux_context': None, + # 'shared': String, + # 'sockets_per_board': int, + # 'sockets_per_node': int, + if current_state == "RUNNING": + start_time = jdata['start_time'] + end_time = None + current_run_time = jdata['run_time'] + else: + start_time = None + end_time = None + current_run_time = jdata['run_time'] # ?? + if jdata['job_state'] == "TIMEOUT": + if jdata['requeue'] is False: + current_run_time = 0 # ?? + elif jdata['job_state'] == "COMPLETING": + if jdata['requeue'] is False: + current_run_time = 0 # ?? + else: + assert current_run_time == 0, "Check if any other value occurs and should be handled! " \ + f"current_run_time:{current_run_time}" \ + f"\njdata:\n{jdata}" + expected_run_time = None + # 'state_reason': String # e.g. 'JobHeldUser', + # 'std_err': String, + # 'std_in': String, + # 'std_out': String, + submit_time = jdata['submit_time'] # int, Unix Time! + # 'suspend_time': int, + # 'system_comment': None, + # 'time_limit': e.g. 570, # in minutes! + time_limit = jdata['time_limit'] * 60 # needed in seconds + # 'time_limit_str': '0-09:30:00', + # 'time_min': int, + # 'threads_per_core': int, + # 'tres_alloc_str': None, + # 'tres_bind': None, + # 'tres_freq': None, + # 'tres_per_job': None, + # 'tres_per_node': None, + # 'tres_per_socket': None, + # 'tres_per_task': None, + # 'tres_req_str': String, + account = jdata['user_id'] # int for slurm, may be String in raps and conversion works. ... + # 'wait4switch': int, + # 'wckey': None, + # 'work_dir': String + # 'cpus_allocated': dict, + # 'cpus_alloc_layout': dict + cpu_trace = None # To be determined by a model! + gpu_trace = None + trace_time = None + trace_start_time = None + trace_end_time = None + trace_quanta = None + trace_missing_values = None + job_info = job_dict( + nodes_required=nodes_required, + name=name, + account=account, + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + nrx_trace=None, + ntx_trace=None, + current_state=current_state, + end_state=end_state, + scheduled_nodes=scheduled_nodes, + id=job_id, + priority=priority, # partition missing + partition=partition, + submit_time=submit_time, time_limit=time_limit, + start_time=start_time, end_time=end_time, + expected_run_time=expected_run_time, + current_run_time=current_run_time, + trace_time=trace_time, + trace_start_time=trace_start_time, trace_end_time=trace_end_time, + trace_quanta=trace_quanta, trace_missing_values=trace_missing_values) + job = Job(job_info) + jobs.append(job) + + return WorkloadData( + jobs=jobs, + telemetry_start=telemetry_start, + telemetry_end=telemetry_end, + start_date=datetime.fromtimestamp(telemetry_start, timezone.utc), + ) def xname_to_index(xname: str, config: dict): @@ -195,7 +560,8 @@ def xname_to_index(xname: str, config: dict): if row == 6: col -= 9 rack_index = row * 12 + col - node_index = chassis * config['BLADES_PER_CHASSIS'] * config['NODES_PER_BLADE'] + slot * config['NODES_PER_BLADE'] + node + node_index = chassis * config['BLADES_PER_CHASSIS'] * \ + config['NODES_PER_BLADE'] + slot * config['NODES_PER_BLADE'] + node return rack_index * config['SC_SHAPE'][2] + node_index @@ -246,3 +612,42 @@ def cdu_pos(index: int, config: dict) -> tuple[int, int]: name = CDU_NAMES[index - 1] row, col = int(name[2]), int(name[3:5]) return (row, col) + + +def download(dest: Path, start: datetime | None, end: datetime | None): + HOST = "dtn.ccs.ornl.gov" + DATA_LAKE = "/lustre/orion/stf218/proj-shared/data/lake/frontier" + + print("Downloading the Frontier dataset requires access permissions.") + print("If you have access you can download via SSH.") + USERNAME = input("NCCS Username: ") + # jobs are indexed by submission time so download a few extra days to make sure we get all that + # ran over start -> end + if start: + start = (start - timedelta(days=2)).astimezone(ZoneInfo("UTC")) + else: + start = datetime.fromisoformat("2023-09-01T00:00:00Z") + if end: + end = (end + timedelta(days=2)).astimezone(ZoneInfo("UTC")) + else: + end = datetime.now(ZoneInfo("UTC")) + + days = list(date_range(start, end)) + + dest.mkdir(parents=True) + subprocess.run(["rsync", "-rvm", + *[f"--include=date={d.date().isoformat()}/***" for d in days], + "--exclude", '*', + f"{USERNAME}@{HOST}:{DATA_LAKE}/jobprofile/jobprofile/", + str(dest / "jobprofile") + ], check=True, text=True) + + (dest / 'slurm').mkdir(parents=True) + subprocess.run(["rsync", "-rvm", + *[f"--include=date={d.date().isoformat()}/***" for d in days], + "--exclude", '*', + f"{USERNAME}@{HOST}:{DATA_LAKE}/slurm/joblive/", + str(dest / "slurm/joblive") + ], check=True, text=True) + + print("Done!") diff --git a/raps/dataloaders/fugaku.py b/raps/dataloaders/fugaku.py index bc28ec325d6941f3751574536d101acd02da911a..1442ad21df4d5d95904e50e438c10d87be70b4c2 100644 --- a/raps/dataloaders/fugaku.py +++ b/raps/dataloaders/fugaku.py @@ -1,22 +1,28 @@ """ - Download parquet files from https://zenodo.org/records/11467483 +Uses the fugaku dataset published at https://zenodo.org/records/11467483 - Note that F-Data doesn't give a list of nodes used, so we set 'scheduled_nodes' to None - which triggers the scheduler to schedule the nodes itself. +Note that F-Data doesn't give a list of nodes used, so we set 'scheduled_nodes' to None +which triggers the scheduler to schedule the nodes itself. - Also, power in F-Data is only given at node-level. We can use node-level power by - adding the --validate option. +Also, power in F-Data is only given at node-level. We can use node-level power by +adding the --validate option. - The '--reschedule poisson' will compute submit times from Poisson distribution, instead of using - the submit times given in F-Data. - - python main.py --system fugaku -f /path/to/21_04.parquet --reschedule poisson --validate +The '--arrival poisson' will compute submit times from Poisson distribution, instead of using +the submit times given in F-Data. +raps run --system fugaku -f /path/to/21_04.parquet +raps run --system fugaku -f /path/to/21_04.parquet --validate +raps run --system fugaku -f /path/to/21_04.parquet --policy priority --backfill easy """ import pandas as pd from tqdm import tqdm -from ..job import job_dict -from ..utils import next_arrival +from datetime import datetime +from pathlib import Path +from zoneinfo import ZoneInfo +import urllib.request +import requests +from ..job import job_dict, Job +from ..utils import WorkloadData def load_data(path, **kwargs): @@ -47,53 +53,93 @@ def load_data_from_df(df, **kwargs): Returns: list: List of job dictionaries. + int: Telemetry Start (in seconds 0) + int: Telemetry End (in seconds) """ - encrypt_bool = kwargs.get('encrypt') - fastforward = kwargs.get('fastforward') - reschedule = kwargs.get('reschedule') + # encrypt_bool = kwargs.get('encrypt') # Unused + # arrival = kwargs.get('arrival') # Unused validate = kwargs.get('validate') - jid = kwargs.get('jid', '*') + # jid = kwargs.get('jid', '*') # Unused config = kwargs.get('config') - min_time = kwargs.get('min_time', None) - - if fastforward: print(f"fast-forwarding {fastforward} seconds") job_list = [] + # Convert all times to datetime and find the min and max thereof for reference use. # Convert 'adt' (submit time) to datetime and find the earliest submission time df['adt'] = pd.to_datetime(df['adt'], errors='coerce') - if not min_time: - min_time = df['adt'].min() + df['sdt'] = pd.to_datetime(df['sdt'], errors='coerce') + df['edt'] = pd.to_datetime(df['edt'], errors='coerce') + + # We only have average power therefore we set the earliest telemetry to the earliest start time + first_start_timestamp = df['sdt'].min() + last_end_timestamp = df['edt'].max() + telemetry_start_timestamp = first_start_timestamp + telemetry_start = 0 + telemetry_end_timestamp = last_end_timestamp + diff = telemetry_end_timestamp - telemetry_start_timestamp + telemetry_end = int(diff.total_seconds()) # Loop through the DataFrame rows to extract job information - for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing Jobs"): + for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing Jobs"): nodes_required = row['nnumr'] if 'nnumr' in df.columns else 0 - account = row['usr'] name = row['jnam'] if 'jnam' in df.columns else 'unknown' + account = row['usr'] if validate: cpu_trace = row['avgpcon'] gpu_trace = cpu_trace else: - cpu_trace = row['perf1'] if 'perf1' in df.columns else 0 # Assuming some performance metric as cpu_trace + # cpu_trace = row['perf1'] if 'perf1' in df.columns else 0 # Assuming some performance metric as cpu_trace + # Total Opts / Total Ops + Idle Ops + cpu_trace = row['perf1'] / (row['perf1'] + row['perf6']) if 'perf1' in df.columns else 0 gpu_trace = 0 # Set to 0 as GPU trace is not explicitly provided - wall_time = row['duration'] if 'duration' in df.columns else 0 + # No network trace + end_state = row['exit state'] if 'exit state' in df.columns else 'unknown' - #scheduled_nodes = row['nnuma'] if 'nnuma' in df.columns else 0 - scheduled_nodes = None - submit_time = row['adt'] if 'adt' in df.columns else min_time - if reschedule == 'poisson': # Let the scheduler reschedule the jobs - time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) - elif reschedule == 'submit-time': - raise NotImplementedError - else: - time_offset = (submit_time - min_time).total_seconds() # Compute time offset in seconds + scheduled_nodes = None # Only nodes_required is in the trace job_id = row['jid'] if 'jid' in df.columns else 'unknown' + priority = row['pri'] if 'pri' in df.columns else 0 + submit_timestamp = pd.to_datetime(row['adt']) if 'adt' in df.columns else - \ + 1 # Else job was submitted in the past + diff = submit_timestamp - telemetry_start_timestamp + submit_time = int(diff.total_seconds()) + + time_limit = int(row['elpl']) if 'elpl' in df.columns else 24 * 60 * 60 # in seconds + + start_timestamp = pd.to_datetime(row['sdt']) if 'sdt' in df.columns else 0 + diff = start_timestamp - telemetry_start_timestamp + start_time = int(diff.total_seconds()) + + end_timestamp = pd.to_datetime(row['edt']) if 'edt' in df.columns else 0 + diff = end_timestamp - telemetry_start_timestamp + end_time = int(diff.total_seconds()) + + wall_time = end_time - start_time + if end_time < start_time: + print(f"Job: {i}, skiped end_time < start_time ({end_time} < {start_time})") + if kwargs.get('debug'): + print(row) + continue + + # duration = int(row['duration']) if 'duration' in df.columns else 0 + # if (wall_time != duration): + # if abs(wall_time - duration) <= 1: # offset is often 1 + # wall_time = min(wall_time,duration) + # else: + # raise ValueError(f"Duration: {row}") # Offset can be as large as 15 minutes! Removed. + + # We only have a single average value, set trace times as if we had all. + trace_time = wall_time + trace_start_time = start_time + trace_end_time = end_time + trace_missing_values = False # Sane Choice? + trace_quanta = config['TRACE_QUANTA'] + # Create job dictionary job_info = job_dict( nodes_required=nodes_required, @@ -103,17 +149,28 @@ def load_data_from_df(df, **kwargs): gpu_trace=gpu_trace, ntx_trace=[], nrx_trace=[], - wall_time=wall_time, + trace_quanta=trace_quanta, end_state=end_state, scheduled_nodes=scheduled_nodes, - time_offset=time_offset, - job_id=job_id, - priority=priority - ) - - job_list.append(job_info) - - return job_list + id=job_id, + priority=priority, + submit_time=submit_time, + time_limit=time_limit, + start_time=start_time, + end_time=end_time, + expected_run_time=wall_time, + trace_time=trace_time, + trace_start_time=trace_start_time, + trace_end_time=trace_end_time, + trace_missing_values=trace_missing_values) + job = Job(job_info) + job_list.append(job) + + return WorkloadData( + jobs=job_list, + telemetry_start=telemetry_start, telemetry_end=telemetry_end, + start_date=telemetry_start_timestamp, + ) def node_index_to_name(index: int, config: dict): @@ -127,4 +184,28 @@ def cdu_index_to_name(index: int, config: dict): def cdu_pos(index: int, config: dict) -> tuple[int, int]: """ Return (row, col) tuple for a cdu index """ - return (0, index) # TODO + return (0, index) # TODO + + +def download(dest: Path, start: datetime | None, end: datetime | None): + tz = ZoneInfo("Asia/Tokyo") + + files = requests.get("https://zenodo.org/api/records/11467483").json()["files"] + files = [f for f in files if f['key'].endswith(".parquet")] + files = sorted(files, key=lambda f: f['key']) + + # TODO: I think fugaku data is indexed by submission time not start time, so filtering by + # filename will probably miss some jobs that ran over start -> end + if start: + start_file = start.astimezone(tz).strftime("%y_%m.parquet") + files = [f for f in files if f['key'] >= start_file] + if end: + end_file = end.astimezone(tz).strftime("%y_%m.parquet") + files = [f for f in files if f['key'] <= end_file] + + dest.mkdir(parents=True) + for file in files: + print(f"Downloading {file['key']}") + urllib.request.urlretrieve(file['links']['self'], dest / file['key']) + + print("Done!") diff --git a/raps/dataloaders/gcloudv2.md b/raps/dataloaders/gcloudv2.md new file mode 100644 index 0000000000000000000000000000000000000000..dccbbc1b24500dc4e852cde77eb800d456715847 --- /dev/null +++ b/raps/dataloaders/gcloudv2.md @@ -0,0 +1,126 @@ +# **Google Cluster Trace V2 (2011) Dataset Overview** + +Some of this info may be incorrect. Look here for the official documentation: + +https://drive.google.com/file/d/0B5g07T_gRDg9Z0lsSTEtTWtpOW8/view?resourcekey=0-cozD56gA4fUDdrkHnLJSrQ + +This document provides a summary of the Google Cluster Trace V2 dataset, released in 2011\. This dataset offers insights into the operation of a large-scale production data center and its workload. It's crucial for research in areas like cluster scheduling, resource management, and workload characterization. + +## **1\. Dataset Overview** + +* **Scale:** The 2011 traces cover a **single production Borg cell (cluster)**. +* **Machines:** This cluster consisted of approximately **12,500 machines**. +* **Time Period:** The dataset spans **29 days** of workload data, collected during **May 2011**. +* **Total Size:** The total compressed size of the full dataset is around **41 GB**. +* **Format:** All files are provided in **gzipped CSV (.csv.gz)** format. +* **Anonymization:** The data is heavily anonymized to protect proprietary information. This means specific hardware details (like CPU models or exact core counts) are not provided, and resource values are normalized. User and job identifiers are opaque hashes. + +## **2\. Data Sources and File Contents** + +The V2 dataset is organized into subdirectories based on event types. Each subdirectory contains multiple gzipped CSV files (part-NNNNN-of-MMMMM.csv.gz). Each of these CSV files **does NOT have a header row**; the first row contains data. + +Here's a detailed look at the contents of the core files you've sampled: + +### **2.1. job\_events/part-NNNNN-of-MMMMM.csv.gz** + +This file contains records for job events. Each row represents an event in the lifecycle of a job. + +**Sample head output:** + +0,,3418309,0,70s3v5qRyCO/1PCdI6fVXnrW8FU/w+5CKRSa72xgcIo=,3,IHgtoxEBuUTHNbUeVs4hzptMY4n8rZKLbZg+Jh5fNG4=,wAmgn2H74cdoMuSFwJF3NaUEaudVBTZ0/HaNZBwIpEQ= +0,,3418314,0,70s3v5qRyCO/1PCdI6fVXnrW8FU/w+5CKRSa72xgcIo=,3,L52XDyhi9x9ChmVBZ1qavOFmnzPeVsvQ2QyGmBZcV4s=,ShNjeaoUeqGV2i9WMKEX9HTeuc9K2Fdfovibt7Mp6qI= +0,,3418319,0,70s3v5qRyCO/1PCdI6fVXnrW8FU/w+5CKRSa72xgcIo=,3,vq0IN3BWEbkDjYgYvkrVyH6OWoUoDwFFf3j/syEZzLA=,1A2GM17AzHRcKJcJet/oIF7FOORyFcAOcUSpR9Fqou8= + +**Schema Description:** + +| Column Index | Field Name | Description | Data Type (in CSV) | Notes | +| :---- | :---- | :---- | :---- | :---- | +| **0** | time | Time of event (microseconds) | Integer | | +| **1** | *(missing value)* | Often an empty string. | String | | +| **2** | job\_ID | Unique ID of the job | Integer | | +| **3** | event\_type | Type of event: 0=submit, 1=schedule, 2=evict, 3=fail, 4=finish, 5=kill, 6=lost, 7=update, 8=noop. | Integer | Sample shows 0 (submit). | +| **4** | user\_ID | Opaque ID of the user submitting the job | String | Hashed value. | +| **5** | scheduling\_class | 0=non-production, 1=production, 2=free. Values outside this range (like 3 in sample) might indicate an unlisted class or a specific trace artifact. | Integer | | +| **6** | job\_name | Opaque ID of the job's name | String | Hashed value. | +| **7** | logical\_job\_name | Opaque ID of the logical job name (for grouping related jobs) | String | Hashed value. | +| **8** | number\_of\_tasks | Number of tasks in the job (typically present only on submit events). | Integer | Can be empty if not applicable or derived for specific event types. | +| **9** | CPU\_request | (Normalized) CPU cores requested per task. | Float | | +| **10** | memory\_request | (Normalized) memory (RAM) requested per task. | Float | | + +### **2.2. machine\_events/part-NNNNN-of-MMMMM.csv.gz** + +This file describes events related to machines in the cluster. + +**Sample head output:** + +0,5,0,HofLGzk1Or/8Ildj2+Lqv0UGGvY82NLoni8+J/Yy0RU=,0.5,0.2493 +0,6,0,HofLGzk1Or/8Ildj2+Lqv0UGGvY82NLoni8+J/Yy0RU=,0.5,0.2493 +0,7,0,HofLGzk1Or/8Ildj2+Lqv0UGGvY82NLoni8+J/Yy0RU=,0.5,0.2493 + +**Schema Description:** + +| Column Index | Field Name | Description | Data Type (in CSV) | Notes | +| :---- | :---- | :---- | :---- | :---- | +| **0** | time | Time of event (microseconds) | Integer | | +| **1** | machine\_ID | Unique ID of the machine | Integer | IDs are simple integers, but map to opaque IDs in task\_events / task\_usage. | +| **2** | event\_type | Type of event: 0=add, 1=remove, 2=update | Integer | Sample shows 0 (add). | +| **3** | platform\_ID | Opaque string representing the machine's microarchitecture and chipset version | String | Hashed value. Provides insight into hardware heterogeneity without specifics. | +| **4** | CPU\_capacity | (Normalized) Total CPU cores on the machine (e.g., 0.5, 1.0). | Float | Normalized value relative to the largest CPU capacity in the trace (1.0). | +| **5** | memory\_capacity | (Normalized) Total memory (RAM) on the machine. | Float | Normalized value. | + +### **2.3. task\_events/part-NNNNN-of-MMMMM.csv.gz** + +This file details events related to individual tasks, which are components of jobs. + +**Sample head output:** + +0,2,3418309,0,4155527081,0,70s3v5qRyCO/1PCdI6fVXnrW8FU/w+5CKRSa72xgcIo=,3,9,,,, +0,2,3418309,1,329150663,0,70s3v5qRyCO/1PCdI6fVXnrW8FU/w+5CKRSa72xgcIo=,3,9,,,, +0,,3418314,0,3938719206,0,70s3v5qRyCO/1PCdI6fVXnrW8FU/w+5CKRSa72xgcIo=,3,9,0.125,0.07446,0.0004244,0 + +**Schema Description:** + +| Column Index | Field Name | Description | Data Type (in CSV) | Notes | +| :---- | :---- | :---- | :---- | :---- | +| **0** | time | Time of event (microseconds) | Integer | | +| **1** | *(missing value)* | Often an empty string. | String | | +| **2** | job\_ID | Unique ID of the job this task belongs to | Integer | | +| **3** | task\_index | The index of the task within the job (0-indexed). Uniquely identifies a task when combined with job\_ID. | Integer | | +| **4** | machine\_ID | ID of the machine where the event occurred (if applicable). This is typically populated when tasks are scheduled or run. Missing implies task not yet assigned to machine. | Integer | This is the opaque machine ID (hashed), distinct from the simple integer machine\_ID in machine\_events but maps to them. | +| **5** | event\_type | Type of task event: 0=submit, 1=schedule, 2=evict, 3=fail, 4=finish, 5=kill, 6=lost, 7=update, 8=noop, 9=assign. | Integer | Sample shows 0 (submit), 2 (evict). | +| **6** | user\_ID | Opaque ID of the user submitting the job. | String | Hashed value. | +| **7** | scheduling\_class | 0=non-production, 1=production, 2=free. Values like 3, 9 might be other classes. | Integer | | +| **8** | priority | Integer priority from 0 (lowest) to 11 (highest). | Integer | | +| **9** | CPU\_request | (Normalized) CPU cores requested by this task. | Float | Empty if not applicable for event type. | +| **10** | memory\_request | (Normalized) memory requested by this task. | Float | Empty if not applicable for event type. | +| **11** | disk\_space\_request | (Normalized) disk space requested by this task. | Float | Empty if not applicable for event type. | +| **12** | constraints | (Binary) 0=no constraints, 1=has constraints. | Integer | Empty if not applicable. | + +### **2.4. task\_usage/part-NNNNN-of-MMMMM.csv.gz** + +This is typically the largest file, containing periodic snapshots of resource usage for running tasks. + +**Sample head output:** + +600000000,900000000,3418309,0,4155527081,0.001562,0.06787,0.07568,0.001156,0.001503,0.06787,2.861e-06,0.0001869,0.03967,0.0003567,2.445,0.007243,0,1,0 +600000000,900000000,3418309,1,329150663,0.001568,0.06787,0.07556,0.0003195,0.0007,0.06787,5.722e-06,0.0001879,0.03302,0.0009289,2.1,0.005791,0,1,0 + +**Schema Description:** + +| Column Index | Field Name | Description | Data Type (in CSV) | Notes | +| :---- | :---- | :---- | :---- | :---- | +| **0** | start\_time | Start time of the data sample (microseconds) | Integer | | +| **1** | end\_time | End time of the data sample (microseconds) | Integer | Typically start\_time \+ 300,000,000 (300 seconds or 5 minutes). | +| **2** | job\_ID | Unique ID of the job | Integer | | +| **3** | task\_index | Index of the task within the job | Integer | | +| **4** | machine\_ID | ID of the machine where this task ran during the sample period | Integer | Opaque machine ID (hashed). | +| **5** | CPU\_usage\_rate | Normalized average CPU usage rate (cores per second) during the sample. | Float | | +| **6** | memory\_usage\_avg | Normalized average memory usage. | Float | | +| **7** | memory\_usage\_max | Normalized maximum memory usage. | Float | | +| **8** | disk\_I/O\_time\_avg | Normalized average disk I/O time. | Float | | +| **9** | disk\_I/O\_time\_max | Normalized maximum disk I/O time. | Float | | +| **10** | CPUs\_allocated | Normalized CPU cores allocated to the task during this sample. | Float | | +| **11** | memory\_allocated | Normalized amount of memory allocated. | Float | | +| **12** | sample\_duration | Duration of the sample period (microseconds). | Float | Usually around 300,000,000 (300 seconds). | +| **13-19** | *(unnamed/unknown)* | Additional columns not explicitly documented. | Mixed | These are usually other system metrics or internal flags. You can name them generically if needed. | + diff --git a/raps/dataloaders/gcloudv2.py b/raps/dataloaders/gcloudv2.py new file mode 100644 index 0000000000000000000000000000000000000000..6f05a878b74df09d9cdd0d63b1b93494c6a4f067 --- /dev/null +++ b/raps/dataloaders/gcloudv2.py @@ -0,0 +1,342 @@ +import os +import re +from datetime import datetime +from tqdm import tqdm +from typing import List, Optional, Generator, Any, Union + +import numpy as np +import pandas as pd + +from raps.job import job_dict, Job +from raps.utils import WorkloadData + +""" +Official instructions are here: + +https://drive.google.com/file/d/0B5g07T_gRDg9Z0lsSTEtTWtpOW8/view?resourcekey=0-cozD56gA4fUDdrkHnLJSrQ + + +--- +Downloading Google Cluster Traces v2: + + curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-471.0.0-darwin-arm.tar.gz + tar -xf google-cloud-cli-471.0.0-darwin-arm.tar.gz + gcloud components update + + gcloud auth login + + gsutil ls gs://clusterdata_2019_a/ + + * collection_events + * instance_events + * instance_usage + * machine_attributes + * machine_events + + gsutil -m cp -r gs://clusterdata_2019_a/instance_usage-*.parquet.gz ./google_cluster_data/cell_a/instance_usage + + # Create a directory to store your sample data + mkdir -p ./google_cluster_data_sample + + # Download the first JSON and Parquet file for collection_events + gsutil cp gs://clusterdata_2019_a/collection_events-000000000000.json.gz ./google_cluster_data_sample/ + gsutil cp gs://clusterdata_2019_a/collection_events-000000000000.parquet.gz ./google_cluster_data_sample/ + + # Download the first JSON and Parquet file for instance_events + gsutil cp gs://clusterdata_2019_a/instance_events-000000000000.json.gz ./google_cluster_data_sample/ + gsutil cp gs://clusterdata_2019_a/instance_events-000000000000.parquet.gz ./google_cluster_data_sample/ + + # Download the first JSON and Parquet file for instance_usage + gsutil cp gs://clusterdata_2019_a/instance_usage-000000000000.json.gz ./google_cluster_data_sample/ + gsutil cp gs://clusterdata_2019_a/instance_usage-000000000000.parquet.gz ./google_cluster_data_sample/ + + # ... and so on for other event types (machine_attributes, machine_events) + gsutil cp gs://clusterdata_2019_a/machine_attributes-000000000000.json.gz ./google_cluster_data_sample/ + gsutil cp gs://clusterdata_2019_a/machine_attributes-000000000000.parquet.gz ./google_cluster_data_sample/ + + gsutil cp gs://clusterdata_2019_a/machine_events-000000000000.json.gz ./google_cluster_data_sample/ + gsutil cp gs://clusterdata_2019_a/machine_events-000000000000.parquet.gz ./google_cluster_data_sample/ + +--- +Following explanation from Gemini-CLI on how the job nodes required is being computed. Method must be verified + + 1. Machine Capacity Determination: + * The machine_events data is loaded to get information about the cluster's machines. + * The CPU_capacity and memory_capacity of a typical machine are determined by taking + the mode() (most frequent value) of these columns from the machine_df. This gives + us the standard CPU and memory capacity of a single node in the cluster. + + 2. Task Resource Request Aggregation: + * The task_events data is loaded, which contains CPU_request and memory_request for + individual tasks. + * These task requests are then grouped by job_ID, and the CPU_request and memory_request + are summed up for all tasks belonging to the same job. This gives us the total CPU and + memory requested by each job. + + 3. Nodes Required Calculation (CPU and Memory): + * For each job, the total CPU_request is divided by the cpu_capacity of a single machine. + The np.ceil() function is used to round up to the nearest whole number, ensuring that + enough nodes are allocated to satisfy the CPU demand. This result is stored as + nodes_required_cpu. + * Similarly, the total memory_request is divided by the mem_capacity of a single machine, + and np.ceil() is applied. This result is stored as nodes_required_mem. + + 4. Final `nodes_required`: + * The final nodes_required for a job is determined by taking the np.maximum() of nodes_required_cpu + and nodes_required_mem. This ensures that the job is allocated enough nodes to satisfy both its CPU + and memory requirements. The result is then cast to an integer (.astype(int)). + + 5. Filtering: + * Finally, any jobs for which the calculated nodes_required is 0 (meaning they requested no CPU or memory) + are filtered out, as these jobs would not require any nodes in the simulation. +""" + +# Define expected column names for each supported event type +V2_COLUMN_NAMES = { + "job_events": [ + "timestamp", # ↔ time + "missing_info", # ↔ missing_col_1 + "job_ID", + "event_type", + "user_name", + "scheduling_class", + "job_name", + "logical_job_name" + ], + "machine_events": [ + "timestamp", + "machine_ID", + "event_type", + "platform_ID", + "CPU_capacity", + "memory_capacity" + ], + "task_events": [ + "timestamp", + "missing_info", + "job_ID", + "task_index", + "machine_ID", + "event_type", + "user_name", + "scheduling_class", + "priority", + "CPU_request", + "memory_request", + "disk_space_request", + "different_machine_constraint" + ], + "task_usage": [ + "start_time", # file-col 0 + "end_time", # file-col 1 + "job_ID", # file-col 2 + "task_index", # file-col 3 + "machine_ID", # file-col 4 + "CPU_usage_rate", # file-col 5 + "memory_usage_avg", # file-col 6 + "memory_usage_max", # file-col 7 + "assigned_memory", # file-col 8 + "unmapped_page_cache_memory", # file-col 9 + "page_cache_memory", # file-col 10 + "maximum_memory_usage", # file-col 11 + "disk_IO_time_avg", # file-col 12 + "disk_IO_time_max", # file-col 13 + "local_disk_space_used", # file-col 14 + "cycles_per_instruction", # file-col 15 + "memory_accesses_per_instruction", # file-col 16 + "sampling_rate", # file-col 17 + "aggregation_type", # file-col 18 + "missing_col_19" # file-col 19 + ] +} +SUPPORTED_EVENT_TYPES = list(V2_COLUMN_NAMES.keys()) + + +class GoogleClusterV2DataLoader: + """ + Loader for Google Cluster V2 CSV.GZ files. + """ + + def __init__(self, base_path: str, event_type: str = "job_events", + file_indices: Optional[List[int]] = None, concatenate: bool = True): + self.base_path = os.path.expanduser(base_path) + if event_type not in SUPPORTED_EVENT_TYPES: + raise ValueError(f"Unsupported event type: '{event_type}'") + self.event_type = event_type + self.file_indices = file_indices + self.concatenate = concatenate + self.file_paths = self._find_files() + + def _find_files(self) -> List[str]: + dir_path = os.path.join(self.base_path, self.event_type) + if not os.path.isdir(dir_path): + raise FileNotFoundError(f"Directory not found: {dir_path}") + files = os.listdir(dir_path) + matches = [] + if self.file_indices: + for idx in self.file_indices: + pattern = re.compile(rf"part-{idx:05d}-of-\d{{5}}\.csv\.gz$") + found = [f for f in files if pattern.match(f)] + if not found: + raise FileNotFoundError(f"File index {idx} missing in {dir_path}") + matches.extend(found) + else: + matches = [f for f in files if f.startswith("part-") and f.endswith(".csv.gz")] + if not matches: + raise FileNotFoundError(f"No files in {dir_path}") + return [os.path.join(dir_path, f) for f in sorted(matches)] + + def __iter__(self) -> Generator[pd.DataFrame, None, None]: + dfs = [] + names = V2_COLUMN_NAMES[self.event_type] + ts_col = names[0] + for path in self.file_paths: + df = pd.read_csv(path, compression='gzip', header=None, + names=names, dtype={ts_col: int}) + if not self.concatenate: + yield df + else: + dfs.append(df) + if self.concatenate and dfs: + yield pd.concat(dfs, ignore_index=True) + + +def load_data(data_path: Union[str, List[str]], **kwargs: Any): + config = kwargs.get('config') + # Unpack list + if isinstance(data_path, list): + if len(data_path) == 1: + data_path = data_path[0] + else: + raise ValueError(f"Expected single path, got {data_path}") + base_path = os.path.expanduser(data_path) + + # Load machine events to determine typical machine capacities + machine_loader = GoogleClusterV2DataLoader(base_path, event_type="machine_events", concatenate=True) + machine_df = next(iter(machine_loader)) + # Get machine capacity (using the mode for robustness) + # This represents the normalized CPU and memory capacity of a single node. + cpu_capacity = machine_df['CPU_capacity'].mode()[0] + mem_capacity = machine_df['memory_capacity'].mode()[0] + + # Load task events to get individual task resource requests + task_loader = GoogleClusterV2DataLoader(base_path, event_type="task_events", concatenate=True) + task_df = next(iter(task_loader)) + # Filter to only submitted tasks (event_type=0) + task_df = task_df[task_df['event_type'] == 0] + + # Calculate total resource requests per job by summing up all task requests for each job + job_resources = task_df.groupby('job_ID').agg({ + 'CPU_request': 'sum', + 'memory_request': 'sum' + }).reset_index() + + # Calculate nodes required for each job based on CPU and memory requests + # Using ceiling division to ensure enough nodes are allocated to meet the demand + job_resources['nodes_required_cpu'] = np.ceil(job_resources['CPU_request'] / cpu_capacity) + job_resources['nodes_required_mem'] = np.ceil(job_resources['memory_request'] / mem_capacity) + # The final nodes_required is the maximum of CPU-driven and memory-driven node requirements + job_resources['nodes_required'] = np.maximum( + job_resources['nodes_required_cpu'], job_resources['nodes_required_mem']).astype(int) + + # Create a dictionary for quick lookup of nodes_required by job_ID + nodes_required_map = job_resources.set_index('job_ID')['nodes_required'].to_dict() + + # Filter out jobs with 0 nodes required (i.e., no resource requests) + num_jobs_before_filter = len(job_resources) + job_resources = job_resources[job_resources['nodes_required'] > 0] + num_jobs_after_filter = len(job_resources) + print(f"Filtered out {num_jobs_before_filter - num_jobs_after_filter} jobs with 0 resource requests.") + + print("Job resource requirements (after filtering):") + print(job_resources.head()) + + # Load submit events + loader = GoogleClusterV2DataLoader(base_path, event_type="job_events", concatenate=True) + df = next(iter(loader)) + for col in ("timestamp", "job_ID", "event_type"): + if col not in df.columns: + raise ValueError(f"Missing column {col}") + df = df[df["event_type"] == 0] + df["timestamp"] = df["timestamp"].astype(float) / 1e6 # convert from microseconds → seconds + t0 = df["timestamp"].min() + # t1 = df["timestamp"] - t0 # Unused + + # Get trace quanta + trace_quanta = config['TRACE_QUANTA'] + + # Load task usage + usage_loader = GoogleClusterV2DataLoader(base_path, event_type="task_usage", concatenate=True) + usage_df = next(iter(usage_loader)) + + # Convert microseconds → seconds for task usage + usage_df["start_time"] = usage_df["start_time"].astype(float) / 1e6 + usage_df["end_time"] = usage_df["end_time"].astype(float) / 1e6 + + # Build per-job start and end times (seconds since trace-start) + usage_map_start = usage_df.groupby("job_ID")["start_time"].min().to_dict() + usage_map_end = usage_df.groupby("job_ID")["end_time"].max().to_dict() + + # rename to avg + if "CPU_usage_rate" in usage_df.columns: + usage_df.rename(columns={"CPU_usage_rate": "CPU_usage_avg"}, inplace=True) + usage_df["job_ID"] = usage_df["job_ID"].astype(int) + usage_df["CPU_usage_avg"] = usage_df["CPU_usage_avg"].astype(float) + usage_map = usage_df.groupby("job_ID")["CPU_usage_avg"].apply(lambda s: s.to_numpy()).to_dict() + + # print(usage_map) + + # Filter to jobs with usage data AND valid resource requests + df = df[df["job_ID"].isin(usage_map) & df["job_ID"].isin(job_resources['job_ID'])] + + jobs: List[Any] = [] + jid_f = kwargs.get('jid', '*') + for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Loading jobs"): + + jid = int(row["job_ID"]) + start = usage_map_start[jid] - t0 + end = usage_map_end[jid] - t0 + wall = end - start + + # nodes_required = int(nodes_required_map.get(jid, 1)) # Default to 1 if not found + # nodes_required = int(nodes_required_map.get(jid)) # Unused + + if jid_f != '*' and str(jid) != str(jid_f): + continue + trace = usage_map[jid] + # ensure gpu_trace is same length as cpu_trace + gpu_trace = np.zeros_like(trace, dtype=float) + + # nodes_required should be a positive int + nr = int(nodes_required_map.get(jid, 1)) + if nr < 1: + nr = 1 + + job_d = job_dict( + nodes_required=nr, + name=f"job_{jid}", + account=f"user_{row.get('user_name', 'unknown')}", + cpu_trace=trace, + gpu_trace=gpu_trace, + nrx_trace=[], ntx_trace=[], + end_state="UNKNOWN", scheduled_nodes=[], + id=jid, priority=int(row.get('scheduling_class', 0)), + # submit_time=row["timestamp"], time_limit=0, + submit_time=start, time_limit=0, + start_time=start, end_time=end, + expected_run_time=wall, trace_time=row["timestamp"], + trace_start_time=start, trace_end_time=end, trace_quanta=trace_quanta + ) + # Wrap dict in a real Job so telemetry.save_snapshot() can use __dict__ + # if nodes_required > 0: + jobs.append(Job(job_d)) + + # Compute simulation span: start at t=0, end at the latest job finish + telemetry_start = 0 + telemetry_end = int(max(usage_map_end.values()) - t0) + return WorkloadData( + jobs=jobs, + telemetry_start=telemetry_start, telemetry_end=telemetry_end, + # gcloud dataset timestamps are already relative, and it doesn't list a start exact date. + start_date=datetime.fromisoformat("2011-05-02T00:00:00Z"), + ) diff --git a/raps/dataloaders/kestrel.py b/raps/dataloaders/kestrel.py new file mode 100644 index 0000000000000000000000000000000000000000..f15c80bc6ab5a168e7538554d3ed59e01880fe9e --- /dev/null +++ b/raps/dataloaders/kestrel.py @@ -0,0 +1,165 @@ +""" + Load data for NREL's Kestrel cluster. +""" +import uuid +import pandas as pd +from tqdm import tqdm + +from ..job import job_dict, Job +from ..utils import power_to_utilization, next_arrival, WorkloadData + + +def load_data(jobs_path, **kwargs): + """ + Reads job and job profile data from parquet files and parses them. + + Parameters + ---------- + jobs_path : str + The path to the jobs parquet file. + + Returns + ------- + list + The list of parsed jobs. + """ + jobs_df = pd.read_parquet(jobs_path, engine='pyarrow') + return load_data_from_df(jobs_df, **kwargs) + + +def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): + """ + Reads job and job profile data from parquet files and parses them. + + Requires the following fields in the DataFrame: + - start_time (timestamp): Time execution begins (actual or expected) + - job_id (int): Job ID + - node_power_consumption (List[int]): Power consumption of the job, recorded at Node level + - nodes_required (int): Number of nodes allocated to the job + - cpu_power_consumption (List[int]): Power consumption of the job, recorded at CPU level (don't have this) + - mem_power_consumption (List[int]): Power consumption of the job, recorded at Memory level (don't have this) + - priority (int): Relative priority of the job, 0=held, 1=required nodes DOWN/DRAINED + - job_state (string): State of the job, see enum job_states for possible values + - wall_time (int): Actual runtime of job, in seconds + - nodes (string): List of nodes allocated to job + + Returns + ------- + list + The list of parsed jobs. + """ + config = kwargs.get('config') + reschedule = kwargs.get('reschedule') + validate = kwargs.get('validate') + jid = kwargs.get('jid', '*') + + # Sort jobs dataframe based on values in time_start column, adjust indices after sorting + jobs_df = jobs_df.sort_values(by='submit_time') + jobs_df = jobs_df[(jobs_df.start_time.between(pd.to_datetime('2024-09-01T00:00:00'), + pd.to_datetime('2024-09-16T00:00:00'), inclusive='right') | + jobs_df.end_time.between(pd.to_datetime('2024-09-01T00:00:00'), + pd.to_datetime('2024-09-16T00:00:00'), inclusive='right') + )].copy() + jobs_df = jobs_df.reset_index(drop=True) + + telemetry_start_timestamp = jobs_df['start_time'].min() + telemetry_end_timestamp = jobs_df['end_time'].max() + telemetry_start = 0 + telemetry_end = int((telemetry_end_timestamp - telemetry_start_timestamp).total_seconds()) + + # Take earliest time as baseline reference + # We can use the start time of the first job. + time_zero = jobs_df['submit_time'].min() + + num_jobs = len(jobs_df) + print("time_zero:", time_zero, "num_jobs", num_jobs) + + jobs = [] + + # Map dataframe to job state. Add results to jobs list + for jidx in tqdm(range(num_jobs - 1), total=num_jobs, desc="Processing Kestrel Jobs"): + + job_id = jobs_df.loc[jidx, 'job_id'] + account = jobs_df.loc[jidx, 'account'] + + if not jid == '*': + if int(jid) == int(job_id): + print(f'Extracting {job_id} profile') + else: + continue + nodes_required = jobs_df.loc[jidx, 'nodes_required'] + + name = str(uuid.uuid4())[:6] + + if validate: + cpu_power = jobs_df.loc[jidx, 'power_per_node'] + cpu_trace = cpu_power + + else: + cpu_power = jobs_df.loc[jidx, 'power_per_node'] + cpu_power_array = [600] if (pd.isna(cpu_power) or cpu_power == 0) else cpu_power.tolist() + cpu_min_power = nodes_required * config['POWER_CPU_IDLE'] * config['CPUS_PER_NODE'] + cpu_max_power = nodes_required * config['POWER_CPU_MAX'] * config['CPUS_PER_NODE'] + cpu_util = power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power) + cpu_trace = cpu_util * config['CPUS_PER_NODE'] + gpu_trace = 0 + + # Priority sorting doesn't seem to be implemented at the moment + priority = 0 + + wall_time = jobs_df.loc[jidx, 'wall_time'] + end_state = jobs_df.loc[jidx, 'job_state'] + time_submit = jobs_df.loc[jidx+1, 'submit_time'] + diff = time_submit - time_zero + + if jid == '*': + time_offset = max(diff.total_seconds(), 0) + else: + # When extracting out a single job, run one iteration past the end of the job + time_offset = config['UI_UPDATE_FREQ'] + + if reschedule: # Let the scheduler reschedule the jobs + scheduled_nodes = None + time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) + else: # Prescribed replay + scheduled_nodes = None + time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) + + trace_quanta = config['TRACE_QUANTA'] + + if cpu_trace.size > 0 and time_offset >= 0: + job_info = job_dict(nodes_required = nodes_required, + name = name, + account = account, + cpu_trace = cpu_trace, + gpu_trace = gpu_trace, + ntx_trace = [], + nrx_trace = [], + end_state = end_state, + scheduled_nodes = scheduled_nodes, + id = job_id, + priority = priority, + submit_time = time_offset, + time_limit = wall_time, + trace_quanta=trace_quanta) + jobs.append(Job(job_info)) + + return WorkloadData( + jobs=jobs, + telemetry_start=telemetry_start, telemetry_end=telemetry_end, + start_date=telemetry_start_timestamp, + ) + + +def node_index_to_name(index: int, config: dict): + """ Converts an index value back to an name string based on system configuration. """ + return f"node{index:04d}" + + +def cdu_index_to_name(index: int, config: dict): + return f"cdu{index:02d}" + + +def cdu_pos(index: int, config: dict) -> tuple[int, int]: + """ Return (row, col) tuple for a cdu index """ + return (0, index) # TODO \ No newline at end of file diff --git a/raps/dataloaders/lassen.py b/raps/dataloaders/lassen.py index b14cabe16110d409d829a5c8c169f54f0854faf9..06d9a98dd66ef227f6e6dfc7ca474eefe0273578 100644 --- a/raps/dataloaders/lassen.py +++ b/raps/dataloaders/lassen.py @@ -10,43 +10,48 @@ Reference: Usage Instructions: - git clone https://github.com/LLNL/LAST/ && cd LAST - git lfs pull + raps download --system lassen - # to analyze dataset - python -m raps.telemetry -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen -v + # to analyze dataset and plot histograms + raps telemetry -f ./data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --plot # to simulate the dataset as submitted - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen + raps run -f ./data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen - # to reschedule - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --reschedule poisson + # to modify the submit times of the telemetry according to Poisson distribution + raps run -f ./data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --arrival poisson + + # to fast-forward 365 days and replay for 1 day. This region day has 2250 jobs with 1650 jobs executed. + raps run -f ./data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --start '2019-08-22T00:00:00+00:00' -t 1d + + # For the network replay this command gives suiteable snapshots: + raps run -f ./data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit -t 12h --arrival poisson # noqa - # to fast-forward 37 days and replay for 1 day - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen -ff 37d -t 1d """ import math -import numpy as np import os +import uuid +import numpy as np import pandas as pd from tqdm import tqdm +from pathlib import Path +import subprocess +import shutil +from datetime import datetime, timedelta -try: - from ..job import job_dict - from ..utils import power_to_utilization, next_arrival - -except: - pass +from ..job import job_dict, Job +from ..utils import power_to_utilization, WorkloadData def load_data(path, **kwargs): """ Loads data from the given file paths and returns job info. """ - nrows = 1E4 - alloc_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_history_hashed.csv'), nrows=nrows) - node_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_node_history.csv'), nrows=nrows) - step_df = pd.read_csv(os.path.join(path[0], 'final_csm_step_history.csv'), nrows=nrows) + nrows = None + alloc_df = pd.read_csv(os.path.join( + path[0], 'final_csm_allocation_history_hashed.csv'), nrows=nrows, low_memory=False) + node_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_node_history.csv'), nrows=nrows, low_memory=False) + step_df = pd.read_csv(os.path.join(path[0], 'final_csm_step_history.csv'), nrows=nrows, low_memory=False) return load_data_from_df(alloc_df, node_df, step_df, **kwargs) @@ -56,24 +61,47 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): """ config = kwargs.get('config') jid = kwargs.get('jid', '*') - reschedule = kwargs.get('reschedule') - fastforward = kwargs.get('fastforward') + validate = kwargs.get('validate') verbose = kwargs.get('verbose') - min_time = kwargs.get('min_time', None) - - if fastforward: - print(f"fast-forwarding {fastforward} seconds") - - allocation_df['begin_time'] = pd.to_datetime(allocation_df['begin_time'], format='mixed', errors='coerce') - allocation_df['end_time'] = pd.to_datetime(allocation_df['end_time'], format='mixed', errors='coerce') - - if not min_time: - min_time = pd.to_datetime(allocation_df['begin_time']).min() + start = datetime.fromisoformat(kwargs['start']) if kwargs.get('start') else None + + allocation_df['job_submit_timestamp'] = pd.to_datetime( + allocation_df['job_submit_time'], format='mixed', errors='coerce') + allocation_df['begin_timestamp'] = pd.to_datetime(allocation_df['begin_time'], format='mixed', errors='coerce') + allocation_df['end_timestamp'] = pd.to_datetime(allocation_df['end_time'], format='mixed', errors='coerce') + + telemetry_start_timestamp = allocation_df['begin_timestamp'].min() + telemetry_start_time = 0 + telemetry_end_timestamp = allocation_df['end_timestamp'].max() + diff = telemetry_end_timestamp - telemetry_start_timestamp + telemetry_end_time = int(math.ceil(diff.total_seconds())) + + # Too large dataset! Cut by fastforward and time to simulate! + if start is None: + fastforward_timedelta = timedelta(seconds=0) + else: + fastforward_timedelta = start - telemetry_start_timestamp.tz_localize("UTC") + time_to_simulate_timedelta = timedelta(seconds=kwargs['time']) + + simulation_start_timestamp = telemetry_start_timestamp + fastforward_timedelta + simulation_end_timestamp = simulation_start_timestamp + time_to_simulate_timedelta + + # As these are >1.4M jobs, filtered to the simulated timestamps before creating the job structs. + # Job should not have ended before the simulation time + allocation_df = allocation_df[allocation_df['end_timestamp'] >= simulation_start_timestamp] + # Job has to have been submited before or during the simulaion time + allocation_df = allocation_df[allocation_df['job_submit_timestamp'] < simulation_end_timestamp] job_list = [] for _, row in tqdm(allocation_df.iterrows(), total=len(allocation_df), desc="Processing Jobs"): - job_id = row['primary_job_id'] + + account = row['hashed_user_id'] + job_id = int(row['primary_job_id']) + # allocation_id = row['allocation_id'] # Unused + nodes_required = row['num_nodes'] + end_state = row['exit_status'] + name = str(uuid.uuid4())[:6] # This generates a random 6 char identifier.... if not jid == '*': if int(jid) == int(job_id): @@ -83,76 +111,139 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs): node_data = node_df[node_df['allocation_id'] == row['allocation_id']] - nodes_required = row['num_nodes'] + wall_time = compute_wall_time(row['begin_timestamp'], row['end_timestamp']) - wall_time = compute_wall_time(row['begin_time'], row['end_time']) samples = math.ceil(wall_time / config['TRACE_QUANTA']) - # Compute GPU power - gpu_energy = node_data['gpu_energy'].sum() # Joules - # divide by nodes_required to get average gpu_usage per node - gpu_usage = node_data['gpu_usage'].sum() / 1E6 / nodes_required # seconds - gpu_power = gpu_energy / gpu_usage if gpu_usage > 0 else 0 - #gpu_power = gpu_energy / wall_time - gpu_power_array = np.array([gpu_power] * samples) - - gpu_min_power = nodes_required * config['POWER_GPU_IDLE'] - gpu_max_power = nodes_required * config['POWER_GPU_MAX'] - gpu_util = power_to_utilization(gpu_power_array, gpu_min_power, gpu_max_power) - # GPU power can be 0: - # Utilization is defined in the range of [0 to GPUS_PER_NODE]. - # gpu_util will be negative if power reports 0, which is smaller than POWER_GPU_IDLE - # Therefore: gpu_util should be set to zero if it is smaller than 0. - gpu_trace = np.maximum(0, gpu_util) - - # Compute CPU power from CPU usage time - # CPU usage is reported per core, while we need it in the range [0 to CPUS_PER_NODE] - cpu_usage = node_data['cpu_usage'].sum() / 1E9 / nodes_required / config['CORES_PER_CPU'] # seconds - cpu_usage_array = np.array([cpu_usage] * samples) - cpu_util = cpu_usage_array / wall_time - cpu_trace = cpu_util # * CPUS_PER_NODE - # TODO use total energy for validation - # Only Node Energy and GPU Energy is reported! - # total_energy = node_data['energy'].sum() # Joules + if validate: + # Validate should represent the node power and not split it according to cpu and gpu. + # Not sure if this is correct. + cpu_power = (node_data['energy'].sum() / nodes_required) / wall_time + cpu_trace = cpu_power + gpu_trace = 0 # = cpu_trace # Is this correct? + else: + # Compute GPU power + gpu_node_idle_power = config['POWER_GPU_IDLE'] * config['GPUS_PER_NODE'] + # Note: GPU_Power is on a per node basis. + # The current simulator uses the same time series for every node of the job + # Therefore we sum over all nodes and form the average node power. + # TODO: Jobs could have a time-series per node! + gpu_node_energy = node_data['gpu_energy'].copy() + gpu_node_energy[gpu_node_energy < 0] = 0.0 + gpu_node_energy[gpu_node_energy == np.nan] = 0.0 + if len(gpu_node_energy) < 1: + gpu_power = gpu_node_idle_power # Setting to idle as other parts of the sim make this assumption + else: + if wall_time > 0: + gpu_power = (gpu_node_energy.sum() / nodes_required) / wall_time # This is a single value + else: + gpu_power = gpu_node_idle_power + if gpu_power < gpu_node_idle_power: + # print(gpu_power, gpu_node_idle_power) + # Issue: RAPS assumes power is between idle and max, but C-states are not considered! + gpu_power = gpu_node_idle_power # Setting to idle as other parts of the sim make this assumption + assert gpu_power >= gpu_node_idle_power, f"{gpu_power} >= {gpu_node_idle_power}" + \ + f" gpu_power = ({gpu_node_energy.sum()} / {nodes_required}) / {wall_time}" + gpu_min_power = gpu_node_idle_power + gpu_max_power = config['POWER_GPU_MAX'] * config['GPUS_PER_NODE'] + # power_to_utilization has issues! As it is unclear if gpu_power is for a single gpu or all gpus of a node. + # The multiplication by GPUS_PER_NODE fixes this but is patch-work! TODO Refactor and fix + gpu_util = power_to_utilization(gpu_power, gpu_min_power, gpu_max_power) + # gpu_util should to be between 0 an 4 (4 GPUs), where 4 is all GPUs full utilization. + gpu_util_scalar = gpu_util * config['GPUS_PER_NODE'] + + # Compute CPU power from CPU usage time + # CPU usage is reported per core, while we need it in the range [0 to CPUS_PER_NODE] + # Same + cpu_node_usage = node_data['cpu_usage'].copy() + cpu_node_usage[cpu_node_usage < 0] = 0.0 + cpu_node_usage[cpu_node_usage == np.nan] = 0.0 + if wall_time > 0: + threads_per_core = config['THREADS_PER_CORE'] + cpu_util = cpu_node_usage.sum() / 10e9 / nodes_required / wall_time / threads_per_core + else: + cpu_util = 0.0 + assert cpu_util >= 0, f"{cpu_util} = {cpu_node_usage.sum()} / 10e9 " \ + f"/ {nodes_required} / {wall_time} / {threads_per_core}" + + # cpu_util should be between 0 an 2 (2 CPUs) + + cpu_util_scalar = cpu_util + # TODO use total energy for validation + # Only Node Energy and GPU Energy is reported! + # total_energy = node_data['energy'].sum() # Joules + + # Expand into lists of length=samples + cpu_trace = [cpu_util_scalar] * samples + gpu_trace = [gpu_util_scalar] * samples # Network utilization - since values are given in octets / quarter of a byte, multiply by 4 to get bytes - ib_tx = 4 * node_data['ib_tx'].values[0] if node_data['ib_tx'].values.size > 0 else [] - ib_rx = 4 * node_data['ib_rx'].values[0] if node_data['ib_rx'].values.size > 0 else [] + total_ib_tx = 4 * node_data['ib_tx'].sum() if node_data['ib_tx'].values.size > 0 else 0 + total_ib_rx = 4 * node_data['ib_rx'].sum() if node_data['ib_rx'].values.size > 0 else 0 - net_tx, net_rx = generate_network_sequences(ib_tx, ib_rx, samples, lambda_poisson=0.3) + n = nodes_required + ib_tx_per_node = total_ib_tx / n # average bytes per node + ib_rx_per_node = total_ib_rx / n # average bytes per node - if reschedule == 'poisson': # Let the scheduler reschedule the jobs - scheduled_nodes = None - time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) - elif reschedule == 'submit-time': - raise NotImplementedError - else: - scheduled_nodes = get_scheduled_nodes(row['allocation_id'], node_df) - time_offset = compute_time_offset(row['begin_time'], min_time) - if fastforward: - time_offset -= fastforward + # net_tx, net_rx = [],[] # generate_network_sequences generates errors (e.g. --ff 800d -t 1d ) + # net_tx, net_rx = generate_network_sequences(ib_tx, ib_rx, samples, lambda_poisson=0.3) + net_tx, net_rx = throughput_traces(ib_tx_per_node, ib_rx_per_node, samples) + + # no priorities defined! + priority = row.get('priority', 0) + partition = row.get('partition', "0") + + scheduled_nodes = get_scheduled_nodes(row['allocation_id'], node_df) + submit_time = compute_time_offset(row['job_submit_timestamp'], telemetry_start_timestamp) + start_time = compute_time_offset(row['begin_timestamp'], telemetry_start_timestamp) + end_time = compute_time_offset(row['end_timestamp'], telemetry_start_timestamp) + + time_limit = row['time_limit'] + + trace_quanta = config['TRACE_QUANTA'] + trace_time = wall_time + trace_start_time = start_time + trace_end_time = end_time + trace_missing_values = False if verbose: - print('ib_tx, ib_rx, samples:', ib_tx, ib_rx, samples) + print('ib_tx, ib_rx, samples:', net_tx, net_rx, samples) print('tx:', net_tx) print('rx:', net_rx) print('scheduled_nodes:', nodes_required, scheduled_nodes) - if time_offset >= 0: - - job_info = job_dict(nodes_required, - row['hashed_user_id'], - row['hashed_user_group_id'], - cpu_trace, gpu_trace, net_tx, net_rx, wall_time, - row['exit_status'], - scheduled_nodes, - time_offset, - job_id, - row.get('priority', 0)) - - job_list.append(job_info) - - return job_list + if wall_time >= 0: + job_info = job_dict(nodes_required=nodes_required, + name=name, + account=account, + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=net_tx, + nrx_trace=net_rx, + end_state=end_state, + scheduled_nodes=scheduled_nodes, + id=job_id, + priority=priority, + partition=partition, + submit_time=submit_time, + time_limit=time_limit, + start_time=start_time, + end_time=end_time, + expected_run_time=wall_time, + trace_time=trace_time, + trace_start_time=trace_start_time, + trace_end_time=trace_end_time, + trace_quanta=trace_quanta, + trace_missing_values=trace_missing_values) + job = Job(job_info) + job_list.append(job) + + return WorkloadData( + jobs=job_list, + telemetry_start=telemetry_start_time, telemetry_end=telemetry_end_time, + # TODO: Confirm whether lassen timestamps are UTC or PDT + start_date=telemetry_start_timestamp.tz_localize("UTC"), + ) def get_scheduled_nodes(allocation_id, node_df): @@ -185,14 +276,14 @@ def compute_time_offset(begin_time, reference_time): def adjust_bursts(burst_intervals, total, intervals): bursts = burst_intervals / np.sum(burst_intervals) * total bursts = np.round(bursts).astype(int) - adjustment = total - np.sum(bursts) + # adjustment = total - np.sum(bursts) # Unused # Distribute adjustment across non-zero elements to avoid negative values - if adjustment != 0: - for i in range(len(bursts)): - if bursts[i] > 0: - bursts[i] += adjustment - break # Apply adjustment only once where it won't cause a negative + # if adjustment != 0: + # for i in range(len(bursts)): + # if bursts[i] > 0: + # bursts[i] += adjustment % (2^64-1) # This can overflow! + # break # Apply adjustment only once where it won't cause a negative return bursts @@ -215,6 +306,17 @@ def generate_network_sequences(total_tx, total_rx, intervals, lambda_poisson): return tx_bursts, rx_bursts +def throughput_traces(total_tx, total_rx, intervals): + + if not total_tx or not total_rx: + return None, None + + tx_bursts = [total_tx // intervals] * intervals + rx_bursts = [total_rx // intervals] * intervals + + return tx_bursts, rx_bursts + + def node_index_to_name(index: int, config: dict): """ Converts an index value back to an name string based on system configuration. """ return f"node{index:04d}" @@ -226,7 +328,7 @@ def cdu_index_to_name(index: int, config: dict): def cdu_pos(index: int, config: dict) -> tuple[int, int]: """ Return (row, col) tuple for a cdu index """ - return (0, index) # TODO + return (0, index) # TODO if __name__ == "__main__": @@ -237,5 +339,14 @@ if __name__ == "__main__": intervals = 20 # number of 20-second intervals lambda_poisson = 0.3 # control sporadicity - tx_sequence, rx_sequence = generate_ib_tx_rx_sequences(total_ib_tx, total_ib_rx, intervals, lambda_poisson) + tx_sequence, rx_sequence = generate_network_sequences(total_ib_tx, total_ib_rx, intervals, lambda_poisson) print(tx_sequence, rx_sequence) + + +def download(dest: Path, start: datetime | None, end: datetime | None): + dest.mkdir(parents=True) + subprocess.run(["git", "clone", "https://github.com/LLNL/LAST/", str(dest / 'repo')], check=True, text=True) + subprocess.run(["git", "lfs", "pull"], check=True, text=True, cwd=dest / "repo") + (dest / "repo" / "Lassen-Supercomputer-Job-Dataset").rename(dest / "Lassen-Supercomputer-Job-Dataset") + shutil.rmtree(dest / 'repo') + print("Done!") diff --git a/raps/dataloaders/marconi100.py b/raps/dataloaders/marconi100.py index c6a97f80982a19d92559eb74e464c971a26cb955..3ee7570ae94b9aa269e70846f254549b20f36832 100644 --- a/raps/dataloaders/marconi100.py +++ b/raps/dataloaders/marconi100.py @@ -1,33 +1,41 @@ """ - # Reference - Antici, Francesco, et al. "PM100: A Job Power Consumption Dataset of a - Large-scale Production HPC System." Proceedings of the SC'23 Workshops - of The International Conference on High Performance Computing, - Network, Storage, and Analysis. 2023. - - # get the data - Download `job_table.parquet` from https://zenodo.org/records/10127767 - - # to simulate the dataset - python main.py -f /path/to/job_table.parquet --system marconi100 - - # to reschedule - python main.py -f /path/to/job_table.parquet --system marconi100 --reschedule poisson - - # to fast-forward 60 days and replay for 1 day - python main.py -f /path/to/job_table.parquet --system marconi100 -ff 60d -t 1d - - # to analyze dataset - python -m raps.telemetry -f /path/to/job_table.parquet --system marconi100 -v - +# Reference +Antici, Francesco, et al. "PM100: A Job Power Consumption Dataset of a +Large-scale Production HPC System." Proceedings of the SC'23 Workshops +of The International Conference on High Performance Computing, +Network, Storage, and Analysis. 2023. + +# get the data +Download the dataset with +``` +raps download --system marconi100 +``` +This will download the dataset from https://zenodo.org/records/10127767 + +# to simulate the dataset +raps run -f /path/to/job_table.parquet --system marconi100 + +# to replay using differnt schedulers +raps run -f /path/to/job_table.parquet --system marconi100 --policy fcfs --backfill easy +raps run -f /path/to/job_table.parquet --system marconi100 --policy priority --backfill firstfit + +# to fast-forward 60 days and replay for 1 day +raps run -f /path/to/job_table.parquet --system marconi100 --start 2020-07-05T00:00:00+00:00 -t 1d + +# to analyze dataset +python -m raps.telemetry -f /path/to/job_table.parquet --system marconi100 -v """ import uuid -import random +import numpy as np import pandas as pd from tqdm import tqdm +from pathlib import Path +from datetime import datetime +import requests +import urllib.request -from ..job import job_dict -from ..utils import power_to_utilization, next_arrival +from ..job import job_dict, Job +from ..utils import power_to_utilization, WorkloadData def load_data(jobs_path, **kwargs): @@ -58,35 +66,46 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): The list of parsed jobs. """ config = kwargs.get('config') - min_time = kwargs.get('min_time', None) - reschedule = kwargs.get('reschedule') - fastforward = kwargs.get('fastforward') + # min_time = kwargs.get('min_time', None) # Unused validate = kwargs.get('validate') jid = kwargs.get('jid', '*') - - if fastforward: print(f"fast-forwarding {fastforward} seconds") + debug = kwargs.get('debug') # Sort jobs dataframe based on values in time_start column, adjust indices after sorting jobs_df = jobs_df.sort_values(by='start_time') jobs_df = jobs_df.reset_index(drop=True) - # Take earliest time as baseline reference - # We can use the start time of the first job. - if min_time: - time_zero = min_time - else: - time_zero = jobs_df['start_time'].min() + # Dataset has one value from start to finish. + # Therefore we set telemetry start and end equal to job start and end. + first_start_timestamp = jobs_df['start_time'].min() + telemetry_start_timestamp = first_start_timestamp + + last_end_timestamp = jobs_df['end_time'].max() + telemetry_end_timestamp = last_end_timestamp + + telemetry_start = 0 + diff = telemetry_end_timestamp - telemetry_start_timestamp + telemetry_end = int(diff.total_seconds()) num_jobs = len(jobs_df) - print("time_zero:", time_zero, "num_jobs", num_jobs) + + if debug: + print("num_jobs:", num_jobs) + print("telemetry_start:", telemetry_start, "simulation_fin", telemetry_end) + print("telemetry_start_timestamp:", telemetry_start_timestamp, + "telemetry_end_timestamp", telemetry_end_timestamp) + print("first_start_timestamp:", first_start_timestamp, "last start timestamp:", jobs_df['time_start'].max()) jobs = [] # Map dataframe to job state. Add results to jobs list for jidx in tqdm(range(num_jobs - 1), total=num_jobs, desc="Processing Jobs"): - account = jobs_df.loc[jidx, 'user_id'] # or 'group_id' + account = jobs_df.loc[jidx, 'user_id'] # or 'user_id' ? job_id = jobs_df.loc[jidx, 'job_id'] + # allocation_id = + nodes_required = jobs_df.loc[jidx, 'num_nodes_alloc'] + end_state = jobs_df.loc[jidx, 'job_state'] if not jid == '*': if int(jid) == int(job_id): @@ -95,10 +114,10 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): continue nodes_required = jobs_df.loc[jidx, 'num_nodes_alloc'] - name = str(uuid.uuid4())[:6] + name = str(uuid.uuid4())[:6] # This generates a random 6 char identifier.... if validate: - cpu_power = jobs_df.loc[jidx, 'node_power_consumption']/jobs_df.loc[jidx, 'num_nodes_alloc'] + cpu_power = jobs_df.loc[jidx, 'node_power_consumption'] / jobs_df.loc[jidx, 'num_nodes_alloc'] cpu_trace = cpu_power gpu_trace = cpu_trace @@ -120,8 +139,8 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): mem_power = mem_power[:min_length] gpu_power = (node_power - cpu_power - mem_power - - ([nodes_required * config['NICS_PER_NODE'] * config['POWER_NIC']] * len(node_power)) - - ([nodes_required * config['POWER_NVME']] * len(node_power))) + - ([nodes_required * config['NICS_PER_NODE'] * config['POWER_NIC']] * len(node_power)) + - ([nodes_required * config['POWER_NVME']] * len(node_power))) gpu_power_array = gpu_power.tolist() gpu_min_power = nodes_required * config['POWER_GPU_IDLE'] * config['GPUS_PER_NODE'] gpu_max_power = nodes_required * config['POWER_GPU_MAX'] * config['GPUS_PER_NODE'] @@ -129,35 +148,92 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs): gpu_trace = gpu_util * config['GPUS_PER_NODE'] priority = int(jobs_df.loc[jidx, 'priority']) - - # wall_time = jobs_df.loc[i, 'run_time'] - wall_time = gpu_trace.size * config['TRACE_QUANTA'] # seconds - end_state = jobs_df.loc[jidx, 'job_state'] - time_start = jobs_df.loc[jidx+1, 'start_time'] - diff = time_start - time_zero - - if jid == '*': - time_offset = max(diff.total_seconds(), 0) - else: - # When extracting out a single job, run one iteration past the end of the job - time_offset = config['UI_UPDATE_FREQ'] - - if fastforward: time_offset -= fastforward - - if reschedule == 'poisson': # Let the scheduler reschedule the jobs - scheduled_nodes = None - time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME']) - elif reschedule == 'submit-time': - raise NotImplementedError - else: # Prescribed replay - scheduled_nodes = (jobs_df.loc[jidx, 'nodes']).tolist() - - if gpu_trace.size > 0 and time_offset >= 0: - job_info = job_dict(nodes_required, name, account, cpu_trace, gpu_trace, [], [], wall_time, - end_state, scheduled_nodes, time_offset, job_id, priority) - jobs.append(job_info) - - return jobs + partition = int(jobs_df.loc[jidx, 'partition']) + + time_limit = jobs_df.loc[jidx, 'time_limit'] + + start_timestamp = jobs_df.loc[jidx, 'start_time'] + diff = start_timestamp - telemetry_start_timestamp + start_time = int(diff.total_seconds()) + + end_timestamp = jobs_df.loc[jidx, 'end_time'] + diff = end_timestamp - telemetry_start_timestamp + end_time = int(diff.total_seconds()) + + wall_time = int(jobs_df.loc[jidx, 'run_time']) + if np.isnan(wall_time): + wall_time = 0 + if wall_time != (end_time - start_time): + print("wall_time != (end_time - start_time)") + print(f"{wall_time} != {(end_time - start_time)}") + + scheduled_nodes = (jobs_df.loc[jidx, 'nodes']).tolist() + + submit_timestamp = jobs_df.loc[jidx, 'submit_time'] + diff = submit_timestamp - telemetry_start_timestamp + submit_time = int(diff.total_seconds()) + + trace_time = gpu_trace.size * config['TRACE_QUANTA'] # seconds + trace_start_time = 0 + trace_end_time = trace_time + if wall_time > trace_time: + missing_trace_time = wall_time - trace_time + if start_time < 0: + trace_start_time = missing_trace_time + trace_end_time = wall_time + elif end_time > telemetry_end: + trace_start_time = 0 + trace_end_time = trace_time + else: + # Telemetry mission at the end + trace_start_time = 0 + trace_end_time = trace_time + trace_missing_values = True + + # What does this do? + # if jid == '*': + # # submit_time = max(submit_time.total_seconds(), 0) + # submit_timestamp = jobs_df.loc[jidx, 'submit_time'] + # diff = submit_timestamp - telemetry_start_timestamp + # submit_time = diff.total_seconds() + + # else: + # # When extracting out a single job, run one iteration past the end of the job + # submit_time = config['UI_UPDATE_FREQ'] + + if gpu_trace.size > 0 and (jid == job_id or jid == '*'): # and time_submit >= 0: + + job_info = job_dict(nodes_required=nodes_required, + name=name, + account=account, + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + nrx_trace=[], ntx_trace=[], + end_state=end_state, + # current_state=current_state, # PENDING? + scheduled_nodes=scheduled_nodes, + id=job_id, + priority=priority, + partition=partition, + submit_time=submit_time, + time_limit=time_limit, + start_time=start_time, + end_time=end_time, + expected_run_time=wall_time, + current_run_time=0, + trace_time=trace_time, + trace_start_time=trace_start_time, + trace_end_time=trace_end_time, + trace_quanta=config["TRACE_QUANTA"], + trace_missing_values=trace_missing_values) + job = Job(job_info) + jobs.append(job) + + return WorkloadData( + jobs=jobs, + telemetry_start=telemetry_start, telemetry_end=telemetry_end, + start_date=telemetry_start_timestamp, + ) def node_index_to_name(index: int, config: dict): @@ -171,4 +247,16 @@ def cdu_index_to_name(index: int, config: dict): def cdu_pos(index: int, config: dict) -> tuple[int, int]: """ Return (row, col) tuple for a cdu index """ - return (0, index) # TODO + return (0, index) # TODO + + +def download(dest: Path, start: datetime | None, end: datetime | None): + files = requests.get("https://zenodo.org/api/records/10127767").json()["files"] + + # marconi100 is just one big parquet, nothing to pre-filter + dest.mkdir(parents=True) + for file in files: + print(f"Downloading {file['key']}") + urllib.request.urlretrieve(file['links']['self'], dest / file['key']) + + print("Done!") diff --git a/raps/dataloaders/mit_supercloud/__init__.py b/raps/dataloaders/mit_supercloud/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eff43faac002079bbb9ab6cd8395915627e6aab4 --- /dev/null +++ b/raps/dataloaders/mit_supercloud/__init__.py @@ -0,0 +1,3 @@ +from .loader import load_data + +__all__ = ["load_data"] diff --git a/raps/dataloaders/mit_supercloud/cli.py b/raps/dataloaders/mit_supercloud/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..e459209d73edd60025e3b53273edcd369bf3e342 --- /dev/null +++ b/raps/dataloaders/mit_supercloud/cli.py @@ -0,0 +1,44 @@ +import argparse +from .download import download +from .loader import load_data +from .utils import DEFAULT_START, DEFAULT_END + + +def main(): + p = argparse.ArgumentParser(prog="mit_supercloud") + subs = p.add_subparsers(dest="cmd", required=True) + + common = argparse.ArgumentParser(add_help=False) + common.add_argument( + '--start', '-s', + default=DEFAULT_START, + help="Start datetime, in ISO format (e.g. '2021-05-21T13:30'), default midnight." + ) + common.add_argument( + '--end', '-e', + default=DEFAULT_END, + help="End datetime, in ISO format (e.g. '2021-05-21T16:45')." + ) + common.add_argument("--partition", choices=["all", "part-cpu", "part-gpu"], default="all") + common.add_argument("--outdir", default="source_data") + common.add_argument("--bucket", default="mit-supercloud-dataset") + common.add_argument("--prefix", default="datacenter-challenge/202201/") + common.add_argument("--max-jobs", type=int) + common.add_argument("--dry-run", action="store_true") + + pd = subs.add_parser("download", parents=[common], help="Fetch data from S3") + pd.set_defaults(func=download) + + pl = subs.add_parser("load", parents=[common], help="Load local data into RAPS") + pl.add_argument("path", help="Local data root") + pl.set_defaults(func=lambda args: load_data(args.path, + start=args.start, + end=args.end, + partition=args.partition)) + + args = p.parse_args() + return args.func(args) + + +if __name__ == "__main__": + main() diff --git a/raps/dataloaders/mit_supercloud/cpu_nodes.txt b/raps/dataloaders/mit_supercloud/cpu_nodes.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff1c4b74e1715bed317d4a670d34b8a50c9d73fc --- /dev/null +++ b/raps/dataloaders/mit_supercloud/cpu_nodes.txt @@ -0,0 +1,480 @@ +r1018283-n146651 +r1018283-n181711 +r1018283-n325382 +r1018283-n392209 +r1018283-n468303 +r1018283-n598065 +r1018283-n680758 +r1018283-n920226 +r1081020-n146651 +r1081020-n181711 +r1081020-n325382 +r1081020-n392209 +r1081020-n468303 +r1081020-n598065 +r1081020-n680758 +r1081020-n920226 +r1416152-n134943 +r1416152-n440434 +r1416152-n442913 +r1416152-n691735 +r1457839-n181711 +r1457839-n325382 +r1457839-n680758 +r1457839-n920226 +r1485405-n146651 +r1485405-n181711 +r1485405-n325382 +r1485405-n392209 +r1485405-n468303 +r1485405-n598065 +r1485405-n680758 +r1485405-n920226 +r1642813-n134943 +r1642813-n440434 +r1642813-n442913 +r1642813-n691735 +r1682297-n146651 +r1682297-n181711 +r1682297-n325382 +r1682297-n392209 +r1682297-n468303 +r1682297-n598065 +r1682297-n680758 +r1682297-n920226 +r189256-n146651 +r189256-n181711 +r189256-n325382 +r189256-n392209 +r189256-n468303 +r189256-n598065 +r189256-n680758 +r189256-n920226 +r2008197-n181711 +r2008197-n325382 +r2008197-n680758 +r2008197-n920226 +r2086368-n146651 +r2086368-n181711 +r2086368-n325382 +r2086368-n392209 +r2086368-n468303 +r2086368-n598065 +r2086368-n680758 +r2086368-n920226 +r2100214-n181711 +r2100214-n325382 +r2100214-n680758 +r2100214-n920226 +r2159346-n134943 +r2159346-n440434 +r2159346-n442913 +r2159346-n691735 +r2501111-n134943 +r2501111-n440434 +r2501111-n442913 +r2501111-n691735 +r2582019-n181711 +r2582019-n325382 +r2582019-n680758 +r2582019-n920226 +r2652301-n146651 +r2652301-n181711 +r2652301-n325382 +r2652301-n392209 +r2652301-n468303 +r2652301-n598065 +r2652301-n680758 +r2652301-n920226 +r2825489-n134943 +r2825489-n440434 +r2825489-n442913 +r2825489-n691735 +r29114-n146651 +r29114-n181711 +r29114-n325382 +r29114-n392209 +r29114-n468303 +r29114-n598065 +r29114-n680758 +r29114-n920226 +r2998125-n134943 +r2998125-n440434 +r2998125-n442913 +r2998125-n691735 +r3039576-n181711 +r3039576-n325382 +r3039576-n680758 +r3039576-n920226 +r3041626-n146651 +r3041626-n181711 +r3041626-n325382 +r3041626-n392209 +r3041626-n468303 +r3041626-n598065 +r3041626-n680758 +r3041626-n920226 +r3117156-n134943 +r3117156-n440434 +r3117156-n442913 +r3117156-n691735 +r322031-n134943 +r322031-n440434 +r322031-n442913 +r322031-n691735 +r3226521-n146651 +r3226521-n181711 +r3226521-n325382 +r3226521-n392209 +r3226521-n468303 +r3226521-n598065 +r3226521-n680758 +r3226521-n920226 +r3254677-n181711 +r3254677-n325382 +r3254677-n680758 +r3254677-n920226 +r3475376-n134943 +r3475376-n440434 +r3475376-n442913 +r3475376-n691735 +r3581284-n146651 +r3581284-n181711 +r3581284-n325382 +r3581284-n392209 +r3581284-n468303 +r3581284-n598065 +r3581284-n680758 +r3581284-n920226 +r3685766-n134943 +r3685766-n440434 +r3685766-n442913 +r3685766-n691735 +r3741709-n146651 +r3741709-n181711 +r3741709-n325382 +r3741709-n392209 +r3741709-n468303 +r3741709-n598065 +r3741709-n680758 +r3741709-n920226 +r3824475-n146651 +r3824475-n181711 +r3824475-n325382 +r3824475-n392209 +r3824475-n468303 +r3824475-n598065 +r3824475-n680758 +r3824475-n920226 +r3879907-n134943 +r3879907-n440434 +r3879907-n442913 +r3879907-n691735 +r406820-n181711 +r406820-n325382 +r406820-n680758 +r406820-n920226 +r4153679-n134943 +r4153679-n440434 +r4153679-n442913 +r4153679-n691735 +r4179716-n181711 +r4179716-n325382 +r4179716-n680758 +r4179716-n920226 +r4229531-n181711 +r4229531-n325382 +r4229531-n680758 +r4229531-n920226 +r4247208-n146651 +r4247208-n181711 +r4247208-n325382 +r4247208-n392209 +r4247208-n468303 +r4247208-n598065 +r4247208-n680758 +r4247208-n920226 +r4327055-n134943 +r4327055-n440434 +r4327055-n442913 +r4327055-n691735 +r4357125-n134943 +r4357125-n440434 +r4357125-n442913 +r4357125-n691735 +r4822976-n134943 +r4822976-n440434 +r4822976-n442913 +r4822976-n691735 +r4858666-n146651 +r4858666-n181711 +r4858666-n325382 +r4858666-n392209 +r4858666-n468303 +r4858666-n598065 +r4858666-n680758 +r4858666-n920226 +r4874959-n181711 +r4874959-n325382 +r4874959-n680758 +r4874959-n920226 +r4990664-n134943 +r4990664-n440434 +r4990664-n442913 +r4990664-n691735 +r5130449-n134943 +r5130449-n440434 +r5130449-n442913 +r5189505-n146651 +r5189505-n181711 +r5189505-n325382 +r5189505-n392209 +r5189505-n468303 +r5189505-n598065 +r5189505-n680758 +r5189505-n920226 +r5261712-n134943 +r5261712-n440434 +r5261712-n442913 +r5261712-n691735 +r5573787-n181711 +r5573787-n325382 +r5573787-n680758 +r5573787-n920226 +r5715171-n134943 +r5715171-n440434 +r5715171-n442913 +r5715171-n691735 +r6102167-n181711 +r6102167-n325382 +r6102167-n680758 +r6102167-n920226 +r6272977-n181711 +r6272977-n325382 +r6272977-n680758 +r6272977-n920226 +r629115-n146651 +r629115-n181711 +r629115-n325382 +r629115-n392209 +r629115-n468303 +r629115-n598065 +r629115-n680758 +r629115-n920226 +r6341586-n146651 +r6341586-n181711 +r6341586-n325382 +r6341586-n392209 +r6341586-n468303 +r6341586-n598065 +r6341586-n680758 +r6341586-n920226 +r6531478-n181711 +r6531478-n325382 +r6531478-n680758 +r6531478-n920226 +r6631426-n181711 +r6631426-n325382 +r6631426-n680758 +r6631426-n920226 +r6682735-n146651 +r6682735-n181711 +r6682735-n325382 +r6682735-n392209 +r6682735-n468303 +r6682735-n598065 +r6682735-n680758 +r6682735-n920226 +r6760045-n146651 +r6760045-n181711 +r6760045-n325382 +r6760045-n392209 +r6760045-n468303 +r6760045-n598065 +r6760045-n680758 +r6760045-n920226 +r697496-n146651 +r697496-n181711 +r697496-n325382 +r697496-n392209 +r697496-n468303 +r697496-n598065 +r697496-n680758 +r697496-n920226 +r7217787-n146651 +r7217787-n181711 +r7217787-n244243 +r7217787-n325382 +r7217787-n392209 +r7217787-n468303 +r7217787-n598065 +r7217787-n680758 +r7217787-n920226 +r7343737-n146651 +r7343737-n181711 +r7343737-n325382 +r7343737-n392209 +r7343737-n468303 +r7343737-n598065 +r7343737-n680758 +r7343737-n920226 +r7831860-n181711 +r7831860-n325382 +r7831860-n680758 +r7831860-n920226 +r7839831-n146651 +r7839831-n181711 +r7839831-n325382 +r7839831-n392209 +r7839831-n468303 +r7839831-n598065 +r7839831-n680758 +r7839831-n920226 +r7952476-n146651 +r7952476-n181711 +r7952476-n325382 +r7952476-n392209 +r7952476-n468303 +r7952476-n598065 +r7952476-n680758 +r7952476-n920226 +r8015356-n181711 +r8015356-n325382 +r8015356-n680758 +r8015356-n920226 +r8062914-n134943 +r8062914-n442913 +r8062914-n691735 +r8212643-n146651 +r8212643-n181711 +r8212643-n244243 +r8212643-n325382 +r8212643-n392209 +r8212643-n468303 +r8212643-n598065 +r8212643-n680758 +r8212643-n920226 +r8333645-n146651 +r8333645-n181711 +r8333645-n325382 +r8333645-n392209 +r8333645-n468303 +r8333645-n598065 +r8333645-n680758 +r8333645-n920226 +r8595196-n134943 +r8595196-n440434 +r8595196-n442913 +r8595196-n691735 +r8607415-n181711 +r8607415-n325382 +r8607415-n680758 +r8607415-n920226 +r8642123-n181711 +r8642123-n325382 +r8642123-n680758 +r8642123-n920226 +r8792496-n181711 +r8792496-n325382 +r8792496-n680758 +r8792496-n920226 +r8918301-n181711 +r8918301-n325382 +r8918301-n680758 +r8918301-n920226 +r8937440-n146651 +r8937440-n181711 +r8937440-n325382 +r8937440-n392209 +r8937440-n468303 +r8937440-n598065 +r8937440-n680758 +r8937440-n920226 +r8939293-n134943 +r8939293-n440434 +r8939293-n442913 +r8939293-n691735 +r9021574-n146651 +r9021574-n181711 +r9021574-n325382 +r9021574-n392209 +r9021574-n468303 +r9021574-n598065 +r9021574-n680758 +r9021574-n920226 +r9026042-n146651 +r9026042-n181711 +r9026042-n325382 +r9026042-n392209 +r9026042-n468303 +r9026042-n598065 +r9026042-n680758 +r9026042-n920226 +r9040233-n181711 +r9040233-n325382 +r9040233-n680758 +r9040233-n920226 +r9102715-n146651 +r9102715-n181711 +r9102715-n325382 +r9102715-n392209 +r9102715-n468303 +r9102715-n598065 +r9102715-n680758 +r9102715-n920226 +r9113711-n181711 +r9113711-n325382 +r9113711-n680758 +r9113711-n920226 +r9189566-n146651 +r9189566-n181711 +r9189566-n325382 +r9189566-n392209 +r9189566-n468303 +r9189566-n598065 +r9189566-n680758 +r9189566-n920226 +r9192091-n146651 +r9192091-n181711 +r9192091-n325382 +r9192091-n392209 +r9192091-n468303 +r9192091-n598065 +r9192091-n680758 +r9192091-n920226 +r9273661-n146651 +r9273661-n181711 +r9273661-n325382 +r9273661-n392209 +r9273661-n468303 +r9273661-n598065 +r9273661-n680758 +r9273661-n920226 +r9352821-n146651 +r9352821-n181711 +r9352821-n244243 +r9352821-n325382 +r9352821-n392209 +r9352821-n598065 +r9352821-n680758 +r9352821-n920226 +r9366523-n134943 +r9366523-n440434 +r9366523-n442913 +r9366523-n691735 +r9555635-n134943 +r9555635-n440434 +r9555635-n442913 +r9555635-n691735 +r9720335-n181711 +r9720335-n325382 +r9720335-n680758 +r9720335-n920226 +r9757054-n146651 +r9757054-n181711 +r9757054-n325382 +r9757054-n392209 +r9757054-n468303 +r9757054-n598065 +r9757054-n680758 +r9757054-n920226 diff --git a/raps/dataloaders/mit_supercloud/download.py b/raps/dataloaders/mit_supercloud/download.py new file mode 100644 index 0000000000000000000000000000000000000000..f46c5733f71d7167c554f8e95909035f16fe57ce --- /dev/null +++ b/raps/dataloaders/mit_supercloud/download.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +""" +Single‑script tool to: + 1) Ensure slurm-log.csv is present locally (download if missing) + 2) Filter jobs by submit date and classify CPU vs GPU by gres/tres + 3) Build or load a one‑time S3 manifest of trace keys (CPU & GPU) + 4) Filter that manifest by job IDs and download matching files + +Usage: + python download_data.py [--start DDMMYYYY] [--end DDMMYYYY] \ + [--partition all|part-cpu|part-gpu] \ + [--outdir PATH] [--max-jobs N] [--dry-run] + +Defaults: + --start 21052021 # 21 May 2021 (inclusive) + --end 22052021 # 22 May 2021 (exclusive) + +Flags: + --max-jobs N # Only process first N jobs (for testing) + --dry-run # List a sample of files without downloading +""" +# Suppress urllib3 InsecureRequestWarning +from .utils import ( + load_slurm_log, + build_or_load_manifest, + # filter_keys_by_jobs # Defined below! not in utils... +) +from tqdm import tqdm +from botocore.client import Config +from botocore import UNSIGNED +import boto3 +import pandas as pd +from datetime import datetime +import re +import os +import urllib3 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + +# Default date window +DEFAULT_START = "21052021" +DEFAULT_END = "22052021" + + +def ensure_slurm_log(s3, bucket, key, dest): + if os.path.exists(dest): + print(f"Found existing slurm-log.csv at {dest}, skipping download.") + return + os.makedirs(os.path.dirname(dest), exist_ok=True) + print(f"Downloading slurm-log.csv → {dest}") + s3.download_file(bucket, key, dest) + print("Downloaded slurm-log.csv.") + + +def list_and_filter_jobs(csv_path, start, end, partition): + df = pd.read_csv(csv_path) + df['time_submit'] = pd.to_datetime(df['time_submit'], unit='s') + + dt0 = datetime.strptime(start, "%d%m%Y") + dt1 = datetime.strptime(end, "%d%m%Y") + window = df[(df['time_submit'] >= dt0) & (df['time_submit'] < dt1)] + + # Identify GPU-using jobs via gres_used or tres_alloc + gres = window['gres_used'].fillna("").astype(str) + tres = window['tres_alloc'].fillna("").astype(str) + gpu_jobs = set(window.loc[ + gres.str.contains("gpu", case=False) | + tres.str.contains(r"(?:1001|1002)=", regex=True), + 'id_job' + ]) + + if partition == 'part-gpu': + job_ids = sorted(gpu_jobs) + elif partition == 'part-cpu': + job_ids = sorted(set(window['id_job']) - gpu_jobs) + else: + job_ids = sorted(window['id_job'].unique()) + + print(f"Selected {len(job_ids)} jobs from {dt0.date()} to {dt1.date()} on {partition}.") + return job_ids + + +def build_manifest(s3, bucket, prefix, manifest_path): + """ + One-time listing of all CSV keys under cpu/ and gpu/ prefixes. + Writes each key to manifest_path. + """ + cpu_pref = prefix + 'cpu/' + gpu_pref = prefix + 'gpu/' + paginator = s3.get_paginator('list_objects_v2') + os.makedirs(os.path.dirname(manifest_path), exist_ok=True) + with open(manifest_path, 'w') as mf: + # CPU + print("Building manifest: listing CPU keys...") + for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=cpu_pref), desc="CPU pages", unit="page"): + for obj in page.get('Contents', []): + key = obj['Key'] + if key.lower().endswith('.csv'): + mf.write(key + '\n') + # GPU + print("Building manifest: listing GPU keys...") + for page in tqdm(paginator.paginate(Bucket=bucket, Prefix=gpu_pref), desc="GPU pages", unit="page"): + for obj in page.get('Contents', []): + key = obj['Key'] + if key.lower().endswith('.csv'): + mf.write(key + '\n') + print(f"Manifest written to {manifest_path}.") + + +# def load_manifest(manifest_path): +# with open(manifest_path) as f: +# return [line.strip() for line in f] + + +def filter_keys_by_jobs(keys, job_ids): + """ + Parse job ID from start of filename (-...) or via -r- in GPU names, + keep only keys matching job_ids. + """ + sel = [] + for key in keys: + fname = os.path.basename(key) + # Try CPU style: '-' + parts = fname.split('-', 1) + jid = None + try: + jid = int(parts[0]) + except ValueError: + # Try GPU style: '-r-' + m = re.search(r'-r(\d+)-', fname) + if m: + jid = int(m.group(1)) + if jid and jid in job_ids: + sel.append(key) + return sel + + +def download_traces(s3, bucket, prefix, outdir, keys, dry_run): + if dry_run: + print("Dry-run: sample of matching keys:") + for key in keys[:10]: + print(" ", key) + return + for key in tqdm(keys, desc="Downloading traces"): + rel = key[len(prefix):] + dest = os.path.join(outdir, rel) + if os.path.exists(dest): + tqdm.write(f"Warning: {dest} exists, skipping.") + continue + os.makedirs(os.path.dirname(dest), exist_ok=True) + s3.download_file(bucket, key, dest) + print("All requested traces downloaded.") + + +def download(args): + """ + Subcommand entrypoint for 'mit_supercloud download'. + Downloads slurm-log.csv and all matching CPU/GPU trace files from S3. + """ + # 1) Initialize anonymous S3 client with SSL verification disabled + s3 = boto3.client( + 's3', + config=Config(signature_version=UNSIGNED), + verify=False + ) + + # 2) Ensure local copy of slurm-log.csv + slurm_key = f"{args.prefix}slurm-log.csv" + slurm_path = os.path.join(args.outdir, 'slurm-log.csv') + ensure_slurm_log(s3, args.bucket, slurm_key, slurm_path) + + # 3) Load and filter SLURM log to determine CPU/GPU job sets + _, cpu_jobs, gpu_jobs = load_slurm_log( + slurm_path, + args.start, + args.end + ) + if args.partition == 'part-cpu': + job_ids = cpu_jobs + elif args.partition == 'part-gpu': + job_ids = gpu_jobs + else: + job_ids = cpu_jobs | gpu_jobs + if args.max_jobs: + job_ids = set(list(job_ids)[:args.max_jobs]) + print(f"Processing {len(job_ids)} jobs (partition={args.partition})") + + # 4) Build or load the one-time manifest of all trace keys + manifest_path = os.path.join(args.outdir, 'file_manifest.txt') + all_keys = build_or_load_manifest( + s3, args.bucket, args.prefix, manifest_path + ) + + # 5) Filter manifest to only the trace keys for our job IDs + trace_keys = filter_keys_by_jobs(all_keys, job_ids) + cpu_count = sum(1 for k in trace_keys if k.startswith(f"{args.prefix}cpu/")) + gpu_count = len(trace_keys) - cpu_count + print(f"Total matching trace files: {len(trace_keys)} (CPU: {cpu_count}, GPU: {gpu_count})") + + # 6) Download or dry-run + download_traces( + s3, + args.bucket, + args.prefix, + args.outdir, + trace_keys, + args.dry_run + ) diff --git a/raps/dataloaders/mit_supercloud/gpu_nodes.txt b/raps/dataloaders/mit_supercloud/gpu_nodes.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4c14a4b3c1965f417418c3cd8541d845e4f7431 --- /dev/null +++ b/raps/dataloaders/mit_supercloud/gpu_nodes.txt @@ -0,0 +1,228 @@ +r1457839-n386398 +r1457839-n851693 +r1457839-n911952 +r1457839-n976057 +r1485405-n386398 +r1485405-n43543 +r1485405-n685852 +r1485405-n830961 +r1485405-n851693 +r1485405-n911952 +r1485405-n976057 +r1682297-n386398 +r1682297-n43543 +r1682297-n685852 +r1682297-n830961 +r1682297-n851693 +r1682297-n911952 +r1682297-n976057 +r2100214-n386398 +r2100214-n851693 +r2100214-n911952 +r2100214-n976057 +r2582019-n386398 +r2582019-n851693 +r2582019-n911952 +r2582019-n976057 +r2652301-n386398 +r2652301-n43543 +r2652301-n685852 +r2652301-n830961 +r2652301-n851693 +r2652301-n911952 +r2652301-n976057 +r2825489-n136082 +r2825489-n139058 +r2825489-n208530 +r2998125-n136082 +r2998125-n139058 +r2998125-n208530 +r3041626-n386398 +r3041626-n43543 +r3041626-n685852 +r3041626-n830961 +r3041626-n851693 +r3041626-n911952 +r3041626-n976057 +r3117156-n136082 +r3117156-n139058 +r3117156-n208530 +r3210026-n172998 +r3226521-n386398 +r3226521-n43543 +r3226521-n685852 +r3226521-n830961 +r3226521-n851693 +r3226521-n911952 +r3226521-n976057 +r3386633-n172998 +r3405251-n136082 +r3405251-n139058 +r3405251-n208530 +r3475376-n136082 +r3475376-n139058 +r3475376-n208530 +r3741709-n386398 +r3741709-n43543 +r3741709-n685852 +r3741709-n830961 +r3741709-n851693 +r3741709-n911952 +r3741709-n976057 +r3879907-n136082 +r3879907-n139058 +r3879907-n208530 +r4179716-n386398 +r4179716-n851693 +r4179716-n911952 +r4179716-n976057 +r4229531-n386398 +r4229531-n851693 +r4229531-n911952 +r4229531-n976057 +r4327055-n136082 +r4327055-n139058 +r4327055-n208530 +r4774426-n172998 +r4822976-n136082 +r4822976-n139058 +r4822976-n208530 +r4858666-n386398 +r4858666-n43543 +r4858666-n685852 +r4858666-n830961 +r4858666-n851693 +r4858666-n911952 +r4858666-n976057 +r5130449-n136082 +r5130449-n139058 +r5130449-n208530 +r5189505-n386398 +r5189505-n43543 +r5189505-n685852 +r5189505-n830961 +r5189505-n851693 +r5189505-n911952 +r5189505-n976057 +r5573787-n386398 +r5573787-n851693 +r5573787-n911952 +r5573787-n976057 +r5715171-n136082 +r5715171-n139058 +r5715171-n208530 +r6272977-n386398 +r6272977-n851693 +r6272977-n911952 +r6272977-n976057 +r629115-n386398 +r629115-n43543 +r629115-n685852 +r629115-n830961 +r629115-n851693 +r629115-n911952 +r629115-n976057 +r6760045-n386398 +r6760045-n43543 +r6760045-n685852 +r6760045-n830961 +r6760045-n851693 +r6760045-n911952 +r6760045-n976057 +r7217787-n386398 +r7217787-n43543 +r7217787-n685852 +r7217787-n830961 +r7217787-n851693 +r7217787-n911952 +r7217787-n976057 +r7343737-n386398 +r7343737-n43543 +r7343737-n685852 +r7343737-n830961 +r7343737-n851693 +r7343737-n911952 +r7343737-n976057 +r8015356-n386398 +r8015356-n851693 +r8015356-n911952 +r8015356-n976057 +r8062914-n136082 +r8062914-n139058 +r8062914-n208530 +r8333645-n386398 +r8333645-n43543 +r8333645-n685852 +r8333645-n830961 +r8333645-n851693 +r8333645-n911952 +r8333645-n976057 +r8579942-n136082 +r8579942-n139058 +r8579942-n208530 +r8586363-n172998 +r8607415-n386398 +r8607415-n851693 +r8607415-n911952 +r8607415-n976057 +r8642123-n386398 +r8642123-n851693 +r8642123-n911952 +r8642123-n976057 +r8937440-n386398 +r8937440-n43543 +r8937440-n685852 +r8937440-n830961 +r8937440-n851693 +r8937440-n911952 +r8937440-n976057 +r8939293-n136082 +r8939293-n139058 +r8939293-n208530 +r9040233-n386398 +r9040233-n851693 +r9040233-n911952 +r9040233-n976057 +r9102715-n386398 +r9102715-n43543 +r9102715-n685852 +r9102715-n830961 +r9102715-n851693 +r9102715-n911952 +r9102715-n976057 +r9175025-n386398 +r9175025-n851693 +r9175025-n911952 +r9175025-n976057 +r9189566-n386398 +r9189566-n43543 +r9189566-n685852 +r9189566-n830961 +r9189566-n851693 +r9189566-n911952 +r9189566-n976057 +r9192091-n386398 +r9192091-n43543 +r9192091-n685852 +r9192091-n830961 +r9192091-n851693 +r9192091-n911952 +r9192091-n976057 +r9352821-n386398 +r9352821-n43543 +r9352821-n685852 +r9352821-n830961 +r9352821-n851693 +r9352821-n911952 +r9352821-n976057 +r9535192-n386398 +r9535192-n851693 +r9535192-n911952 +r9535192-n976057 +r9555635-n136082 +r9555635-n139058 +r9555635-n208530 +r9720335-n386398 +r9720335-n851693 +r9720335-n911952 +r9720335-n976057 diff --git a/raps/dataloaders/mit_supercloud/loader.py b/raps/dataloaders/mit_supercloud/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..fc16ade2e3c29a1898a55ad138ec34e750168197 --- /dev/null +++ b/raps/dataloaders/mit_supercloud/loader.py @@ -0,0 +1,638 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +MIT Supercloud data loader + +This module extracts and processes job traces from the MIT SuperCloud dataset, +starting with slurm-log.csv file, and then searching for the files in the cpu +and gpu directories. The main paper associated with the MIT Supercloud Dataset +is available here: https://arxiv.org/abs/2108.02037. +There is more information available here: https://dcc.mit.edu/ + +Note, that quite a bit of filtering is done with sanity checks to make sure +the the CPU traces match the GPU traces, etc. At this point it's not uncommon +if there may be 1569 total jobs in the time range, only 834 cpu jobs and 128 +gpu jobs (962 total) are able to be replayed. This is an issue which will likely +have to be improved in the future. + +--------------------------------------------------------------------------- +Understanding some of the errors. We track the different reasons that +less than the total number of jobs in the slurm log actually run in the +simulator. This is not so much an issue for the CPU partition, but for +the GPU partition, where we have to combine traces extracted from both +CPU trace files and GPU trace files. + +At the beginning of the GPU partition analysis, we give an analysis such as: + + --- Detailed Job Accounting for Partition 'part-gpu' --- + Initial jobs considered: 519 + * Jobs with NO trace file found: 69 (519 - 450) + + Of the 450 jobs with traces: + * 289 jobs have only CPU traces (417 - 128) + * 33 jobs have only GPU traces (161 - 128) + * 128 jobs have BOTH CPU and GPU traces. + ---------------------------------------------------- + +We give a summary report at the end of the data loading process. An +example report is shown for the range `--start 2021-05-21T00:00 --end 2021-05-22T00:00` + + Skipped jobs summary: + - nodes_alloc > 480: 0 + - pruned_nodes: 1 + - no_trace_file: 69 + - no_cpu_trace_for_gpu_job: 41 + - final_gpu_none_mixed: 289 + - final_cpu_none_mixed: 33 + + [INFO] Partition 'mit_supercloud/part-cpu': 834 jobs loaded + [INFO] Partition 'mit_supercloud/part-gpu': 128 jobs loaded + +We explain each of these stats here. + + - `nodes_alloc > 480`: the number of jobs that are thrown out because + they request more than 480 nodes. + + - `pruned_nodes`: the number of jobs thrown out because the node was + listed in `prune_list.txt`. + + - `no_trace_file`: the number of jobs that were found in the Slurm log + for the correct time window and partition, but for which not a single + corresponding trace file (neither CPU nor GPU) could be found on the filesystem. + + - `no_cpu_trace_for_gpu_job`: The number of jobs that had a GPU trace file + but were discarded because they were missing their required corresponding + CPU trace file. + + - `final_gpu_none_mixed`: The number of jobs in a GPU partition run that had + a CPU trace but were missing the final, processed GPU trace data. + + - `final_cpu_none_mixed`: The number of jobs in a GPU partition run that were + missing the essential CPU trace data during the final job construction phase. + +Now, we work on debugging some of these. For example, for `no_cpu_trace_for_gpu_job`, +we can take the jid from the warning message: + + [WARNING] → no cpu trace for gpu! (jid=4074251073298) SKIPPING + +And then check the data directory to see if it can find trace files for both the cpu +and gpu: + + > find ~/data/mit/202201 -name '4074251073298*' + +--------------------------------------------------------------------------- +How we curated and generated the node ids: cpu_nodes.txt and gpu_nodes.txt + +Node filtering based on observed resource allocation history. + +Summary of node filtering: + +- A total of 1135 unique node IDs were extracted from `slurm-log.csv`. +- Of these, 228 were identified as GPU-capable nodes (recorded in `gpu_nodes.txt`). +- The remaining 907 nodes were treated as CPU-only candidates. + +Filtering steps: + +1. Jobs with `nodes_alloc > 480` were excluded, based on the assumption that + such large allocations span across GPU nodes. This removed 413 nodes, + leaving 494 candidate CPU-only nodes. + +2. To reach the target of 480 CPU nodes, we analyzed job frequency per node + and pruned the 14 least-used nodes (those with only 1–26 jobs). + These pruned nodes are listed in `prune_list.txt`. + +The final list of CPU-only nodes is stored in `cpu_nodes.txt`, and the list +of GPU nodes are stored in `gpu_nodes.txt`. + +Note: To locate the pruning logic, search for the keyword "prune" in the code. +""" + +import ast +import os +import math +import pandas as pd +import re + +from tqdm import tqdm +from typing import Dict, Union, Optional +from collections import Counter +from datetime import datetime, timezone + +from raps.job import job_dict, Job +from raps.utils import summarize_ranges, WorkloadData +from .utils import proc_cpu_series, proc_gpu_series, to_epoch +from .utils import DEFAULT_START, DEFAULT_END + +TRES_ID_MAP = { + 1: "cpu", + 2: "mem", # in MB + 3: "energy", + 4: "gres/gpu", + 5: "billing", +} +GREEN = "\033[32m" +YELLOW = "\033[33m" +RED = "\033[31m" +RESET = "\033[0m" + + +def parse_tres_alloc(tres_str: Union[str, None], + id_map: Optional[Dict[int, str]] = None, + return_ids: bool = False, + stats: Counter = None) -> Dict[Union[int, str], int]: + """ + Parse a Slurm tres_alloc/tres_req field like: '1=20,2=170000,4=1,5=20' + + Parameters + ---------- + tres_str : str | None + The raw TRES string from Slurm (quotes OK). If None/empty returns {}. + id_map : dict[int,str] | None + Optional mapping from TRES numeric IDs to friendly names. + Falls back to TRES_ID_MAP if not provided. + return_ids : bool + If True, keys are the numeric IDs. If False, keys use id_map names + (falls back to the numeric ID as a string if unknown). + stats : Counter + Optional counter to track parsing errors. + + Returns + ------- + dict + Parsed key/value pairs. Example: + {'cpu': 20, 'mem': 170000, 'gres/gpu': 1, 'billing': 20} + """ + if pd.isna(tres_str): + return {} + tres_str = str(tres_str) + + id_map = id_map or TRES_ID_MAP + + # strip quotes or whitespace + tres_str = tres_str.strip().strip('"').strip("'") + + if not tres_str: + return {} + + # Split on commas, but be tolerant of spaces + parts = [p for p in tres_str.split(",") if p] + + out: Dict[Union[int, str], int] = {} + + for p in parts: + m = re.match(r"\s*(\d+)\s*=\s*([0-9]+)\s*$", p) + if not m: + if stats is not None: + stats["malformed_tres"] += 1 + # skip or raise; here we skip silently + continue + tid = int(m.group(1)) + val = int(m.group(2)) + if return_ids: + out[tid] = val + else: + key = id_map.get(tid, str(tid)) + out[key] = val + + return out + + +def load_data(local_dataset_path, **kwargs): + """ + Load MIT Supercloud job traces **without** any metadata files. + Expects under: + local_dataset_path/ + [.../] + slurm-log.csv + cpu/...-timeseries.csv + gpu/...-timeseries.csv + Returns: + jobs_list, sim_start_time, sim_end_time + """ + debug = kwargs.get("debug") + config = kwargs.get("config") + NL_PATH = os.path.dirname(__file__) + + skip_counts = Counter() + + # unpack + if isinstance(local_dataset_path, list): + if len(local_dataset_path) != 1: + raise ValueError("Expect exactly one path") + local_dataset_path = local_dataset_path[0] + + # slurm log -> DataFrame + slurm_path = None + for root, _, files in os.walk(os.path.expanduser(local_dataset_path)): + if "slurm-log.csv" in files: + slurm_path = os.path.join(root, "slurm-log.csv") + break + + if not slurm_path: + raise FileNotFoundError(f"Could not find slurm-log.csv under {local_dataset_path}") + + data_root = os.path.dirname(slurm_path) + sl = pd.read_csv(slurm_path) + sl["__line__"] = sl.index + 2 + + # date window + start_ts = to_epoch(kwargs.get("start") or DEFAULT_START) + end_ts = to_epoch(kwargs.get("end") or DEFAULT_END) + + mask = (sl.time_submit >= start_ts) & (sl.time_submit < end_ts) + sl = sl[mask] + + if debug: + print(f"[DEBUG] After time filtering: {len(sl)} jobs") + hits = sl.loc[mask] + lines = hits["__line__"].tolist() + print(f"data sourced from {len(lines)} records in slurm-log.csv. Line number ranges:", + summarize_ranges(lines)) + + # --- prune out oversized jobs and known under‑used hosts --- + # load list of underutilized nodes to ignore + pruned = set() + with open(os.path.join(NL_PATH, "prune_list.txt")) as pf: + pruned = {l_.strip() for l_ in pf if l_.strip()} + + before_prune = len(sl) + # only keep jobs requesting <= 480 nodes + sl = sl[sl.nodes_alloc <= 480] + after_alloc_filter = len(sl) + skip_counts['nodes_alloc > 480'] += (before_prune - after_alloc_filter) + + # drop any job whose nodelist includes a pruned node + sl["nodes_list"] = sl["nodelist"].apply(ast.literal_eval) + + def is_pruned(lst): + matches = [n for n in lst if n in pruned] + if matches: + if debug: + print(f"[DEBUG] Skipping job due to pruned nodes: {matches}") + return True + return False + + before_prune_filter = len(sl) + sl = sl[~sl["nodes_list"].apply(is_pruned)] + after_prune_filter = len(sl) + skip_counts['pruned_nodes'] += (before_prune_filter - after_prune_filter) + + if debug: + print(f"[DEBUG] After pruning: {len(sl)} jobs") + + # —— ERROR CATCH: no jobs in this window? —— + if sl.empty: + raise ValueError( + f"No SLURM jobs found between {kwargs.get('start')} and " + f"{kwargs.get('end')}. Please pick a range covered by the dataset." + ) + + # detect GPU‐using jobs + gres = sl.gres_used.fillna("").astype(str) + tres = sl.tres_alloc.fillna("").astype(str) + + gpu_jobs = set(sl.loc[ + gres.str.contains("gpu", case=False) | + tres.str.contains(r"(?:1001|1002)=", regex=True), + "id_job" + ]) + + # partition mode + part = (kwargs.get("partition") or "").split("/")[-1].lower() + cpu_only = (part == "part-cpu") + mixed = (part == "part-gpu") + + # handle single-partition configs (e.g., mit_supercloud.yaml) + if not cpu_only and not mixed: + gpus_per_node = config.get("GPUS_PER_NODE") + + if gpus_per_node == 0: + cpu_only = True + part = "part-cpu" + else: + mixed = True + part = "part-gpu" + + # create nodelist mapping + if cpu_only: + with open(os.path.join(NL_PATH, "cpu_nodes.txt")) as f: + cpu_nodes = [l_.strip() for l_ in f if l_.strip()] + cpu_node_to_idx = {h: i for i, h in enumerate(cpu_nodes)} + else: # cpu + gpu + with open(os.path.join(NL_PATH, "gpu_nodes.txt")) as f: + gpu_nodes = [l_.strip() for l_ in f if l_.strip()] + gpu_node_to_idx = {h: i for i, h in enumerate(gpu_nodes)} + + if cpu_only: + job_ids = set(sl.id_job) - gpu_jobs + # skip_counts['gpu_job_in_cpu_mode'] += len(set(sl.id_job) & gpu_jobs) + elif mixed: + job_ids = gpu_jobs & set(sl.id_job) + # skip_counts['cpu_job_in_gpu_mode'] += len(set(sl.id_job) - gpu_jobs) + else: + job_ids = set(sl.id_job) + + print(f"{GREEN}→ mode={part}, jobs: {len(job_ids)}{RESET}") + + # find trace files by walking directories + cpu_files = [] + cpu_root = os.path.join(data_root, "cpu") + if os.path.exists(cpu_root): + for R, _, fs in os.walk(cpu_root): + for f in fs: + if not f.endswith("-timeseries.csv"): + continue + try: + jid = int(f.split("-", 1)[0]) + if jid in job_ids: + cpu_files.append(os.path.join(R, f)) + except (ValueError, IndexError): + continue + + gpu_files = [] + gpu_root = os.path.join(data_root, "gpu") + if os.path.exists(gpu_root): + for R, _, fs in os.walk(gpu_root): + for f in fs: + if not f.endswith(".csv"): + continue + try: + jid = int(f.split("-", 1)[0]) + if jid in job_ids: + gpu_files.append(os.path.join(R, f)) + except (ValueError, IndexError): + continue + + cpu_ids = {int(os.path.basename(p).split('-', 1)[0]) for p in cpu_files} + gpu_ids = {int(os.path.basename(p).split('-', 1)[0]) for p in gpu_files} + all_trace_ids = cpu_ids | gpu_ids + + print(f"→ {len(cpu_files)} CPU files, {len(gpu_files)} GPU files → {len(all_trace_ids)} jobs with traces") + + if mixed: + # Perform a full accounting of all jobs considered for the partition. + jobs_with_no_traces = len(job_ids - all_trace_ids) + jobs_with_traces = len(all_trace_ids) + + print(f"\n--- Detailed Job Accounting for Partition '{part}' ---") + print(f"Initial jobs considered: {len(job_ids)}") + print(f" * Jobs with NO trace file found: {jobs_with_no_traces} ({len(job_ids)} - {jobs_with_traces})\n") + + if jobs_with_traces > 0: + overlap_count = len(cpu_ids & gpu_ids) + cpu_only_count = len(cpu_ids) - overlap_count + gpu_only_count = len(gpu_ids) - overlap_count + print(f"Of the {jobs_with_traces} jobs with traces:") + print(f" * {cpu_only_count} jobs have only CPU traces ({len(cpu_ids)} - {overlap_count})") + print(f" * {gpu_only_count} jobs have only GPU traces ({len(gpu_ids)} - {overlap_count})") + print(f" * {overlap_count} jobs have BOTH CPU and GPU traces.") + print("----------------------------------------------------\n") + + data = {} + + traced_jobs = all_trace_ids + untraced_jobs = job_ids - traced_jobs + skip_counts['no_trace_file'] += len(untraced_jobs) + + # CPU first + for fp in tqdm(cpu_files, desc="Loading CPU traces"): + df = pd.read_csv(fp, dtype={0: str}) + jid = int(os.path.basename(fp).split("-", 1)[0]) + rec = data.setdefault(jid, {}) + + # Find job info in slurm log and print details + job_info = sl[sl.id_job == jid] + if job_info.empty: + skip_counts['job_not_in_slurm_log'] += 1 + if debug: + tqdm.write(f"Reading CPU {os.path.basename(fp)} for Job ID: {jid} (No slurm info found)") + continue + + job_row = job_info.iloc[0] + if debug: + start_time = job_row.get('time_start', 'N/A') + wall_time = job_row.get('time_limit', 'N/A') + tres_alloc = job_row.get('tres_alloc', 'N/A') + tres_alloc_dict = parse_tres_alloc(tres_alloc) + rec["tres_alloc_dict"] = tres_alloc_dict + # gres_used = job_row.get('gres_used', 'N/A') # Unused + + tqdm.write(f"Reading CPU {os.path.basename(fp)} for Job ID: {jid}") + tqdm.write(f" Start Time: {start_time}, Wall Time: {wall_time}s") + tqdm.write(f" TRES Alloc: {tres_alloc_dict}") + + tres_alloc = job_row.get('tres_alloc', 'N/A') + tres_alloc_dict = parse_tres_alloc(tres_alloc, stats=skip_counts) + rec["tres_alloc_dict"] = tres_alloc_dict + + raw = job_row.get("nodelist", "") + hosts = ast.literal_eval(raw) + # Get allocated nodes "['r9189566-n911952','r9189567-n...']" + try: + if cpu_only: + rec["scheduled_nodes"] = [cpu_node_to_idx[h] for h in hosts] + else: + rec["scheduled_nodes"] = [gpu_node_to_idx[h] for h in hosts] + except KeyError as e: + skip_counts['unrecognized_node_name'] += 1 + if debug: + print(f"Skipping job {jid} due to unrecognized node name: {e}") + continue + + rec["nodes_alloc"] = int(job_row["nodes_alloc"]) + rec["cpu"] = proc_cpu_series(df) + # print(f'{RED}{rec["cpu"]}{RESET}') + + if debug: + print(f"GPU candidate files ({len(gpu_files)}):") + for p in gpu_files[:10]: + print(" ", p) + + # data from the cpu processes are all stored under the `data` dictionary + # according to their respective jid key + # print("******", data.keys()) + + for fp in tqdm(gpu_files, desc="Loading GPU traces"): + + if not os.path.exists(fp): + if debug: + print(f"{YELLOW}[WARNING] gpu path {fp!r} doesn't exist skipping{RESET}") + skip_counts['gpu_path_does_not_exist'] += 1 + continue + + if debug: + tqdm.write(f"Reading GPU {os.path.basename(fp)}") + dfi = pd.read_csv(fp, dtype={0: str}) + if "gpu_index" not in dfi.columns: + if debug: + tqdm.write("[WARNING] → no gpu_index column! SKIPPING") + skip_counts['no_gpu_index_column'] += 1 + continue + + jid = int(os.path.basename(fp).split("-", 1)[0]) + rec = data.setdefault(jid, {}) + cpu_df = rec.get("cpu") + # print(f"{YELLOW}jid={jid} {cpu_df}{RESET}") + if cpu_df is None: + if debug: + tqdm.write(f"{YELLOW}[WARNING] → no cpu trace for gpu! (jid={jid}) SKIPPING{RESET}") + skip_counts['no_cpu_trace_for_gpu_job'] += 1 + continue + + gpu_cnt = rec.get("gpu_cnt", 0) + gpu_ser, gpu_cnt = proc_gpu_series(cpu_df, dfi, gpu_cnt) + + gpu_cnt = data[jid].get("gpu_cnt", 0) + prev_gpu = data[jid].get("gpu") + gpu_ser, gpu_cnt = proc_gpu_series(cpu_df, dfi, gpu_cnt) + if prev_gpu is None: + data[jid]["gpu"] = gpu_ser + else: + data[jid]["gpu"] = pd.merge(prev_gpu, gpu_ser, on="utime") + data[jid]["gpu_cnt"] = gpu_cnt + + if debug: + print(f"[DEBUG] proc_gpu_series returned {len(gpu_ser)} rows (gpu_cnt={gpu_cnt})") + + if "gpu" in rec: + rec["gpu"] = pd.merge(rec["gpu"], gpu_ser, on="utime", how="outer") + else: + rec["gpu"] = gpu_ser + rec["gpu_cnt"] = gpu_cnt + + gpu_df = rec["gpu"] + + # grab all the gpu-util columns + util_cols = [c for c in gpu_df.columns if c.startswith("gpu_util_")] + + if not util_cols: + # no gpu utilization columns? zero out + rec["gpu_trace"] = [] + else: + # as floats in [0,1] + raw = gpu_df[util_cols].astype(float).div(100) + + # average across devices + avg_util = raw.mean(axis=1) + + # scale by number of nodes requested + nodes = rec.get("nodes_alloc") + rec["gpu_trace"] = (avg_util * nodes).tolist() + + # merge slurm metadata + for _, row in sl.iterrows(): + jid = row.id_job + if jid in data and 'id_job' not in data[jid]: + data[jid].update(row.to_dict()) + + # build final job_dicts + jobs_list = [] + + # Get CPUS_PER_NODE and GPUS_PER_NODE from config + cpus_per_node = config.get('CPUS_PER_NODE') + cores_per_cpu = config.get('CORES_PER_CPU') + # gpus_per_node = config.get('GPUS_PER_NODE') # Unused + + quanta = config.get('TRACE_QUANTA') + + for jid, rec in data.items(): + nr = rec.get("nodes_alloc") + if nr is None: + skip_counts['final_missing_nodes_alloc'] += 1 + continue + + cpu = rec.get("cpu") + gpu = rec.get("gpu_trace") + + cpu_tr = [] + gpu_tr = [] + t0, t1 = 0, 0 + + if cpu_only: + if cpu is None: + skip_counts['final_cpu_none_cpu_only'] += 1 + continue + cpu_tr = cpu.cpu_utilisation.tolist() + gpu_tr = [0] # Ensure gpu_tr is a list for max() operation + t0, t1 = cpu.utime.min(), cpu.utime.max() + elif mixed: + if cpu is None: + skip_counts['final_cpu_none_mixed'] += 1 + continue + if gpu is None: + skip_counts['final_gpu_none_mixed'] += 1 + continue + cpu_tr = cpu.cpu_utilisation.tolist() + gpu_tr = gpu + t0, t1 = cpu.utime.min(), cpu.utime.max() + else: # not cpu_only or mixed + skip_counts['final_unhandled_partition'] += 1 + continue + + # Calculate cpu_cores_required and gpu_units_required from tres_alloc + if "tres_alloc_dict" not in rec: + skip_counts['final_missing_tres_alloc'] += 1 + continue + + total_cpu = rec["tres_alloc_dict"].get('cpu', 0) + # Can either allocate gpu:volta (1002) or gpu:tesla (1001) but not both + total_gpu = rec["tres_alloc_dict"].get('1002') or rec["tres_alloc_dict"].get(1001, 0) + + cpu_cores_req = math.ceil(total_cpu / nr) + gpu_units_req = math.ceil(total_gpu / nr) + + # sometimes there are spurious large values for cpu util - set max limit based on peak + cpu_peak = cpu_cores_req / cores_per_cpu / cpus_per_node # Is this per CPU? + cpu_tr = [min(x/cores_per_cpu/cpus_per_node, cpu_peak) for x in cpu_tr] + + start_time = t0 - start_ts + end_time = t1 - start_ts + submit_time = rec.get("time_submit") - start_ts + scheduled_nodes = rec.get("scheduled_nodes") + + current_job_dict = job_dict( + nodes_required=nr, + cpu_cores_required=cpu_cores_req, + gpu_units_required=gpu_units_req, + name=rec.get("name_job", "unknown"), + account=rec.get("id_user", "unknown"), + cpu_trace=cpu_tr, + gpu_trace=gpu_tr, + ntx_trace=[], + nrx_trace=[], + end_state=rec.get("state_end", "unknown"), + id=jid, + scheduled_nodes=scheduled_nodes, + priority=rec.get("priority", 0), + submit_time=submit_time, + time_limit=rec.get("timelimit") * 60, + start_time=start_time, + end_time=end_time, + expected_run_time=max(0, t1-t0), + trace_time=len(cpu_tr)*quanta, + trace_start_time=0, + trace_end_time=len(cpu_tr)*quanta, + trace_quanta=quanta + ) + job = Job(current_job_dict) + jobs_list.append(job) + + # Calculate min_overall_utime and max_overall_utime + # min_overall_utime = int(sl.time_submit.min()) + # max_overall_utime = int(sl.time_submit.max()) + + # args_namespace = SimpleNamespace( + # fastforward=min_overall_utime, + # system='mit_supercloud', + # time=max_overall_utime + # ) + + print("\nSkipped jobs summary:") + for reason, count in skip_counts.items(): + print(f"- {reason}: {count}") + + return WorkloadData( + jobs=jobs_list, + telemetry_start=0, telemetry_end=int(end_ts - start_ts), + start_date=datetime.fromtimestamp(start_ts, timezone.utc), + ) diff --git a/raps/dataloaders/mit_supercloud/prune_list.txt b/raps/dataloaders/mit_supercloud/prune_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e455245d46f9b22fb2bce36750ecd33e3e5b53f --- /dev/null +++ b/raps/dataloaders/mit_supercloud/prune_list.txt @@ -0,0 +1,14 @@ +r1356503-n172998 +r2627558-n172998 +r3045754-n48252 +r3236768-n172998 +r9836048-n172998 +r9115114-n172998 +r5130449-n691735 +r6491112-n172998 +r9541411-n172998 +r6682735-n244243 +r629115-n244243 +r7839831-n244243 +r189256-n244243 +r4858666-n244243 diff --git a/raps/dataloaders/mit_supercloud/utils.py b/raps/dataloaders/mit_supercloud/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8d51c482e2c7aa7017d7e33e01a81e6634dae875 --- /dev/null +++ b/raps/dataloaders/mit_supercloud/utils.py @@ -0,0 +1,252 @@ +import numpy as np +import os +import re +import pandas as pd + +from datetime import datetime +from scipy.sparse import csr_matrix as csr +from tqdm import tqdm + +DEFAULT_START = "2021-05-21T00:00" +DEFAULT_END = "2021-05-22T00:00" + + +def to_epoch(s: str) -> int: + try: + return int(datetime.fromisoformat(s).timestamp()) + except ValueError: + return int(datetime.strptime(s, "%d%m%Y").timestamp()) + + +def parse_dt(s: str) -> datetime: + try: + # handles 'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM[:SS]' + return datetime.fromisoformat(s) + except ValueError: + # legacy support for DDMMYYYY → midnight + return datetime.strptime(s, "%d%m%Y") + + +def load_slurm_log(slurm_path: str, start_date: str, end_date: str): + """ + Load Slurm log and filter jobs by submission window. + + Args: + slurm_path: Path to local slurm-log.csv + start_date: "DDMMYYYY" inclusive start + end_date: "DDMMYYYY" exclusive end + + Returns: + tuple( + pandas.DataFrame filtered on date window, + set of CPU-only job IDs, + set of GPU-using job IDs + ) + """ + df = pd.read_csv(slurm_path) + # Convert submit times + df['time_submit'] = pd.to_datetime(df['time_submit'], unit='s') + dt0 = parse_dt(start_date) + dt1 = parse_dt(end_date) + window = df[(df['time_submit'] >= dt0) & (df['time_submit'] < dt1)] + + # Detect GPU jobs via gres_used or tres_alloc + gres = window['gres_used'].fillna("").astype(str) + tres = window['tres_alloc'].fillna("").astype(str) + gpu_jobs = set( + window.loc[ + gres.str.contains("gpu", case=False) | + tres.str.contains(r"(?:1001|1002)=", regex=True), + 'id_job' + ] + ) + cpu_jobs = set(window['id_job']) - gpu_jobs + return window, cpu_jobs, gpu_jobs + + +def build_or_load_manifest(s3, bucket: str, prefix: str, manifest_path: str): + """ + Build a one-time manifest of all .csv keys under cpu/ and gpu/ in S3, + or load an existing manifest from disk. + + Args: + s3: boto3 S3 client + bucket: S3 bucket name + prefix: S3 dataset root prefix (e.g. "datacenter-challenge/202201/") + manifest_path: local path to cache the manifest + + Returns: + List[str]: all S3 keys ending in .csv under cpu/ and gpu/ + """ + if os.path.exists(manifest_path): + with open(manifest_path, 'r') as f: + return [line.strip() for line in f] + + # Otherwise build manifest + keys = [] + paginator = s3.get_paginator('list_objects_v2') + total_pages = {'cpu': 791, 'gpu': 110} + progress = tqdm(total=sum(total_pages.values()), desc="Building file-manifest.txt", unit="page") + + for kind in ('cpu', 'gpu'): + pfx = prefix + f"{kind}/" + for page in paginator.paginate(Bucket=bucket, Prefix=pfx): + for obj in page.get('Contents', []): + key = obj['Key'] + if key.lower().endswith('.csv'): + keys.append(key) + progress.update(1) + + progress.close() + + # Cache on disk + os.makedirs(os.path.dirname(manifest_path), exist_ok=True) + with open(manifest_path, 'w') as f: + for key in keys: + f.write(key + '\n') + return keys + + +def filter_keys_by_jobs(all_keys: list, job_ids: set): + """ + Filter a list of S3 keys to those belonging to specified job IDs. + + Args: + all_keys: list of S3 keys from manifest + job_ids: set of job IDs (int) + + Returns: + List[str] of keys matching CPU or GPU jobs + """ + selected = [] + gpu_pattern = re.compile(r'-r(\d+)-') + for key in all_keys: + # CPU keys: prefix/jobid-...-timeseries.csv or -summary.csv + if '/cpu/' in key: + fname = os.path.basename(key) + parts = fname.split('-', 1) + try: + jid = int(parts[0]) + except ValueError: + continue + if jid in job_ids: + selected.append(key) + # GPU keys: detect -r- in filename + elif '/gpu/' in key: + fname = os.path.basename(key) + m = gpu_pattern.search(fname) + if m and int(m.group(1)) in job_ids: + selected.append(key) + return selected + + +def proc_cpu_series(dfi): + dfi = dfi[~dfi.Step.isin([-1, -4, '-1', '-4'])].copy() + dfi['CPUUtilization'] = dfi['CPUUtilization'].fillna(0) / 100.0 + + t = pd.to_datetime(dfi.EpochTime, unit='s') + start_time = t.min() + dfi['t'] = ((t - start_time).dt.total_seconds() // 10).astype(int) + dfi['sid'] = pd.factorize(dfi.Step)[0] + + useries = dfi.Series.unique() + inds = np.arange(dfi.t.max() + 1) + df = pd.DataFrame({'t': inds}) + Xm, Xrss, Xvm, Xreadmb, Xwritemb = (np.zeros((len(useries), len(inds))) for _ in range(5)) + + for cnt, i in enumerate(useries): + sift = dfi.Series == i + M, N = len(inds), dfi.sid[sift].max() + 1 + + for metric, arr, name in zip( + ['CPUUtilization', 'RSS', 'VMSize', 'ReadMB', 'WriteMB'], + [Xm, Xrss, Xvm, Xreadmb, Xwritemb], + ['cpu', 'rss', 'vm', 'readmb', 'writemb'] + ): + X = csr((dfi.loc[sift, metric], (dfi.loc[sift, 't'], dfi.loc[sift, 'sid'])), shape=(M, N)) + mm = np.array(X.max(axis=1).todense()).reshape(-1,) + df[f'{name}_{i}'] = mm + arr[cnt, :] = mm + + df['cpu_utilisation'] = Xm.mean(axis=0) + df['rss'] = Xrss.sum(axis=0) + df['vm'] = Xvm.sum(axis=0) + df['readmb'] = Xreadmb.sum(axis=0) + df['writemb'] = Xwritemb.sum(axis=0) + df['timestamp'] = start_time + pd.to_timedelta(df.t * 10, unit='s') + df['utime'] = df['timestamp'].astype('int64') // 10**9 + + return df + + +def proc_gpu_series(cpu_df, dfi, gpu_cnt): + # 1) Build CPU time range + t_cpu_start = int(cpu_df.utime.min()) + t_cpu_end = int(cpu_df.utime.max()) + t_cpu = np.array([t_cpu_start, t_cpu_end, t_cpu_end - t_cpu_start]) + + # 2) Safely convert the GPU timestamps to integer seconds + # (this handles strings like "1621607266.426") + ts = pd.to_numeric(dfi["timestamp"], errors="coerce") # float64 or NaN + ts_int = ts.ffill().astype(float).astype(int) + t0, t1 = ts_int.min(), ts_int.max() + t_gpu = np.array([t0, t1, t1 - t0]) + + # 3) Sanity‐check the durations match within 10% + per_diff = ((t_cpu[1] - t_cpu[0]) - (t_gpu[1] - t_gpu[0])) / (t_gpu[1] - t_gpu[0]) * 100 + if abs(per_diff) > 10: + # warn and proceed — GPU trace may be trimmed or misaligned + tqdm.write(f"Warning: GPU‐CPU time mismatch {per_diff:.1f}% exceeds 10%; continuing anyway") + + # 4) Align GPU times onto CPU utime grid + # Use our integer‐second Series rather than the raw column + dfi["t_fixed"] = ts_int - ts_int.min() + t_cpu_start + + # 5) Prepare output DataFrame with a utime column + # ugpus = dfi.gpu_index.unique() + gpu_df = pd.DataFrame({"utime": cpu_df["utime"].values}) + + # 6) Interpolate each GPU field onto the CPU utime grid + fields = [ + "utilization_gpu_pct", + "utilization_memory_pct", + "memory_free_MiB", + "memory_used_MiB", + "temperature_gpu", + "temperature_memory", + "power_draw_W", + ] + for field in fields: + # grab the float‐converted timestamp and the metric + x1 = ts_int.values + y1 = dfi[field].astype(float).values + xv = cpu_df["utime"].values + # numpy interpolation + gpu_df[field] = np.interp(xv, x1, y1) + + # 7) Rename the GPU pct, memory pct, and power columns with the device index + ren = { + "gpu_index": f"gpu_index_{gpu_cnt}", + "utilization_gpu_pct": f"gpu_util_{gpu_cnt}", + "utilization_memory_pct": f"gpu_mempct_{gpu_cnt}", + "memory_free_MiB": f"gpu_memfree_{gpu_cnt}", + "memory_used_MiB": f"gpu_memused_{gpu_cnt}", + "temperature_gpu": f"gpu_temp_{gpu_cnt}", + "temperature_memory": f"gpu_memtemp_{gpu_cnt}", + "power_draw_W": f"gpu_power_{gpu_cnt}", + } + gpu_df.rename(columns=ren, inplace=True) + + return gpu_df, gpu_cnt + 1 + + +def validate_job_traces(job, granularity=1): + print(job) + assert job.cpu_trace is not None, f"job {job.id} missing cpu_trace" + assert job.gpu_trace is not None, f"job {job.id} missing gpu_trace" + assert all(p >= 0 for p in job.cpu_trace), f"neg cpu power in job {job.id}" + assert all(p >= 0 for p in job.gpu_trace), f"neg gpu power in job {job.id}" + # Length sanity: at least wall_time/granularity samples + needed = max(1, int(job.wall_time / granularity)) + assert len(job.cpu_trace) >= needed, f"cpu_trace too short for job {job.id}" + assert len(job.gpu_trace) >= needed, f"gpu_trace too short for job {job.id}" diff --git a/raps/dataloaders/philly.py b/raps/dataloaders/philly.py new file mode 100644 index 0000000000000000000000000000000000000000..ca8121bbedb7e701b2f360d1dec05ffe3b5209fd --- /dev/null +++ b/raps/dataloaders/philly.py @@ -0,0 +1,447 @@ +""" +This is the dataloader for the Philly traces which is documented in this paper: + + Jeon, Myeongjae, et al. "Analysis of Large-Scale Multi-Tenant GPU clusters + for DNN training workloads." 2019 USENIX Annual Technical Conference + (USENIX ATC 19). 2019. https://www.usenix.org/system/files/atc19-jeon.pdf + +Note on hardware specs: + + Philly only provides GPU memory sizes (12G & 24G) without clarifying GPU models. + Hu et al. (2024) https://arxiv.org/html/2403.07648v1 + + For estimating system power and FLOPS performance, we assume that the 2-GPU + nodes used Tesla P100 (12 GB) GPUs and the 8-GPU nodes used Tesla P40 (24 GB) + GPUs, consistent with hardware Microsoft deployed around 2017. Training is + assumed to have been performed in 32-bit (FP32), and the CPUs are assumed + to be 64-bit Intel Xeon E5-2690 v4. + +The repository is available here: + + https://github.com/msr-fiddle/philly-traces + +The data portion of the repo can be downloaded using one of the following methods: + + git clone https://github.com/msr-fiddle/philly-traces.git + cd philly-traces + git lfs pull + + wget https://github.com/msr-fiddle/philly-traces/raw/master/trace-data.tar.gz + + curl -L -o trace-data.tar.gz \ + https://github.com/msr-fiddle/philly-traces/raw/master/trace-data.tar.gz + +After the file is downloaded, assuming its in /opt/data/philly/trace-data directory: + + /opt/data/philly/trace-data/trace-data.tar.gz + + cd /opt/data/philly/trace-data + + run `tar xvfz trace-data.tar.gz` which will unpack the following files: + + cluster_cpu_util 1.5G + cluster_gpu_util 2.8G + cluster_mem_util 2.2G + cluster_job_log 37M + cluster_machine_list 8K + + then run the following: + + python /path/to/raps/scripts/parse_philly_traces.py cluster_cpu_util + python /path/to/raps/scripts/parse_philly_traces.py cluster_gpu_util + + this will parse these two files into two directories, cpu_by_day and gpu_by_day, + creating one file for each day and adding the lines for that day into the files. + + sanity checks: + + wc -l cluster_cpu_util + 45028261 cluster_cpu_util + wc -l cpu_by_day/*.csv + 45350898 total + + wc -l cluster_gpu_util + 44750641 cluster_gpu_util + wc -l gpu_by_day/*.csv + 44750640 total + +Running a replay simulation: + + python main.py run-parts -x philly -f /opt/data/philly/trace-data \ + --start 2017-10-03T00:00 --end 2017-10-04T00:00 + +Once the dataloader has been run at least once, it will dump npz files into a directory, +so they can be replayed again without having to go through the expensive extractoin process, +using e.g.: + + python main.py run-parts -x philly -f raps-output-5efefa3 + +Note: it is possible to run simulations for an user-defined length of time between +10/3/2017 to 12/15/2017. + +""" + +import csv +import json +import os +from datetime import datetime, timedelta, timezone + +import pandas as pd +from tqdm import tqdm + +from raps.job import Job, job_dict +from raps.utils import WorkloadData + +DATE_FORMAT_STR = "%Y-%m-%d %H:%M:%S" +DEFAULT_START = "2017-10-03T00:00" +DEFAULT_END = "2017-10-04T00:00" + + +def to_epoch(ts_str): + """Convert a timestamp string or int/float into epoch seconds.""" + if ts_str is None: + return None + if isinstance(ts_str, (int, float)): + return int(ts_str) + if "T" in ts_str: + dt = datetime.fromisoformat(ts_str) + else: + dt = datetime.strptime(ts_str, DATE_FORMAT_STR) + return int(dt.timestamp()) + + +def parse_timestamp(val): + """ + Convert Philly job log timestamps to datetime. + Handles integers (epoch) and strings with PST/PDT. + Returns datetime or None. + """ + if val is None or val == "None": + return None + if isinstance(val, (int, float)): + return datetime.fromtimestamp(int(val), tz=timezone.utc).replace(tzinfo=None) + if isinstance(val, str): + val = val.replace(" PST", "").replace(" PDT", "") + try: + return datetime.strptime(val, DATE_FORMAT_STR).replace(tzinfo=None) + except ValueError: + return None + return None + + +def load_traces_by_day(trace_dir, start_dt, end_dt, colname): + """Load CPU or GPU traces between start_dt and end_dt.""" + traces = {} + current = start_dt.date() + + while current <= end_dt.date(): + daily_file = os.path.join(trace_dir, f"{current}.csv") + if os.path.exists(daily_file): + df = pd.read_csv( + daily_file, + names=["time", "machineId", colname], # no header in daily CSVs + dtype={"machineId": str, colname: str}, # avoid DtypeWarning + ) + + # Normalize time column (strip PST/PDT, parse datetime) + df["time"] = df["time"].str.replace(" PST", "").str.replace(" PDT", "") + df["time"] = pd.to_datetime( + df["time"], errors="coerce", format=DATE_FORMAT_STR + ) + + # Convert util column to numeric (NA/invalid → NaN) + df[colname] = pd.to_numeric(df[colname], errors="coerce") + + traces[current] = df + else: + print(f"⚠ No trace file for {current}") + current += timedelta(days=1) + + if not traces: + return {} + + return traces + + +def parse_date(s): + """Parse a Philly trace date string into a datetime object.""" + if not s or s == "None": + return None + # strip possible timezone labels like "PST"/"PDT" + s = s.replace(" PST", "").replace(" PDT", "") + return datetime.strptime(s, DATE_FORMAT_STR) + + +def load_data(files, **kwargs): + """ + Load Philly trace into ExaDigiT Job objects. + + Args: + files (list[str]): A list with one directory path (e.g., ['/opt/data/philly/trace-data']). + + Returns: + list[Job] + """ + debug = kwargs.get("debug") + print("started reading of philly traces... please be patient...", flush=True) + + # extract --start from kwargs + start_ts = to_epoch(kwargs.get("start", DEFAULT_START)) + end_ts = to_epoch(kwargs.get("end", DEFAULT_END)) + + assert len(files) == 1, "Expecting a single directory path" + trace_dir = files[0] + gpu_trace_dir = os.path.join(files[0], "gpu_by_day") + config = kwargs.get("config") + gpus_per_node = config.get("GPUS_PER_NODE") + if gpus_per_node is None: + raise ValueError("Must pass gpus_per_node (2 or 8)") + + # --- 1. Machine list --- + machine_file = os.path.join(trace_dir, "cluster_machine_list") + machines = {} + with open(machine_file, encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + mid = row["machineId"] + machines[mid] = { + "num_gpus": int(row[" number of GPUs"]), + "gpu_mem": row[" single GPU mem"].strip(), + } + + partition_machines = { + mid: info for mid, info in machines.items() if info["num_gpus"] == gpus_per_node + } + + # Build node → index mapping for this partition + node_mapping = { + mid: idx for idx, mid in enumerate(sorted(partition_machines.keys())) + } + + # Assign partition ID (e.g. 0 for 2-GPU, 1 for 8-GPU) + partition_id = 0 if gpus_per_node == 2 else 1 + + # --- 3. GPU util --- + start_dt = datetime.fromtimestamp(start_ts) + end_dt = datetime.fromtimestamp(end_ts) + + # --- 4. Job log --- + job_file = os.path.join(trace_dir, "cluster_job_log") + with open(job_file, encoding="utf-8") as f: + job_log = json.load(f) + + # --- First pass: filter jobs by date range --- + filtered_log = [] + for raw in job_log: + submitted = raw.get("submitted_time") + if submitted is None or submitted == "None": + continue + if isinstance(submitted, (int, float)): + submitted_dt = datetime.fromtimestamp(int(submitted)) + else: + submitted_dt = parse_date(submitted) + if submitted_dt and start_dt <= submitted_dt <= end_dt: + filtered_log.append(raw) + job_log = filtered_log + + # Filter job_log to only jobs matching the partition's gpus_per_node + if gpus_per_node is not None: + filtered_log = [] + for raw in job_log: + attempts = raw.get("attempts", []) + if attempts and "detail" in attempts[0]: + # Count GPUs from the first detail + gpus = sum( + len(detail.get("gpus", [])) for detail in attempts[0]["detail"] + ) + if gpus > 0 and (gpus % gpus_per_node == 0): + filtered_log.append(raw) + job_log = filtered_log + + # --- First pass: find earliest submit time --- + start_ts = None + + for raw in job_log: + submitted = raw.get("submitted_time") + if submitted is None or submitted == "None": + continue + + # Philly uses either string dates or epoch ints + if isinstance(submitted, (int, float)): + t = int(submitted) + else: + t = parse_date(submitted).timestamp() + + if start_ts is None or t < start_ts: + start_ts = t + + if start_ts is None: + raise ValueError("No valid submitted_time found in Philly traces") + + # --- Pre-load all traces for the given date range --- + cpu_trace_dir = os.path.join(trace_dir, "cpu_by_day") + gpu_trace_dir = os.path.join(trace_dir, "gpu_by_day") + all_cpu_traces = load_traces_by_day(cpu_trace_dir, start_dt, end_dt, "cpu_util") + all_gpu_traces = load_traces_by_day(gpu_trace_dir, start_dt, end_dt, "gpu_util") + + # --- Second pass: build jobs --- + jobs_list = [] + for raw in tqdm(job_log, desc="Building Job objects"): + jobid = raw.get("jobid") + user = raw.get("user") + status = raw.get("status") + + # Submitted time + submitted = raw.get("submitted_time") + if isinstance(submitted, (int, float)): + submitted = datetime.fromtimestamp(int(submitted)) + else: + submitted = parse_date(submitted) + + attempts = raw.get("attempts", []) + start, end = None, None + if attempts: + st = attempts[0].get("start_time") + et = attempts[-1].get("end_time") + + if isinstance(st, (int, float)): + start = datetime.fromtimestamp(int(st)) + elif st: + start = parse_date(st) + + if isinstance(et, (int, float)): + end = datetime.fromtimestamp(int(et)) + elif et: + end = parse_date(et) + + wall_time = None + if start and end: + wall_time = (end - start).total_seconds() + + # Which machines did this job run on? + machine_ids, gpus = [], 0 + if attempts and "detail" in attempts[0]: + for detail in attempts[0]["detail"]: + mid = detail["ip"] + machine_ids.append(mid) + gpus += len(detail.get("gpus", [])) + + num_nodes = len(machine_ids) + if num_nodes == 0: + continue + gpus_per_node = gpus // num_nodes + + # --- absolute datetimes (used for filtering traces) --- + submitted_dt = parse_timestamp(raw.get("submitted_time")) + + job_start = start + job_end = end + + if not job_start or not job_end: + continue + + # --- CPU utilization traces --- + cpu_dfs = [] + current_date = job_start.date() + while current_date <= job_end.date(): + if current_date in all_cpu_traces: + cpu_dfs.append(all_cpu_traces[current_date]) + current_date += timedelta(days=1) + + if not cpu_dfs: + job_cpu_trace = [] + else: + job_cpu_df = pd.concat(cpu_dfs, ignore_index=True) + mask = ( + (job_cpu_df["machineId"].isin(machine_ids)) + & (job_cpu_df["time"] >= start) + & (job_cpu_df["time"] <= end) + ) + job_cpu = job_cpu_df.loc[mask].copy() + + if len(machine_ids) > 1: + job_cpu = job_cpu.groupby("time")["cpu_util"].mean().reset_index() + + job_cpu_trace = (job_cpu["cpu_util"].to_numpy() * 0.01).tolist() + + # --- GPU utilization traces --- + gpu_dfs = [] + current_date = job_start.date() + while current_date <= job_end.date(): + if current_date in all_gpu_traces: + gpu_dfs.append(all_gpu_traces[current_date]) + current_date += timedelta(days=1) + + if not gpu_dfs: + job_gpu_trace = [] + else: + job_gpu_df = pd.concat(gpu_dfs, ignore_index=True) + mask = ( + (job_gpu_df["machineId"].isin(machine_ids)) + & (job_gpu_df["time"] >= start) + & (job_gpu_df["time"] <= end) + ) + job_gpu = job_gpu_df.loc[mask].copy() + + if len(machine_ids) > 1: + job_gpu = job_gpu.groupby("time")["gpu_util"].mean().reset_index() + + job_gpu_trace = ( + job_gpu["gpu_util"].to_numpy() * 0.01 * gpus_per_node + ).tolist() + + if machine_ids: + submit_time = submitted.timestamp() + start_time = start.timestamp() + end_time = end.timestamp() + + if not submit_time or not start_time or not end_time: + tqdm.write( + f"skipped {jobid} b/c missing submit_time, start_time, or end_time" + ) + continue + + scheduled_nodes = [ + node_mapping[mid] for mid in machine_ids if mid in node_mapping + ] + + if submit_time and start_time and end_time: + + job = job_dict( + id=jobid, + name=f"philly-{jobid}", + account=user if user else "unknown", + nodes_required=len(machine_ids), + partition=partition_id, + cpu_cores_required=1, + gpu_units_required=gpus_per_node, + end_state=status, + scheduled_nodes=scheduled_nodes, + cpu_trace=job_cpu_trace, + gpu_trace=job_gpu_trace, + ntx_trace=[], + nrx_trace=[], + submit_time=submit_time, + start_time=start_time, + end_time=end_time, + time_limit=end_time, + expected_run_time=wall_time if wall_time else 0, + trace_start_time=start_time, # None, + trace_end_time=end_time, # None, + trace_quanta=60, + trace_missing_values=False + ) + if job_cpu_trace and job_gpu_trace: + jobs_list.append(Job(job)) + else: + tqdm.write(f"skipping {job['id']} b/c either no cpu or gpu trace") + + if debug: + tqdm.write(f"{job['id']} start: {job['start_time']} end: {job['end_time']}") + + return WorkloadData( + jobs=jobs_list, + telemetry_start=start_ts, + telemetry_end=end_ts, + start_date=datetime.fromtimestamp(start_ts, timezone.utc), + ) diff --git a/raps/downtime.py b/raps/downtime.py new file mode 100644 index 0000000000000000000000000000000000000000..ae8b82dd7b54a9fe4f3b8bb380ace74fe2e69b64 --- /dev/null +++ b/raps/downtime.py @@ -0,0 +1,84 @@ +from __future__ import annotations +from typing import TYPE_CHECKING +from raps.job import JobState +import numpy as np + + +if TYPE_CHECKING: + from raps.engine import Engine + + +class Downtime: + + def __init__(self, *, + first_downtime, + downtime_interval, + downtime_length, + debug=False + ): + self.skip = False + if downtime_length == 0 or downtime_interval == 0 or \ + downtime_length is None or downtime_interval is None: + self.skip = True + self.interval: int = downtime_interval + self.length: int = downtime_length + self.start: int = first_downtime + self.end: int = 0 + self.down: bool = False + self.debug = debug + + def check_and_trigger(self, *, + timestep: int, + engine: Engine + ): + if self.skip: + return False # Dont simulate downtime + if timestep > self.start and not self.down: + self.simulate_down(engine=engine) + this_downtime_length = np.random.normal(self.length, 30 * 60) # 30 minutes std variance around the downtime + self.end = timestep + this_downtime_length + self.start = self.start + self.interval # Next start + return True # System went down + if timestep > self.end and self.down: + self.simulate_up(engine=engine) + return True # System went up + return False # No change + + def simulate_down(self, *, + engine: Engine + ): + if self.debug: + print("Simulated downtime: before downtime start") + print(f"Running: {len(engine.running)}, queued: {len(engine.queue)}") + + # engine.resource_manager.down_nodes.update(engine.resource_manager.nodes) # down_nodes are a set + # engine.resource_manager.available_nodes[:] = [] + + for job in engine.running: + job._state = JobState.CANCELLED + engine.power_manager.set_idle(job.scheduled_nodes) + engine.resource_manager.free_nodes_from_job(job) + + # add all available nodes to down set. + engine.resource_manager.down_nodes.update( + engine.resource_manager.available_nodes) + # clear available nodes + engine.resource_manager.available_nodes[:] = [] + + engine.queue += engine.running + engine.running = [] + if self.debug: + print("Simulated downtime: after downtime start") + print(f"Running: {len(engine.running)}, queued: {len(engine.queue)}") + self.down = True + + def simulate_up(self, *, + engine: Engine + ): + self.down = False + engine.resource_manager.available_nodes[:] = [n['id'] + for n in engine.resource_manager.nodes if not n['is_down']] + engine.down_nodes # Careful! + # these are the down nodes not managed by the resouce manager but given to the engine! + engine.resource_manager.down_nodes.clear() + engine.resource_manager.down_nodes.update(engine.config["DOWN_NODES"]) # Orig. diff --git a/raps/engine.py b/raps/engine.py index 0886b3863bd3157bbe001ceeeb1dcfb76b7bb662..a9ea4a07f448a72ac9459e90d125ece904a0b4b9 100644 --- a/raps/engine.py +++ b/raps/engine.py @@ -1,268 +1,878 @@ -from typing import Optional +from typing import Optional, List import dataclasses import pandas as pd +import numpy as np +import threading +import sys +import tty +import termios +import os +import select +import time +import random +from raps.job import Job, JobState +from raps.policy import PolicyType +from raps.utils import ( + summarize_ranges, + get_current_utilization, + WorkloadData, +) +from raps.resmgr import ResourceManager +from raps.schedulers import load_scheduler +from raps.power import ( + PowerManager, + compute_node_power, + compute_node_power_validate, + record_power_stats_foreach_job, + compute_node_power_uncertainties, + compute_node_power_validate_uncertainties, +) +from raps.network import ( + NetworkModel, + apply_job_slowdown, + compute_system_network_stats, + simulate_inter_job_congestion +) +from raps.telemetry import Telemetry +from raps.cooling import ThermoFluidsModel +from raps.flops import FLOPSManager +from raps.workloads import Workload, continuous_job_generation +from raps.account import Accounts +from raps.downtime import Downtime +from raps.weather import Weather +from raps.sim_config import SimConfig +from bisect import bisect_right -from .job import Job, JobState -from .network import network_utilization -from .utils import summarize_ranges, expand_ranges, get_utilization -from .resmgr import ResourceManager -from .schedulers import load_scheduler + +@dataclasses.dataclass +class TickReturn: + """ Represents the state output from the simulation each tick """ + power_df: Optional[pd.DataFrame] + p_flops: Optional[float] + g_flops_w: Optional[float] + system_util: float + fmu_inputs: Optional[dict] + fmu_outputs: Optional[dict] + avg_net_tx: Optional[float] + avg_net_rx: Optional[float] + avg_net_util: Optional[float] + slowdown_per_job: float + node_occupancy: dict[int, int] @dataclasses.dataclass class TickData: """ Represents the state output from the simulation each tick """ - current_time: int + current_timestep: int completed: list[Job] + killed: list[Job] running: list[Job] queue: list[Job] down_nodes: list[int] power_df: Optional[pd.DataFrame] p_flops: Optional[float] g_flops_w: Optional[float] - system_util: float + system_util: Optional[float] fmu_inputs: Optional[dict] fmu_outputs: Optional[dict] num_active_nodes: int num_free_nodes: int + avg_net_tx: Optional[float] + avg_net_rx: Optional[float] + avg_net_util: Optional[float] + slowdown_per_job: Optional[float] + node_occupancy: Optional[dict[int, int]] + time_delta: int + + +class SimulationState: + def __init__(self, time_delta): + self.paused = False + self.time_delta = time_delta + self.lock = threading.Lock() + + def toggle_pause(self): + with self.lock: + self.paused = not self.paused + + def is_paused(self): + with self.lock: + return self.paused + + def speed_up(self): + with self.lock: + self.time_delta *= 2 + print(f"\n[INFO] time_delta increased to {self.time_delta}", file=sys.stderr) + + def slow_down(self): + with self.lock: + if self.time_delta > 1: + self.time_delta //= 2 + print(f"\n[INFO] time_delta decreased to {self.time_delta}", file=sys.stderr) + + def get_time_delta(self): + with self.lock: + return self.time_delta + + +def keyboard_listener(state): + fd = sys.stdin.fileno() + old_settings = termios.tcgetattr(fd) + try: + tty.setcbreak(fd) # or tty.setraw(fd) + while True: + # Wait up to 0.1s for input + rlist, _, _ = select.select([sys.stdin], [], [], 0.1) + if rlist: + char = os.read(fd, 1).decode() + if char == 'k' or char == ' ': + state.toggle_pause() + if state.is_paused(): + print("\n[PAUSED] Press space or k to resume.", file=sys.stderr) + else: + print("\n[RESUMED]", file=sys.stderr) + elif char == 'l' or char == '+': + state.speed_up() + elif char == 'j' or char == '_': + state.slow_down() + finally: + termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) class Engine: """Job scheduling simulation engine.""" - def __init__(self, *, power_manager, flops_manager, cooling_model=None, config, **kwargs): - self.config = config + def __init__(self, sim_config: SimConfig, partition: str | None = None): + if partition: + system_config = sim_config.get_system_config_by_name(partition) + elif len(sim_config.system_configs) > 1: + raise ValueError( + "Engine can only run single-partition simulations. Use MultiPartEngine for " + + "multi-partition simulations, or pass partition to select the partition to run." + ) + else: + system_config = sim_config.system_configs[0] + + # Some temporary backwards/compatibility wrappers + system_config_dict = system_config.get_legacy() + sim_config_args = sim_config.get_legacy_args() + sim_config_dict = sim_config.get_legacy_args_dict() + sim_config_dict['config'] = system_config_dict + + if sim_config.seed: + random.seed(sim_config.seed) + np.random.seed(sim_config.seed + 1) + + if sim_config.live and not sim_config.replay: + telemetry = Telemetry(**sim_config_dict) + wd = telemetry.load_from_live_system() + elif sim_config.replay: + # TODO: this will have issues if running separate systems or custom systems + partition_short = partition.split("/")[-1] if partition else None + telemetry = Telemetry( + **sim_config_dict, + partition=partition, + ) + if partition: + snap_map = {p.stem: p for p in sim_config.replay[0].glob("*.npz")} + if len(snap_map) > 0: + if partition_short not in snap_map: + raise RuntimeError(f"Snapshot '{partition_short}.npz' not in {sim_config.replay[0]}") + replay_files = [snap_map[partition_short]] + else: + replay_files = sim_config.replay + else: + replay_files = sim_config.replay + + wd = telemetry.load_from_files(replay_files) + else: # Synthetic jobs + wl = Workload(sim_config_args, system_config_dict) + wd = wl.generate_jobs() + telemetry = Telemetry(**sim_config_dict) + + jobs = wd.jobs + if len(jobs) == 0: + print(f"Warning no jobs found for {partition or 'system'}") + if partition and len(sim_config.system_configs) > 1: + for job in jobs: + job.partition = partition + + if sim_config.start: + start = sim_config.start + diff = start - wd.start_date + # diff may be negative if start is before the first job in the workload. We'll still + # shift telemetry_start to match with sim_config.start, even if that leaves a blank + # spot at the beginning. + wd.telemetry_start += int(diff.total_seconds()) + wd.start_date = start + else: + start = wd.start_date + start = start + sim_config.fastforward + wd.telemetry_end = wd.telemetry_start + sim_config.time_int + + time_delta = sim_config.time_delta_int + + if sim_config.continuous_job_generation: + continuous_workload = wl + else: + continuous_workload = None + + if sim_config.cooling: + cooling_model = ThermoFluidsModel(**system_config_dict) + cooling_model.initialize() + if sim_config.weather: + cooling_model.weather = Weather(start, config=system_config_dict) + else: + cooling_model = None + + if sim_config.power_scope == 'node': + if sim_config.uncertainties: + power_manager = PowerManager(compute_node_power_validate_uncertainties, **system_config_dict) + else: + power_manager = PowerManager(compute_node_power_validate, **system_config_dict) + else: + if sim_config.uncertainties: + power_manager = PowerManager(compute_node_power_uncertainties, **system_config_dict) + else: + power_manager = PowerManager(compute_node_power, **system_config_dict) + + flops_manager = FLOPSManager( + config=system_config_dict, + validate=(sim_config.power_scope == "node"), + ) + + accounts = None + if sim_config.accounts: + job_accounts = Accounts(jobs) + if sim_config.accounts_json: + loaded_accounts = Accounts.from_json_filename(sim_config.accounts_json) + accounts = Accounts.merge(loaded_accounts, job_accounts) + else: + accounts = job_accounts + + self.sim_config = sim_config + self.system_config = system_config + self.config = system_config.get_legacy() + + self.start = start + self.timestep_start = wd.telemetry_start + self.timestep_end = wd.telemetry_end + self.time_delta = time_delta + self.down_nodes = summarize_ranges(self.config['DOWN_NODES']) self.resource_manager = ResourceManager( total_nodes=self.config['TOTAL_NODES'], - down_nodes=self.config['DOWN_NODES'] + down_nodes=self.config['DOWN_NODES'], + config=self.config ) - # Initialize running and queue, etc. self.running = [] self.queue = [] - self.accounts = None + self.accounts = accounts + self.telemetry = telemetry self.job_history_dict = [] self.jobs_completed = 0 - self.current_time = 0 + self.jobs_killed = 0 + self.jobs = jobs + self.total_initial_jobs = len(jobs) + self.current_timestep = 0 self.cooling_model = cooling_model self.sys_power = 0 self.power_manager = power_manager self.flops_manager = flops_manager - self.debug = kwargs.get('debug') - self.output = kwargs.get('output') - self.replay = kwargs.get('replay') + self.debug = sim_config.debug + self.continuous_workload = continuous_workload + self.replay = sim_config.replay + self.downscale = sim_config.downscale # Factor to downscale the 1s timesteps (power of 10) + self.simulate_network = sim_config.simulate_network self.sys_util_history = [] + self.scheduler_queue_history = [] + self.scheduler_running_history = [] + self.avg_net_tx = [] + self.avg_net_rx = [] + self.net_util_history = [] + self.net_congestion_history = [] + self.avg_slowdown_history = [] + self.max_slowdown_history = [] + self.node_occupancy_history = [] + self.downtime = Downtime(first_downtime=sim_config.downtime_first_int, + downtime_interval=sim_config.downtime_interval_int, + downtime_length=sim_config.downtime_length_int, + debug=sim_config.debug, + ) + + # Set scheduler type - either based on config or command-line args - defaults to 'default' + if self.config['multitenant']: + scheduler_type = 'multitenant' + else: + scheduler_type = sim_config.scheduler + + policy_type = sim_config.policy + backfill_type = sim_config.backfill - # Get scheduler type from command-line args or default - scheduler_type = kwargs.get('scheduler', 'default') self.scheduler = load_scheduler(scheduler_type)( config=self.config, - policy=kwargs.get('policy'), - resource_manager=self.resource_manager + policy=policy_type, + bfpolicy=backfill_type, + resource_manager=self.resource_manager, + jobs=jobs ) - print(f"Using scheduler: {scheduler_type}") - - def eligible_jobs(self,jobs_to_submit): - eligible_jobs_list = [] - while jobs_to_submit and jobs_to_submit[0]['submit_time'] <= self.current_time: - job_info = jobs_to_submit.pop(0) - job = Job(job_info, self.current_time) - eligible_jobs_list.append(job) - return eligible_jobs_list - - def tick(self): - """Simulate a timestep.""" - completed_jobs = [job for job in self.running if job.end_time is not None and job.end_time <= self.current_time] - - # Simulate node failure - newly_downed_nodes = self.resource_manager.node_failure(self.config['MTBF']) - for node in newly_downed_nodes: - self.power_manager.set_idle(node) - - # Update active/free nodes - self.num_free_nodes = len(self.resource_manager.available_nodes) - self.num_active_nodes = self.config['TOTAL_NODES'] \ - - len(self.resource_manager.available_nodes) \ - - len(self.resource_manager.down_nodes) - - # Update running time for all running jobs - scheduled_nodes = [] - cpu_utils = [] - gpu_utils = [] - net_utils = [] - for job in self.running: - if job.end_time == self.current_time: - job.state = JobState.COMPLETED - - if job.state == JobState.RUNNING: - job.running_time = self.current_time - job.start_time - time_quanta_index = (self.current_time - job.start_time) // self.config['TRACE_QUANTA'] - cpu_util = get_utilization(job.cpu_trace, time_quanta_index) - gpu_util = get_utilization(job.gpu_trace, time_quanta_index) - net_util = 0 - - if len(job.ntx_trace) and len(job.nrx_trace): - net_tx = get_utilization(job.ntx_trace, time_quanta_index) - net_rx = get_utilization(job.nrx_trace, time_quanta_index) - net_util = network_utilization(net_tx, net_rx) - net_utils.append(net_util) - else: - net_utils.append(0) + if sim_config.live: + assert self.scheduler.policy != PolicyType.REPLAY, \ + "Cannot replay from a live system. Choose a scheduling policy!" + print(f"Using scheduler: {str(self.scheduler.__class__).split('.')[2]}" + f", with policy {self.scheduler.policy} " + f"and backfill {self.scheduler.bfpolicy}") + + if self.simulate_network: + available_nodes = self.resource_manager.available_nodes + self.network_model = NetworkModel( + available_nodes=available_nodes, + config=self.config + ) + else: + self.network_model = None - scheduled_nodes.append(job.scheduled_nodes) - cpu_utils.append(cpu_util) - gpu_utils.append(gpu_util) + def get_workload_data(self) -> WorkloadData: + return WorkloadData( + jobs=self.jobs[:], + telemetry_start=self.timestep_start, telemetry_end=self.timestep_end, + start_date=self.start, + ) + + def add_running_jobs_to_queue(self, jobs_to_submit: List): + """ + Modifies jobs_to_submit and self.queue + + This is a preparatory step and should only be called before the main + loop of run_simulation. + Adds running jobs to the queue, and removes them from the jobs_to_submit + jobs_to_submit still holds the jobs that need be submitted in the future. + """ + # Build a list of jobs whose start_time is <= current_time. + eligible_jobs = [job for job in jobs_to_submit if + job.start_time is not None + and job.start_time < self.current_timestep] + # Remove those jobs from jobs_to_submit: + jobs_to_submit[:] = [job for job in jobs_to_submit if + job.start_time is None + or job.start_time >= self.current_timestep] + # Convert them to Job instances and build list of eligible jobs. + self.queue += eligible_jobs + + def add_eligible_jobs_to_queue(self, jobs_to_submit: List): + """ + Modifies jobs_to_submit and self.queue + + Adds eligible jobs to the queue, and removes them from the jobs_to_submit + jobs_to_submit still holds the jobs that need be submitted in the future. + returns + - true if new jobs are present + - false if no new jobs are present + """ + # Build a list of jobs whose submit_time is <= current_time. + eligible_jobs = [job for job in jobs_to_submit if job.submit_time <= self.current_timestep] + # Remove those jobs from jobs_to_submit: + jobs_to_submit[:] = [job for job in jobs_to_submit if job.submit_time > self.current_timestep] + # Convert them to Job instances and build list of eligible jobs. + self.queue += eligible_jobs + if eligible_jobs != []: + return True + else: + return False - if len(scheduled_nodes) > 0: - self.flops_manager.update_flop_state(scheduled_nodes, cpu_utils, gpu_utils) - jobs_power = self.power_manager.update_power_state(scheduled_nodes, cpu_utils, gpu_utils, net_utils) + def prepare_timestep(self, *, replay: bool = True, jobs): + # 0 track need to reschedule + # 1 identify completed jobs + # 2 Check continuous job generation + # 3 Simulate node failure # Defunct feature! + # 4 Simulate downtime + # 5 Update active and free nodes - _running_jobs = [job for job in self.running if job.state == JobState.RUNNING] - if len(jobs_power) != len(_running_jobs): - raise ValueError(f"Jobs power list of length ({len(jobs_power)}) should have ({len(_running_jobs)}) items.") - for i, job in enumerate(_running_jobs): - if job.running_time % self.config['TRACE_QUANTA'] == 0: - job.power_history.append(jobs_power[i] * len(job.scheduled_nodes)) - del _running_jobs + need_reschedule = False + # 1 Identify Completed Jobs + completed_jobs = [job for job in self.running if + job.end_time is not None and job.end_time <= self.current_timestep] + + need_reschedule = need_reschedule or (completed_jobs != []) + + # Update Completed Jobs, their account and and Free resources. for job in completed_jobs: + self.power_manager.set_idle(job.scheduled_nodes) + job.current_state = JobState.COMPLETED + job.end_time = self.current_timestep self.running.remove(job) self.jobs_completed += 1 job_stats = job.statistics() - self.accounts.update_account_statistics(job_stats) + if self.accounts: + self.accounts.update_account_statistics(job_stats) self.job_history_dict.append(job_stats.__dict__) # Free the nodes via the resource manager. self.resource_manager.free_nodes_from_job(job) - # Ask scheduler to schedule any jobs waiting in queue - self.scheduler.schedule(self.queue, self.running, self.current_time, self.accounts) + if not replay: + killed_jobs = [job for job in self.running if + job.end_time is not None and + job.start_time + job.time_limit <= self.current_timestep] + else: + killed_jobs = [] + + need_reschedule = need_reschedule or (killed_jobs != []) - # Update the power array UI component - rack_power, rect_losses = self.power_manager.compute_rack_power() - sivoc_losses = self.power_manager.compute_sivoc_losses() - rack_loss = rect_losses + sivoc_losses + for job in killed_jobs: + self.power_manager.set_idle(job.scheduled_nodes) + job.current_state = JobState.TIMEOUT + job.end_time = self.current_timestep - # Update system utilization - system_util = self.num_active_nodes / self.config['AVAILABLE_NODES'] * 100 - self.sys_util_history.append((self.current_time, system_util)) - - # Render the updated layout - power_df = None - cooling_inputs, cooling_outputs = None, None - - # Update power history every 15s - if self.current_time % self.config['POWER_UPDATE_FREQ'] == 0: - total_power_kw = sum(row[-1] for row in rack_power) + self.config['NUM_CDUS'] * self.config['POWER_CDU'] / 1000.0 - total_loss_kw = sum(row[-1] for row in rack_loss) - self.power_manager.history.append((self.current_time, total_power_kw)) - self.sys_power = total_power_kw - self.power_manager.loss_history.append((self.current_time, total_loss_kw)) - pflops = self.flops_manager.get_system_performance() / 1E15 - gflop_per_watt = pflops * 1E6 / (total_power_kw * 1000) + self.running.remove(job) + self.jobs_killed += 1 + job_stats = job.statistics() + if self.accounts: + self.accounts.update_account_statistics(job_stats) + self.job_history_dict.append(job_stats.__dict__) + # Free the nodes via the resource manager. + self.resource_manager.free_nodes_from_job(job) + + # 2 Check continuous job generation + if self.continuous_workload is not None: # Experimental + continuous_job_generation(engine=self, timestep=self.current_timestep, jobs=jobs) + + # 3 Simulate node failure + if not replay: + newly_downed_nodes = self.resource_manager.node_failure(self.config['MTBF']) + for node in newly_downed_nodes: + self.power_manager.set_idle(node) else: - pflops, gflop_per_watt = None, None - - if self.current_time % self.config['POWER_UPDATE_FREQ'] == 0: - if self.cooling_model: - # Power for NUM_CDUS (25 for Frontier) - cdu_power = rack_power.T[-1] * 1000 - runtime_values = self.cooling_model.generate_runtime_values(cdu_power, self) - - # FMU inputs are N powers and the wetbulb temp - fmu_inputs = self.cooling_model.generate_fmu_inputs(runtime_values, - uncertainties=self.power_manager.uncertainties) - cooling_inputs, cooling_outputs = ( - self.cooling_model.step(self.current_time, fmu_inputs, self.config['POWER_UPDATE_FREQ']) + newly_downed_nodes = [] + + need_reschedule = need_reschedule or (newly_downed_nodes != []) + + # 4 Simulate downtime + downtime = self.downtime.check_and_trigger(timestep=self.current_timestep, engine=self) + + need_reschedule = need_reschedule or downtime + + # 5 Update active/free nodes based on core/GPU utilization + if self.config['multitenant']: + # #total_cpu_cores = sum(node['total_cpu_cores'] for node in self.resource_manager.nodes) + # #total_gpu_units = sum(node['total_gpu_units'] for node in self.resource_manager.nodes) + # #available_cpu_cores = sum(node['available_cpu_cores'] for node in self.resource_manager.nodes) + # #available_gpu_units = sum(node['available_gpu_units'] for node in self.resource_manager.nodes) + + self.num_free_nodes = len([node for node in self.resource_manager.nodes if + not node['is_down'] + and node['available_cpu_cores'] == node['total_cpu_cores'] + and node['available_gpu_units'] == node['total_gpu_units']]) + self.num_active_nodes = len([node for node in self.resource_manager.nodes if + not node['is_down'] + and (node['available_cpu_cores'] < node['total_cpu_cores'] + or node['available_gpu_units'] < node['total_gpu_units'])]) + + # Update system utilization history + self.resource_manager.update_system_utilization(self.current_timestep, self.running) + else: + # Whole-node allocator + self.num_free_nodes = len(self.resource_manager.available_nodes) + self.num_active_nodes = self.config['TOTAL_NODES'] \ + - len(self.resource_manager.available_nodes) \ + - len(self.resource_manager.down_nodes) + if self.down_nodes != self.resource_manager.down_nodes: + need_reschedule = need_reschedule or True + self.down_nodes = self.resource_manager.down_nodes + # TODO This should only be managed in the resource manager! + + return completed_jobs, killed_jobs, need_reschedule + + def complete_timestep(self, *, + actively_considered_jobs: List, + all_jobs: List, + replay: bool, + autoshutdown: bool, + cursor: int): + # 1 update running time of all running jobs + # 2 update the current_timestep of the engine (this serves as reference for most computations) + # 3 Check if simulation should shutdown + + # update Running time + for job in self.running: + if job.current_state == JobState.RUNNING: + job.current_run_time = self.current_timestep - job.start_time + + # Stop the simulation if no more jobs are running or in the queue or in the job list. + if autoshutdown and \ + len(self.queue) == 0 and \ + len(self.running) == 0 and \ + not replay and \ + len(all_jobs) == cursor and \ + len(actively_considered_jobs) == 0: + if self.debug: + print(f"Simulaiton completed early: {self.config['system_name']} - " + f"Stopping simulation at time {self.current_timestep}. " + f"Simulation ran for {self.current_timestep - self.timestep_start}") + simulation_complete = True + else: + simulation_complete = False + self.current_timestep += 1 # Update the current time every timestep + + return simulation_complete + + def tick(self, *, time_delta=1, replay=False): + """ + Tick runs all simulations of interest at the given time delta interval. + + The simulations which are needed for simulations consistency at each time step + (inside: the main simulation loop of run_simulation) are not part of tick. + + Tick contains: + For each running job: + - CPU utilization + - GPU utilization + - Network utilization + + From these the systems (across all nodes) + - System Utilization + - Power + - Cooling + - System Performance + is simulated. + """ + + scheduled_nodes = [] + cpu_utils = [] + gpu_utils = [] + net_congs = [] + net_utils = [] + net_tx_list = [] + net_rx_list = [] + + slowdown_factors = [] + + for job in self.running: + + job.current_run_time = self.current_timestep - job.start_time + + if job.current_state != JobState.RUNNING: + raise ValueError( + f"Job {job.id} is in running list, " + + f"but state is not RUNNING: job.state == {job.current_state}" ) + else: # if job.state == JobState.RUNNING: + # Error checks + if not replay and job.current_run_time > job.time_limit and job.end_time is not None: + raise Exception(f"Job exceded time limit! " + f"{job.current_run_time} > {job.time_limit}" + f"\n{job}" + f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)" + ) + if replay and job.current_run_time > job.expected_run_time: + raise Exception(f"Job should have ended in replay! " + f" {job.current_run_time} > {job.expected_run_time}" + f"\n{job}" + f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)" + ) + + # Aggregate scheduled nodes + scheduled_nodes.append(job.scheduled_nodes) + + # Get CPU utilization + cpu_util = get_current_utilization(job.cpu_trace, job) + cpu_utils.append(cpu_util) + # Percentage Utilization! - # Get a dataframe of the power data - power_df = self.power_manager.get_power_df(rack_power, rack_loss) + # Get GPU utilization + gpu_util = get_current_utilization(job.gpu_trace, job) + gpu_utils.append(gpu_util) + # Percentage Utilization! + + # Simulate network utilization + if self.simulate_network: + + net_util, net_cong, net_tx, net_rx, max_throughput = \ + self.network_model.simulate_network_utilization(job=job, debug=self.debug) + + net_utils.append(net_util) + net_congs.append(net_cong) + net_tx_list.append(net_tx) + net_rx_list.append(net_rx) + + else: + net_util, net_cong, net_tx, net_rx = 0.0, 0.0, 0.0, 0.0 + max_throughput = 0 + net_utils.append(net_util) + net_congs.append(net_cong) + net_tx_list.append(net_tx) + net_rx_list.append(net_rx) + + # Apply slowdowns + slowdown_factor = apply_job_slowdown(job=job, + max_throughput=max_throughput, + net_util=net_util, + net_cong=net_cong, + net_tx=net_tx, + net_rx=net_rx, + debug=self.debug) + slowdown_factors.append(slowdown_factor) + + # All required values for each jobs have been an collected. + # Continue with calculations for the whole system: + + # System Utilization Statistics + system_util = self.num_active_nodes / self.config['AVAILABLE_NODES'] * 100 + self.record_util_stats(system_util=system_util) + + # --- Inter-Job Network Congestion --- + if self.simulate_network and self.network_model and self.running: + congestion_stats = simulate_inter_job_congestion( + self.network_model, self.running, self.config, self.debug + ) + if isinstance(congestion_stats, dict): + total_congestion = congestion_stats['mean'] else: - # Get a dataframe of the power data - power_df = self.power_manager.get_power_df(rack_power, rack_loss) - - tick_data = TickData( - current_time=self.current_time, - completed=completed_jobs, - running=self.running, - queue=self.queue, - down_nodes=expand_ranges(self.down_nodes[1:]), + total_congestion = congestion_stats + self.net_congestion_history.append((self.current_timestep, total_congestion)) + # --- + + # System Power + if self.power_manager: # Power is always simulated + power_df, rack_power, total_power_kw, total_loss_kw, jobs_power = \ + self.power_manager.simulate_power(running_jobs=self.running, + scheduled_nodes=scheduled_nodes, + cpu_utils=cpu_utils, + gpu_utils=gpu_utils, + net_utils=net_utils) + + # Unclear what jobs_power is! + self.record_power_stats(time_delta=time_delta, + total_power_kw=total_power_kw, + total_loss_kw=total_loss_kw, + jobs_power=jobs_power) + else: + power_df = None + + # System Cooling + if self.cooling_model: + cooling_inputs, cooling_outputs = self.cooling_model.simulate_cooling(rack_power=rack_power, + engine=self) + else: + cooling_inputs, cooling_outputs = None, None + + # System total Flops + if self.flops_manager: + pflops, gflops_per_watt = self.flops_manager.simulate_flops(scheduled_nodes=scheduled_nodes, + cpu_util=cpu_utils, + gpu_util=gpu_utils, + total_power_kw=total_power_kw) + + # System Network + if self.network_model: + avg_tx, avg_rx, avg_net = compute_system_network_stats(net_utils=net_utils, + net_tx_list=net_tx_list, + net_rx_list=net_rx_list, + slowdown_factors=slowdown_factors + ) + slowdown_per_job = sum(slowdown_factors) / len(slowdown_factors) if len(slowdown_factors) != 0 else 0 + self.record_network_stats(avg_tx=avg_tx, + avg_rx=avg_rx, + avg_net=avg_net) + else: + avg_tx, avg_rx, avg_net = None, None, None + slowdown_per_job = 0 + + # Continue with System Simulation + + # Calculate node occupancy + node_occupancy = {node['id']: 0 for node in self.resource_manager.nodes} # Initialize even if no running jobs + for job in self.running: + if job.scheduled_nodes: + node_id = job.scheduled_nodes[0] # Assuming one node per job for multitenancy + node_occupancy[node_id] += 1 + + self.node_occupancy_history.append(node_occupancy) + + return TickReturn( power_df=power_df, p_flops=pflops, - g_flops_w=gflop_per_watt, - system_util=self.num_active_nodes / self.config['AVAILABLE_NODES'] * 100, + g_flops_w=gflops_per_watt, + system_util=system_util, fmu_inputs=cooling_inputs, fmu_outputs=cooling_outputs, - num_active_nodes=self.num_active_nodes, - num_free_nodes=self.num_free_nodes, + avg_net_tx=avg_tx, + avg_net_rx=avg_rx, + avg_net_util=avg_net, + slowdown_per_job=slowdown_per_job, + node_occupancy=node_occupancy, ) - self.current_time += 1 - return tick_data - - def run_simulation(self, jobs, timesteps, autoshutdown=False): + def prepare_system_state(self, *, all_jobs: List, timestep_start, timestep_end): + # Set engine timesteps + self.timestep_start = timestep_start + self.current_timestep = timestep_start + self.timestep_end = timestep_end + + # Modifies Jobs object + # Keep only jobs that have not yet ended and that have a chance to start + all_jobs[:] = [job for job in all_jobs if + job.submit_time < timestep_end + and ((job.end_time is not None + and job.end_time >= timestep_start) + or job.end_time is None) + ] + all_jobs.sort(key=lambda j: j.submit_time) + + self.add_running_jobs_to_queue(all_jobs) + # Set policy to replay and no backfill to get the original prefilled placement. + target_policy = self.scheduler.policy + self.scheduler.policy = PolicyType.REPLAY + target_bfpolicy = self.scheduler.bfpolicy + self.scheduler.bfpolicy = None + + # Now process job queue one by one (needed to get the start_time right!) + for job in self.queue[:]: # operate over a slice copy to be able to remove jobs from queue if placed. + self.scheduler.schedule([job], self.running, job.start_time, accounts=self.accounts, sorted=True) + self.queue.remove(job) + if self.replay and len(self.queue) != 0: + raise ValueError( + f"Something went wrong! Not all jobs could be placed!\nPotential confligt in queue:\n{self.queue}") + # Restore the target policy and backfill for the remainder of the simulation. + self.scheduler.policy = target_policy + self.scheduler.bfpolicy = target_bfpolicy + + def run_simulation(self, autoshutdown=False): """Generator that yields after each simulation tick.""" - self.timesteps = timesteps - - # Sort pending jobs by submit_time. - jobs_to_submit = sorted(jobs, key=lambda j: j['submit_time']) - - for timestep in range(timesteps): - # Identify eligible jobs and add them to the queue. - self.queue += self.eligible_jobs(jobs_to_submit) - # Sort the queue according to the policy - self.queue = self.scheduler.sort_jobs(self.queue, self.accounts) - # Schedule jobs that are now in the queue. - self.scheduler.schedule(self.queue, self.running, self.current_time, sorted=True) - - # Stop the simulation if no more jobs are running or in the queue. - if autoshutdown and not self.queue and not self.running and not self.replay: - print(f"[DEBUG] {self.config['system_name']} - Stopping simulation at time {self.current_time}") - break - - if self.debug and timestep % self.config['UI_UPDATE_FREQ'] == 0: + if self.scheduler.policy == PolicyType.REPLAY: + replay = True + else: + replay = False + + if self.debug: + print(f"[DEBUG] run_simulation: Initial jobs count: {len(self.jobs)}") + if self.jobs: + print("[DEBUG] run_simulation: First job submit_time: " + f"{self.jobs[0].submit_time}, start_time: {self.jobs[0].start_time}") + + # Set times and place jobs that are currently running, onto the system. + self.prepare_system_state(all_jobs=self.jobs, + timestep_start=self.timestep_start, timestep_end=self.timestep_end, + ) + + # Process jobs in batches for better performance of timestep loop + all_jobs = self.jobs.copy() + submit_times = [j.submit_time for j in all_jobs] + cursor = 0 + + jobs = [] + # Batch Jobs into 6h windows based on submit_time or twice the time_delta if larger + batch_window = max(60 * 60 * 6, 2 * self.time_delta) # at least 6h + + sim_state = SimulationState(self.time_delta) + # listener_thread = threading.Thread(target=keyboard_listener, args=(sim_state,), daemon=True) + # listener_thread.start() + + while self.current_timestep < self.timestep_end: # Runs every second + + if sim_state.is_paused(): + time.sleep(0.1) + continue + + current_time_delta = sim_state.get_time_delta() + + if (self.current_timestep % batch_window == 0) or (self.current_timestep == self.timestep_start): + # Add jobs that are within the batching window and remove them from all jobs + # jobs += [job for job in all_jobs if job.submit_time <= self.current_timestep + batch_window] + # all_jobs[:] = [job for job in all_jobs if job.submit_time > self.current_timestep + batch_window] + cutoff = self.current_timestep + batch_window + r = bisect_right(submit_times, cutoff, lo=cursor) + if r > cursor: + jobs.extend(all_jobs[cursor:r]) + cursor = r + + # 1. Prepare Timestep: + completed_jobs, killed_jobs, need_reschedule = self.prepare_timestep(jobs=jobs) + + # 2. Identify eligible jobs and add them to the queue. + has_new_additions = self.add_eligible_jobs_to_queue(jobs) + need_reschedule = need_reschedule or has_new_additions + + # 3. Schedule jobs that are now in the queue. + if need_reschedule: + self.scheduler.schedule(self.queue, self.running, + self.current_timestep, + accounts=self.accounts, + sorted=(not has_new_additions)) + + if self.debug and self.current_timestep % self.config['UI_UPDATE_FREQ'] == 0: print(".", end="", flush=True) - yield self.tick() - - def get_stats(self): - """ Return output statistics """ - sum_values = lambda values: sum(x[1] for x in values) if values else 0 - min_value = lambda values: min(x[1] for x in values) if values else 0 - max_value = lambda values: max(x[1] for x in values) if values else 0 - num_samples = len(self.power_manager.history) if self.power_manager else 0 - - throughput = self.jobs_completed / self.timesteps * 3600 if self.timesteps else 0 # Jobs per hour - average_power_mw = sum_values(self.power_manager.history) / num_samples / 1000 if num_samples else 0 - average_loss_mw = sum_values(self.power_manager.loss_history) / num_samples / 1000 if num_samples else 0 - min_loss_mw = min_value(self.power_manager.loss_history) / 1000 if num_samples else 0 - max_loss_mw = max_value(self.power_manager.loss_history) / 1000 if num_samples else 0 - - loss_fraction = average_loss_mw / average_power_mw if average_power_mw else 0 - efficiency = 1 - loss_fraction if loss_fraction else 0 - total_energy_consumed = average_power_mw * self.timesteps / 3600 if self.timesteps else 0 # MW-hr - emissions = total_energy_consumed * 852.3 / 2204.6 / efficiency if efficiency else 0 - total_cost = total_energy_consumed * 1000 * self.config.get('POWER_COST', 0) # Total cost in dollars - - stats = { - 'num_samples': num_samples, - 'jobs completed': self.jobs_completed, - 'throughput': f'{throughput:.2f} jobs/hour', - 'jobs still running': [job.id for job in self.running], - 'jobs still in queue': [job.id for job in self.queue], - 'average power': f'{average_power_mw:.2f} MW', - 'min loss': f'{min_loss_mw:.2f} MW', - 'average loss': f'{average_loss_mw:.2f} MW', - 'max loss': f'{max_loss_mw:.2f} MW', - 'system power efficiency': f'{efficiency * 100:.2f}%', - 'total energy consumed': f'{total_energy_consumed:.2f} MW-hr', - 'carbon emissions': f'{emissions:.2f} metric tons CO2', - 'total cost': f'${total_cost:.2f}' - } - - return stats + # 4. Run tick only at specified time_delta + if 0 == (self.current_timestep % current_time_delta): + tick_return = self.tick(time_delta=current_time_delta, replay=replay) + else: + pass + + # Yield TickData here! + yield TickData( + current_timestep=self.current_timestep, + completed=completed_jobs, + killed=killed_jobs, + running=self.running, + queue=self.queue, + down_nodes=self.down_nodes, + power_df=tick_return.power_df, + p_flops=tick_return.p_flops, + g_flops_w=tick_return.g_flops_w, + system_util=tick_return.system_util, + fmu_inputs=tick_return.fmu_inputs, + fmu_outputs=tick_return.fmu_outputs, + num_active_nodes=self.num_active_nodes, + num_free_nodes=self.num_free_nodes, + avg_net_rx=tick_return.avg_net_rx, + avg_net_tx=tick_return.avg_net_tx, + avg_net_util=tick_return.avg_net_util, + slowdown_per_job=tick_return.slowdown_per_job, + node_occupancy=tick_return.node_occupancy, + time_delta=self.time_delta + ) + + # 5. Complete the timestep + simulation_done = self.complete_timestep(actively_considered_jobs=jobs, + all_jobs=all_jobs, + replay=replay, + autoshutdown=autoshutdown, + cursor=cursor) + if simulation_done: + break def get_job_history_dict(self): return self.job_history_dict + + def get_scheduler_queue_history(self): + return self.scheduler_queue_history + + def get_scheduler_running_history(self): + return self.scheduler_running_history + + def record_util_stats(self, *, system_util): + self.sys_util_history.append((self.current_timestep, system_util)) + self.scheduler_queue_history.append(len(self.running)) + self.scheduler_running_history.append(len(self.queue)) + + def record_network_stats(self, *, + avg_tx, + avg_rx, + avg_net + ): + self.avg_net_tx.append(avg_tx) + self.avg_net_rx.append(avg_rx) + self.net_util_history.append(avg_net) + + def record_power_stats(self, *, time_delta, total_power_kw, total_loss_kw, jobs_power): + if (time_delta == 1 and self.current_timestep % self.config['POWER_UPDATE_FREQ'] == 0) or time_delta != 1: + # First job specific + record_power_stats_foreach_job(running_jobs=self.running, jobs_power=jobs_power) + # power manager + self.power_manager.history.append((self.current_timestep, total_power_kw)) + self.power_manager.loss_history.append((self.current_timestep, total_loss_kw)) + # engine + self.sys_power = total_power_kw diff --git a/raps/envs/raps_env.py b/raps/envs/raps_env.py new file mode 100644 index 0000000000000000000000000000000000000000..e4ca8eb9a6e6178865ba79cfed6285af45c22f84 --- /dev/null +++ b/raps/envs/raps_env.py @@ -0,0 +1,183 @@ +import gym +from gym import spaces +import numpy as np + +from raps.engine import Engine +from raps.stats import get_engine_stats, get_job_stats, get_scheduler_stats, get_network_stats + +from stable_baselines3.common.logger import Logger, HumanOutputFormat +import sys + +logger = Logger(folder=None, output_formats=[HumanOutputFormat(sys.stdout)]) + + +def print_stats(stats, step=0): + """prints SB3-style stats output""" + + wanted_keys = { + "time_simulated": "engine/Time Simulated", + "average_power": "engine/Average Power", + "system_power_efficiency": "engine/System Power Efficiency", + "total_energy_consumed": "engine/Total Energy Consumed", + "carbon_emissions": "engine/Carbon Footprint", + "jobs_completed": "jobs/Jobs Completed", + "throughput": "jobs/Throughput", + "jobs_still_running": "jobs/Jobs Still Running", + } + + for section in ["engine_stats", "job_stats"]: + if section in stats: + for k, v in stats[section].items(): + if k in wanted_keys: + if k == "jobs_still_running" and isinstance(v, list): + v = len(v) + logger.record(wanted_keys[k], v) + + logger.dump(step=step) + + +class RAPSEnv(gym.Env): + """ + Minimal Gym-compatible wrapper around RAPS Engine + for RL job scheduling experiments. + """ + + metadata = {"render.modes": ["human"]} + + def __init__(self, sim_config): + super().__init__() + # Store everything in self.args + self.sim_config = sim_config + self.engine = self._create_engine() + + # --- RL spaces --- + max_jobs = 100 + job_features = 4 # [nodes, runtime, priority, wait_time] + self.observation_space = spaces.Box( + low=0, high=1, shape=(max_jobs, job_features), dtype=np.float32 + ) + self.action_space = spaces.Discrete(max_jobs) + + def _create_engine(self): + engine = Engine(self.sim_config) + engine.scheduler.env = self + self.jobs = engine.jobs + self.generator = engine.run_simulation() + return engine + + def reset(self, **kwargs): + self.engine = self._create_engine() + obs = self._get_state() + return obs + + def _compute_reward(self, tick_data): + """ + Reward function for RL scheduling on Frontier-like systems. + Balances throughput and carbon footprint, using incremental values. + """ + + # How many jobs completed *this tick* + jobs_done = len(getattr(tick_data, "completed", [])) + + # Incremental carbon emitted this tick + carbon_step = getattr(self.engine, "carbon emissions", 0.0) + + # Tradeoff weights (tunable hyperparameters) + alpha = 10.0 # reward for finishing a job + beta = 0.1 # penalty per metric ton CO2 + + # Reward = (jobs * alpha) - (carbon * beta) + reward = (alpha * jobs_done) - (beta * carbon_step) + + # Small penalty if idle and no jobs complete + if jobs_done == 0 and carbon_step == 0: + reward -= 0.01 + + return reward + + def step(self, action): + if self.engine is None: + raise RuntimeError("Engine not initialized. Did you forget to call reset()?") + + queue = self.engine.queue + invalid_action = False + + # If queue empty or index out of range → invalid + if len(queue) == 0 or action >= len(queue): + invalid_action = True + else: + job = queue[int(action)] + available_nodes = self.engine.scheduler.resource_manager.available_nodes + + if job.nodes_required <= len(available_nodes): + # Just pick the first available node (simplest placement policy) + node_id = available_nodes[0] + self.engine.scheduler.place_job_and_manage_queues( + job, + queue, + self.engine.running, + self.engine.current_timestep, + node_id, + ) + else: + invalid_action = True + + # advance simulation by one tick + tick_data = next(self.generator) + + # compute reward + if invalid_action: + reward = -1.0 + else: + reward = self._compute_reward(tick_data) + + # clip reward + reward = np.clip(reward, -10.0, 10.0) + + # Print stats + stats = self.get_stats() + print_stats(stats) + + obs = self._get_state() + done = self.engine.current_timestep >= self.engine.timestep_end + info = {} + + print(f"t={self.engine.current_timestep}, " + f"queue={len(self.engine.queue)}, " + f"running={len(self.engine.running)}, " + f"completed={self.engine.jobs_completed}", + f"action={action}") + + return obs, reward, done, info + + def _get_state(self): + """Construct simple state representation from engine's job queue.""" + # Example: take waiting jobs (haven’t started yet) + job_queue = [j for j in self.jobs if getattr(j, "start_time", None) is None] + + max_jobs, job_features = self.observation_space.shape + state = np.zeros((max_jobs, job_features), dtype=np.float32) + + for i, job in enumerate(job_queue[:max_jobs]): + features = [ + getattr(job, "nodes_required", 0), + getattr(job, "wall_time", 0), + getattr(job, "priority", 0), + getattr(job, "wait_time", 0), # may need to compute from current_timestep - qdt + ] + state[i, : len(features)] = features + + return state + + def render(self, mode="human"): + print("Timestep:", self.engine.current_timestep, + "Utilization:", self.telemetry.utilization(), + "Power:", self.telemetry.power()) + + def get_stats(self): + return { + "engine_stats": get_engine_stats(self.engine), + "job_stats": get_job_stats(self.engine), + "scheduler_stats": get_scheduler_stats(self.engine), + "network_stats": get_network_stats(self.engine) + } diff --git a/raps/flops.py b/raps/flops.py index eebd0fa117c0243f509d70e7a87f940af17ad7f5..1546c5246f063411c8a113508336e4d2640c9ba9 100644 --- a/raps/flops.py +++ b/raps/flops.py @@ -1,6 +1,7 @@ import numpy as np from .utils import linear_to_3d_index + class FLOPSManager(): def __init__(self, **kwargs): @@ -9,40 +10,54 @@ class FLOPSManager(): self.flop_state = np.zeros(self.config['SC_SHAPE']) def update_flop_state(self, scheduled_nodes, cpu_util, gpu_util): + if len(scheduled_nodes) == 0: + return cpu_util = np.asarray(cpu_util) gpu_util = np.asarray(gpu_util) job_lengths = np.array([len(job) for job in scheduled_nodes]) - flattened_nodes = np.concatenate(scheduled_nodes, axis=0) + flattened_nodes = np.concatenate(scheduled_nodes, axis=0).astype(np.int64) cpu_util_flat = np.repeat(cpu_util, job_lengths) gpu_util_flat = np.repeat(gpu_util, job_lengths) node_indices = linear_to_3d_index(flattened_nodes, self.config['SC_SHAPE']) - if self.validate: # cpu_util is in fact node_Watts in this case total_peak = ( - self.config['CPU_FP_RATIO'] * self.config['CPU_PEAK_FLOPS'] + + self.config['CPU_FP_RATIO'] * self.config['CPU_PEAK_FLOPS'] + self.config['GPU_FP_RATIO'] * self.config['GPU_PEAK_FLOPS'] - ) + ) denominator = ( - self.config['POWER_CPU_MAX'] * self.config['CPUS_PER_NODE'] + - self.config['POWER_GPU_MAX'] * self.config['GPUS_PER_NODE'] + + self.config['POWER_CPU_MAX'] * self.config['CPUS_PER_NODE'] + + self.config['POWER_GPU_MAX'] * self.config['GPUS_PER_NODE'] + self.config['POWER_NIC'] * self.config['NICS_PER_NODE'] + self.config['POWER_NVME'] - ) + ) self.flop_state[node_indices] = total_peak * (cpu_util_flat / denominator) - else: + else: self.flop_state[node_indices] = ( self.config['CPU_FP_RATIO'] * cpu_util_flat * self.config['CPU_PEAK_FLOPS'] + self.config['GPU_FP_RATIO'] * gpu_util_flat * self.config['GPU_PEAK_FLOPS'] ) def get_rpeak(self): - node_peak_flops = self.config['CPUS_PER_NODE'] * self.config['CPU_PEAK_FLOPS'] \ - + self.config['GPUS_PER_NODE'] * self.config['GPU_PEAK_FLOPS'] + node_peak_flops = ( + self.config['CPUS_PER_NODE'] * self.config['CPU_PEAK_FLOPS'] + + self.config['GPUS_PER_NODE'] * self.config['GPU_PEAK_FLOPS'] + ) system_peak_flops = self.config['AVAILABLE_NODES'] * node_peak_flops return system_peak_flops def get_system_performance(self): return np.sum(self.flop_state) + + def simulate_flops(self, *, scheduled_nodes, cpu_util, gpu_util, total_power_kw): + self.update_flop_state(scheduled_nodes=scheduled_nodes, + cpu_util=cpu_util, + gpu_util=gpu_util) + pflops = self.get_system_performance() / 1E15 + if total_power_kw != 0: + gflops_per_watt = pflops * 1E6 / (total_power_kw * 1000) + else: + gflops_per_watt = 0 + return pflops, gflops_per_watt diff --git a/raps/helpers.py b/raps/helpers.py index bff066a9390be143be6d311751d81d72a46bc703..0e2e6540475c69fabdd4641688190689c0859bbd 100644 --- a/raps/helpers.py +++ b/raps/helpers.py @@ -1,9 +1,25 @@ import sys +import tomllib +from pathlib import Path + def check_python_version(): - # Check for the required Python version - required_major, required_minor = 3, 9 + # Load pyproject.toml + pyproject_path = Path(__file__).parent.parent / "pyproject.toml" + with open(pyproject_path, "rb") as f: + pyproject_data = tomllib.load(f) + + # Extract required python version (e.g., ">=3.11") + requires_python = pyproject_data["project"]["requires-python"] + + # Get the minimum major/minor from the string + # This assumes format like ">=3.11" + version_str = requires_python.lstrip(">=").strip() + required_major, required_minor = map(int, version_str.split(".")[:2]) + # Compare if sys.version_info < (required_major, required_minor): - sys.stderr.write(f"Error: RAPS requires Python {required_major}.{required_minor} or greater\n") + sys.stderr.write( + f"Error: RAPS requires Python {required_major}.{required_minor} or greater\n" + ) sys.exit(1) diff --git a/raps/job.py b/raps/job.py index c0b0e9be6dc69bbdce4ee1e3c0c9b324132f8de7..4d1dda01233023803938cc8d24e0091072842bfa 100644 --- a/raps/job.py +++ b/raps/job.py @@ -1,86 +1,253 @@ from enum import Enum +import numpy as np +from types import NoneType -def job_dict(nodes_required, name, account, cpu_trace, gpu_trace, ntx_trace, nrx_trace, \ - wall_time, end_state, scheduled_nodes, time_offset, job_id, priority=0, partition=0): +""" +Note: want to simplify this in the future to use a minimal required set of job attributes, +the standard workload format (swf) https://www.cs.huji.ac.il/labs/parallel/workload/swf.html + +Implementing such using something like: + + from types import SimpleNamespace + job = SimpleNamespace(**job_dict(...)) +""" + + +class JobState(Enum): + """Enumeration for job states.""" + RUNNING = 'R' + PENDING = 'PD' + COMPLETED = 'C' + COMPLETING = 'Cing' + CANCELLED = 'CA' + FAILED = 'F' + TIMEOUT = 'TO' + + +def job_dict(*, + nodes_required, + name, + account, + # Allocation + current_state=JobState.PENDING, + end_state: JobState | None = None, + scheduled_nodes=None, + id, + priority: int | None = 0, + partition: int | None = 0, + # Resource Requests and allocations + cpu_cores_required=0, + gpu_units_required=0, + allocated_cpu_cores=0, + allocated_gpu_units=0, + # Traces + cpu_trace, + gpu_trace, + ntx_trace, + nrx_trace, + # Times + submit_time=0, + time_limit: int = 0, + start_time: int | None = 0, + end_time: int | None = 0, + expected_run_time: int | None = 0, + current_run_time: int = 0, + trace_time: int | None = 0, + trace_start_time: int | None = 0, + trace_end_time: int | None = 0, + trace_quanta: int | None = None, + trace_missing_values: bool | None = False, + downscale: int = 1 + ): """ Return job info dictionary """ return { 'nodes_required': nodes_required, 'name': name, 'account': account, + # Allocation: + 'current_state': current_state, + 'end_state': end_state, + 'scheduled_nodes': scheduled_nodes, + 'id': id, + 'priority': priority, + 'partition': partition, + # Resource Requests and allocations: + 'cpu_cores_required': cpu_cores_required, + 'gpu_units_required': gpu_units_required, + 'allocated_cpu_cores': allocated_cpu_cores, + 'allocated_gpu_units': allocated_gpu_units, + # Traces: 'cpu_trace': cpu_trace, 'gpu_trace': gpu_trace, 'ntx_trace': ntx_trace, 'nrx_trace': nrx_trace, - 'wall_time': wall_time, - 'end_state': end_state, - 'requested_nodes': scheduled_nodes, - 'submit_time': time_offset, - 'id': job_id, - 'priority': priority, - 'partition': partition + # Times: + 'submit_time': submit_time, + 'time_limit': time_limit, + 'start_time': start_time, + 'end_time': end_time, + 'expected_run_time': expected_run_time, + 'current_run_time': current_run_time, + 'trace_time': trace_time, + 'trace_start_time': trace_start_time, + 'trace_end_time': trace_end_time, + 'trace_quanta': trace_quanta, + 'trace_missing_values': trace_missing_values, + 'dilated': False, + 'downscale': downscale } -class JobState(Enum): - """Enumeration for job states.""" - RUNNING = 'R' - PENDING = 'PD' - COMPLETED = 'C' - CANCELLED = 'CA' - FAILED = 'F' - TIMEOUT = 'TO' +def dilate_trace(trace, factor): + """ + Scale a trace in the time dimension by the given factor. + + Parameters: + - trace: list/tuple/np.ndarray of floats OR a single numeric scalar. + - factor (float): >1 to slow down (stretch in time), <1 to speed up. + + Returns: + - list of float for sequence inputs, or numeric for scalar inputs. + """ + if trace is None: + return trace + + if factor is None: + raise ValueError("factor must be provided") + if factor == 0: + raise ValueError("factor must be non-zero") + + # Treat any numeric scalar (int/float/np.number) as a scalar trace + if isinstance(trace, (int, float, np.integer, np.floating, np.number)): + # Keep total "area" the same when stretching/compressing in time: + return trace / factor + + # Handle common sequence types directly + if isinstance(trace, (list, tuple, np.ndarray)): + arr = np.asarray(trace, dtype=float) + else: + # Last-resort: try coercion (e.g., pandas Series) + arr = np.asarray(trace, dtype=float) + + if arr.size == 0: + # empty sequence: nothing to do + return [] if not isinstance(trace, np.ndarray) else arr + + original_length = arr.size + # at least 1 sample after dilation + new_length = max(1, int(np.round(original_length * float(factor)))) + + # If original_length == 1, interpolation just repeats the value + old_indices = np.linspace(0, original_length - 1, num=original_length) + new_indices = np.linspace(0, original_length - 1, num=new_length) + + new_trace = np.interp(new_indices, old_indices, arr).tolist() + return new_trace class Job: """Represents a job to be scheduled and executed in the distributed computing system. Each job consists of various attributes such as the number of nodes required for execution, - CPU and GPU utilization, wall time, and other relevant parameters (see utils.job_dict). + CPU and GPU utilization, trace time, and other relevant parameters (see utils.job_dict). The job can transition through different states during its lifecycle, including PENDING, RUNNING, COMPLETED, CANCELLED, FAILED, or TIMEOUT. """ _id_counter = 0 - def __init__(self, job_dict, current_time, state=JobState.PENDING, account=None): + def __init__(self, job_dict, current_state=JobState.PENDING, end_state=None, account=None): + # # current_time unused! # Initializations: - self.start_time = None - self.end_time = None - self.running_time = 0 self.power = 0 - self.scheduled_nodes = [] + self.scheduled_nodes = [] # Explicit list of requested nodes + self.nodes_required = 0 # If scheduled_nodes is set this can be derived. + self.cpu_cores_required = 0 + self.gpu_units_required = 0 + self.allocated_cpu_cores = 0 + self.allocated_gpu_units = 0 self.power_history = [] - self._state = state + self._current_state = current_state + self.end_state = end_state # default None! self.account = account + # Times: + self.submit_time = None # Actual submit time + self.time_limit = None # Time limit set at submission + self.start_time = None # Actual start time when executing or from telemetry + self.end_time = None # Actual end time, either None if or from telemetry + self.expected_run_time = None + self.current_run_time = 0 + self.trace_time = None # Time period for which traces are available + self.trace_start_time = None # Relative start time of the trace (to running time) + self.trace_end_time = None # Relative end time of the trace + self.trace_quanta = None # Trace quanta associated with the job # None means single value! + self.current_run_time = 0 # Current running time updated when simulating + # If a job dict was given, override the values from the job_dict: for key, value in job_dict.items(): setattr(self, key, value) # In any case: provide a job_id! - if not self.id: + if self.id is None: # This is wrong self.id = Job._get_next_id() + if self.nodes_required == 0 and self.scheduled_nodes != []: + self.nodes_required = len(self.scheduled_nodes) + elif self.nodes_required != 0: + pass + else: + raise ValueError(f"{self.nodes_required} {self.scheduled_nodes}") + if self.scheduled_nodes == [] or self.scheduled_nodes is None or \ + (isinstance(self.scheduled_nodes, list) and isinstance(self.scheduled_nodes[0], int)) or \ + (isinstance(self.scheduled_nodes, np.ndarray) and isinstance(self.scheduled_nodes[0], int)): + pass # Type is ok + else: + raise ValueError( + f"type: self.scheduled_nodes:{type(self.scheduled_nodes)}, " + f"with {type(self.scheduled_nodes[0])}") + assert isinstance(self.submit_time, (int, float)) + assert isinstance(self.expected_run_time, (int, float, np.int64, np.double, NoneType)) + assert isinstance(self.current_run_time, (int, float, np.int64, np.double)) + assert isinstance(self.start_time, (int, float, np.int64, np.double, NoneType)) + assert isinstance(self.end_time, (int, float, np.int64, np.double, NoneType)) + if self.start_time is not None and self.end_time is not None: + assert self.start_time <= self.end_time, f"{self.start_time} <= {self.end_time}" + def __repr__(self): """Return a string representation of the job.""" return (f"Job(id={self.id}, name={self.name}, account={self.account}, " f"nodes_required={self.nodes_required}, " - f"cpu_trace={self.cpu_trace}, gpu_trace={self.gpu_trace}, wall_time={self.wall_time}, " - f"end_state={self.end_state}, requested_nodes={self.requested_nodes}, " - f"submit_time={self.submit_time}, start_time={self.start_time}, " - f"end_time={self.end_time}, running_time={self.running_time}, state={self._state}, " - f"scheduled_nodes={self.scheduled_nodes}, power={self.power}, " + f"scheduled_nodes={self.scheduled_nodes}, " + f"cpu_cores_required={self.cpu_cores_required}, " + f"gpu_units_required={self.gpu_units_required}, " + f"allocated_cpu_cores={self.allocated_cpu_cores}, " + f"allocated_gpu_units={self.allocated_gpu_units}, " + f"cpu_trace={self.cpu_trace}, gpu_trace={self.gpu_trace}, " + f"ntx_trace={self.ntx_trace}, nrx_trace={self.nrx_trace}, " + f"end_state={self.end_state}, " + f"current_state={self.current_state}, " + f"submit_time={self.submit_time}, time_limit={self.time_limit}, " + f"start_time={self.start_time}, end_time={self.end_time}, " + f"expected_run_time={self.expected_run_time}, " + f"current_run_time={self.current_run_time}, " + f"trace_time={self.trace_time}, " + f"trace_start_time={self.trace_start_time}, " + f"trace_end_time={self.trace_end_time}, " + f"trace_quanta={self.trace_quanta}, " + f"current_run_time={self.current_run_time}, " + f"power={self.power}, " f"power_history={self.power_history})") @property - def state(self): + def current_state(self): """Get the current state of the job.""" - return self._state + return self._current_state - @state.setter - def state(self, value): - """Set the state of the job.""" + @current_state.setter + def current_state(self, value): + """Set the current_state of the job.""" if isinstance(value, JobState): - self._state = value + self._current_state = value elif isinstance(value, str) and value in JobState.__members__: - self._state = JobState[value] + self._current_state = JobState[value] else: raise ValueError(f"Invalid state: {value}") @@ -99,25 +266,89 @@ class Job: return cls._id_counter def statistics(self): - """ Derive job statistics from the Job Class and return - """ + """ Derive job statistics from the Job Class and return """ return JobStatistics(self) + def apply_dilation(self, factor): + """ + Apply a dilation factor to the job’s execution traces and run time. + + Parameters: + - factor (float): the dilation factor; >1 to slow down (lengthen the traces) and <1 to speed up. + """ + self.cpu_trace = dilate_trace(self.cpu_trace, factor) + self.gpu_trace = dilate_trace(self.gpu_trace, factor) + self.ntx_trace = dilate_trace(self.ntx_trace, factor) + self.nrx_trace = dilate_trace(self.nrx_trace, factor) + if self.end_time is not None: + expected_run_time = self.end_time - self.start_time + expected_run_time = int(np.round(expected_run_time * factor)) + assert self.start_time is not None + self.end_time = self.start_time + expected_run_time + class JobStatistics: - """ - Reduced class for handling statistics after the job has finished. - """ + """ Reduced class for handling statistics after the job has finished. """ - def __init__(self,job): + def __init__(self, job): self.id = job.id self.name = job.name self.account = job.account self.num_nodes = len(job.scheduled_nodes) - self.run_time = job.running_time + self.scheduled_nodes = job.scheduled_nodes + self.run_time = job.current_run_time + self.submit_time = job.submit_time self.start_time = job.start_time self.end_time = job.end_time - self.state = job._state + self.current_state = job.current_state + if isinstance(job.cpu_trace, list) or isinstance(job.cpu_trace, np.ndarray): + if len(job.cpu_trace) == 0: + self.avg_cpu_usage = 0 + else: + self.avg_cpu_usage = sum(job.cpu_trace) / len(job.cpu_trace) + elif isinstance(job.cpu_trace, int) or isinstance(job.cpu_trace, float): + self.avg_cpu_usage = job.cpu_trace + elif job.cpu_trace is None: + self.avg_cpu_usage = None + else: + raise NotImplementedError() + + if isinstance(job.gpu_trace, list) or isinstance(job.gpu_trace, np.ndarray): + if len(job.gpu_trace) == 0: + self.avg_gpu_usage = 0 + else: + self.avg_gpu_usage = sum(job.gpu_trace) / len(job.gpu_trace) + elif isinstance(job.gpu_trace, int) or isinstance(job.gpu_trace, float): + self.avg_gpu_usage = job.gpu_trace + elif job.gpu_trace is None: + self.avg_gpu_usage = None + else: + raise NotImplementedError() + + if isinstance(job.ntx_trace, list) or isinstance(job.ntx_trace, np.ndarray): + if len(job.ntx_trace) == 0: + self.avg_ntx_usage = 0 + else: + self.avg_ntx_usage = sum(job.ntx_trace) / len(job.ntx_trace) + elif isinstance(job.ntx_trace, int) or isinstance(job.ntx_trace, float): + self.avg_ntx_usage = job.ntx_trace + elif job.ntx_trace is None: + self.avg_ntx_usage = None + else: + raise NotImplementedError() + + if isinstance(job.nrx_trace, list) or isinstance(job.nrx_trace, np.ndarray): + if len(job.nrx_trace) == 0: + self.avg_nrx_usage = 0 + else: + self.avg_nrx_usage = sum(job.nrx_trace) / len(job.nrx_trace) + elif isinstance(job.nrx_trace, int) or isinstance(job.nrx_trace, float): + self.avg_nrx_usage = job.nrx_trace + elif job.nrx_trace is None: + self.avg_nrx_usage = None + else: + raise NotImplementedError() + if len(job.power_history) == 0: self.avg_node_power = 0 self.max_node_power = 0 @@ -125,3 +356,62 @@ class JobStatistics: self.avg_node_power = sum(job.power_history) / len(job.power_history) / self.num_nodes self.max_node_power = max(job.power_history) / self.num_nodes self.energy = self.run_time * self.avg_node_power * self.num_nodes + + +if __name__ == "__main__": + import random + + # Each sample in the trace represents 15 seconds. + trace_quanta = 15 # seconds per sample + expected_run_time = 600 # total job run time in seconds (600s = 10 minutes) + num_samples = expected_run_time // trace_quanta # should be 40 samples + + # Generate a random GPU trace (values between 0 and 4 for 4 GPUs total) + gpu_trace = [random.uniform(0, 4) for _ in range(num_samples)] + # Generate a random CPU trace (values between 0 and 1) + cpu_trace = [random.uniform(0, 1) for _ in range(num_samples)] + # Dummy network traces + ntx_trace = [random.uniform(0, 10) for _ in range(num_samples)] + nrx_trace = [random.uniform(0, 10) for _ in range(num_samples)] + + # Create a job dictionary using the existing job_dict helper. + jdict = job_dict( + nodes_required=1, + name="test_job", + account="test_account", + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=ntx_trace, + nrx_trace=nrx_trace, + expected_run_time=expected_run_time, + end_state="", + scheduled_nodes=[], + time_offset=0, + job_id=0 + ) + + # Instantiate the Job. + job_instance = Job(jdict, current_time=0) + + # Print original job properties. + print("Original expected_run_time:", job_instance.expected_run_time) + print("Original cpu_trace length:", len(job_instance.cpu_trace)) + print("Original gpu_trace length:", len(job_instance.gpu_trace)) + + # Apply a dilation factor, e.g., 1.5 for a 50% slowdown (traces become 50% longer) + dilation_factor = 1.5 + job_instance.apply_dilation(dilation_factor) + + # Calculate the expected new lengths. + expected_samples = int(np.round(num_samples * dilation_factor)) + expected_run_time = int(np.round(expected_run_time * dilation_factor)) + + # Print the dilated job properties. + print("\nAfter applying a dilation factor of", dilation_factor) + print("New expected_run_time:", job_instance.expected_run_time, "(expected:", expected_run_time, ")") + print("New cpu_trace length:", len(job_instance.cpu_trace), "(expected:", expected_samples, ")") + print("New gpu_trace length:", len(job_instance.gpu_trace), "(expected:", expected_samples, ")") + + # Optionally, print a few sample values from the new traces. + print("\nSample cpu_trace values:", job_instance.cpu_trace[:5]) + print("Sample gpu_trace values:", job_instance.gpu_trace[:5]) diff --git a/raps/multi_part_engine.py b/raps/multi_part_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..6332aa5ac17afd5bed918d99ddc6a67734e8f6f9 --- /dev/null +++ b/raps/multi_part_engine.py @@ -0,0 +1,38 @@ +from collections.abc import Iterable +from raps.engine import Engine, TickData +from raps.sim_config import MultiPartSimConfig + + +class MultiPartEngine: + def __init__(self, sim_config: MultiPartSimConfig): + if sim_config.replay: + root_systems = set(s.system_name.split("/")[0] for s in sim_config.system_configs) + # TODO should consider how to pass separate replay values for separate systems + if len(root_systems) > 1: + raise ValueError("Replay for multi-system runs is not supported") + + engines: dict[str, Engine] = {} + + for partition in sim_config.system_configs: + engine = Engine(sim_config, partition=partition.system_name) + engines[partition.system_name] = engine + + total_initial_jobs = sum(len(e.jobs) for e in engines.values()) + for engine in engines.values(): + engine.total_initial_jobs = total_initial_jobs + + self.partition_names = sorted(engines.keys()) + self.engines = engines + first_engine = list(engines.values())[0] + self.start = first_engine.start + self.timestep_start = first_engine.timestep_start + self.timestep_end = first_engine.timestep_end + + def run_simulation(self) -> Iterable[dict[str, TickData | None]]: + generators = [] + for part in self.partition_names: + generators.append(self.engines[part].run_simulation()) + for tick_datas in zip(*generators, strict=True): + yield dict(zip(self.partition_names, tick_datas)) + + # TODO need to add a mode to run the partitions in parallel diff --git a/raps/network.py b/raps/network.py deleted file mode 100644 index ddcfbc110c353c6d89c2c9e10abf6a43f91c0840..0000000000000000000000000000000000000000 --- a/raps/network.py +++ /dev/null @@ -1,8 +0,0 @@ -TX_MAX = 10000 -RX_MAX = 20000 - -def network_utilization(tx, rx): - """Compute average network utilization""" - tx_util = min(tx / TX_MAX, 1.0) # Clamp to 1.0 - rx_util = min(rx / RX_MAX, 1.0) - return (tx_util + rx_util) / 2.0 diff --git a/raps/network/__init__.py b/raps/network/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3522b39fa38e0cb94f7e5e77780cdca6a4ac0e7d --- /dev/null +++ b/raps/network/__init__.py @@ -0,0 +1,181 @@ +import os +import warnings + +from .base import ( + all_to_all_paths, + apply_job_slowdown, + compute_system_network_stats, + link_loads_for_job, + network_congestion, + network_slowdown, + network_utilization, + worst_link_util, + get_link_util_stats, + simulate_inter_job_congestion, + max_throughput_per_tick, +) + +from .fat_tree import build_fattree, node_id_to_host_name, subsample_hosts +from .torus3d import build_torus3d, link_loads_for_job_torus, torus_host_from_real_index +from .dragonfly import build_dragonfly, dragonfly_node_id_to_host_name, build_dragonfly_idx_map +from raps.plotting import plot_fattree_hierarchy, plot_dragonfly, plot_torus2d, plot_torus3d + +from raps.utils import get_current_utilization + +__all__ = [ + "NetworkModel", + "apply_job_slowdown", + "compute_system_network_stats", + "network_congestion", + "network_utilization", + "network_slowdown", + "all_to_all_paths", + "link_loads_for_job", + "worst_link_util", + "build_fattree", + "build_torus3d", + "build_dragonfly", + "dragonfly_node_id_to_host_name", + "simulate_inter_job_congestion", + "max_throughput_per_tick", + "get_link_util_stats", +] + + +class NetworkModel: + def __init__(self, *, available_nodes, config, **kwargs): + self.config = config + self.topology = config.get("TOPOLOGY") + self.max_link_bw = config.get("NETWORK_MAX_BW", 1e9) # default safeguard + self.real_to_fat_idx = kwargs.get("real_to_fat_idx", {}) + + if self.topology == "fat-tree": + total_nodes = config['TOTAL_NODES'] - len(config['DOWN_NODES']) + self.fattree_k = config.get("FATTREE_K") + self.net_graph = build_fattree(self.fattree_k, total_nodes) + # TODO: future testing of subsampling feature + #self.net_graph = subsample_hosts(self.net_graph, num_hosts=4626) + + elif self.topology == "torus3d": + dims = ( + int(config["TORUS_X"]), + int(config["TORUS_Y"]), + int(config["TORUS_Z"]) + ) + wrap = bool(config.get("TORUS_WRAP", True)) + hosts_per_router = int(config.get("HOSTS_PER_ROUTER", config.get("hosts_per_router", 1))) + + # Build the graph and metadata + self.net_graph, self.meta = build_torus3d(dims, wrap, hosts_per_router=hosts_per_router) + + # Deterministic numeric → host mapping + X, Y, Z = self.meta["dims"] + self.id_to_host = {} + nid = 0 + for x in range(X): + for y in range(Y): + for z in range(Z): + for i in range(hosts_per_router): + h = f"h_{x}_{y}_{z}_{i}" + self.id_to_host[nid] = h + nid += 1 + + elif self.topology == "dragonfly": + D = self.config["DRAGONFLY_D"] + A = self.config["DRAGONFLY_A"] + P = self.config["DRAGONFLY_P"] + self.net_graph = build_dragonfly(D, A, P) + + # total nodes seen by scheduler or job trace + total_real_nodes = getattr(self, "available_nodes", None) + if total_real_nodes is None: + total_real_nodes = 4626 # fallback for Lassen + + # if available_nodes is a list, take its length + if not isinstance(total_real_nodes, int): + total_real_nodes = len(total_real_nodes) + + self.real_to_fat_idx = build_dragonfly_idx_map(D, A, P, total_real_nodes) + print(f"[DEBUG] Dragonfly mapping: {len(self.real_to_fat_idx)} entries") + + elif self.topology == "capacity": + # Capacity-only model: no explicit graph + self.net_graph = None + + else: + raise ValueError(f"Unsupported topology: {self.topology}") + + def simulate_network_utilization(self, *, job, debug=False): + net_util = net_cong = net_tx = net_rx = 0 + max_throughput = self.max_link_bw * job.trace_quanta + + if job.nodes_required <= 1: + # Single node job, skip network impact + return net_util, net_cong, net_tx, net_rx, max_throughput + + net_tx = get_current_utilization(job.ntx_trace, job) + net_rx = get_current_utilization(job.nrx_trace, job) + net_util = network_utilization(net_tx, net_rx, max_throughput) + + if self.topology == "fat-tree": + host_list = [node_id_to_host_name(n, self.fattree_k) for n in job.scheduled_nodes] + loads = link_loads_for_job(self.net_graph, host_list, net_tx) + net_cong = worst_link_util(loads, max_throughput) + if debug: + print(" fat-tree hosts:", host_list) + + elif self.topology == "dragonfly": + D = self.config["DRAGONFLY_D"] + A = self.config["DRAGONFLY_A"] + P = self.config["DRAGONFLY_P"] + # Directly use mapped host names + host_list = [self.real_to_fat_idx[real_n] for real_n in job.scheduled_nodes] + if debug: + print(" dragonfly hosts:", host_list) + print("Example nodes in graph:", list(self.net_graph.nodes)[:10]) + print("Contains h_0_9_0?", "h_0_9_0" in self.net_graph) + loads = link_loads_for_job(self.net_graph, host_list, net_tx) + net_cong = worst_link_util(loads, max_throughput) + + elif self.topology == "torus3d": + X = self.config["TORUS_X"] + Y = self.config["TORUS_Y"] + Z = self.config["TORUS_Z"] + hosts_per_router = self.config["HOSTS_PER_ROUTER"] + #host_list = [self.id_to_host[n] for n in job.scheduled_nodes] + host_list = [ + torus_host_from_real_index(n, X, Y, Z, hosts_per_router) + for n in job.scheduled_nodes + ] + loads = link_loads_for_job_torus(self.net_graph, self.meta, host_list, net_tx) + net_cong = worst_link_util(loads, max_throughput) + if debug: + print(" torus3d hosts:", host_list) + + elif self.topology == "capacity": + net_cong = network_congestion(net_tx, net_rx, max_throughput) + + else: + raise ValueError(f"Unsupported topology: {self.topology}") + + return net_util, net_cong, net_tx, net_rx, max_throughput + + def plot_topology(self, output_dir): + """Plot network topology - save as png file in output_dir.""" + if output_dir: + if self.topology == "fat-tree": + save_path = output_dir / "net-fat-tree.png" + plot_fattree_hierarchy(self.net_graph, k=self.fattree_k, save_path=save_path) + elif self.topology == "dragonfly": + save_path = output_dir / "net-dragonfly.png" + plot_dragonfly(self.net_graph, save_path=save_path) + elif self.topology == "torus3d": + save_path = output_dir / "net-torus2d.png" + plot_torus2d(self.net_graph, save_path=save_path) + save_path = output_dir / "net-torus3d.png" + plot_torus3d(self.net_graph, save_path=save_path) + else: + warnings.warn( + f"plotting not supported for {self.topology} topology", + UserWarning + ) diff --git a/raps/network/base.py b/raps/network/base.py new file mode 100644 index 0000000000000000000000000000000000000000..3f3daeb953a182c0e0574c0da0f44aa50785abe7 --- /dev/null +++ b/raps/network/base.py @@ -0,0 +1,222 @@ +import networkx as nx +import numpy as np +from raps.utils import get_current_utilization +from raps.network.fat_tree import node_id_to_host_name +from raps.network.torus3d import link_loads_for_job_torus, torus_host_from_real_index + + +def debug_print_trace(job, label: str = ""): + """Print either the length (if iterable) or the value of job.gpu_trace.""" + if hasattr(job.gpu_trace, "__len__"): + print(f"length of {len(job.gpu_trace)} {label}") + else: + print(f"gpu_trace value {job.gpu_trace} {label}") + + +def apply_job_slowdown(*, job, max_throughput, net_util, net_cong, net_tx, net_rx, debug: bool = False): + # Get the maximum allowed bandwidth from the configuration. + if net_cong > 1: + if debug: + print(f"congested net_cong: {net_cong}, max_throughput: {max_throughput}") + debug_print_trace(job, "before dilation") + + throughput = net_tx + net_rx + slowdown_factor = network_slowdown(throughput, max_throughput) + + if debug: + print("***", hasattr(job, "dilated"), throughput, max_throughput, slowdown_factor) + + # Only apply slowdown once per job to avoid compounding the effect. + if not job.dilated: + if debug: + print(f"Applying slowdown factor {slowdown_factor:.2f} to job {job.id} due to network congestion") + job.apply_dilation(slowdown_factor) + job.dilated = True + if debug: + debug_print_trace(job, "after dilation") + else: + slowdown_factor = 1 + job.slowdown_factor = slowdown_factor + + return slowdown_factor + + +def compute_system_network_stats(net_utils, net_tx_list, net_rx_list, slowdown_factors): + + # Compute network averages + n = len(net_utils) or 1 + avg_tx = sum(net_tx_list) / n + avg_rx = sum(net_rx_list) / n + avg_net = sum(net_utils) / n + # avg_slowdown_per_job = sum(slowdown_factors) / n + # self.avg_slowdown_history.append(avg_slowdown_per_job) + # max_slowdown_per_job = max(slowdown_factors) + # self.max_slowdown_history.append(max_slowdown_per_job) + + return avg_tx, avg_rx, avg_net + + +def network_congestion(tx, rx, max_throughput): + """ + Overload factor ≥0: average of send/recv NOT clamped. + >1.0 means you’re pushing above capacity. + """ + tx_util = float(tx) / max_throughput + rx_util = float(rx) / max_throughput + return (tx_util + rx_util) / 2.0 + + +def network_utilization(tx, rx, max_throughput): + """ + True utilization in [0,1]: average of send/recv clamped to 100%. + """ + tx_u = min(float(tx) / max_throughput, 1.0) + rx_u = min(float(rx) / max_throughput, 1.0) + return (tx_u + rx_u) / 2.0 + + +def network_slowdown(current_throughput, max_throughput): + """ + Calculate a slowdown factor based on current network bandwidth usage. + + If current_bw is within limits, the factor is 1.0 (no slowdown). + If current_bw exceeds max_bw, the factor is current_bw/max_bw. + """ + if current_throughput <= max_throughput: + return 1.0 + else: + return current_throughput / max_throughput + + +def all_to_all_paths(G, hosts): + """ + Given a list of host names, return shortest‐paths for every unordered pair. + """ + paths = [] + for i in range(len(hosts)): + for j in range(i + 1, len(hosts)): + src, dst = hosts[i], hosts[j] + p = nx.shortest_path(G, src, dst) + paths.append((src, dst, p)) + return paths + + +def link_loads_for_job(G, job_hosts, tx_volume_bytes): + """ + Distribute tx_volume_bytes from each host equally to all its peers; + accumulate per-link loads and return a dict {(u,v):bytes, …}. + """ + paths = all_to_all_paths(G, job_hosts) + loads = {edge: 0.0 for edge in G.edges()} + # each host sends tx_volume_bytes to each of the (N-1) peers + for src in job_hosts: + if len(job_hosts) >= 2: + per_peer = tx_volume_bytes / (len(job_hosts) - 1) + else: + per_peer = 0 + # find paths where src is the sender + for s, d, p in paths: + if s != src: + continue + # add per_peer to every link on p + for u, v in zip(p, p[1:]): + # ensure ordering matches loads keys + edge = (u, v) if (u, v) in loads else (v, u) + loads[edge] += per_peer + return loads + + +def worst_link_util(loads, throughput): + """ + Given loads in **bytes** and capacity in **bits/sec**, convert: + util = (bytes * 8) / throughput + Return the maximum util over all links. + """ + max_util = 0.0 + for edge, byte_load in loads.items(): + util = (byte_load * 8) / throughput + if util > max_util: + max_util = util + return max_util + + +def get_link_util_stats(loads, throughput, top_n=10): + """ + Calculates a distribution of link utilization stats. + Returns a dictionary with min, mean, max, std_dev, and top N congested links. + """ + if not loads: + return {'max': 0, 'mean': 0, 'min': 0, 'std_dev': 0, 'top_links': []} + + # Calculate utilization for every link + utilizations = {(edge): (byte_load * 8) / throughput for edge, byte_load in loads.items()} + + util_values = list(utilizations.values()) + + stats = { + 'max': np.max(util_values), + 'mean': np.mean(util_values), + 'min': np.min(util_values), + 'std_dev': np.std(util_values) + } + + # Get top N congested links + sorted_links = sorted(utilizations.items(), key=lambda item: item[1], reverse=True) + stats['top_links'] = sorted_links[:top_n] + + return stats + + +def max_throughput_per_tick(legacy_cfg: dict, trace_quanta: int) -> float: + """Return bytes-per-tick throughput of a single link.""" + bw = legacy_cfg.get("NETWORK_MAX_BW") or 12.5e9 + return float(bw) * trace_quanta + + +def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False): + """ + Simulates network congestion from a list of concurrently running jobs. + """ + if not network_model.net_graph: + print("[WARN] Network graph is not defined. Skipping congestion simulation.") + return 0.0 + + total_loads = {tuple(sorted(edge)): 0.0 for edge in network_model.net_graph.edges()} + trace_quanta = jobs[0].trace_quanta if jobs else 0 + + for job in jobs: + # Assuming job.current_run_time is 0 for this static simulation + job.current_run_time = 0 + job.trace_start_time = 0 + net_tx = get_current_utilization(job.ntx_trace, job) + + job_loads = {} + if network_model.topology in ("fat-tree", "dragonfly"): + if network_model.topology == "fat-tree": + k = int(legacy_cfg.get("FATTREE_K", 32)) + host_list = [node_id_to_host_name(n, k) for n in job.scheduled_nodes] + else: # dragonfly + host_list = [network_model.real_to_fat_idx[real_n] for real_n in job.scheduled_nodes] + + job_loads = link_loads_for_job(network_model.net_graph, host_list, net_tx) + + elif network_model.topology == "torus3d": + X = int(legacy_cfg.get("TORUS_X", 12)) + Y = int(legacy_cfg.get("TORUS_Y", 12)) + Z = int(legacy_cfg.get("TORUS_Z", 12)) + hosts_per_router = int(legacy_cfg.get("HOSTS_PER_ROUTER", 1)) + host_list = [ + torus_host_from_real_index(n, X, Y, Z, hosts_per_router) + for n in job.scheduled_nodes + ] + job_loads = link_loads_for_job_torus(network_model.net_graph, network_model.meta, host_list, net_tx) + + for edge, load in job_loads.items(): + edge_key = tuple(sorted(edge)) + if edge_key in total_loads: + total_loads[edge_key] += load + + max_throughput = max_throughput_per_tick(legacy_cfg, trace_quanta) + net_stats = get_link_util_stats(total_loads, max_throughput) + + return net_stats diff --git a/raps/network/dragonfly.py b/raps/network/dragonfly.py new file mode 100644 index 0000000000000000000000000000000000000000..0f29b68c556571a0193241c938d198f20b167152 --- /dev/null +++ b/raps/network/dragonfly.py @@ -0,0 +1,144 @@ +import networkx as nx +from itertools import combinations + + +import networkx as nx + +def build_dragonfly(d, a, p): + """ + Build a Dragonfly network graph. + d = routers per group + a = global connections per router + p = compute nodes per router + """ + G = nx.Graph() + num_groups = a + 1 # standard Dragonfly rule + + # --- Routers and hosts --- + for g in range(num_groups): + for r in range(d): + router = f"r_{g}_{r}" + G.add_node(router, layer="router", group=g) + + # attach p hosts to each router + for h in range(p): + host = f"h_{g}_{r}_{h}" + G.add_node(host, layer="host", group=g) + G.add_edge(router, host) + + # --- Intra-group full mesh --- + for g in range(num_groups): + routers = [f"r_{g}_{r}" for r in range(d)] + for i in range(d): + for j in range(i + 1, d): + G.add_edge(routers[i], routers[j]) + + # --- Inter-group (global) links --- + for g in range(num_groups): + for r in range(d): + src = f"r_{g}_{r}" + for offset in range(1, a + 1): + dst_group = (g + offset) % num_groups + dst = f"r_{dst_group}_{r % d}" + G.add_edge(src, dst) + + return G + + +def build_dragonfly2(D: int, A: int, P: int) -> nx.Graph: + """ + Build a “simple” k-ary Dragonfly with: + D = # of groups + A = # of routers per group + P = # of hosts (endpoints) per router + + Naming convention: + - Router nodes: "r_{g}_{r}" with g ∈ [0..D−1], r ∈ [0..A−1] + - Host nodes: "h_{g}_{r}_{p}" with p ∈ [0..P−1] + + Topology: + 1. All routers within a group form a full clique. + 2. Each router r in group g has exactly one “global link” to router r in each other group. + 3. Each router r in group g attaches to P hosts ("h_{g}_{r}_{0..P−1}"). + + Examples + -------- + >>> from raps.plotting import plot_network_graph + >>> G = build_dragonfly(D=2, A=2, P=2) + >>> plot_network_graph(G, 'dragonfly.png') + """ + G = nx.Graph() + + # 1) Create all router nodes + for g in range(D): + for r in range(A): + router = f"r_{g}_{r}" + G.add_node(router, type="router", group=g, index=r) + + # 2) Intra‐group full mesh of routers + for g in range(D): + routers_in_group = [f"r_{g}_{r}" for r in range(A)] + for u, v in combinations(routers_in_group, 2): + G.add_edge(u, v) + + # 3) Inter‐group “one‐to‐one” global links + # (router index r in group g → router index r in group g2) + for g1 in range(D): + for g2 in range(g1 + 1, D): + for r in range(A): + u = f"r_{g1}_{r}" + v = f"r_{g2}_{r}" + G.add_edge(u, v) + + # 4) Attach hosts to each router + for g in range(D): + for r in range(A): + router = f"r_{g}_{r}" + for p in range(P): + host = f"h_{g}_{r}_{p}" + G.add_node(host, type="host", group=g, router=r, index=p) + G.add_edge(router, host) + + return G + + +def dragonfly_node_id_to_host_name(fat_idx: int, D: int, A: int, P: int) -> str: + """ + Convert a contiguous Dragonfly host index to its hierarchical name. + + For a Dragonfly with: + D routers per group, + A global links per router ⇒ num_groups = A + 1, + P compute nodes per router. + + Hosts are laid out in contiguous order: + group g = floor(fat_idx / (D * P)) + router r = (fat_idx // P) % D + host h = fat_idx % P + """ + num_groups = A + 1 + total_hosts = num_groups * D * P + assert 0 <= fat_idx < total_hosts, f"fat_idx {fat_idx} out of range (max {total_hosts-1})" + + group = fat_idx // (D * P) + router = (fat_idx // P) % D + host = fat_idx % P + return f"h_{group}_{router}_{host}" + + +def build_dragonfly_idx_map(d: int, a: int, p: int, total_real_nodes: int) -> dict[int, str]: + """ + Build a mapping {real_node_index: host_name} for Dragonfly. + Wrap around if total_real_nodes > total_hosts. + """ + num_groups = a + 1 + total_hosts = num_groups * d * p + + mapping = {} + for i in range(total_real_nodes): + fat_idx = i % total_hosts # <- wrap safely + group = fat_idx // (d * p) + router = (fat_idx // p) % d + host = fat_idx % p + mapping[i] = f"h_{group}_{router}_{host}" + return mapping diff --git a/raps/network/fat_tree.py b/raps/network/fat_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..c514b836b7a632d7f3f43e010ca2c6e29b50d907 --- /dev/null +++ b/raps/network/fat_tree.py @@ -0,0 +1,82 @@ +import random +import networkx as nx + + +def node_id_to_host_name(node_id: int, k: int) -> str: + """ + Convert an integer node id to the host name string in the fat-tree. + Node IDs are assumed to be contiguous, mapping to h_{pod}_{edge}_{i}. + """ + # need to match the scheme from build_fattree + pod = node_id // (k * k // 4) + edge = (node_id % (k * k // 4)) // (k // 2) + host = node_id % (k // 2) + return f"h_{pod}_{edge}_{host}" + + +def build_fattree(k, total_nodes): + """ + Build a k-ary fat-tree: + - k pods + - each pod has k/2 edge switches, k/2 agg switches + - core layer has (k/2)^2 core switches + - each edge switch connects to k/2 hosts + Returns a NetworkX Graph where: + - hosts are named "h_{pod}_{edge}_{i}" + - edge switches "e_{pod}_{edge}" + - agg switches "a_{pod}_{agg}" + - core switches "c_{i}_{j}" + + Examples + -------- + >>> from raps.plotting import plot_network_graph + >>> G = build_fattree(k=4, total_nodes=16) + >>> plot_network_graph(G, 'fat_tree.png') + """ + num_hosts = (k**3) // 4 + if num_hosts < total_nodes: + raise ValueError( + f"Fat-tree network with k={k} has {num_hosts} hosts, but the system has {total_nodes} nodes. " + f"Please increase the value of 'fattree_k' in the system configuration file." + ) + G = nx.Graph() + # core + # num_core = (k//2)**2 # Unused! + for i in range(k // 2): + for j in range(k // 2): + core = f"c_{i}_{j}" + G.add_node(core, type="core") + # pods + for pod in range(k): + # agg switches + for agg in range(k // 2): + a = f"a_{pod}_{agg}" + G.add_node(a, type="agg") + # connect to all core switches in column agg + for i in range(k // 2): + core = f"c_{agg}_{i}" + G.add_edge(a, core) + # edge switches + hosts + for edge in range(k // 2): + e = f"e_{pod}_{edge}" + G.add_node(e, type="edge") + # connect edge→each agg in this pod + for agg in range(k // 2): + a = f"a_{pod}_{agg}" + G.add_edge(e, a) + # connect hosts + for h in range(k // 2): + host = f"h_{pod}_{edge}_{h}" + G.add_node(host, type="host") + G.add_edge(e, host) + return G + + +def subsample_hosts(G, num_hosts): + """Reduce the number of host nodes in the FatTree graph to match system size.""" + hosts = [n for n in G if n.startswith("h")] + if num_hosts < len(hosts): + keep = set(random.sample(hosts, num_hosts)) + remove = [n for n in hosts if n not in keep] + G.remove_nodes_from(remove) + return G diff --git a/raps/network/torus3d.py b/raps/network/torus3d.py new file mode 100644 index 0000000000000000000000000000000000000000..b88e1d2aba4f9b13ed7612107ebb42c6b52ac478 --- /dev/null +++ b/raps/network/torus3d.py @@ -0,0 +1,194 @@ +import csv +import networkx as nx +from pathlib import Path + + +def build_torus3d( + dims, + wrap=True, + hosts_per_router: int = 1, + torus_link_bw: float = None, + latency_per_hop: float = None, + network_max_bw: float = None, +): + """ + Build a 3D torus network (routers + hosts). + Each router r_x_y_z connects to 6 neighbors (±X, ±Y, ±Z) + and attaches hosts h_x_y_z_p for p ∈ [0..hosts_per_router-1]. + + Returns: + (G, meta) where: + - G: networkx.Graph + - meta: dict with topology info for plotting/simulation + """ + X, Y, Z = dims + G = nx.Graph() + + # --- Add routers with normalized coordinates --- + for x in range(X): + for y in range(Y): + for z in range(Z): + name = f"r_{x}_{y}_{z}" + G.add_node( + name, + type="router", + x=x / (X - 1 if X > 1 else 1), + y=y / (Y - 1 if Y > 1 else 1), + z=z / (Z - 1 if Z > 1 else 1), + ) + + # --- Add wrap-around router-to-router edges --- + for x in range(X): + for y in range(Y): + for z in range(Z): + src = f"r_{x}_{y}_{z}" + + nx_ = (x + 1) % X if wrap else x + 1 + if nx_ < X: + G.add_edge( + src, f"r_{nx_}_{y}_{z}", + bandwidth=torus_link_bw, + latency=latency_per_hop, + type="router_link" + ) + + ny_ = (y + 1) % Y if wrap else y + 1 + if ny_ < Y: + G.add_edge( + src, f"r_{x}_{ny_}_{z}", + bandwidth=torus_link_bw, + latency=latency_per_hop, + type="router_link" + ) + + nz_ = (z + 1) % Z if wrap else z + 1 + if nz_ < Z: + G.add_edge( + src, f"r_{x}_{y}_{nz_}", + bandwidth=torus_link_bw, + latency=latency_per_hop, + type="router_link" + ) + + # --- Add hosts and host-router edges --- + for x in range(X): + for y in range(Y): + for z in range(Z): + router = f"r_{x}_{y}_{z}" + for p in range(hosts_per_router): + host = f"h_{x}_{y}_{z}_{p}" + G.add_node( + host, + type="host", + x=(x + 0.1) / (X - 1 if X > 1 else 1), + y=(y + 0.1) / (Y - 1 if Y > 1 else 1), + z=(z + 0.1 * (p + 1)) / (Z - 1 if Z > 1 else 1), + ) + G.add_edge( + host, router, + bandwidth=network_max_bw, + latency=latency_per_hop, + type="host_link" + ) + + # --- Build host <-> router mappings for simulator use --- + host_to_router = {} + router_to_hosts = {} + + for x in range(X): + for y in range(Y): + for z in range(Z): + router = f"r_{x}_{y}_{z}" + router_to_hosts[router] = [] + for p in range(hosts_per_router): + host = f"h_{x}_{y}_{z}_{p}" + host_to_router[host] = router + router_to_hosts[router].append(host) + + meta = { + "topology": "torus3d", + "dims": (X, Y, Z), + "hosts_per_router": hosts_per_router, + "wrap": wrap, + "num_routers": X * Y * Z, + "num_hosts": X * Y * Z * hosts_per_router, + "host_to_router": host_to_router, + "router_to_hosts": router_to_hosts, + } + + print(f"Built 3D torus with {meta['num_routers']} routers and {meta['num_hosts']} hosts.") + return G, meta + + +def _axis_steps(a, b, n, wrap=True): + """Return minimal step sequence along one axis from a to b with wrap-around.""" + if a == b: + return [] + fwd = (b - a) % n + back = (a - b) % n + if not wrap: + step = 1 if b > a else -1 + return [step] * abs(b - a) + if fwd <= back: + return [1] * fwd + else: + return [-1] * back + + +def torus_route_xyz(src_r, dst_r, dims, wrap=True): + """Router-level path (list of router names) using XYZ dimension-order routing.""" + X, Y, Z = dims + + def parse(r): + _, x, y, z = r.split("_") + return int(x), int(y), int(z) + + x1, y1, z1 = parse(src_r) + x2, y2, z2 = parse(dst_r) + + path = [src_r] + x, y, z = x1, y1, z1 + for step in _axis_steps(x, x2, X, wrap): + x = (x + step) % X + path.append(f"r_{x}_{y}_{z}") + for step in _axis_steps(y, y2, Y, wrap): + y = (y + step) % Y + path.append(f"r_{x}_{y}_{z}") + for step in _axis_steps(z, z2, Z, wrap): + z = (z + step) % Z + path.append(f"r_{x}_{y}_{z}") + return path + + +def torus_host_path(G, meta, h_src, h_dst): + r_src = meta["host_to_router"][h_src] + r_dst = meta["host_to_router"][h_dst] + routers = torus_route_xyz(r_src, r_dst, meta["dims"], meta["wrap"]) + # host->src_router + (router path) + dst_router->host + path = [h_src, r_src] + routers[1:] + [h_dst] + return path + + +def link_loads_for_job_torus(G, meta, host_list, traffic_bytes): + # all-to-all between hosts in host_list, route via torus_host_path, add traffic_bytes per pair + loads = {} + n = len(host_list) + for i in range(n): + for j in range(i + 1, n): + p = torus_host_path(G, meta, host_list[i], host_list[j]) + for u, v in zip(p, p[1:]): + e = tuple(sorted((u, v))) + loads[e] = loads.get(e, 0) + traffic_bytes + return loads + + +def torus_host_from_real_index(real_n, X, Y, Z, hosts_per_router): + total_hosts = X * Y * Z * hosts_per_router + idx = real_n % total_hosts + r = idx // hosts_per_router + h = idx % hosts_per_router + z = r % Z + y = (r // Z) % Y + x = (r // (Y * Z)) % X + return f"h_{x}_{y}_{z}_{h}" + diff --git a/raps/plotting.py b/raps/plotting.py index 1c3f550382b01234ca7ebdc6dce196b0f2e13e78..44a66af608cf00611782ac9844e1e8f0495c3d9d 100644 --- a/raps/plotting.py +++ b/raps/plotting.py @@ -13,9 +13,20 @@ Plotter histograms, and comparisons. """ +import itertools +from pathlib import Path import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +from mpl_toolkits.mplot3d import Axes3D +from matplotlib.ticker import MaxNLocator + +import time import numpy as np +import networkx as nx +import random from uncertainties import unumpy +from rich.progress import track + class BasePlotter: """ @@ -30,6 +41,7 @@ class BasePlotter: title : str The title of the plot. """ + def __init__(self, xlabel, ylabel, title, uncertainties=False): """ Constructs all the necessary attributes for the BasePlotter object. @@ -75,6 +87,7 @@ class BasePlotter: plt.savefig(save_path) plt.close() + class Plotter(BasePlotter): """ A class for creating and saving specific types of plots, such as histories, @@ -85,7 +98,8 @@ class Plotter(BasePlotter): save_path : str The path to save the plot. """ - def __init__(self, xlabel='', ylabel='', title='', save_path='out.svg', uncertainties=False): + + def __init__(self, xlabel='', ylabel='', title='', save_path: Path | str = 'out.svg', uncertainties=False): """ Constructs all the necessary attributes for the Plotter object. @@ -231,6 +245,386 @@ def plot_submit_times(submit_times, nr_list): plt.savefig('submit_times.png', dpi=300, bbox_inches='tight') +def convert_time_scale(times): + max_time = max(times) + if max_time >= 3600 * 24 * 7: # more than a week + return [t / (3600 * 24) for t in times], 'days' + elif max_time >= 3600 * 24: # more than a day + return [t / 3600 for t in times], 'hours' + else: + return times, 'seconds' + + +def plot_job_gantt(start_times, end_times, node_counts): + # Convert times + start_times, time_label = convert_time_scale(start_times) + end_times, _ = convert_time_scale(end_times) + + plt.figure(figsize=(10, 4)) + + # We'll plot each job in a different row on the Y-axis + y_positions = range(len(start_times)) # 0, 1, 2, ... + + for s, e, n in zip(start_times, end_times, node_counts): + # Bar placed at y = n + plt.barh( + y=n, # node count is the vertical coordinate + width=e - s, # job duration on the x-axis + left=s, # start time + height=0.8, # thickness of the bar + color='yellow', + edgecolor='black', + alpha=0.8 + ) + + # for y, (s, e, n) in enumerate(zip(start_times, end_times, node_counts)): + # plt.barh(y, width=e - s, left=s, height=0.8, + # color='yellow', edgecolor='black', alpha=0.8) + # # Optionally place the node count label in the middle of the bar + # plt.text((s + e)/2, y, str(n), + # ha='center', va='center', color='black') + + plt.xlabel(f'Time ({time_label})') + plt.ylabel('Job Index') + plt.title('Job Timeline (Gantt Style)') + plt.yticks(y_positions) # label each job if desired + + # Time axis from earliest start to latest end + plt.xlim(min(start_times), max(end_times)) + + plt.tight_layout() + plt.savefig('job_gantt.png', dpi=300) + + +def plot_network_histogram(*, ax, data, bins=50, save_path='network_histogram.png'): + """ + Plot a histogram of network traffic per job, with scientific notation on the x-axis. + """ + if ax is None: + ax = plt.figure(figsize=(10, 3)) + + ax.hist(data, bins=bins, edgecolor='black', alpha=0.7) + + # log-scale the y-axis + ax.yscale('log') + + # force scientific notation on x-axis + ax.ticklabel_format(style='scientific', axis='x', scilimits=(0, 0)) + + ax.xlabel('Network Traffic per Job (bytes)') + ax.ylabel('Frequency') + ax.title('Histogram of Network Traffic per Job') + ax.grid(True, which='both', ls='--', lw=0.5) + + return ax + + +def spaced_colors(n, cmap_name='nipy_spectral'): + cmap = plt.get_cmap(cmap_name) + # Get n points spaced in [0,1] + base = np.linspace(0, 1, n, endpoint=False) + # Shuffle them to maximize distance between consecutive colors + # e.g. take every k-th, wrap around + step = int(np.ceil(np.sqrt(n))) + indices = (step * np.arange(n)) % n + values = base[indices] + return [cmap(v) for v in values] + + +def plot_jobs_gantt(*, ax=None, jobs, bars_are_node_sized): + jobs.sort(key=lambda x: x.submit_time) + if ax is None: + ax = plt.figure(figsize=(10, 4)) + # Submit_time and Wall_time + submit_t = [x.submit_time for x in jobs] + duration = [x.current_run_time if x.end_time else x.time_limit for x in jobs] + nodes_required = [x.nodes_required for x in jobs] + + colors = spaced_colors(len(jobs)) + offset = 0 + for i in track(range(len(jobs)), description="Collecting information to plot"): + if bars_are_node_sized: + ax.barh(offset + nodes_required[i] / 2, duration[i], height=nodes_required[i], left=submit_t[i]) + offset += nodes_required[i] + else: + ax.barh(i, duration[i], height=1.0, left=submit_t[i], color=colors[i]) + print("Plotting") + + ax.set_ylabel("Job ID") + # ax_b labels: + ax.set_xlabel("time [hh:mm]") + minx_s = min([x.submit_time for x in jobs]) + maxx_s = np.ceil(max([x.current_run_tim if x.end_time else x.time_limit for + x in jobs]) + max([x.submit_time for x in jobs])) + x_label_mins = [int(n) for n in np.arange(minx_s // 60, maxx_s // 60)] + x_label_ticks = [n * 60 for n in x_label_mins[0::60]] + x_label_str = [str(x1).zfill(2) + ":" + str(x2).zfill(2) for + (x1, x2) in [(n // 60, n % 60) for + n in x_label_mins[0::60]]] + + ax.set_xticks(x_label_ticks, x_label_str) + # ax.yaxis.set_inverted(True) + return ax + + +def plot_nodes_gantt(*, ax=None, jobs): + if ax is None: + ax = plt.figure(figsize=(10, 4)) + # Submit_time and Wall_time + duration = [x.current_run_time if x.end_time else x.time_limit for x in jobs] + # nodes_required = [x['nodes_required'] for x in jobs] + start_t = [x.start_time for x in jobs] + nodeIDs = [x.scheduled_nodes for x in jobs] + print(nodeIDs) + if not any(nodeIDs): + raise IndexError(f"No nodeIDs: {nodeIDs}, jobs have no scheduled_nodes.") + + colors = spaced_colors(len(jobs)) + for i in track(range(len(jobs)), description="Collecting information to plot"): + for nodeID in nodeIDs[i]: + ax.barh(nodeID, duration[i], height=1.0, left=start_t[i], color=colors[i]) + print("Plotting") + + ax.set_ylabel("Node ID") + # ax_b labels: + ax.set_xlabel("time [hh:mm]") + # minx_s = min([x.submit_time for x in jobs]) # Unused + # maxx_s = np.ceil(max([x.wall_time for x in jobs]) + max([x.submit_time for x in jobs])) # Unused + # ax.xaxis.set_major_formatter(md.DateFormatter('%H:%M:%S')) + + formatter = ticker.FuncFormatter(lambda s, x: time.strftime('%m-%d %H:%M:%S', time.gmtime(s))) + ax.xaxis.set_major_formatter(formatter) + ax.yaxis.set_major_locator(MaxNLocator(integer=True)) + + # x_label_mins = [int(n) for n in np.arange(minx_s // 60, maxx_s // 60)] + # x_label_ticks = [n * 60 for n in x_label_mins[0::60]] + # x_label_str = [str(x1).zfill(2) + ":" + str(x2).zfill(2) for + # (x1,x2) in [(n // 60,n % 60) for + # n in x_label_mins[0::60]]] + + # ax.set_xticks(x_label_ticks,x_label_str) + ax.set_ylim(1, max(list(itertools.chain.from_iterable(nodeIDs)))) + # ax.yaxis.set_inverted(True) + return ax + + +def plot_fattree_hierarchy(G, k=32, save_path='net_fattree.png'): + """Draw a hierarchical Fat-Tree layout with automatic scaling.""" + pos = {} + + # --- Layer order and matching prefixes --- + layers = ["core", "agg", "edge", "h"] + layer_prefixes = { + "core": ["core", "c_"], + "agg": ["agg", "a_"], + "edge": ["edge", "e_"], + "h": ["h", "host"] + } + + # --- Compute how many nodes per layer --- + layer_counts = {} + for layer in layers: + prefixes = layer_prefixes[layer] + layer_nodes = [n for n in G.nodes if any(n.startswith(p) for p in prefixes)] + layer_counts[layer] = len(layer_nodes) + + max_nodes = max(layer_counts.values()) or 1 + y_gap = 1.0 / (len(layers) - 1) + + # --- Assign positions, normalized to [0,1] range --- + for j, layer in enumerate(layers): + prefixes = layer_prefixes[layer] + layer_nodes = [n for n in G.nodes if any(n.startswith(p) for p in prefixes)] + n_layer = len(layer_nodes) + if n_layer == 0: + continue + x_spacing = 1.0 / n_layer + y = 1.0 - j * y_gap + for i, node in enumerate(layer_nodes): + x = (i + 0.5) * x_spacing # center each node + pos[node] = (x, y) + + # --- Draw figure --- + plt.figure(figsize=(10, 8)) + color_map = {"core": "red", "agg": "orange", "edge": "green", "h": "blue"} + size_map = {"core": 30, "agg": 20, "edge": 10, "h": 5} + + for layer in layers: + nodes = [n for n in G.nodes if any(n.startswith(p) for p in layer_prefixes[layer])] + if nodes: + nx.draw_networkx_nodes( + G, pos, nodelist=nodes, node_color=color_map[layer], + node_size=size_map[layer], label=layer.capitalize(), alpha=0.7 + ) + + # --- Only draw inter-layer edges for clarity --- + edgelist = [ + (u, v) for (u, v) in G.edges + if not any(u.startswith(p) and v.startswith(p) + for p in ["c_", "a_", "e_", "h", "core", "agg", "edge", "host"]) + ] + nx.draw_networkx_edges(G, pos, edgelist=edgelist, alpha=0.05, width=0.4) + + plt.legend() + plt.axis("off") + plt.tight_layout() + if save_path: + plt.savefig(save_path, dpi=300) + + +def plot_dragonfly(G, save_path='net_dragonfly.png'): + """ + Draw a circular Dragonfly layout: groups in a large ring, + routers in small inner rings, hosts hanging around each router. + """ + import math + import matplotlib.pyplot as plt + import networkx as nx + + # Identify groups + groups = sorted({G.nodes[n]["group"] for n in G if "group" in G.nodes[n]}) + num_groups = len(groups) + + pos = {} + R_outer = 1.0 # radius of the outer ring (groups) + R_inner = 0.15 # radius of each group's internal ring + + # --- compute positions --- + for i, g in enumerate(groups): + # center of this group + theta_g = 2 * math.pi * i / num_groups + cx = R_outer * math.cos(theta_g) + cy = R_outer * math.sin(theta_g) + + routers = [n for n in G if n.startswith("r_") and G.nodes[n]["group"] == g] + hosts = [n for n in G if n.startswith("h_") and G.nodes[n]["group"] == g] + + # routers in small ring + for j, r in enumerate(routers): + theta_r = 2 * math.pi * j / len(routers) + x = cx + R_inner * math.cos(theta_r) + y = cy + R_inner * math.sin(theta_r) + pos[r] = (x, y) + + # hosts slightly further out around each router + for j, h in enumerate(hosts): + router = f"r_{g}_{j // 8}" if len(routers) > 0 else None + # angle toward router’s position if available + angle = 2 * math.pi * (j / len(hosts)) + r_off = R_inner + 0.05 + x = cx + r_off * math.cos(angle) + y = cy + r_off * math.sin(angle) + pos[h] = (x, y) + + # --- Draw figure --- + plt.figure(figsize=(10, 10)) + nx.draw_networkx_nodes(G, pos, + nodelist=[n for n in G if n.startswith("r_")], + node_color="orange", node_size=20, label="Routers", alpha=0.9) + nx.draw_networkx_nodes(G, pos, + nodelist=[n for n in G if n.startswith("h_")], + node_color="blue", node_size=8, label="Hosts", alpha=0.7) + + # intra-group edges light gray, inter-group black + intra = [(u, v) for (u, v) in G.edges if G.nodes[u]["group"] == G.nodes[v]["group"]] + inter = [(u, v) for (u, v) in G.edges if G.nodes[u]["group"] != G.nodes[v]["group"]] + nx.draw_networkx_edges(G, pos, edgelist=intra, alpha=0.1, width=0.3, edge_color="gray") + nx.draw_networkx_edges(G, pos, edgelist=inter, alpha=0.4, width=0.4, edge_color="black") + + plt.axis("off") + plt.legend() + plt.tight_layout() + if save_path: + plt.savefig(save_path, dpi=300) + + +def plot_torus2d(G, save_path="net_torus2d.png"): + import matplotlib.pyplot as plt + + routers = [n for n, d in G.nodes(data=True) if d["type"] == "router"] + hosts = [n for n, d in G.nodes(data=True) if d["type"] == "host"] + + fig, ax = plt.subplots(figsize=(8,8)) + + for u, v, d in G.edges(data=True): + if d.get("type") == "router_link": + x1, y1 = G.nodes[u]["x"], G.nodes[u]["y"] + x2, y2 = G.nodes[v]["x"], G.nodes[v]["y"] + ax.plot([x1, x2], [y1, y2], color="gray", alpha=0.1, linewidth=0.5) + + # flatten z by adding it to y or x offset + xs = [G.nodes[n]["x"] for n in routers] + ys = [G.nodes[n]["y"] + 0.05*G.nodes[n]["z"] for n in routers] + ax.scatter(xs, ys, c="orange", s=10, label="Routers", alpha=0.8) + + hx = [G.nodes[n]["x"] for n in hosts] + hy = [G.nodes[n]["y"] + 0.05*G.nodes[n]["z"] for n in hosts] + ax.scatter(hx, hy, c="blue", s=4, label="Hosts", alpha=0.5) + + ax.set_xlabel("X") + ax.set_ylabel("Y + (scaled Z)") + ax.legend() + if save_path: + plt.savefig(save_path, dpi=300) + + +def plot_torus3d(G, active_edges=None, max_edges=4000, save_path="net_torus3d.png"): + """ + Plot a 3D torus with routers, hosts, and optional job link highlights. + Args: + G : networkx.Graph + active_edges : list of (u,v) tuples for job links to highlight + max_edges : subsample edges to avoid clutter + """ + fig = plt.figure(figsize=(8, 8)) + ax = fig.add_subplot(111, projection="3d") + + # --- Separate routers and hosts --- + routers = [n for n, d in G.nodes(data=True) if d["type"] == "router"] + hosts = [n for n, d in G.nodes(data=True) if d["type"] == "host"] + + # --- Plot routers --- + xs, ys, zs = [G.nodes[n]["x"] for n in routers], [G.nodes[n]["y"] for n in routers], [G.nodes[n]["z"] for n in routers] + ax.scatter(xs, ys, zs, c="orange", s=6, label="Routers", alpha=0.8) + + # --- Plot hosts --- + hx, hy, hz = [G.nodes[n]["x"] for n in hosts], [G.nodes[n]["y"] for n in hosts], [G.nodes[n]["z"] for n in hosts] + ax.scatter(hx, hy, hz, c="dodgerblue", s=3, label="Hosts", alpha=0.6) + + # --- Draw router-to-router edges (subsampled) --- + all_router_edges = [(u, v) for u, v, d in G.edges(data=True) if d.get("type") == "router_link"] + if len(all_router_edges) > max_edges: + all_router_edges = random.sample(all_router_edges, max_edges) + for u, v in all_router_edges: + x1, y1, z1 = G.nodes[u]["x"], G.nodes[u]["y"], G.nodes[u]["z"] + x2, y2, z2 = G.nodes[v]["x"], G.nodes[v]["y"], G.nodes[v]["z"] + ax.plot([x1, x2], [y1, y2], [z1, z2], color="gray", alpha=0.05, linewidth=0.5) + + # --- Draw host links lightly --- + for u, v, d in G.edges(data=True): + if d.get("type") == "host_link": + x1, y1, z1 = G.nodes[u]["x"], G.nodes[u]["y"], G.nodes[u]["z"] + x2, y2, z2 = G.nodes[v]["x"], G.nodes[v]["y"], G.nodes[v]["z"] + ax.plot([x1, x2], [y1, y2], [z1, z2], color="lightblue", alpha=0.05, linewidth=0.3) + + # --- Overlay active job edges --- + if active_edges: + for u, v in active_edges: + if u in G.nodes and v in G.nodes: + x1, y1, z1 = G.nodes[u]["x"], G.nodes[u]["y"], G.nodes[u]["z"] + x2, y2, z2 = G.nodes[v]["x"], G.nodes[v]["y"], G.nodes[v]["z"] + ax.plot([x1, x2], [y1, y2], [z1, z2], color="red", linewidth=1.8, alpha=0.8) + + ax.set_xlabel("X") + ax.set_ylabel("Y") + ax.set_zlabel("Z") + ax.legend() + plt.tight_layout() + if save_path: + plt.savefig(save_path, dpi=300) + + + if __name__ == "__main__": plotter = Plotter() - #plotter.plot_history([1, 2, 3, 4]) + # plotter.plot_history([1, 2, 3, 4]) diff --git a/raps/policy.py b/raps/policy.py new file mode 100644 index 0000000000000000000000000000000000000000..10a9a3103a62d7f1c0a8a5bc4735f3ca3c9664d3 --- /dev/null +++ b/raps/policy.py @@ -0,0 +1,20 @@ +from .utils import ValueComparableEnum + + +class PolicyType(ValueComparableEnum): + """Supported scheduling policies.""" + REPLAY = 'replay' # Default is specified in each scheduler! + FCFS = 'fcfs' + PRIORITY = 'priority' + SJF = 'sjf' + LJF = 'ljf' + + +class BackfillType(ValueComparableEnum): + """Supported backfilling policies.""" + NONE = None + FIRSTFIT = 'firstfit' + BESTFIT = 'bestfit' + GREEDY = 'greedy' + EASY = 'easy' # Earliest Available Start Time Yielding + CONSERVATIVE = 'conservative' diff --git a/raps/power.py b/raps/power.py index e61010f86d1a82fc5f39436c27384e02f1d2452f..b1e6c9d70994d6aa76d17075d9bd3499c6af22f1 100644 --- a/raps/power.py +++ b/raps/power.py @@ -8,7 +8,8 @@ Classes: Functions: - compute_loss: Linear loss model - compute_node_power: Calculate the total power consumption for given CPU and GPU utilization. -- compute_node_power_validate: Calculate the total power consumption for a given mean and standard deviation of node power. +- compute_node_power_validate: Calculate the total power consumption for + a given mean and standard deviation of node power. """ import numpy as np @@ -29,7 +30,7 @@ def custom_format_uncertainties(self, fmt_spec): return f"{self.nominal_value:{fmt_spec}} ±{self.std_dev:{fmt_spec}}" -#In stats unicode is printed as unocde abbreviation! To be fixed! +# In stats unicode is printed as unocde abbreviation! To be fixed! uf.Variable.__str__ = custom_str_uncertainties uf.Variable.__repr__ = custom_repr_uncertainties uf.Variable.__format__ = custom_format_uncertainties @@ -49,26 +50,26 @@ def compute_node_power(cpu_util, gpu_util, net_util, config): :return: Total power consumption after accounting for power loss. """ power_cpu = cpu_util * config['POWER_CPU_MAX'] + \ - (config['CPUS_PER_NODE'] - cpu_util) * config['POWER_CPU_IDLE'] + (config['CPUS_PER_NODE'] - cpu_util) * config['POWER_CPU_IDLE'] power_gpu = gpu_util * config['POWER_GPU_MAX'] + \ - (config['GPUS_PER_NODE'] - gpu_util) * config['POWER_GPU_IDLE'] + (config['GPUS_PER_NODE'] - gpu_util) * config['POWER_GPU_IDLE'] - try: + if config.get("POWER_NIC_IDLE") is not None and config.get("POWER_NIC_MAX") is not None: power_nic = config['POWER_NIC_IDLE'] + \ - (config['POWER_NIC_MAX'] - config['POWER_NIC_IDLE']) * net_util - except: + (config['POWER_NIC_MAX'] - config['POWER_NIC_IDLE']) * net_util + else: if isinstance(net_util, np.ndarray): power_nic = config['POWER_NIC'] * np.ones(net_util.shape) else: power_nic = config['POWER_NIC'] power_total = power_cpu + power_gpu + config['POWER_MEM'] + \ - config['NICS_PER_NODE'] * power_nic + config['POWER_NVME'] + config['NICS_PER_NODE'] * power_nic + config['POWER_NVME'] # Apply power loss due to Sivoc and Rectifier - power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], \ - config['SIVOC_EFFICIENCY']) + power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], + config['SIVOC_EFFICIENCY']) power_sivoc_loss_only = power_with_sivoc_loss - power_total return power_with_sivoc_loss, power_sivoc_loss_only @@ -84,18 +85,19 @@ def compute_node_power_uncertainties(cpu_util, gpu_util, net_util, config): :return: Total power consumption after accounting for power loss. """ power_cpu = cpu_util \ - * uf.ufloat(config['POWER_CPU_MAX'], config['POWER_CPU_MAX'] * config['POWER_CPU_UNCERTAINTY']) \ - + (config['CPUS_PER_NODE'] - cpu_util) \ - * uf.ufloat(config['POWER_CPU_IDLE'], config['POWER_CPU_IDLE'] * config['POWER_CPU_UNCERTAINTY']) + * uf.ufloat(config['POWER_CPU_MAX'], config['POWER_CPU_MAX'] * config['POWER_CPU_UNCERTAINTY']) \ + + (config['CPUS_PER_NODE'] - cpu_util) \ + * uf.ufloat(config['POWER_CPU_IDLE'], config['POWER_CPU_IDLE'] * config['POWER_CPU_UNCERTAINTY']) power_gpu = gpu_util \ - * uf.ufloat(config['POWER_GPU_MAX'], config['POWER_GPU_MAX'] * config['POWER_GPU_UNCERTAINTY']) \ - + (config['GPUS_PER_NODE'] - gpu_util) \ - * uf.ufloat(config['POWER_GPU_IDLE'], config['POWER_GPU_IDLE'] * config['POWER_GPU_UNCERTAINTY']) + * uf.ufloat(config['POWER_GPU_MAX'], config['POWER_GPU_MAX'] * config['POWER_GPU_UNCERTAINTY']) \ + + (config['GPUS_PER_NODE'] - gpu_util) \ + * uf.ufloat(config['POWER_GPU_IDLE'], config['POWER_GPU_IDLE'] * config['POWER_GPU_UNCERTAINTY']) power_total = power_cpu + power_gpu \ - + uf.ufloat(config['POWER_MEM'], config['POWER_MEM'] * config['POWER_MEM_UNCERTAINTY']) \ - + config['NICS_PER_NODE'] * uf.ufloat(config['POWER_NIC'], config['POWER_NIC'] * config['POWER_NIC_UNCERTAINTY']) \ - + uf.ufloat(config['POWER_NVME'], config['POWER_NVME'] * config['POWER_NVME_UNCERTAINTY']) + + uf.ufloat(config['POWER_MEM'], config['POWER_MEM'] * config['POWER_MEM_UNCERTAINTY']) \ + + config['NICS_PER_NODE'] \ + * uf.ufloat(config['POWER_NIC'], config['POWER_NIC'] * config['POWER_NIC_UNCERTAINTY']) \ + + uf.ufloat(config['POWER_NVME'], config['POWER_NVME'] * config['POWER_NVME_UNCERTAINTY']) # Apply power loss due to Sivoc and Rectifier power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], config['SIVOC_EFFICIENCY']) @@ -164,6 +166,7 @@ class PowerManager: - down_nodes: Nodes that are currently down. - down_rack: Rack number of down nodes. """ + def __init__(self, power_func=compute_node_power, **config): """ Initialize the PowerManager object. @@ -186,20 +189,22 @@ class PowerManager: self.history = [] self.loss_history = [] self.uncertainties = False - if power_func in [compute_node_power_uncertainties, \ + if power_func in [compute_node_power_uncertainties, compute_node_power_validate_uncertainties]: self.uncertainties = True - if self.down_nodes: self.apply_down_nodes() + if self.down_nodes: + self.apply_down_nodes() def get_peak_power(self): """Estimate peak power of system for setting max value of gauges in dashboard""" - node_power = compute_node_power(self.config['CPUS_PER_NODE'], self.config['GPUS_PER_NODE'], net_util=0, config=self.config)[0] + node_power = compute_node_power(self.config['CPUS_PER_NODE'], + self.config['GPUS_PER_NODE'], net_util=0, config=self.config)[0] blades_per_rectifier = self.config['BLADES_PER_CHASSIS'] / self.config['RECTIFIERS_PER_CHASSIS'] rectifier_load = blades_per_rectifier * self.config['NODES_PER_BLADE'] * node_power - rectifier_power = compute_loss(rectifier_load, self.config['RECTIFIER_LOSS_CONSTANT'], \ - self.config['RECTIFIER_EFFICIENCY']) # with AC-DC conversion losses + rectifier_power = compute_loss(rectifier_load, self.config['RECTIFIER_LOSS_CONSTANT'], + self.config['RECTIFIER_EFFICIENCY']) # with AC-DC conversion losses chassis_power = self.config['BLADES_PER_CHASSIS'] * rectifier_power / blades_per_rectifier \ - + self.config['SWITCHES_PER_CHASSIS'] * self.config['POWER_SWITCH'] + + self.config['SWITCHES_PER_CHASSIS'] * self.config['POWER_SWITCH'] rack_power = chassis_power * self.config['CHASSIS_PER_RACK'] total_power = rack_power * self.config['NUM_RACKS'] + self.config['POWER_CDU'] * self.config['NUM_CDUS'] return total_power @@ -221,10 +226,10 @@ class PowerManager: # approximate by scaling up to number of rectifiers, applying loss # and then dividing by number of rectifiers. # For Frontier there are four nodes per rectifier. - power_with_loss = compute_loss(initial_power * self.config['NODES_PER_RECTIFIER'], \ - self.config['RECTIFIER_LOSS_CONSTANT'], \ + power_with_loss = compute_loss(initial_power * self.config['NODES_PER_RECTIFIER'], + self.config['RECTIFIER_LOSS_CONSTANT'], self.config['RECTIFIER_EFFICIENCY']) \ - / self.config['NODES_PER_RECTIFIER'] + / self.config['NODES_PER_RECTIFIER'] return np.full(self.sc_shape, power_with_loss) def apply_down_nodes(self): @@ -263,23 +268,23 @@ class PowerManager: float Total power consumption of the scheduled nodes. """ + if len(scheduled_nodes) == 0: + return [] cpu_util = np.asarray(cpu_util) gpu_util = np.asarray(gpu_util) net_util = np.asarray(net_util) job_lengths = np.array([len(job) for job in scheduled_nodes]) - flattened_nodes = np.concatenate(scheduled_nodes, axis=0) + flattened_nodes = np.concatenate(scheduled_nodes, axis=0).astype(np.int64) cpu_util_flat = np.repeat(cpu_util, job_lengths) gpu_util_flat = np.repeat(gpu_util, job_lengths) net_util_flat = np.repeat(net_util, job_lengths) node_indices = linear_to_3d_index(flattened_nodes, self.config['SC_SHAPE']) - power_value, sivoc_loss = self.power_func(cpu_util_flat, gpu_util_flat, net_util_flat, self.config) self.power_state[node_indices] = power_value self.sivoc_loss[node_indices] = sivoc_loss return power_value[np.cumsum(job_lengths) - 1] - def calculate_rectifiers_needed(self, power_state_summed): """ @@ -334,9 +339,10 @@ class PowerManager: num_rectifiers = num_rectifiers_array[i, j, k] power_per_rectifier = chassis_power[i, j, k] / num_rectifiers rectifier_power[i, j, k, :num_rectifiers] = power_per_rectifier - power_with_losses[i, j, k, :num_rectifiers] = compute_loss(power_per_rectifier, \ - self.config['RECTIFIER_LOSS_CONSTANT'], \ - self.config['RECTIFIER_EFFICIENCY']) + power_with_losses[i, j, k, :num_rectifiers] = \ + compute_loss(power_per_rectifier, + self.config['RECTIFIER_LOSS_CONSTANT'], + self.config['RECTIFIER_EFFICIENCY']) rectifier_power = np.nan_to_num(rectifier_power) power_with_losses = np.nan_to_num(power_with_losses) @@ -344,8 +350,8 @@ class PowerManager: else: divisor = np.array([4, 4, 4, 4]).reshape(1, 1, 1, 4) rectifier_power = chassis_power[:, :, :, np.newaxis] / divisor - power_with_losses = compute_loss(rectifier_power, \ - self.config['RECTIFIER_LOSS_CONSTANT'], \ + power_with_losses = compute_loss(rectifier_power, + self.config['RECTIFIER_LOSS_CONSTANT'], self.config['RECTIFIER_EFFICIENCY']) # Compute just the losses @@ -376,7 +382,6 @@ class PowerManager: # Return rectifier losses summed at CDU level return power_with_rows, rect_loss_with_rows - def compute_sivoc_losses(self): """ Compute SIVOC losses for each CDU in the system. @@ -386,7 +391,7 @@ class PowerManager: Array containing SIVOC losses for each CDU. """ # Aggregate SIVOC losses - summed_sivoc_losses = np.sum(self.sivoc_loss/1000, axis=2) # kW + summed_sivoc_losses = np.sum(self.sivoc_loss / 1000, axis=2) # kW rows = self.sc_shape[0] # Add CDU numbers to table @@ -398,7 +403,7 @@ class PowerManager: sivoc_loss_with_rows = np.hstack((sivoc_loss_with_rows, rack_sivoc_loss_sum)) return sivoc_loss_with_rows - + def get_power_df(self, rack_power, rack_loss): # Initialize the columns for power_df power_columns = self.config['POWER_DF_HEADER'] @@ -416,3 +421,44 @@ class PowerManager: power_df = pd.DataFrame(power_data, columns=power_columns) return power_df + + def simulate_power(self, *, + running_jobs, + scheduled_nodes, + cpu_utils, + gpu_utils, + net_utils + ): + jobs_power = self.update_power_state(scheduled_nodes, cpu_utils, gpu_utils, net_utils) + + for i, job in enumerate(running_jobs): + job.power_history.append(jobs_power[i] * len(job.scheduled_nodes)) + + # Update the power array UI component + rack_power, rect_losses = self.compute_rack_power() + sivoc_losses = self.compute_sivoc_losses() + rack_loss = rect_losses + sivoc_losses + power_df = self.get_power_df(rack_power, rack_loss) + + total_power_kw = sum(row[-1] for row in rack_power) + \ + self.config['NUM_CDUS'] * self.config['POWER_CDU'] / 1000.0 + total_loss_kw = sum(row[-1] for row in rack_loss) + + # Primary return value: + # power_df + # Other returns needed for further processing: + # rack_power, # For cooling + # total_power_kw, # For statistics + # total_loss_kw, # For statistics + # jobs_power # For statistics + # === + return power_df, \ + rack_power, \ + total_power_kw, \ + total_loss_kw, \ + jobs_power + + +def record_power_stats_foreach_job(*, running_jobs, jobs_power): + for i, job in enumerate(running_jobs): + job.power_history.append(jobs_power[i] * len(job.scheduled_nodes)) diff --git a/raps/raps_config.py b/raps/raps_config.py new file mode 100644 index 0000000000000000000000000000000000000000..d1e1385cf72bc99e5badda31353fc35b9e45d678 --- /dev/null +++ b/raps/raps_config.py @@ -0,0 +1,35 @@ +from pathlib import Path +from raps.utils import ResolvedPath +from pydantic_settings import BaseSettings, SettingsConfigDict, YamlConfigSettingsSource +ROOT_DIR = Path(__file__).parent.parent + + +class RapsConfig(BaseSettings): + """ + General settings for raps. Pydantic will automatically populate this model from env vars or a + .env file. + """ + # TODO I think we should move more of general/ui related settings from SimConfig into here. + # We'll be using SimConfig in the simulation server and those settings aren't applicable there, + # so it makes sense to keep SimConfig scoped to the logical operation of the sim. + + system_config_dir: ResolvedPath = ROOT_DIR / 'config' + """ Directory containing system configuration files """ + + model_config = SettingsConfigDict( + yaml_file="raps_config.yaml", + env_prefix='raps_', + env_nested_delimiter='__', + nested_model_default_partial_update=True, + ) + + # Customize setting sources, we'll use yaml config file instead of the default .env + @classmethod + def settings_customise_sources( + cls, settings_cls, + init_settings, env_settings, dotenv_settings, file_secret_settings, + ): + return (init_settings, env_settings, YamlConfigSettingsSource(settings_cls),) + + +raps_config = RapsConfig() diff --git a/raps/resmgr.py b/raps/resmgr.py deleted file mode 100644 index 8abce81730f4d7b287277940169871e7db3b1e5a..0000000000000000000000000000000000000000 --- a/raps/resmgr.py +++ /dev/null @@ -1,77 +0,0 @@ -import numpy as np -from .job import JobState -from scipy.stats import weibull_min - - -class ResourceManager: - def __init__(self, total_nodes, down_nodes): - self.total_nodes = total_nodes - # Maintain a set for down nodes (e.g., nodes that are offline) - self.down_nodes = set(down_nodes) - # Available nodes are those that are not down - self.available_nodes = sorted(set(range(total_nodes)) - self.down_nodes) - # You can track system utilization history here - self.sys_util_history = [] # list of (time, utilization) tuples - - def assign_nodes_to_job(self, job, current_time): - """Assigns nodes to a job and updates the available nodes.""" - if len(self.available_nodes) < job.nodes_required: - raise ValueError(f"Not enough available nodes to schedule job {job.id}") - - if job.requested_nodes: # Telemetry replay case - job.scheduled_nodes = job.requested_nodes - self.available_nodes = [n for n in self.available_nodes if n not in job.scheduled_nodes] - else: # Synthetic or reschedule case - job.scheduled_nodes = self.available_nodes[:job.nodes_required] - self.available_nodes = self.available_nodes[job.nodes_required:] - - # Set job start and end times - job.start_time = current_time - job.end_time = current_time + job.wall_time - job.state = JobState.RUNNING # Mark job as running - - def free_nodes_from_job(self, job): - """Frees the nodes that were allocated to a completed job.""" - if hasattr(job, "scheduled_nodes"): - self.available_nodes.extend(job.scheduled_nodes) - # Remove duplicates and sort the list for consistency - self.available_nodes = sorted(set(self.available_nodes)) - else: - # If job has no scheduled nodes, there is nothing to free. - pass - - def update_system_utilization(self, current_time, num_active_nodes): - """ - Computes and records the system utilization. - For example, utilization could be defined as the ratio of active nodes to the total non-down nodes. - """ - # Number of nodes that are not down: - total_operational = self.total_nodes - len(self.down_nodes) - # Compute utilization as a percentage: - utilization = (num_active_nodes / total_operational) * 100 if total_operational else 0 - self.sys_util_history.append((current_time, utilization)) - return utilization - - def node_failure(self, mtbf): - """Simulate node failure using Weibull distribution.""" - shape_parameter = 1.5 - scale_parameter = mtbf * 3600 # Convert to seconds - - # Create a NumPy array of node indices, excluding down nodes - all_nodes = np.array(sorted(set(range(self.total_nodes)) - set(self.down_nodes))) - - # Sample the Weibull distribution for all nodes at once - random_values = weibull_min.rvs(shape_parameter, scale=scale_parameter, size=all_nodes.size) - - # Identify nodes that have failed - failure_threshold = 0.1 - failed_nodes_mask = random_values < failure_threshold - newly_downed_nodes = all_nodes[failed_nodes_mask] - - # Update available and down nodes - for node_index in newly_downed_nodes: - if node_index in self.available_nodes: - self.available_nodes.remove(node_index) - self.down_nodes.add(str(node_index)) - - return newly_downed_nodes.tolist() diff --git a/raps/resmgr/__init__.py b/raps/resmgr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..70814a965776f5f1cb0cb6a355d7ef38b113cee4 --- /dev/null +++ b/raps/resmgr/__init__.py @@ -0,0 +1,26 @@ +""" +ResourceManager package initializer. +Exports a factory that returns the appropriate manager based on config. +""" +from .default import ExclusiveNodeResourceManager +from .multitenant import MultiTenantResourceManager + + +def make_resource_manager(total_nodes, down_nodes, config): + """ + Factory to choose between exclusive-node and multitenant managers. + """ + if config.get("multitenant", False): + return MultiTenantResourceManager(total_nodes, down_nodes, config) + return ExclusiveNodeResourceManager(total_nodes, down_nodes, config) + + +# Alias for backward compatibility +ResourceManager = make_resource_manager + +__all__ = [ + "make_resource_manager", + "ResourceManager", + "ExclusiveNodeResourceManager", + "MultiTenantResourceManager" +] diff --git a/raps/resmgr/default.py b/raps/resmgr/default.py new file mode 100644 index 0000000000000000000000000000000000000000..198fce774042b7dcd84bd6b20d3f43f0fd2d9796 --- /dev/null +++ b/raps/resmgr/default.py @@ -0,0 +1,121 @@ +from raps.job import JobState +from raps.policy import PolicyType + + +class ExclusiveNodeResourceManager: + """ + Legacy exclusive-node resource manager: allocates and frees full nodes. + """ + + def __init__(self, total_nodes, down_nodes, config=None): + self.total_nodes = total_nodes + self.down_nodes = set(down_nodes) + self.config = config or {} + + # Determine per-node capacities + cfg = self.config + if 'CPUS_PER_NODE' in cfg and 'CORES_PER_CPU' in cfg: + total_cpu = cfg['CPUS_PER_NODE'] * cfg['CORES_PER_CPU'] + else: + total_cpu = cfg.get('CORES_PER_NODE', cfg.get('CPUS_PER_NODE', 1)) + total_gpu = cfg.get('GPUS_PER_NODE', 0) + + # Build unified node list so engine can inspect resource_manager.nodes + self.nodes = [] + for i in range(self.total_nodes): + is_down = i in self.down_nodes + self.nodes.append({ + 'id': i, + 'total_cpu_cores': total_cpu, + 'available_cpu_cores': 0 if is_down else total_cpu, + 'total_gpu_units': total_gpu, + 'available_gpu_units': 0 if is_down else total_gpu, + 'is_down': is_down + }) + + # Available nodes list for allocation/frees + self.available_nodes = [n['id'] for n in self.nodes if not n['is_down']] + # System utilization history (time, util%) + self.sys_util_history = [] + + def assign_nodes_to_job(self, job, current_time, policy, node_id=None): + """Assigns full nodes to a job (replay or count-based).""" + # Ensure enough free nodes + if len(self.available_nodes) < job.nodes_required: + raise ValueError(f"Not enough available nodes to schedule job {job.id}", + f"{len(self.available_nodes)} < {job.nodes_required}") + + if policy == PolicyType.REPLAY and job.scheduled_nodes: + # Telemetry replay: use the exact nodes + self.available_nodes = [n for n in self.available_nodes if n not in job.scheduled_nodes] + else: + # Count-based allocation: take the first N free nodes + job.scheduled_nodes = self.available_nodes[:job.nodes_required] + self.available_nodes = self.available_nodes[job.nodes_required:] + + # Mark job running + job.start_time = current_time + if job.expected_run_time: + job.end_time = current_time + job.expected_run_time # This may be an assumption! + job.current_state = JobState.RUNNING + + def free_nodes_from_job(self, job): + """Frees the full nodes previously allocated to a job.""" + if getattr(job, 'scheduled_nodes', None): + for n in job.scheduled_nodes: + if n not in self.available_nodes: + self.available_nodes.append(n) + else: + # Already free — log instead of raising + print(f"[WARN] Tried to free node {n}, but it was already available") + print(f"Atempting to free node {n} after completion of job {job.id}. " + + "Node is already free (in available nodes)!") + self.available_nodes = sorted(self.available_nodes) + + def update_system_utilization(self, current_time, running_jobs): + """ + Computes system utilization as percentage of non-down nodes that are active. + + Parameters: + - current_time: simulation time + - running_jobs: list of currently running Job objects + """ + # Number of active nodes is length of running_jobs + num_active = len(running_jobs) + total_operational = self.total_nodes - len(self.down_nodes) + util = (num_active / total_operational) * 100 if total_operational else 0 + self.sys_util_history.append((current_time, util)) + return util + # """ + # Computes system utilization as percentage of non-down nodes that are active. + # """ + # total_operational = self.total_nodes - len(self.down_nodes) + # util = (num_active_nodes / total_operational) * 100 if total_operational else 0 + # self.sys_util_history.append((current_time, util)) + # return util + + def node_failure(self, mtbf): + return [] + # Node failure not working! + # """Simulate node failure using Weibull distribution.""" + # shape_parameter = 1.5 + # scale_parameter = mtbf * 3600 # Convert to seconds + + # # Create a NumPy array of node indices, excluding down nodes + # all_nodes = np.array(sorted(set(range(self.total_nodes)) - set(self.down_nodes))) + + # # Sample the Weibull distribution for all nodes at once + # random_values = weibull_min.rvs(shape_parameter, scale=scale_parameter, size=all_nodes.size) + + # # Identify nodes that have failed + # failure_threshold = 0.1 + # failed_nodes_mask = random_values < failure_threshold + # newly_downed_nodes = all_nodes[failed_nodes_mask] + + # # Update available and down nodes + # for node_index in newly_downed_nodes: + # if node_index in self.available_nodes: + # self.available_nodes.remove(node_index) + # self.down_nodes.add(str(node_index)) + + # return newly_downed_nodes.tolist() diff --git a/raps/resmgr/multitenant.py b/raps/resmgr/multitenant.py new file mode 100644 index 0000000000000000000000000000000000000000..bb24e4e3de3a4c3520993fe73be8f4dfab9939e2 --- /dev/null +++ b/raps/resmgr/multitenant.py @@ -0,0 +1,138 @@ +import numpy as np +from ..job import JobState +from scipy.stats import weibull_min + + +def assert_node_accounting_ok(node): + assert node['available_cpu_cores'] >= 0, "available_cpu_cores went negative" + assert node['available_gpu_units'] >= 0, "available_gpu_units went negative" + + +class MultiTenantResourceManager: + """ + Resource manager for per-node CPU/GPU multitenancy. + """ + + def __init__(self, total_nodes, down_nodes, config): + self.total_nodes = total_nodes + self.config = config + self.down_nodes = set(down_nodes) + self.nodes = [] + # Track total allocations for reporting + self.allocated_cpu_cores = 0 + self.allocated_gpu_units = 0 + self.sys_util_history = [] + + # Determine per-node capacities + total_cpu = self.config['CPUS_PER_NODE'] * self.config['CORES_PER_CPU'] + total_gpu = self.config.get('GPUS_PER_NODE', 0) + + # Initialize node state + for i in range(self.total_nodes): + is_down = i in self.down_nodes + self.nodes.append({ + 'id': i, + 'total_cpu_cores': total_cpu, + 'available_cpu_cores': 0 if is_down else total_cpu, + 'total_gpu_units': total_gpu, + 'available_gpu_units': 0 if is_down else total_gpu, + 'is_down': is_down + }) + + # List of up nodes for quick enumeration + self.available_nodes = [n['id'] for n in self.nodes if not n['is_down']] + + def assign_nodes_to_job(self, job, current_time, node_id=None): + """Assigns cores/GPUs to a job on one eligible node.""" + # Try preferred node + found = None + if node_id is not None and 0 <= node_id < len(self.nodes): + candidate = self.nodes[node_id] + if (not candidate['is_down'] and + candidate['available_cpu_cores'] >= job.cpu_cores_required and + candidate['available_gpu_units'] >= job.gpu_units_required): + found = candidate + + # Fallback: first-fit + if found is None: + for candidate in self.nodes: + if (not candidate['is_down'] and + candidate['available_cpu_cores'] >= job.cpu_cores_required and + candidate['available_gpu_units'] >= job.gpu_units_required): + found = candidate + break + + if found is None: + raise ValueError(f"Not enough available resources to schedule job {job.id}.") + + # Allocate resources + found['available_cpu_cores'] -= job.cpu_cores_required + found['available_gpu_units'] -= job.gpu_units_required + self.allocated_cpu_cores += job.cpu_cores_required + self.allocated_gpu_units += job.gpu_units_required + + # ---- Invariant checks (after mutating node/RM state) ---- + assert_node_accounting_ok(found) # no negatives left + assert self.allocated_cpu_cores >= 0 and self.allocated_gpu_units >= 0 + # Optional: global sanity vs. totals + assert self.allocated_cpu_cores <= sum(n['total_cpu_cores'] for n in self.nodes) + assert self.allocated_gpu_units <= sum(n['total_gpu_units'] for n in self.nodes) + + # Record on job + job.scheduled_nodes = [found['id']] + job.allocated_cpu_cores = job.cpu_cores_required + job.allocated_gpu_units = job.gpu_units_required + job.start_time = current_time + if job.expected_run_time: + job.end_time = current_time + job.expected_run_time # this may be an assumption (See default.py) + job.current_state = JobState.RUNNING + + def free_nodes_from_job(self, job): + """Releases cores/GPUs from a completed job.""" + if getattr(job, 'scheduled_nodes', None): + nid = job.scheduled_nodes[0] + if 0 <= nid < len(self.nodes): + node = self.nodes[nid] + node['available_cpu_cores'] += getattr(job, 'allocated_cpu_cores', 0) + node['available_gpu_units'] += getattr(job, 'allocated_gpu_units', 0) + self.allocated_cpu_cores -= getattr(job, 'allocated_cpu_cores', 0) + self.allocated_gpu_units -= getattr(job, 'allocated_gpu_units', 0) + else: + print(f"Warning: Job {job.id} had invalid node {nid} during free.") + + def update_system_utilization(self, current_time, running_jobs): + """ + Computes and records utilization based on allocated CPU/GPU across all nodes. + """ + total_cpu = sum(n['total_cpu_cores'] for n in self.nodes) + total_gpu = sum(n['total_gpu_units'] for n in self.nodes) + used_cpu = self.allocated_cpu_cores + used_gpu = self.allocated_gpu_units + + cpu_util = (used_cpu / total_cpu) * 100 if total_cpu else 0 + gpu_util = (used_gpu / total_gpu) * 100 if total_gpu else 0 + + # Choose GPU util if GPUs exist, else CPU + util = gpu_util if self.config.get('GPUS_PER_NODE', 0) > 0 else cpu_util + self.sys_util_history.append((current_time, util)) + return util + + def node_failure(self, mtbf): + """ + Simulate random node failures via a Weibull distribution. + """ + shape = 1.5 + scale = mtbf * 3600 + ops = np.array([n['id'] for n in self.nodes if not n['is_down']]) + if ops.size == 0: + return [] + + vals = weibull_min.rvs(shape, scale=scale, size=ops.size) + failed = ops[vals < 0.001] + for nid in failed: + node = self.nodes[nid] + node['is_down'] = True + node['available_cpu_cores'] = 0 + node['available_gpu_units'] = 0 + self.down_nodes.add(nid) + return failed.tolist() diff --git a/raps/run_sim.py b/raps/run_sim.py new file mode 100644 index 0000000000000000000000000000000000000000..db50465bfb738033fada924fcd0d55d09fe9b32c --- /dev/null +++ b/raps/run_sim.py @@ -0,0 +1,306 @@ +""" +Module containing the primary commands for use in the CLI. The simulation logic itself is kept in +Engine and MultiPartEngine so that it can be used programmatically such as in the simulation server. +These functions just handle rendering the terminal UI and outputting results to files etc. +""" +import json +import pandas as pd +import sys +import warnings +from raps.ui import LayoutManager +from raps.plotting import Plotter +from raps.engine import Engine +from raps.multi_part_engine import MultiPartEngine +from raps.utils import write_dict_to_file, pydantic_add_args, SubParsers, read_yaml_parsed +from raps.stats import ( + get_engine_stats, + get_job_stats, + get_scheduler_stats, + get_network_stats, + print_formatted_report +) + +from raps.sim_config import SingleSimConfig, MultiPartSimConfig, SIM_SHORTCUTS + + +def run_sim_add_parser(subparsers: SubParsers): + parser = subparsers.add_parser("run", description=""" + Run single-partition (homogeneous) systems. Supports synthetic workload generation or + telemetry replay, dynamic power modeling (including conversion losses), and optional + coupling to a thermo-fluids cooling model. Produces performance, utilization, and + energy metrics, with optional plots and output files for analysis and validation. + """) + parser.add_argument("config_file", nargs="?", default=None, help=""" + YAML sim config file, can be used to configure an experiment instead of using CLI + flags. Pass "-" to read from stdin. + """) + model_validate = pydantic_add_args(parser, SingleSimConfig, model_config={ + "cli_shortcuts": SIM_SHORTCUTS, + }) + parser.set_defaults( + impl=lambda args: run_sim(model_validate(args, read_yaml_parsed(SingleSimConfig, args.config_file))) + ) + + +def run_sim(sim_config: SingleSimConfig): + if sim_config.verbose or sim_config.debug: + print(f"SingleSimConfig: {sim_config.model_dump_json(indent=4)}") + if len(sim_config.system_configs) > 1: + print("Use run-parts to run multi-partition simulations") + sys.exit(1) + + engine = Engine(sim_config) + + out = sim_config.get_output() + if out: + out.mkdir(parents=True, exist_ok=True) + engine.telemetry.save_snapshot( + dest=str(out / 'snapshot.npz'), + result=engine.get_workload_data(), + args=sim_config, + ) + (out / 'sim_config.yaml').write_text(sim_config.dump_yaml()) + + jobs = engine.jobs + timestep_start, timestep_end = engine.timestep_start, engine.timestep_end + total_timesteps = timestep_end - timestep_start + + downscale = sim_config.downscale + downscale_str = ""if downscale == 1 else f"/{downscale}" + print(f"Simulating {len(jobs)} jobs for {total_timesteps}{downscale_str}" + f" seconds from {timestep_start} to {timestep_end}.") + print(f"Simulation time delta: {engine.time_delta}{downscale_str} s," + f"Telemetry trace quanta: {jobs[0].trace_quanta}{downscale_str} s.") + layout_manager = LayoutManager( + sim_config.layout, engine=engine, + debug=sim_config.debug, total_timesteps=total_timesteps, + args_dict=sim_config.get_legacy_args_dict(), **sim_config.system_configs[0].get_legacy(), + ) + layout_manager.run() + + engine_stats = get_engine_stats(engine) + job_stats = get_job_stats(engine) + scheduler_stats = get_scheduler_stats(engine) + if engine.simulate_network: + network_stats = get_network_stats(engine) + else: + network_stats = None + + print_formatted_report( + engine_stats=engine_stats, + job_stats=job_stats, + scheduler_stats=scheduler_stats, + network_stats=network_stats, + ) + + if downscale_str: + downscale_str = "1" + downscale_str + + if sim_config.plot: + assert out # SimConfig validation should check this + if 'power' in sim_config.plot: + pl = Plotter(f"Time ({downscale_str}s)", 'Power (kW)', 'Power History', + out / f'power.{sim_config.imtype}', + uncertainties=sim_config.uncertainties) + x, y = zip(*engine.power_manager.history) + pl.plot_history(x, y) + + if 'util' in sim_config.plot: + pl = Plotter(f"Time ({downscale_str}s)", 'System Utilization (%)', + 'System Utilization History', out / f'util.{sim_config.imtype}') + x, y = zip(*engine.sys_util_history) + pl.plot_history(x, y) + + if 'loss' in sim_config.plot: + pl = Plotter(f"Time ({downscale_str}s)", 'Power Losses (kW)', 'Power Loss History', + out / f'loss.{sim_config.imtype}', + uncertainties=sim_config.uncertainties) + x, y = zip(*engine.power_manager.loss_history) + pl.plot_history(x, y) + + pl = Plotter(f"Time ({downscale_str}s)", 'Power Losses (%)', 'Power Loss History', + out / f'loss_pct.{sim_config.imtype}', + uncertainties=sim_config.uncertainties) + x, y = zip(*engine.power_manager.loss_history_percentage) + pl.plot_history(x, y) + + if 'pue' in sim_config.plot: + if engine.cooling_model: + ylabel = 'pue' + title = 'FMU ' + ylabel + 'History' + pl = Plotter(f"Time ({downscale_str}s)", ylabel, title, + out / f'pue.{sim_config.imtype}', + uncertainties=sim_config.uncertainties) + df = pd.DataFrame(engine.cooling_model.fmu_history) + df.to_parquet('cooling_model.parquet', engine='pyarrow') + pl.plot_history(df['time'], df[ylabel]) + else: + print('Cooling model not enabled... skipping output of plot') + + if 'net' in sim_config.plot: + engine.network_model.plot_topology(out) + + if 'temp' in sim_config.plot: + if engine.cooling_model: + ylabel = 'Tr_pri_Out[1]' + title = 'FMU ' + ylabel + 'History' + pl = Plotter(f"Time ({downscale_str}s)", ylabel, title, out / 'temp.svg') + df = pd.DataFrame(engine.cooling_model.fmu_history) + df.to_parquet('cooling_model.parquet', engine='pyarrow') + pl.plot_compare(df['time'], df[ylabel]) + else: + print('Cooling model not enabled... skipping output of plot') + + if out: + if sim_config.uncertainties: + # Parquet cannot handle annotated ufloat format AFAIK + print('Data dump not implemented using uncertainties!') + else: + if engine.cooling_model: + df = pd.DataFrame(engine.cooling_model.fmu_history) + df.to_parquet(out / 'cooling_model.parquet', engine='pyarrow') + + df = pd.DataFrame(engine.power_manager.history) + df.to_parquet(out / 'power_history.parquet', engine='pyarrow') + + df = pd.DataFrame(engine.power_manager.loss_history) + df.to_parquet(out / 'loss_history.parquet', engine='pyarrow') + + df = pd.DataFrame(engine.sys_util_history) + df.to_parquet(out / 'util.parquet', engine='pyarrow') + + # Schedule history + job_history = pd.DataFrame(engine.get_job_history_dict()) + job_history.to_csv(out / "job_history.csv", index=False) + + scheduler_running_history = pd.DataFrame(engine.get_scheduler_running_history()) + scheduler_running_history.to_csv(out / "running_history.csv", index=False) + scheduler_queue_history = pd.DataFrame(engine.get_scheduler_running_history()) + scheduler_queue_history.to_csv(out / "queue_history.csv", index=False) + + try: + with open(out / 'stats.out', 'w') as f: + json.dump(engine_stats, f, indent=4) + json.dump(job_stats, f, indent=4) + except TypeError: # Is this the correct error code? + write_dict_to_file(engine_stats, out / 'stats.out') + write_dict_to_file(job_stats, out / 'stats.out') + + if sim_config.accounts: + try: + with open(out / 'accounts.json', 'w') as f: + json_string = json.dumps(engine.accounts.to_dict()) + f.write(json_string) + except TypeError: + write_dict_to_file(engine.accounts.to_dict(), out / 'accounts.json') + print("Output directory is: ", out) # If output is enabled, the user wants this information as last output + + +def run_parts_sim_add_parser(subparsers: SubParsers): + parser = subparsers.add_parser("run-parts", description=""" + Simulates multi-partition (heterogeneous) systems. Supports replaying telemetry or + generating synthetic workloads across CPU-only, GPU, and mixed partitions. Initializes + per-partition power, FLOPS, and scheduling models, then advances simulations in lockstep. + Outputs per-partition performance, utilization, and energy statistics for systems such as + MIT Supercloud, Setonix, Adastra, and LUMI. + """) + parser.add_argument("config_file", nargs="?", default=None, help=""" + YAML sim config file, can be used to configure an experiment instead of using CLI + flags. Pass "-" to read from stdin. + """) + model_validate = pydantic_add_args(parser, MultiPartSimConfig, model_config={ + "cli_shortcuts": SIM_SHORTCUTS, + }) + parser.set_defaults( + impl=lambda args: run_parts_sim(model_validate(args, read_yaml_parsed(MultiPartSimConfig, args.config_file))) + ) + + +def run_parts_sim(sim_config: MultiPartSimConfig): + if len(sim_config.system_configs) == 1: + warnings.warn( + "run_parts_sim is usually for multiple partitions. Did you mean to run with one?", + UserWarning + ) + + multi_engine = MultiPartEngine(sim_config) + + out = sim_config.get_output() + if out: + out.mkdir(parents=True) + for part, engine in multi_engine.engines.items(): + engine.telemetry.save_snapshot( + dest=str(out / part.split('/')[-1]), + result=engine.get_workload_data(), + args=sim_config, + ) + (out / 'sim_config.yaml').write_text(sim_config.dump_yaml()) + + ui_update_freq = sim_config.system_configs[0].scheduler.ui_update_freq + gen = multi_engine.run_simulation() + + for tick_datas in gen: + sys_power = 0 + tick_datas = {k: v for k, v in tick_datas.items() if v} # Filter nones + timestep = list(tick_datas.values())[0].current_timestep if tick_datas else None + + if timestep and timestep % ui_update_freq == 0: + for part, tick_data in tick_datas.items(): + engine = multi_engine.engines[part] + + sys_util = engine.sys_util_history[-1] if engine.sys_util_history else (0, 0.0) + if hasattr(engine.resource_manager, 'allocated_cpu_cores'): + allocated_cores = engine.resource_manager.allocated_cpu_cores + print( + f"[DEBUG] {part} - Timestep {timestep} - Jobs running: {len(engine.running)} -", + f"Utilization: {sys_util[1]:.2f}% - Allocated Cores: {allocated_cores} - ", + f"Power: {engine.sys_power:.1f}kW", + flush=True, + ) + sys_power += engine.sys_power + print(f"system power: {sys_power:.1f}kW", flush=True) + + print("Simulation complete.", flush=True) + + # Print statistics for each partition + for part, engine in multi_engine.engines.items(): + print(f"\n=== Partition: {part} ===") + + engine_stats = get_engine_stats(engine) + job_stats = get_job_stats(engine) + scheduler_stats = get_scheduler_stats(engine) + network_stats = get_network_stats(engine) if sim_config.simulate_network else None + + # Print a formatted report + print_formatted_report( + engine_stats=engine_stats, + job_stats=job_stats, + scheduler_stats=scheduler_stats, + network_stats=network_stats, + ) + + +def show_add_parser(subparsers: SubParsers): + parser = subparsers.add_parser("show", description=""" + Outputs the given CLI args as a YAML config file that can be used to re-run the same + simulation. + """) + parser.add_argument("config_file", nargs="?", default=None, help=""" + Input YAML sim config file. Can be used to slightly modify an existing sim config. + """) + parser.add_argument("--show-defaults", default=False, help=""" + If true, include defaults in the output YAML + """) + model_validate = pydantic_add_args(parser, SingleSimConfig, model_config={ + "cli_shortcuts": SIM_SHORTCUTS, + }) + + def impl(args): + sim_config = model_validate(args, read_yaml_parsed(SingleSimConfig, args.config_file)) + show(sim_config, show_defaults=args.show_defaults) + + parser.set_defaults(impl=impl) + + +def show(sim_config: SingleSimConfig, show_defaults=False): + print(sim_config.dump_yaml(exclude_unset=not show_defaults), end='') diff --git a/raps/schedulers/__init__.py b/raps/schedulers/__init__.py index ca3431e041c0b866f5df9b541122228c1134a4ad..201963559679ada5c902bbca171cfc2abe696202 100644 --- a/raps/schedulers/__init__.py +++ b/raps/schedulers/__init__.py @@ -1,6 +1,7 @@ from importlib import import_module + def load_scheduler(scheduler_type="default"): """Dynamically loads a scheduler by type.""" module = import_module(f".{scheduler_type}", package="raps.schedulers") - return getattr(module, f"Scheduler") + return getattr(module, "Scheduler") diff --git a/raps/schedulers/default.py b/raps/schedulers/default.py index c3291ac0440c54e65cc628151e803527d1540bcb..2a1fd21c71ca5e1b4cb89b0f7a0321c9671ca9c5 100644 --- a/raps/schedulers/default.py +++ b/raps/schedulers/default.py @@ -1,24 +1,17 @@ -from enum import Enum +from typing import List from ..utils import summarize_ranges - -from ..workload import MAX_PRIORITY - - -class PolicyType(Enum): - """Supported scheduling policies.""" - FCFS = 'fcfs' - BACKFILL = 'backfill' - PRIORITY = 'priority' - FUGAKU_PTS = 'fugaku_pts' - SJF = 'sjf' +from ..policy import PolicyType, BackfillType class Scheduler: """ Default job scheduler with various scheduling policies. """ - def __init__(self, config, policy, resource_manager=None): + def __init__(self, config, policy, bfpolicy=None, jobs=None, resource_manager=None): self.config = config + if policy is None: # policy is passed as policy=None, therefore default is not choosen + policy = "replay" self.policy = PolicyType(policy) + self.bfpolicy = BackfillType(bfpolicy) if resource_manager is None: raise ValueError("Scheduler requires a ResourceManager instance") self.resource_manager = resource_manager @@ -26,123 +19,176 @@ class Scheduler: def sort_jobs(self, queue, accounts=None): """Sort jobs based on the selected scheduling policy.""" - if self.policy == PolicyType.FCFS or self.policy == PolicyType.BACKFILL: + if self.policy == PolicyType.FCFS: return sorted(queue, key=lambda job: job.submit_time) - elif self.policy == PolicyType.SJF: - return sorted(queue, key=lambda job: job.wall_time) elif self.policy == PolicyType.PRIORITY: return sorted(queue, key=lambda job: job.priority, reverse=True) - elif self.policy == PolicyType.FUGAKU_PTS: - return self.sort_fugaku_redeeming(queue, accounts) + elif self.policy == PolicyType.SJF: + return sorted(queue, key=lambda job: job.time_limit) + elif self.policy == PolicyType.LJF: + return sorted(queue, key=lambda job: job.nodes_required, reverse=True) + elif self.policy == PolicyType.REPLAY: + return sorted(queue, key=lambda job: job.start_time) else: - raise ValueError(f"Unknown policy type: {self.policy}") + raise ValueError(f"Policy not implemented: {self.policy}") - def schedule(self, queue, running, current_time, accounts=None, sorted=False, debug=False): + def schedule(self, queue, running, current_time, accounts=None, sorted=False): # Sort the queue in place. if not sorted: queue[:] = self.sort_jobs(queue, accounts) # Iterate over a copy of the queue since we might remove items for job in queue[:]: + if self.policy == PolicyType.REPLAY: + if job.start_time > current_time: + continue # Replay: Job didn't start yet. Next! + else: + # assert job.start_time == current_time, f"{job.start_time} == {current_time}" + pass + else: + pass + + nodes_available = self.check_available_nodes(job) + + if nodes_available: + self.place_job_and_manage_queues(job, queue, running, current_time) + else: # In case the job was not placed, see how we should continue: + if self.bfpolicy is not None or self.bfpolicy is not BackfillType.NONE: + self.backfill(queue, running, current_time) + + # After backfill dedice continue processing the queue or wait, continuing may result in fairness issues. + if self.policy in [PolicyType.REPLAY]: + continue # Regardless if the job at the front of the queue doenst fit, try placing all of them. + elif self.policy in [PolicyType.FCFS, PolicyType.PRIORITY, + PolicyType.LJF, PolicyType.SJF]: + break # The job at the front of the queue doesnt fit stop processing the queue. + else: + raise NotImplementedError( + "Depending on the Policy this choice should be explicit. Add the implementation above!") + + def prepare_system_state(self, jobs_to_submit: List, running, timestep_start): + # def schedule(self, queue, running, current_time, accounts=None, sorted=False, debug=False): + """ + In the case of replay and fast forward, previously placed jobs should be present. - # For synthetic jobs the number of requested nodes is given. - # Make sure the available nodes count meets job.nodes_required. - synthetic_bool = len(self.resource_manager.available_nodes) >= job.nodes_required - - # For telemetry replay jobs a list of requested nodes is provided. - # Make sure the requested nodes are available. - telemetry_bool = False - if job.requested_nodes: - telemetry_bool = set(job.requested_nodes).issubset(set(self.resource_manager.available_nodes)) - - if synthetic_bool or telemetry_bool: - self.resource_manager.assign_nodes_to_job(job, current_time) - running.append(job) - queue.remove(job) - if debug: - scheduled_nodes = summarize_ranges(job.scheduled_nodes) - print(f"t={current_time}: Scheduled job {job.id} with wall time {job.wall_time} on nodes {scheduled_nodes}") + """ + if self.policy == PolicyType.REPLAY: + total_jobs = len(jobs_to_submit) + print(f"All jobs: {total_jobs}") + + # Keep only jobs have an end time in the future future. + jobs_to_submit[:] = [job for job in jobs_to_submit if job['end_time'] >= timestep_start] + print(f"Num jobs in the past: {total_jobs - len(jobs_to_submit)}") + + # Identify jobs that started in the past and Split them from the jobs that will start in the future: + jobs_to_start_now = [job for job in jobs_to_submit if job['start_time'] < timestep_start] + print(f"Num jobs that started in the past: {len(jobs_to_start_now)}") + + jobs_to_submit[:] = [job for job in jobs_to_submit if job['start_time'] >= timestep_start] + print(f"Num jobs to be schedule in the simulation: {len(jobs_to_submit)}") + + # Now schedule them with their orignal start time. + # This has to be done one by one! + for job in jobs_to_start_now: + self.schedule([job], running, job['start_time'], sorted=True) + # self.schedule(jobs_to_start_now, running, 0, False) + return jobs_to_submit + else: + return jobs_to_submit + + def place_job_and_manage_queues(self, job, queue, running, current_time): + self.resource_manager.assign_nodes_to_job(job, current_time, self.policy) + running.append(job) + queue.remove(job) + if self.debug: + scheduled_nodes = summarize_ranges(job.scheduled_nodes) + print(f"t={current_time}: Scheduled job {job.id} with time limit " + f"{job.time_limit} on nodes {scheduled_nodes}") + + def check_available_nodes(self, job): + nodes_available = False + if job.nodes_required <= len(self.resource_manager.available_nodes): + if self.policy == PolicyType.REPLAY and job.scheduled_nodes: # Check if we need exact set + # is exact set available: + nodes_available = set(job.scheduled_nodes).issubset(set(self.resource_manager.available_nodes)) + else: + # we dont need the exact set: + nodes_available = True # Checked above + if job.nodes_required == 0: + raise ValueError(f"Job Requested zero nodes: {job}") + # clear scheduled nodes + job.scheduled_nodes = [] + else: + pass # not enough nodes available + return nodes_available + + def backfill(self, queue: List, running: List, current_time): + # Try to find a backfill candidate from the entire queue. + while queue: + backfill_job = self.find_backfill_job(queue, running, current_time) + if backfill_job: + self.place_job_and_manage_queues(backfill_job, queue, running, current_time) else: - if self.policy == PolicyType.BACKFILL: - # Try to find a backfill candidate from the entire queue. - backfill_job = self.find_backfill_job(queue, len(self.resource_manager.available_nodes), current_time) - if backfill_job: - self.assign_nodes_to_job(backfill_job, self.resource_manager.available_nodes, current_time) - running.append(backfill_job) - queue.remove(backfill_job) - if debug: - scheduled_nodes = summarize_ranges(backfill_job.scheduled_nodes) - print(f"t={current_time}: Backfilling job {backfill_job.id} with wall time {backfill_job.wall_time} on nodes {scheduled_nodes}") - - - def find_backfill_job(self, queue, num_free_nodes, current_time): + break + + def find_backfill_job(self, queue, running, current_time): """Finds a backfill job based on available nodes and estimated completion times. - Based on pseudocode from Leonenkov and Zhumatiy, 'Introducing new backfill-based + Loosely based on pseudocode from Leonenkov and Zhumatiy, 'Introducing new backfill-based scheduler for slurm resource manager.' Procedia computer science 66 (2015): 661-669. """ - if not queue: return None + # Identify when the nex job in the queue could run as a time limit: first_job = queue[0] + nodes_required = 0 + if self.policy == PolicyType.REPLAY and first_job.scheduled_nodes: # This needs to be done propper! + nodes_required = len(first_job.scheduled_nodes) + else: + nodes_required = first_job.nodes_required - for job in queue: - job.end_time = current_time + job.wall_time # Estimate end time - - # Sort jobs according to their termination time (end_time) - sorted_queue = sorted(queue, key=lambda job: job.end_time) - - # Compute shadow time by accumulating nodes - sum_nodes = 0 - shadow_time = None - num_extra_nodes = 0 + sorted_running = sorted(running, key=lambda job: job.time_limit) - for job in sorted_queue: - sum_nodes += job.nodes_required - if sum_nodes >= first_job.nodes_required: - shadow_time = current_time + job.wall_time - num_extra_nodes = sum_nodes - job.nodes_required + # Identify when we have enough nodes therefore the start time of the first_job in line + shadow_time_end = 0 + shadow_nodes_avail = len(self.resource_manager.available_nodes) + for job in sorted_running: + if shadow_nodes_avail >= nodes_required: break + else: + shadow_nodes_avail += job.nodes_required + shadow_time_end = job.start_time + job.time_limit + + time_limit = shadow_time_end - current_time + # We now have the time_limit after which no backfilled job should end + # as the next job in line has the necessary resrouces after this time limit. + + # Find and return the first job that fits + if self.bfpolicy == BackfillType.NONE: + pass + elif self.bfpolicy == BackfillType.EASY: + queue[:] = sorted(queue, key=lambda job: job.submit_time) + return self.return_first_fit(queue, time_limit) + elif self.bfpolicy == BackfillType.FIRSTFIT: + pass # Stay with the prioritization! + return self.return_first_fit(queue, time_limit) + elif self.bfpolicy in [BackfillType.BESTFIT, + BackfillType.GREEDY, + BackfillType.CONSERVATIVE, + ]: + raise NotImplementedError(f"{self.bfpolicy} not implemented! Please implement!") + else: + raise NotImplementedError(f"{self.bfpolicy} not implemented.") - # Find backfill job + def return_first_fit(self, queue, time_limit): for job in queue: - condition1 = job.nodes_required <= num_free_nodes and current_time + job.wall_time < shadow_time - condition2 = job.nodes_required <= min(num_free_nodes, num_extra_nodes) - - if condition1 or condition2: - return job - + if job.time_limit <= time_limit: + nodes_available = self.check_available_nodes(job) + if nodes_available: + return job + else: + continue + else: + continue return None - - def sort_fugaku_redeeming(self, queue, accounts=None): - if queue == []: - return queue - # Priority queues not yet implemented: - # Strategy: Sort by Fugaku Points Representing the Priority Queue - # Everything with negative Fugaku Points get sorted according to normal priority - priority_triple_list = [] - for job in queue: - fugaku_priority = accounts.account_dict[job.account].fugaku_points - # Create a tuple of the job and the priority - priority = job.priority - priority_triple_list.append((fugaku_priority,priority,job)) - # Sort everythin according to fugaku_points - priority_triple_list = sorted(priority_triple_list, key=lambda x:x[0], reverse=True) - # Find the first element with negative fugaku_points - for cutoff, triple in enumerate(priority_triple_list): - fugaku_priority, _, _ = triple - if fugaku_priority < 0: - break - first_part = priority_triple_list[:cutoff] - # Sort everything afterwards according to job priority - second_part = sorted(priority_triple_list[cutoff:], key=lambda x:x[1], reverse=True) - queue_a = [] - queue_b = [] - if first_part != []: - _, _, queue_a = zip(*first_part) - queue_a = list(queue_a) - if second_part != []: - _, _, queue_b = zip(*second_part) - queue_b = list(queue_b) - return queue_a + queue_b diff --git a/raps/schedulers/experimental.py b/raps/schedulers/experimental.py new file mode 100644 index 0000000000000000000000000000000000000000..b77b0ceb7e219639fe1196f0218623467fce2240 --- /dev/null +++ b/raps/schedulers/experimental.py @@ -0,0 +1,367 @@ +from typing import List +from enum import Enum +from ..utils import summarize_ranges + +from ..policy import BackfillType + +# Extending PolicyType: +from ..policy import PolicyType as BasePolicyType +from ..utils import ValueComparableEnum + + +class ExtendedPolicyType(ValueComparableEnum): + ACCT_FUGAKU_PTS = 'acct_fugaku_pts' + ACCT_AVG_P = 'acct_avg_power' + ACCT_LOW_AVG_P = 'acct_low_avg_power' + ACCT_AVG_PW4LJ = 'acct_avg_power_w4lj' + ACCT_EDP = 'acct_edp' + ACCT_ED2P = 'acct_ed2p' + ACCT_PDP = 'acct_pdp' + + +# Boilerplate to combine the enums +combined_members = { + **{name: member.value for name, member in BasePolicyType.__members__.items()}, + **{name: member.value for name, member in ExtendedPolicyType.__members__.items()} +} +PolicyType = Enum('PolicyType', combined_members, type=ValueComparableEnum) +# The scheduler can now use both the BasePolicies and the Extended Policies + + +class Scheduler: + """ Default job scheduler with various scheduling policies. """ + + def __init__(self, config, policy, bfpolicy=None, jobs=None, resource_manager=None): + self.config = config + if policy is None: # policy is passed as policy=None, therefore default is not choosen + policy = "replay" + self.policy = PolicyType(policy) + self.bfpolicy = BackfillType(bfpolicy) + if resource_manager is None: + raise ValueError("Scheduler requires a ResourceManager instance") + self.resource_manager = resource_manager + self.debug = False + + def sort_jobs(self, queue, accounts=None): + """Sort jobs based on the selected scheduling policy.""" + if self.policy == PolicyType.REPLAY: # REPLAY NEEDS TO BE THERE + return sorted(queue, key=lambda job: job.start_time) + elif self.policy == PolicyType.ACCT_FUGAKU_PTS: + return self.sort_fugaku_redeeming(queue, accounts) + elif self.policy == PolicyType.ACCT_AVG_PW4LJ: + return self.sort_avg_Pw4LJ(queue, accounts) + elif self.policy == PolicyType.ACCT_AVG_P: + return self.sort_avg_P(queue, accounts) + elif self.policy == PolicyType.ACCT_LOW_AVG_P: + return self.sort_low_avg_P(queue, accounts) + elif self.policy == PolicyType.ACCT_EDP: + return self.sort_AEDP(queue, accounts) + elif self.policy == PolicyType.ACCT_ED2P: + return self.sort_AED2P(queue, accounts) + elif self.policy == PolicyType.ACCT_PDP: + return self.sort_APDP(queue, accounts) + else: + raise ValueError(f"Policy not implemented: {self.policy}") + + def schedule(self, queue, running, current_time, accounts=None, sorted=False): + # Sort the queue in place. + if not sorted: + queue[:] = self.sort_jobs(queue, accounts) + + # Iterate over a copy of the queue since we might remove items + for job in queue[:]: + if self.policy == PolicyType.REPLAY: + if job.start_time > current_time: + continue # Replay: Job didn't start yet. Next! + else: + pass + else: + pass + + nodes_available = self.check_available_nodes(job) + + if nodes_available: + self.place_job_and_manage_queues(job, queue, running, current_time) + else: # In case the job was not placed, see how we should continue: + if self.bfpolicy is not None: + self.backfill(queue, running, current_time) + + # After backfill dedice continue processing the queue or wait, continuing may result in fairness issues. + if self.policy in [PolicyType.REPLAY]: # REPLAY NEEDS TO BE THERE + continue # Regardless if the job at the front of the queue doenst fit, try placing all of them. + elif self.policy in [PolicyType.ACCT_FUGAKU_PTS, + PolicyType.ACCT_AVG_PW4LJ, PolicyType.ACCT_LOW_AVG_P, PolicyType.ACCT_AVG_P, + PolicyType.ACCT_EDP, PolicyType.ACCT_ED2P, PolicyType.ACCT_PDP + ]: + break # The job at the front of the queue doesnt fit stop processing the queue. + else: + raise NotImplementedError( + "Depending on the Policy this choice should be explicit. Add the implementation above!") + + def place_job_and_manage_queues(self, job, queue, running, current_time): + self.resource_manager.assign_nodes_to_job(job, current_time) + running.append(job) + queue.remove(job) + if self.debug: + scheduled_nodes = summarize_ranges(job.scheduled_nodes) + print(f"t={current_time}: Scheduled job {job.id} with time limit " + f"{job.time_limit} on nodes {scheduled_nodes}") + + def check_available_nodes(self, job): + nodes_available = False + if job.nodes_required <= len(self.resource_manager.available_nodes): + if self.policy == PolicyType.REPLAY and job.scheduled_nodes: # Check if we need exact set + # is exact set available: + nodes_available = set(job.scheduled_nodes).issubset(set(self.resource_manager.available_nodes)) + else: + # we dont need the exact set: + nodes_available = True # Checked above + if job.nodes_required == 0: + raise ValueError(f"Job Requested zero nodes: {job}") + # clear scheduled nodes + job.scheduled_nodes = [] + else: + pass # not enough nodes available + return nodes_available + + def backfill(self, queue: List, running: List, current_time): + # Try to find a backfill candidate from the entire queue. + while queue: + backfill_job = self.find_backfill_job(queue, running, current_time) + if backfill_job: + self.place_job_and_manage_queues(backfill_job, queue, running, current_time) + else: + break + + def find_backfill_job(self, queue, running, current_time): + """Finds a backfill job based on available nodes and estimated completion times. + + Loosely based on pseudocode from Leonenkov and Zhumatiy, 'Introducing new backfill-based + scheduler for slurm resource manager.' Procedia computer science 66 (2015): 661-669. + """ + if not queue: + return None + + # Identify when the nex job in the queue could run as a time limit: + first_job = queue[0] + nodes_required = 0 + if self.policy == PolicyType.REPLAY and first_job.scheduled_nodes: + nodes_required = len(first_job.scheduled_nodes) + else: + nodes_required = first_job.nodes_required + + sorted_running = sorted(running, key=lambda job: job.end_time) + + # Identify when we have enough nodes therefore the start time of the first_job in line + shadow_time_end = 0 + shadow_nodes_avail = len(self.resource_manager.available_nodes) + for job in sorted_running: + if shadow_nodes_avail >= nodes_required: + break + else: + shadow_nodes_avail += job.nodes_required + shadow_time_end = job.time_limit + + time_limit = shadow_time_end - current_time + # We now have the time_limit after which no backfilled job should end + # as the next job in line has the necessary resrouces after this time limit. + + # Find and return the first job that fits + if self.bfpolicy == BackfillType.NONE: + pass + elif self.bfpolicy == BackfillType.EASY: + queue[:] = sorted(queue, key=lambda job: job.submit_time) + return self.return_first_fit(queue, time_limit) + elif self.bfpolicy == BackfillType.FIRSTFIT: + pass # Stay with the prioritization! + return self.return_first_fit(queue, time_limit) + elif self.bfpolicy in [BackfillType.BESTFIT, + BackfillType.GREEDY, + BackfillType.CONSERVATIVE, + ]: + raise NotImplementedError(f"{self.bfpolicy} not implemented! Please implement!") + else: + raise NotImplementedError(f"{self.bfpolicy} not implemented.") + + def return_first_fit(self, queue, time_limit): + for job in queue: + if job.time_limit <= time_limit: + nodes_available = self.check_available_nodes(job) + if nodes_available: + return job + else: + continue + else: + continue + return None + + def sort_fugaku_redeeming(self, queue, accounts=None): + if queue == []: + return queue + # Priority queues not yet implemented: + # Strategy: Sort by Fugaku Points Representing the Priority Queue + # Everything with negative Fugaku Points get sorted according to normal priority + priority_triple_list = [] + for job in queue: + assert accounts and accounts.account_dict + fugaku_priority = accounts.account_dict[job.account].fugaku_points + if fugaku_priority is None: + fugaku_priority = 0 + # Create a tuple of the job and the priority + priority = job.priority + priority_triple_list.append((fugaku_priority, priority, job)) + # Sort everythin according to fugaku_points + priority_triple_list = sorted(priority_triple_list, key=lambda x: x[0], reverse=True) + # Find the first element with negative fugaku_points + for cutoff, triple in enumerate(priority_triple_list): + fugaku_priority, _, _ = triple + if fugaku_priority < 0: + break + first_part = priority_triple_list[:cutoff] + # Sort everything afterwards according to job priority + second_part = sorted(priority_triple_list[cutoff:], key=lambda x: x[1], reverse=True) + queue_a = [] + queue_b = [] + if first_part != []: + _, _, queue_a = zip(*first_part) + queue_a = list(queue_a) + if second_part != []: + _, _, queue_b = zip(*second_part) + queue_b = list(queue_b) + return queue_a + queue_b + + def sort_avg_Pw4LJ(self, queue, accounts=None): + if queue == []: + return queue + priority_tuple_list = [] + for job in queue: + assert accounts and accounts.account_dict + power = accounts.account_dict[job.account].avg_power + if power is None: + power = 0 + # Create a tuple of the job and the priority + if job.nodes_required: + nnodes = job.nodes_required + elif job.scheduled_nodes: + nnodes = len(job.scheduled_nodes) + else: + raise KeyError("No nodes indicated") + + priority = 100 * nnodes * power + priority_tuple_list.append((priority, job)) + # Sort everythin according to new priority + priority_tuple_list = sorted(priority_tuple_list, key=lambda x: x[0], reverse=True) + queue = [] + if priority_tuple_list != []: + _, queue = zip(*priority_tuple_list) + queue = list(queue) + return queue + + def sort_avg_P(self, queue, accounts=None): + if queue == []: + return queue + priority_tuple_list = [] + for job in queue: + assert accounts and accounts.accounts_dict + power = accounts.account_dict[job.account].avg_power + if power is None: + power = 0 + + priority = power + priority_tuple_list.append((priority, job)) + # Sort everythin according to power_acct_priority Disregarding size + priority_tuple_list = sorted(priority_tuple_list, key=lambda x: x[0], reverse=True) + queue = [] + if priority_tuple_list != []: + _, queue = zip(*priority_tuple_list) + queue = list(queue) + return queue + + def sort_low_avg_P(self, queue, accounts=None): + if queue == []: + return queue + priority_tuple_list = [] + for job in queue: + assert accounts and accounts.accounts_dict + power = accounts.account_dict[job.account].avg_power + if power is None: + power = 0 + + priority = power + priority_tuple_list.append((priority, job)) + # Sort everythin according to power_acct_priority Disregarding size + priority_tuple_list = sorted(priority_tuple_list, key=lambda x: x[0], reverse=False) + queue = [] + if priority_tuple_list != []: + _, queue = zip(*priority_tuple_list) + queue = list(queue) + return queue + + def sort_AEDP(self, queue, accounts=None): + if queue == []: + return queue + priority_tuple_list = [] + for job in queue: + assert accounts and accounts.accounts_dict + energy = accounts.account_dict[job.account].energy_allocated + time = accounts.account_dict[job.account].time_allocated + if energy is None: + energy = 0 + if time is None: + time = 0 + + priority = energy * time + priority_tuple_list.append((priority, job)) + # Sort everythin according to power_acct_priority Disregarding size + priority_tuple_list = sorted(priority_tuple_list, key=lambda x: x[0], reverse=False) + queue = [] + if priority_tuple_list != []: + _, queue = zip(*priority_tuple_list) + queue = list(queue) + return queue + + def sort_AED2P(self, queue, accounts=None): + if queue == []: + return queue + priority_tuple_list = [] + for job in queue: + assert accounts and accounts.accounts_dict + energy = accounts.account_dict[job.account].energy_allocated + time = accounts.account_dict[job.account].time_allocated + if energy is None: + energy = 0 + if time is None: + time = 0 + + priority = energy * time * time + priority_tuple_list.append((priority, job)) + # Sort everythin according to power_acct_priority Disregarding size + priority_tuple_list = sorted(priority_tuple_list, key=lambda x: x[0], reverse=False) + queue = [] + if priority_tuple_list != []: + _, queue = zip(*priority_tuple_list) + queue = list(queue) + return queue + + def sort_APDP(self, queue, accounts=None): + if queue == []: + return queue + priority_tuple_list = [] + for job in queue: + assert accounts and accounts.accounts_dict + power = accounts.account_dict[job.account].avg_power + time = accounts.account_dict[job.account].time_allocated + if power is None: + power = 0 + if time is None: + time = 0 + + priority = power * time + priority_tuple_list.append((priority, job)) + # Sort everythin according to power_acct_priority Disregarding size + priority_tuple_list = sorted(priority_tuple_list, key=lambda x: x[0], reverse=False) + queue = [] + if priority_tuple_list != []: + _, queue = zip(*priority_tuple_list) + queue = list(queue) + return queue diff --git a/raps/schedulers/fastsim.py b/raps/schedulers/fastsim.py new file mode 100644 index 0000000000000000000000000000000000000000..e930a1c5d552bd8acb580939aaa9ed8633cb7a23 --- /dev/null +++ b/raps/schedulers/fastsim.py @@ -0,0 +1,163 @@ +import pandas as pd +import sys +import os +import zmq + +from ..policy import PolicyType, BackfillType +from raps.telemetry import Telemetry +from ..job import JobState +from raps.sim_config import args +from raps.system_config import get_system_config + +# Run with this command: +# raps run --system kestrel -f ../data/fastsim_jobs_output.parquet --scheduler fastsim --policy priority --start 2024-09-01T00:00 --end 2024-09-15T00:00 + +class Scheduler(): + """ + FastSim-backed scheduler (strict lockstep via ZeroMQ). + + Protocol (server side is FastSim --serve): + - INIT -> { init_time } + - GET { t } -> { t, running_ids } (server acks t after reply) + - END (on shutdown) -> { ok: true } + + Semantics at engine second t: + - R_t := authoritative running IDs from FastSim for t + - started = R_t - prev_R + -> stamp start_time=t (once), assign nodes once, mark RUNNING + - finished = prev_R - R_t + -> stamp end_time=t (engine will finalize next tick in prepare_timestep) + + running list for this tick = R_t & finished (so those finishing at t remain + visible for one more scheduler call; engine completes them on next second). + """ + + def __init__(self, config, resource_manager, **kwargs): + self.config = config + self.policy = PolicyType(kwargs.get('policy')) + self.bfpolicy = BackfillType(kwargs.get('backfill')) + self.debug = bool(kwargs.get('debug', False)) + + # ---- ZeroMQ client ---- + self.endpoint = kwargs.get('plugin_endpoint', 'ipc:///tmp/fastsim.sock') + self._ctx = zmq.Context.instance() + self._sock = self._ctx.socket(zmq.REQ) + self._sock.setsockopt(zmq.LINGER, 0) + self._sock.connect(self.endpoint) + + # INIT handshake: fetch FastSim's init_time (ISO string). + self.init_time_iso = self._rpc('INIT').get('init_time') + + self.resource_manager = resource_manager + + # Job metadata: id -> Job + self.jobids_to_jobs = {} + self.allocated_jobs = set() # job_ids we have assigned nodes for + self.prev_running_ids = set() # R_{t-1} + + # Build the Job objects from RAPS Telemetry (needed so ExaDigiT subsystems have objects) + args_dict = vars(args) + config = get_system_config(args.system).get_legacy() + args_dict['config'] = config + td = Telemetry(**args_dict) + + print("...Now loading jobs to FastSim scheduler.") + jobs, _, _ = td.load_data(args.replay) + for job in jobs: + self.jobids_to_jobs[job.id] = job + + if self.debug: + print(f"[RAPS-FastSim] Connected to {self.endpoint}; init_time={self.init_time_iso}", file=sys.stderr) + + def _rpc(self, op, **payload): + """Send a JSON request and return the JSON reply (dict).""" + try: + msg = {'op': op} + msg.update(payload) + self._sock.send_json(msg) + rep = self._sock.recv_json() + except Exception as e: + raise RuntimeError(f"[RAPS-FastSim] RPC {op} failed: {e}") from e + if isinstance(rep, dict) and 'error' in rep: + raise RuntimeError(f"[RAPS-FastSim] RPC {op} error: {rep['error']}") + return rep + + def _fastsim_running_ids(self, t: int): + """Blocking call: get authoritative running job IDs for second t.""" + rep = self._rpc('GET', t=int(t)) + rids = rep.get('running_ids', []) + return set(rids) + + def schedule(self, queue=None, running=None, current_time=None, accounts=None, sorted=False): + """ + Called by Engine when RAPS detects an event. + """ + running = running if running is not None else [] + + t = int(current_time) + + # Get authoritative running set for second t (blocks until available) + R_t = self._fastsim_running_ids(t) + + # Diff vs previous second + started_ids = R_t - self.prev_running_ids + finished_ids = self.prev_running_ids - R_t # these end at t; engine finalizes next tick + + # Handle starts: stamp start_time, assign nodes, mark RUNNING + for jid in started_ids: + job = self.jobids_to_jobs.get(jid) + if job is None: + if self.debug: + print(f"[RAPS-FastSim][WARN] Unknown job id from FastSim: {jid}", file=sys.stderr) + continue + + # Assign nodes exactly once + if jid not in self.allocated_jobs: + self.resource_manager.assign_nodes_to_job(job, t, self.policy) + self.allocated_jobs.add(jid) + + # FastSim is authoritative + job.start_time = t + # IMPORTANT: prevent premature completion by RM’s default behavior + job.end_time = None # Prevents RAPS from removing job + job.state = JobState.RUNNING + + # Handle finishes: stamp end_time=t (engine.prepare_timestep next tick completes) + running.clear() + for jid in finished_ids: + job = self.jobids_to_jobs.get(jid) + if job is not None: + # overwrite any prior value; FastSim is the source of truth + # job.end_time = t + if job.start_time is not None: + observed = t - job.start_time + if (job.time_limit is None) or (job.time_limit < observed): + # This is necessary since RAPS is handling finishing jobs, but schedule is not always + # called at every tick, even though the job may have finished in FastSim during that tick. + # TODO: Deal with this, because it messes up the end time of some jobs. + # print(f"Extending {job.id} runtime {job.time_limit} to match observed {observed} at finish.") + job.time_limit = observed + # print((f"Job {job.id} is finished, start time: {job.start_time}, wall time: {job.time_limit}," + # f"end time: {job.end_time}, at time {t}. With nodes {job.scheduled_nodes}.")) + job.end_time = t + job.time_limit = t - job.start_time + running.append(job) + + # Running list reflects exactly FastSim’s R_t + for jid in R_t: + job = self.jobids_to_jobs.get(jid) + if job is not None: + # defensively ensure state isn’t stuck at COMPLETED + if job.state != JobState.RUNNING: + job.state = JobState.RUNNING + running.append(job) + + # Update prev + self.prev_running_ids = R_t + + def end_sim(self): + # Ask server to stop + try: + self._rpc('END') + except Exception: + pass \ No newline at end of file diff --git a/raps/schedulers/multitenant.py b/raps/schedulers/multitenant.py new file mode 100644 index 0000000000000000000000000000000000000000..85c302407178dbc16017a69f135d65fd9a9ff5b7 --- /dev/null +++ b/raps/schedulers/multitenant.py @@ -0,0 +1,207 @@ +from typing import List +from ..utils import summarize_ranges +from ..policy import PolicyType, BackfillType + + +class Scheduler: + """ Default job scheduler with various scheduling policies. """ + + def __init__(self, config, policy, bfpolicy=None, jobs=None, resource_manager=None): + self.config = config + if policy is None: # policy is passed as policy=None, therefore default is not choosen + policy = "replay" + self.policy = PolicyType(policy) + self.bfpolicy = BackfillType(bfpolicy) + if resource_manager is None: + raise ValueError("Scheduler requires a ResourceManager instance") + self.resource_manager = resource_manager + self.debug = False + + def sort_jobs(self, queue, accounts=None): + """Sort jobs based on the selected scheduling policy.""" + if self.policy == PolicyType.FCFS: + return sorted(queue, key=lambda job: job.submit_time) + elif self.policy == PolicyType.PRIORITY: + return sorted(queue, key=lambda job: job.priority, reverse=True) + elif self.policy == PolicyType.SJF: + return sorted(queue, key=lambda job: job.time_limit) + elif self.policy == PolicyType.LJF: + return sorted(queue, key=lambda job: job.nodes_required, reverse=True) + elif self.policy == PolicyType.REPLAY: + return sorted(queue, key=lambda job: job.start_time) + else: + raise ValueError(f"Policy not implemented: {self.policy}") + + def schedule(self, queue, running, current_time, accounts=None, sorted=False): + # Sort the queue in place. + if not sorted: + queue[:] = self.sort_jobs(queue, accounts) + + # Iterate over a copy of the queue since we might remove items + for job in queue[:]: + if self.debug: + print( + f"[DEBUG] Scheduler: Considering job {job.id} " + f"(CPU: {job.cpu_cores_required}, GPU: {job.gpu_units_required})") + if self.policy == PolicyType.REPLAY: + if job.start_time > current_time: + continue # Replay: Job didn't start yet. Next! + else: + pass + else: + pass + + nodes_available = self.check_available_nodes(job) + + if nodes_available is not None: + self.place_job_and_manage_queues(job, queue, running, current_time, nodes_available) + else: # In case the job was not placed, see how we should continue: + if self.bfpolicy is not None: + backfill_job, node_id = self.backfill(queue, running, current_time) + if backfill_job and node_id is not None: + self.place_job_and_manage_queues(backfill_job, queue, running, current_time, node_id) + + # After backfill dedice continue processing the queue or wait, continuing may result in fairness issues. + if self.policy in [PolicyType.REPLAY]: + # print(f"Nodes available {nodes_available} - " + # f"Req:{len(job.requested_nodes)} N-avail:{len(self.resource_manager.available_nodes)}") + continue # Regardless if the job at the front of the queue doenst fit, try placing all of them. + elif self.policy in [PolicyType.FCFS, PolicyType.PRIORITY, + PolicyType.LJF, PolicyType.SJF]: + break # The job at the front of the queue doesnt fit stop processing the queue. + else: + raise NotImplementedError( + "Depending on the Policy this choice should be explicit. Add the implementation above!") + + def prepare_system_state(self, jobs_to_submit: List, running, timestep_start): + # def schedule(self, queue, running, current_time, accounts=None, sorted=False, debug=False): + """ + In the case of replay and fast forward, previously placed jobs should be present. + + """ + if self.policy == PolicyType.REPLAY: + total_jobs = len(jobs_to_submit) + print(f"All jobs: {total_jobs}") + + # Keep only jobs have an end time in the future future. + jobs_to_submit[:] = [job for job in jobs_to_submit if job['end_time'] >= timestep_start] + print(f"Num jobs in the past: {total_jobs - len(jobs_to_submit)}") + + # Identify jobs that started in the past and Split them from the jobs that will start in the future: + jobs_to_start_now = [job for job in jobs_to_submit if job['start_time'] < timestep_start] + print(f"Num jobs that started in the past: {len(jobs_to_start_now)}") + + jobs_to_submit[:] = [job for job in jobs_to_submit if job['start_time'] >= timestep_start] + print(f"Num jobs to be schedule in the simulation: {len(jobs_to_submit)}") + + # Now schedule them with their orignal start time. + # This has to be done one by one! + for job in jobs_to_start_now: + self.schedule([job], running, job['start_time'], sorted=True) + # self.schedule(jobs_to_start_now, running, 0, False) + return jobs_to_submit + else: + return jobs_to_submit + + def place_job_and_manage_queues(self, job, queue, running, current_time, node_id): + self.resource_manager.assign_nodes_to_job(job, current_time, node_id) + running.append(job) + queue.remove(job) + if self.debug: + scheduled_nodes = summarize_ranges(job.scheduled_nodes) + print(f"t={current_time}: Scheduled job {job.id} with wall time {job.wall_time} on nodes {scheduled_nodes}") + + def check_available_nodes(self, job): + """Checks if there are available resources (CPU cores, GPU units) for the job on any node.""" + # Iterate through all nodes managed by the ResourceManager + for node in self.resource_manager.nodes: + if self.debug: + print( + f"[DEBUG] Checking node {node['id']}: " + f"Available CPU: {node['available_cpu_cores']}, " + f"Available GPU: {node['available_gpu_units']}. " + f"Job needs CPU: {job.cpu_cores_required}, GPU: {job.gpu_units_required}") + # Skip if the node is down + if node['is_down']: + continue + + # Check if the node has enough available CPU cores and GPU units + if (node['available_cpu_cores'] >= job.cpu_cores_required and + node['available_gpu_units'] >= job.gpu_units_required): + # If a suitable node is found, return its ID + return node['id'] + # If no suitable node is found, return None + return None + + def backfill(self, queue: List, running: List, current_time): + # Try to find a backfill candidate from the entire queue. + while queue: + backfill_job, node_id = self.find_backfill_job(queue, running, current_time) + if backfill_job is not None and node_id is not None: + # Instead of placing here, return the job and node_id to the caller + return backfill_job, node_id + else: + break + return None, None + + def find_backfill_job(self, queue, running, current_time): + """Finds a backfill job based on available nodes and estimated completion times. + + Loosely based on pseudocode from Leonenkov and Zhumatiy, 'Introducing new backfill-based + scheduler for slurm resource manager.' Procedia computer science 66 (2015): 661-669. + """ + if not queue: + return None, None + + # Identify when the nex job in the queue could run as a time limit: + # first_job = queue[0] # Unused + # For multitenancy, we need to check if the first job can fit on any node + # based on its core/GPU requirements, not just nodes_required. + # This is a simplification; a more complex backfill might consider + # if the job can fit by combining resources from multiple nodes. + # For now, we assume it needs to fit on a single node. + + # We need to know the total available resources if all running jobs finish by shadow_time_end + # This is complex with multitenancy, so for now, we'll simplify the backfill logic + # to just check if a job can fit on *any* node, not necessarily the one + # that will be freed up by the first job in line. + + # The original logic for shadow_time_end and shadow_nodes_avail is based on whole nodes. + # With multitenancy, this needs a more sophisticated resource projection. + # For now, we will make `time_limit` effectively infinite for backfill candidates + # if the job can fit on *any* node, and rely on `check_available_nodes`. + + # Revert to a simpler time_limit for now, or remove it if not applicable + # For now, let's assume time_limit is not strictly tied to node availability + # in the same way as before, and focus on resource availability. + time_limit = float('inf') # Effectively no time limit for backfill candidates + + # We now have the time_limit after which no backfilled job should end + # as the next job in line has the necessary resrouces after this time limit. + + # Find and return the first job that fits + if self.bfpolicy == BackfillType.NONE: + pass + elif self.bfpolicy == BackfillType.EASY: + queue[:] = sorted(queue, key=lambda job: job.submit_time) + return self.return_first_fit(queue, time_limit) + elif self.bfpolicy == BackfillType.FIRSTFIT: + pass # Stay with the prioritization! + return self.return_first_fit(queue, time_limit) + elif self.bfpolicy in [BackfillType.BESTFIT, + BackfillType.GREEDY, + BackfillType.CONSERVATIVE, + ]: + raise NotImplementedError(f"{self.bfpolicy} not implemented! Please implement!") + else: + raise NotImplementedError(f"{self.bfpolicy} not implemented.") + return None, None + + def return_first_fit(self, queue, time_limit): + for job in queue: + # Check if the job can fit on any node based on its resource requirements + node_id = self.check_available_nodes(job) + if node_id is not None: + # If a suitable node is found, return the job and the node_id + return job, node_id + return None, None diff --git a/raps/schedulers/rl.py b/raps/schedulers/rl.py new file mode 100644 index 0000000000000000000000000000000000000000..2272e9d7eb7cb6beca0b12142c1c9a4c71a8f9d1 --- /dev/null +++ b/raps/schedulers/rl.py @@ -0,0 +1,35 @@ +from raps.schedulers.default import Scheduler as DefaultScheduler + + +class Scheduler(DefaultScheduler): + """ + Scheduler driven by RL agent actions. + RAPSEnv.step(action) sets env.pending_action, + then RLScheduler.schedule() reads it and acts. + """ + + def __init__(self, config, policy, resource_manager, env=None, *args, **kwargs): + super().__init__(config=config, policy=policy, resource_manager=resource_manager, *args, **kwargs) + self.env = env + self.pending_action = None + + def schedule(self, queue, running, current_time, **kwargs): + if not queue or self.pending_action is None: + return + + action = self.pending_action + if action >= len(queue): + return + + job = queue[action] + + # Check feasibility + if job.nodes_required <= len(self.resource_manager.available_nodes): + self.place_job_and_manage_queues(job, queue, running, current_time) + else: + # Invalid action → skip or log + if self.config.args.get("debug", False): + print(f"[t={current_time}] RL chose invalid job {job.id} (needs {job.nodes_required})") + + # Reset action after use + self.pending_action = None diff --git a/raps/schedulers/scheduleflow.py b/raps/schedulers/scheduleflow.py new file mode 100644 index 0000000000000000000000000000000000000000..2af694d4a8c3c50fefab473ebf7498d00cfd6105 --- /dev/null +++ b/raps/schedulers/scheduleflow.py @@ -0,0 +1,220 @@ +from third_party.ScheduleFlow import ScheduleFlow +from third_party.ScheduleFlow import _intScheduleFlow +from third_party.ScheduleFlow._intScheduleFlow import EventType + + +class Scheduler: + """ + Adapter for integrating ScheduleFlow into RAPS. + + This scheduler implements the same interface as the default RAPS scheduler. + It converts RAPS jobs into ScheduleFlow’s format, calls ScheduleFlow’s scheduling + routines, then updates the RAPS job objects accordingly. + """ + + def __init__(self, config, policy, bfpolicy, resource_manager, jobs): + self.sorted_priorities = sorted([x.priority for x in jobs]) + num_prios = len(self.sorted_priorities) + # self.sf_queue = [] + self.queue = [] # track submitted jobs + self.config = config + self.policy = policy + self.bfpolicy = bfpolicy + self.resource_manager = resource_manager + self.sf_scheduler = ScheduleFlow.Scheduler( + ScheduleFlow.System(config['TOTAL_NODES']), + priorityLevels=num_prios, + ) + self._sf_runtime = _intScheduleFlow.Runtime([]) + self._sf_runtime.scheduler = self.sf_scheduler + # self.sf_time = -1 + self.sf_submitted_list = [] # list of sf_apps + # self.sf_start_list = [] # list as returned from sf_scheduler.submit_job + # self.sf_end_list = [] # list as returned from sf_scheduler.start_job + # self.sf_action_list = [] # list as returned from sf_scheduler.stop_job + + def gif(self): + # logs = self._sf_runtime.get_stats() # Unused + # vis_hanlder = _intScheduleFlow.VizualizationEngine(self.sf_scheduler. + self._sf_runtime._Runtime__generate_gif() + + def sort_jobs(self, queue, accounts=None): + """ + Optionally, pre-sort jobs. + + For now, we can sort by submit_time (FCFS) as a default. + """ + return sorted(queue, key=lambda job: job.submit_time) + + def start_job_event(): + pass + + def end_job_event(): + pass + + def schedule(self, queue, running, current_time, accounts=None, sorted=False, debug=False): + + # self._sf_runtim + pass + # SECOND TRY + new_queue_items = list(filter(lambda x: x not in self.queue, queue)) + if new_queue_items: + self.queue += new_queue_items + # # Convert RAPS jobs to ScheduleFlow format + new_sf_jobs = [self._convert_to_sf(job) for job in new_queue_items] + self.sf_submitted_list += new_sf_jobs # This one only holds sf_jobs no timestamps + # Submit each job to the ScheduleFlow scheduler # This trigger schedule! + if new_sf_jobs: + ret = self.sf_scheduler.submit_job(current_time, new_sf_jobs) + self._sf_runtime._Runtime__handle_scheduler_actions(ret) + self._sf_runtime._Runtime__trigger_schedule_event() + + if not self._sf_runtime._Runtime__events.empty(): + top = self._sf_runtime._Runtime__events.top() + if top[0] == current_time: + start_jobs = [] + end_jobs = [] + for event in self._sf_runtime._Runtime__events.pop_list(): + if event[1] == EventType.Submit: + raise ValueError(f"Didnt we already Submit above? {event}") + if event[1] == EventType.JobStart: + start_jobs.append(event[2]) + if event[1] == EventType.JobEnd: + end_jobs.append(event[2]) + if len(end_jobs) > 0: + self._sf_runtime._Runtime__job_end_event(end_jobs) + # End of jobs is handled by RAPS via prepare_timestep + pass + if len(start_jobs) > 0: + self._sf_runtime._Runtime__job_start_event(start_jobs) + for sf_app in start_jobs: + job = _match_sf_app_and_job(sf_app, queue, start_jobs) + queue.remove(job) + self.resource_manager.assign_nodes_to_job(job, current_time, self.policy) + running.append(job) + + # Keep track of: All jobs have been submitted empty the queue! + + # remove_list = [] + # job_list = [] + # for x in self.sf_start_list: + # sf_job_start_time,sf_app = x + # if sf_job_start_time <= current_time: + # job_list.append(sf_app) + # remove_list.append(x) + # job = _match_sf_app_and_job(sf_app,queue,self.sf_submitted_list) + # if current_time != sf_job_start_time: + # print("current_time != sf_job_start_time") + # print(f"{current_time} != {sf_job_start_time}") + # queue.remove(job) + # self.sf_submitted_list.remove(sf_app) + + # self.resource_manager.assign_nodes_to_job(job, current_time) + # running.append(job) + # if job_list: + # self.sf_end_list += self.sf_scheduler.start_job(current_time,job_list) + # for x in remove_list: + # self.sf_start_list.remove(x) + + # First TRY + # if self.sf_end_list: + # remove_list = [] + # job_list = [] + # for x in self.sf_end_list: + # if x[0] <= current_time: + # job_list.append(x[1]) + # remove_list.append(x) + # if job_list: + # self.sf_action_list += self.sf_scheduler.stop_job(current_time,job_list) + # for x in remove_list: + # self.sf_end_list.remove(x) + + # submit_jobs triggered the schedule calculation, sf_jobs returned the placed jobs. + # We need to flect this on the raps side. + + # March the sf_scheduler forward based on the jobs + # end_jobs = self.sf_scheduler.start_job(current_time,sf_schedule[1]) + # self.sf_scheduler.end_job(current_time,end_jobs) + + # Add to running + + # Process the actions (each action is assumed to be (start_time, job_info)) + # for act in actions: + # start_time, sf_job = act + # # Find the corresponding RAPS job using its ID + # job = self._find_job(queue, sf_job['job_id']) + # if job: + # job.scheduled_nodes = sf_job.get('assigned_nodes', []) + # job.start_time = start_time + # job.end_time = start_time + job.wall_time + # job.state = JobState.RUNNING + # running.append(job) + # queue.remove(job) + # if debug: + # print(f"t={current_time}: Scheduled job {job.id} on nodes {summarize_ranges(job.scheduled_nodes)}") + + def _find_sf_in_queue(self, queue, sf_app): + # Remember we added four digits and an underscore in _convert_to_sf: + match = [x for x in queue if x.id == sf_app.name] + if len(match != 1): + raise ValueError(sf_app) + return match[0] + + def _convert_to_sf(self, job): + # Create an ScheduleFlow.Application from the job information: + sf_prio = self.sorted_priorities.index(job.priority) + # Use job_dict to create a dictionary from the RAPS job. + nodes = job.nodes_required + submission_time = job.submit_time + if submission_time < 0: + submission_time = 0 + walltime = job.wall_time + requested_walltimes = [job.wall_time] + priority = sf_prio + resubmit_factor = -1 + name = job.id # We use the ID as name to be able to match when unpacking! + return ScheduleFlow.Application(nodes, + submission_time, + walltime, + requested_walltimes, + priority, + resubmit_factor, + name) + + def _find_job(self, queue, job_id): + """ + Find the RAPS job in the queue that matches the given job_id. + """ + for job in queue: + if job.job_id == job_id: + return job + return None + + def find_backfill_job(self, queue, num_free_nodes, current_time): + """ + Optionally, implement backfill logic by delegating to ScheduleFlow's + mechanisms or by applying custom logic. + """ + # This is left as an exercise. You might use ScheduleFlow’s API to determine if a job can backfill. + return None + + +def _match_sf_app_and_job(sf_app, queue, sf_queue): + match = [x for x in sf_queue if x.name == sf_app.name] + if len(match) != 1: + print("Multiple Matches") + raise ValueError(sf_app) + else: + match = match[0] + job = [x for x in queue if x.id == match.name] + if len(job) != 1: + print("Multiple submitted Jobs ") + raise ValueError(job) + else: + job = job[0] + return job + + +if __name__ == '__main__': + import unittest + unittest.main() diff --git a/raps/sim_config.py b/raps/sim_config.py new file mode 100644 index 0000000000000000000000000000000000000000..32cc043408670ab72cc5404ce44cdbc58b52451d --- /dev/null +++ b/raps/sim_config.py @@ -0,0 +1,514 @@ +import argparse +import abc +from pathlib import Path +import pandas as pd +from functools import cached_property +from datetime import timedelta +from typing import Literal, Annotated as A +from annotated_types import Len +import importlib +from raps.schedulers.default import PolicyType, BackfillType +from raps.utils import ( + parse_time_unit, convert_to_time_unit, infer_time_unit, ResolvedPath, create_casename, + RAPSBaseModel, AutoAwareDatetime, SmartTimedelta, yaml_dump, +) +from raps.system_config import ( + SystemConfig, get_partition_configs, get_system_config, list_systems, resolve_system_reference, +) +from pydantic import model_validator, Field, BeforeValidator + +Distribution = Literal['uniform', 'weibull', 'normal'] + + +class SimConfig(RAPSBaseModel, abc.ABC): + cooling: bool = False + """ Include the FMU cooling model """ + simulate_network: bool = False + """ Include network model """ + weather: bool = False + """ + Include weather information in the cooling model. + Defaults to True if replay, False otherwise. + """ + + # Simulation runtime options + start: AutoAwareDatetime | None = None + """ Start of simulation """ + # Exclude end from serialization as it is redundant with time + end: A[AutoAwareDatetime | None, Field(exclude=True)] = None + """ End of simulation. Pass either `time` or `end`, not both. """ + time: SmartTimedelta = timedelta(hours=1) + """ + Length of time to simulate (default seconds). + Can pass a string like 123, 27m, 3h, 7d + Pass either `time` or `end`, not both. + """ + fastforward: SmartTimedelta = timedelta(seconds=0) + """ + "Fast-forward" the simulation by time amount before starting. This is just a convenience + shortcut for setting --start without having to recall the exact start date of the dataset. + Can pass a string like 15s, 1m, 1h + """ + time_delta: SmartTimedelta = timedelta(seconds=1) + """ + Step size for the power simulation (default seconds). + Can pass a string like 15s, 1m, 1h, 1ms + """ + time_unit: A[timedelta, BeforeValidator(parse_time_unit)] = timedelta(seconds=1) + """ + The base unit of the simulation, determining how often it will tick the job scheduler. + """ + + @cached_property + def time_int(self) -> int: + """ Return time as an int of time_unit """ + return int(self.time / self.time_unit) + + @cached_property + def time_delta_int(self) -> int: + """ Return time_delta as an int of time_unit """ + return int(self.time_delta / self.time_unit) + + @cached_property + def downscale(self) -> int: + return int(timedelta(seconds=1) / self.time_unit) + + numjobs: int = 100 + """ Number of jobs to schedule """ + + uncertainties: bool = False + """ Use float-with-uncertainties (much slower) """ + + seed: int | None = None + """ Set RNG seed for deterministic simulation """ + + output: ResolvedPath | Literal['none'] | None = None + """ + Where to output power, cooling, and loss models for later analysis. + If omitted it will output to raps-output- by default. + Set to "none" to disable file output entirely. + """ + + _random_output: Path | None = None + + def get_output(self) -> Path | None: + if self.output is None: # by default, output to a random directory + if not self._random_output: + self._random_output = Path(create_casename("raps-output-")).resolve() + return self._random_output + elif self.output == "none": # allow explicitly disabling output with "none" + return None + else: + return self.output # return user defined output path + + debug: bool = False + """ Enable debug mode and disable rich layout """ + noui: bool = False + """ Run without UI """ + verbose: bool = False + """ Enable verbose output """ + layout: Literal["layout1", "layout2"] = "layout1" + """ UI layout """ + plot: list[Literal["power", "loss", "pue", "temp", "util", "net"]] | None = None + """ Plots to generate """ + + imtype: Literal["png", "svg", "jpg", "pdf", "eps"] = "png" + """ Plot image type """ + + replay: list[ResolvedPath] | None = None + """ Either: path/to/joblive path/to/jobprofile OR filename.npz """ + + dataloader: str | None = None + """ + Python module path to use as the dataloader when loading replay data. Only relevant if replay is + set. E.g. Defaults to "raps.dataloaders." but can be set to your own custom dataloader + as well. + """ + + encrypt: bool = False + """ Encrypt sensitive data in telemetry """ + + power_scope: Literal['node', 'chip'] = "chip" + """ node mode will use node power instead of CPU/GPU utilizations """ + + jid: str = "*" + """ Replay job id """ + + scale: int = 0 + """ Scale telemetry to a smaller target system, --scale 192 """ + + live: bool = False + """ Grab data from live system. """ + + # Workload arguments (TODO split into separate model) + workload: Literal['random', 'benchmark', 'peak', 'idle', 'synthetic', + 'multitenant', 'replay', 'randomAI', 'network_test', + 'inter_job_congestion', 'calculon', 'hpl'] = "random" + + """ Type of synthetic workload """ + multimodal: list[float] = [1.0] + """ + Percentage to draw from each distribution (list of floats). e.g. '0.2 0.8' percentages apply + in order to the list of the --distribution argument list. + """ + # Jobsize + jobsize_distribution: list[Distribution] | None = None + """ Distribution type """ + jobsize_normal_mean: float | None = None + """ Mean (mu) for Normal distribution """ + jobsize_normal_stddev: float | None = None + """ Standard deviation (sigma) for Normal distribution """ + jobsize_weibull_shape: float | None = None + """ Jobsize shape of weibull """ + jobsize_weibull_scale: float | None = None + """ Jobsize scale of weibull """ + jobsize_is_of_degree: int | None = None + """ Draw jobsizes from distribution of degree N (squared,cubed). """ + jobsize_is_power_of: int | None = None + """ Draw jobsizes from distribution of power of N (2->2^x,3->3^x). """ + + # Walltime + walltime_distribution: list[Distribution] | None = None + """ Distribution type """ + walltime_normal_mean: float | None = None + """ Walltime mean (mu) for Normal distribution """ + walltime_normal_stddev: float | None = None + """ Walltime standard deviation (sigma) for Normal distribution """ + walltime_weibull_shape: float | None = None + """ Walltime shape of weibull """ + walltime_weibull_scale: float | None = None + """ Walltime scale of weibull """ + # Utilizations (TODO should probably make a reusable "Distribution" submodel) + cpuutil_distribution: list[Distribution] = ['uniform'] + """ Distribution type """ + cpuutil_normal_mean: float | None = None + """ Walltime mean (mu) for Normal distribution """ + cpuutil_normal_stddev: float | None = None + """ Walltime standard deviation (sigma) for Normal distribution """ + cpuutil_weibull_shape: float | None = None + """ Walltime shape of weibull """ + cpuutil_weibull_scale: float | None = None + """ Walltime scale of weibull """ + gpuutil_distribution: list[Distribution] = ['uniform'] + """ Distribution type """ + gpuutil_normal_mean: float | None = None + """ Walltime mean (mu) for Normal distribution """ + gpuutil_normal_stddev: float | None = None + """ Walltime standard deviation (sigma) for Normal distribution """ + gpuutil_weibull_shape: float | None = None + """ Walltime shape of weibull """ + gpuutil_weibull_scale: float | None = None + """ Walltime scale of weibull """ + gantt_nodes: bool = False + """ Print Gannt with nodes required as line thickness (default false) """ + + # Synthetic workloads + scheduler: Literal[ + "default", + "experimental", + "fastsim", + "multitenant", + "scheduleflow", + ] = "default" + """ Scheduler name """ + policy: str | None = None + """ Schedule policy """ + backfill: str | None = None + """ Backfill policy """ + + # Arrival + arrival: Literal["prescribed", "poisson"] = "prescribed" + """ Modify arrival distribution (poisson) or use original submit times (prescribed) """ + job_arrival_time: int | None = None + """ Poisson arrival (seconds). Overrides system config scheduler.job_arrival_time """ + job_arrival_rate: float | None = None # TODO define default here + """ Modify Poisson rate (default 1) """ + + # Accounts + accounts: bool = False + accounts_json: ResolvedPath | None = None + """ Path to accounts JSON file from previous run """ + + # Downtime + downtime_first: SmartTimedelta | None = None + """ + First downtime. Can pass a string like 27m, 3h, 7d + """ + downtime_interval: SmartTimedelta | None = None + """ + Interval between downtimes. Can pass a string like 123, 27m, 3h, 7d + """ + downtime_length: SmartTimedelta | None = None + """ + Downtime length. Can pass a string like 123, 27m, 3h, 7d + """ + + @cached_property + def downtime_first_int(self) -> int | None: + return None if self.downtime_first is None else int(self.downtime_first / self.time_unit) + + @cached_property + def downtime_interval_int(self) -> int | None: + return None if self.downtime_interval is None else int(self.downtime_interval / self.time_unit) + + @cached_property + def downtime_length_int(self) -> int | None: + return None if self.downtime_length is None else int(self.downtime_length / self.time_unit) + + # Continous Job Generation + continuous_job_generation: bool = False + """ Activate continuous job generation """ + maxqueue: int = 50 + """ Specify the max queue length for continuous job generation """ + + filter: str | None = None + """job filter \"traffic > 1e8\" """ + + @model_validator(mode="before") + def _validate_before(cls, data): + # This is called with the raw input, before Pydantic parses it, so data is just a dict and + # contain any data types. + data = {**data} + + # infer time_unit + td_fields = [ + "time_delta", "time", "fastforward", + "downtime_first", "downtime_interval", "downtime_length", + ] + # infer time unit from other timedelta fields if it wasn't set explicitly + if data.get('time_unit') is None: + time_unit = min( + [infer_time_unit(data[f]) for f in td_fields if data.get(f)], + default=timedelta(seconds=1) + ) + else: + time_unit = parse_time_unit(data['time_unit']) + data['time_unit'] = time_unit + + return data + + @model_validator(mode="after") + def _validate_after(self): + # Allow setting either start/end or start/time for backwards compatibility and convenience + if self.start and self.fastforward: + raise ValueError("start and fastforward are mutually exclusive") + + if self.start: + self.start = pd.Timestamp(self.start).floor(self.time_unit).to_pydatetime() + if self.end: + self.end = pd.Timestamp(self.end).floor(self.time_unit).to_pydatetime() + + if self.end: + if not self.start: + raise ValueError("end requires start to be set") + if 'time' not in self.model_fields_set: # If time was not explicitly set + self.time = self.end - self.start + elif self.start: + self.end = self.start + self.time + + if self.start and self.start + self.time != self.end: + raise ValueError("time and end values don't match. You only need to specify one.") + + td_fields = [ + "time_delta", "time", "fastforward", + "downtime_first", "downtime_interval", "downtime_length", + ] + # Check time fields are divisible by time_unit. + for field in td_fields: + td = getattr(self, field) + if td is not None: + convert_to_time_unit(td, self.time_unit) # will throw if invalid + + if self.replay: + if "workload" not in self.model_fields_set: + self.workload = "replay" # default to replay if --replay is set + if not self.policy: + self.policy = "replay" + if self.workload != "replay" or self.policy != 'replay': + raise ValueError('workload & policy must be either omitted or "replay" when --replay is set') + if self.scheduler != 'default': + raise ValueError('scheduler must be omitted or set to default when --replay is set') + else: + if self.workload == "replay" or self.policy == "replay": + raise ValueError('--replay must be set when workload type is "replay"') + + if self.cooling: + self.layout = "layout2" + + if 'weather' not in self.model_fields_set: + self.weather = self.cooling and bool(self.replay) + + if self.jobsize_is_power_of is not None and self.jobsize_is_of_degree is not None: + raise ValueError("jobsize_is_power_of and jobsize_is_of_degree are mutually exclusive") + + if self.plot and self.output == "none": + raise ValueError("plot requires an output directory to be set") + + if self.live and not self.replay and self.time is None: + raise ValueError("--time must be set, specifing how long we want to predict") + + if self.policy or self.backfill: + try: + module = importlib.import_module(f"raps.schedulers.{self.scheduler}") + except ImportError as e: + raise ValueError(f"Scheduler '{self.scheduler}' could not be imported") from e + + if self.policy: + extended_policytypes = getattr(module, "ExtendedPolicyType", None) + + valid_policies = set(m.value for m in PolicyType) + if extended_policytypes is not None: + valid_policies |= {m.value for m in extended_policytypes} + + if self.policy not in valid_policies: + raise ValueError(f"policy {self.policy} not implemented by {self.scheduler}. " + f"Valid selections: {sorted(valid_policies)}") + + if self.backfill: + extended_backfilltypes = getattr(module, "ExtendedBackfillType", None) + + valid_backfilltypes = set(m.value for m in BackfillType) + if extended_backfilltypes is not None: + valid_backfilltypes |= {m.value for m in extended_backfilltypes} + + if self.backfill not in valid_backfilltypes: + raise ValueError(f"policy {self.backfill} not implemented by {self.scheduler}. " + f"Valid selections: {sorted(valid_backfilltypes)}") + + return self + + @property + @abc.abstractmethod + def system_name(self) -> str: + """ + Name of the system. + Note, this is different than system, as system can be a file, or there can be multiple systems + """ + pass + + @property + @abc.abstractmethod + def system_configs(self) -> list[SystemConfig]: + """ + Return the SystemConfigs for the selected systems. + Will be a single element array unless multiple `partitions` are selected. + """ + pass + + def get_system_config_by_name(self, name: str) -> SystemConfig: + for s in self.system_configs: + if s.system_name == name: + return s + raise ValueError(f"Partition {name} isn't in SimConfig") + + def get_legacy_args(self): + """ + Return as an argparse.Namespace object for backwards compatability + """ + return argparse.Namespace(**self.get_legacy_args_dict()) + + def get_legacy_args_dict(self): + """ + Return as a dict object. This is for backwards compatibility with the rest of RAPS code so + we can migrate to the new config gradually. The dict also has a "sim_config" key that + contains the SimConfig object itself. + """ + args_dict = self.model_dump(mode="json") + args_dict['system'] = self.system_name + # validate has been renamed to power_scope + args_dict['validate'] = self.power_scope == "node" + args_dict['downscale'] = self.downscale + + # Convert Path objects to str + if self.output: + args_dict['output'] = str(self.output) + if self.replay: + args_dict['replay'] = [str(p) for p in self.replay] + if self.accounts_json: + args_dict['accounts_json'] = str(self.accounts_json) + + args_dict["time"] = self.time_int + args_dict["time_delta"] = self.time_delta_int + args_dict["downtime_first"] = self.downtime_first_int + args_dict["downtime_interval"] = self.downtime_interval_int + args_dict["downtime_length"] = self.downtime_length_int + args_dict['start'] = self.start.astimezone().isoformat() if self.start else None + args_dict['end'] = self.end.astimezone().isoformat() if self.end else None + args_dict.pop("fastforward") # Remove fastforward from this to avoid confusion later + + args_dict['sim_config'] = self + return args_dict + + def dump_yaml(self, exclude_unset=True): + return yaml_dump(self.model_dump(mode="json", exclude_unset=exclude_unset)) + + +class SingleSimConfig(SimConfig, abc.ABC): + # Dynamic help string + system: A[SystemConfig | str, Field(description=f""" + Name of the system to simulate or a path to a yaml file containing the SystemConfig. + + You can also make modifications to the SystemConfig on the CLI using `--system.base`, e.g + `--system.base frontier --system.cooling.fmu-path path/to/my.fmu`. + + Built-in systems: {', '.join(list_systems())} + """)] = "frontier" + + @model_validator(mode="after") + def _validate_system(self, info): + self.system = resolve_system_reference(self.system, info) + try: + self._system_configs = [get_system_config(self.system)] + except FileNotFoundError as e: + raise ValueError(str(e)) + return self + + @property + def system_name(self) -> str: + return self.system_configs[0].system_name + + @property + def system_configs(self) -> list[SystemConfig]: + return self._system_configs + + +class MultiPartSimConfig(SimConfig): + partitions: A[list[SystemConfig | str], Len(min_length=1)] + """ + List of multiple systems/partitions to run. Can be names of preconfigured systems, or paths + to custom SystemConfig yaml files. + """ + + @model_validator(mode="after") + def _validate_partitions(self, info): + self.partitions = [resolve_system_reference(p, info) for p in self.partitions] + try: + self._multi_partition_system_config = get_partition_configs(self.partitions) + except FileNotFoundError as e: + raise ValueError(str(e)) + return self + + @property + def system_name(self) -> str: + return self._multi_partition_system_config.system_name + + @property + def system_configs(self) -> list[SystemConfig]: + return self._multi_partition_system_config.partitions + + +SIM_SHORTCUTS = { + "partitions": "x", + "cooling": "c", + "simulate-network": "net", + "time": "t", + "fastforward": "ff", + "debug": "d", + "numjobs": "n", + "verbose": "v", + "output": "o", + "uncertainties": "u", + "plot": "p", + "replay": "f", + "workload": "w", +} diff --git a/raps/stats.py b/raps/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..6906e67b09995a546d4d63182d4cde5df1d08088 --- /dev/null +++ b/raps/stats.py @@ -0,0 +1,520 @@ +""" +This module provides functionality for generating statistics. +These are statistics on +the engine +the jobs + +Both could be part of the engine or jobs class, but as the are very verbose, +try to keep statistics consolidated in this file. +""" +import sys +from .utils import sum_values, min_value, max_value, convert_seconds_to_hhmmss + +from .engine import Engine + + +def get_engine_stats(engine: Engine): + """ + Return engine statistics + """ + timesteps = engine.current_timestep - engine.timestep_start + num_samples = len(engine.power_manager.history) if engine.power_manager else 0 + time_simulated = convert_seconds_to_hhmmss(timesteps / engine.downscale) + average_power_mw = sum_values(engine.power_manager.history) / num_samples / 1000 if num_samples else 0 + average_loss_mw = sum_values(engine.power_manager.loss_history) / num_samples / 1000 if num_samples else 0 + min_loss_mw = min_value(engine.power_manager.loss_history) / 1000 if num_samples else 0 + max_loss_mw = max_value(engine.power_manager.loss_history) / 1000 if num_samples else 0 + + loss_fraction = average_loss_mw / average_power_mw if average_power_mw else 0 + efficiency = 1 - loss_fraction if loss_fraction else 0 + total_energy_consumed = average_power_mw * timesteps / 3600 if timesteps else 0 # MW-hr + emissions = total_energy_consumed * 852.3 / 2204.6 / efficiency if efficiency else 0 + total_cost = total_energy_consumed * 1000 * engine.config.get('POWER_COST', 0) # Total cost in dollars + + stats = { + 'time_simulated': time_simulated, + 'num_samples': num_samples, + 'average_power': average_power_mw, + 'min_loss': min_loss_mw, + 'average_loss': average_loss_mw, + 'max_loss': max_loss_mw, + 'system_power_efficiency': efficiency * 100, + 'total_energy_consumed': total_energy_consumed, + 'carbon_emissions': emissions, + 'total_cost': total_cost, + } + + if engine.config['multitenant']: + # Multitenancy Stats + total_jobs_loaded = engine.total_initial_jobs # Assuming this is passed to __init__ + stats['total_jobs_loaded'] = total_jobs_loaded + if total_jobs_loaded > 0: + stats['jobs_completed_percentage'] = engine.jobs_completed / total_jobs_loaded * 100 + else: + stats['jobs_completed_percentage'] = 0 + + if engine.node_occupancy_history: + # Calculate average concurrent jobs per node (average density across all nodes and timesteps) + total_jobs_running_timesteps = 0 + max_concurrent_jobs_per_node = 0 + sum_jobs_per_active_node = 0 # New: Sum of (jobs / active_nodes) for each timestep + count_active_timesteps_for_avg_active = 0 # New: Count of timesteps with active nodes + + for occupancy_dict in engine.node_occupancy_history: + current_timestep_total_occupancy = sum(occupancy_dict.values()) + total_jobs_running_timesteps += current_timestep_total_occupancy + + # Find max concurrent jobs on any single node for this timestep + if occupancy_dict: + max_concurrent_jobs_per_node = max(max_concurrent_jobs_per_node, max(occupancy_dict.values())) + + # New: Calculate average jobs per *active* node for this timestep + active_nodes_in_timestep = [count for count in occupancy_dict.values() if count > 0] + if active_nodes_in_timestep: + sum_jobs_per_active_node += sum(active_nodes_in_timestep) / len(active_nodes_in_timestep) + count_active_timesteps_for_avg_active += 1 + + # Average jobs per *active* node (user's desired "1" type) + avg_jobs_per_active_node = (sum_jobs_per_active_node / count_active_timesteps_for_avg_active) \ + if count_active_timesteps_for_avg_active > 0 else 0 + + stats['avg_concurrent_jobs_per_active_node'] = avg_jobs_per_active_node + stats['max_concurrent_jobs_per_node'] = max_concurrent_jobs_per_node + else: + stats['avg_concurrent_jobs_per_node'] = None + stats['max_concurrent_jobs_per_node'] = None + + # network_stats = get_network_stats() + # stats.update(network_stats) + + return stats + + +def min_max_sum(value, min, max, sum): + if value < 0: + value = 0 + if value < min: + min = value + if value > max: + max = value + sum += value + return min, max, sum + + +def get_scheduler_stats(engine: Engine): + if len(engine.scheduler_queue_history) != 0: + average_queue = sum(engine.scheduler_queue_history) / len(engine.scheduler_queue_history) + else: + average_queue = 0 + if len(engine.scheduler_running_history) != 0: + average_running = sum(engine.scheduler_running_history) / len(engine.scheduler_running_history) + else: + average_running = 0 + + stats = { + 'average_queue': average_queue, + 'average_running': average_running, + } + return stats + + +def get_network_stats(engine: Engine): + stats = {} + + if engine.net_util_history: + mean_net_util = sum(engine.net_util_history) / len(engine.net_util_history) + else: + mean_net_util = 0.0 + + stats["avg_network_util"] = mean_net_util * 100 + + if engine.avg_slowdown_history: + avg_job_slow = sum(engine.avg_slowdown_history) / len(engine.avg_slowdown_history) + else: + avg_job_slow = 1.0 + stats["avg_per_job_slowdown"] = avg_job_slow + + if engine.max_slowdown_history: + max_job_slow = max(engine.max_slowdown_history) + else: + max_job_slow = 1.0 + stats["max_per_job_slowdown"] = max_job_slow + + if engine.net_congestion_history: + congestion_values = [c for t, c in engine.net_congestion_history] + stats['avg_inter_job_congestion'] = sum(congestion_values) / len(congestion_values) + stats['max_inter_job_congestion'] = max(congestion_values) + stats['min_inter_job_congestion'] = min(congestion_values) + else: + stats['avg_inter_job_congestion'] = 0.0 + stats['max_inter_job_congestion'] = 0.0 + stats['min_inter_job_congestion'] = 0.0 + + return stats + + +def get_job_stats(engine: Engine): + """ Return job statistics processed over the engine execution""" + # Information on Job-Mix + min_job_size, max_job_size, sum_job_size = sys.maxsize, -sys.maxsize - 1, 0 + min_runtime, max_runtime, sum_runtime = sys.maxsize, -sys.maxsize - 1, 0 + + min_energy, max_energy, sum_energy = sys.maxsize, -sys.maxsize - 1, 0 + min_edp, max_edp, sum_edp = sys.maxsize, -sys.maxsize - 1, 0 + min_edp2, max_edp2, sum_edp2 = sys.maxsize, -sys.maxsize - 1, 0 + + min_agg_node_hours, max_agg_node_hours, sum_agg_node_hours = sys.maxsize, -sys.maxsize - 1, 0 + # Completion statistics + throughput = engine.jobs_completed / (engine.current_timestep - engine.timestep_start) * 3600 if \ + (engine.current_timestep - engine.timestep_start != 0) else 0 # Jobs per hour + + min_wait_time, max_wait_time, sum_wait_time = sys.maxsize, -sys.maxsize - 1, 0 + min_turnaround_time, max_turnaround_time, sum_turnaround_time = sys.maxsize, -sys.maxsize - 1, 0 + min_psf_partial_num, max_psf_partial_num, sum_psf_partial_num = sys.maxsize, -sys.maxsize - 1, 0 + min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den = sys.maxsize, -sys.maxsize - 1, 0 + min_awrt, max_awrt, sum_awrt = sys.maxsize, -sys.maxsize - 1, 0 + + min_cpu_u, max_cpu_u, sum_cpu_u = sys.maxsize, -sys.maxsize - 1, 0 + min_gpu_u, max_gpu_u, sum_gpu_u = sys.maxsize, -sys.maxsize - 1, 0 + min_ntx_u, max_ntx_u, sum_ntx_u = sys.maxsize, -sys.maxsize - 1, 0 + min_nrx_u, max_nrx_u, sum_nrx_u = sys.maxsize, -sys.maxsize - 1, 0 + + jobsSmall = 0 + jobsMedium = 0 + jobsLarge = 0 + jobsVLarge = 0 + jobsHuge = 0 + + # Information on Job-Mix + for job in engine.job_history_dict: + job_size = job['num_nodes'] + min_job_size, max_job_size, sum_job_size = \ + min_max_sum(job_size, min_job_size, max_job_size, sum_job_size) + + runtime = job['end_time'] - job['start_time'] + min_runtime, max_runtime, sum_runtime = \ + min_max_sum(runtime, min_runtime, max_runtime, sum_runtime) + + energy = job['energy'] + min_energy, max_energy, sum_energy = \ + min_max_sum(energy, min_energy, max_energy, sum_energy) + edp = energy * runtime + min_edp, max_edp, sum_edp = \ + min_max_sum(edp, min_edp, max_edp, sum_edp) + + edp2 = energy * runtime**2 + min_edp2, max_edp2, sum_edp2 = \ + min_max_sum(edp2, min_edp2, max_edp2, sum_edp2) + + agg_node_hours = runtime * job_size # Aggreagte node hours + min_agg_node_hours, max_agg_node_hours, sum_agg_node_hours = \ + min_max_sum(agg_node_hours, min_agg_node_hours, max_agg_node_hours, sum_agg_node_hours) + + # Completion statistics + wait_time = job["start_time"] - job["submit_time"] + min_wait_time, max_wait_time, sum_wait_time = \ + min_max_sum(wait_time, min_wait_time, max_wait_time, sum_wait_time) + + turnaround_time = job["end_time"] - job["submit_time"] + min_turnaround_time, max_turnaround_time, sum_turnaround_time = \ + min_max_sum(turnaround_time, min_turnaround_time, max_turnaround_time, sum_turnaround_time) + + # Area Weighted Average Response Time + awrt = agg_node_hours * turnaround_time # Area Weighted Response Time + min_awrt, max_awrt, sum_awrt = min_max_sum(awrt, min_awrt, max_awrt, sum_awrt) + + # Priority Weighted Specific Response Time + psf_partial_num = job_size * (turnaround_time**4 - wait_time**4) + psf_partial_den = job_size * (turnaround_time**3 - wait_time**3) + + min_psf_partial_num, max_psf_partial_num, sum_psf_partial_num = \ + min_max_sum(psf_partial_num, min_psf_partial_num, max_psf_partial_num, sum_psf_partial_num) + min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den = \ + min_max_sum(psf_partial_den, min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den) + + if job['avg_cpu_usage'] is not None: + min_cpu_u, max_cpu_u, sum_cpu_u = min_max_sum(job['avg_cpu_usage'], min_cpu_u, max_cpu_u, sum_cpu_u) + if job['avg_gpu_usage'] is not None: + min_gpu_u, max_gpu_u, sum_gpu_u = min_max_sum(job['avg_gpu_usage'], min_gpu_u, max_gpu_u, sum_gpu_u) + if job['avg_ntx_usage'] is not None: + min_ntx_u, max_ntx_u, sum_ntx_u = min_max_sum(job['avg_ntx_usage'], min_ntx_u, max_ntx_u, sum_ntx_u) + if job['avg_nrx_usage'] is not None: + min_nrx_u, max_nrx_u, sum_nrx_u = min_max_sum(job['avg_nrx_usage'], min_nrx_u, max_nrx_u, sum_nrx_u) + + if job['num_nodes'] <= 5: + jobsSmall += 1 + elif job['num_nodes'] <= 50: + jobsMedium += 1 + elif job['num_nodes'] <= 250: + jobsLarge += 1 + elif job['num_nodes'] <= 4500: + jobsVLarge += 1 + else: # job['nodes_required'] > 250: + jobsHuge += 1 + + if len(engine.job_history_dict) != 0: + avg_job_size = sum_job_size / len(engine.job_history_dict) + avg_runtime = sum_runtime / len(engine.job_history_dict) + avg_energy = sum_energy / len(engine.job_history_dict) + avg_edp = sum_edp / len(engine.job_history_dict) + avg_edp2 = sum_edp2 / len(engine.job_history_dict) + avg_agg_node_hours = sum_agg_node_hours / len(engine.job_history_dict) + avg_wait_time = sum_wait_time / len(engine.job_history_dict) + avg_turnaround_time = sum_turnaround_time / len(engine.job_history_dict) + + avg_cpu_u = sum_cpu_u / len(engine.job_history_dict) + avg_gpu_u = sum_gpu_u / len(engine.job_history_dict) + avg_ntx_u = sum_ntx_u / len(engine.job_history_dict) + avg_nrx_u = sum_nrx_u / len(engine.job_history_dict) + + if sum_agg_node_hours != 0: + avg_awrt = sum_awrt / sum_agg_node_hours + else: + avg_awrt = 0 + if sum_psf_partial_den != 0: + psf = (3 * sum_psf_partial_num) / (4 * sum_psf_partial_den) + else: + psf = 0 + else: + # Set these to -1 to indicate nothing ran + min_job_size, max_job_size, avg_job_size = -1, -1, -1 + min_runtime, max_runtime, avg_runtime = -1, -1, -1 + min_energy, max_energy, avg_energy = -1, -1, -1 + min_edp, max_edp, avg_edp = -1, -1, -1 + min_edp2, max_edp2, avg_edp2 = -1, -1, -1 + min_agg_node_hours, max_agg_node_hours, avg_agg_node_hours = -1, -1, -1 + min_wait_time, max_wait_time, avg_wait_time = -1, -1, -1 + min_turnaround_time, max_turnaround_time, avg_turnaround_time = -1, -1, -1 + min_awrt, max_awrt, avg_awrt = -1, -1, -1 + psf = -1 + + min_cpu_u, max_cpu_u, avg_cpu_u = -1, -1, -1 + min_gpu_u, max_gpu_u, avg_gpu_u = -1, -1, -1 + min_ntx_u, max_ntx_u, avg_ntx_u = -1, -1, -1 + min_nrx_u, max_nrx_u, avg_nrx_u = -1, -1, -1 + + if min_cpu_u == sys.maxsize and \ + max_cpu_u == -sys.maxsize - 1 and \ + sum_cpu_u == 0: + min_cpu_u, max_cpu_u, avg_cpu_u = -1, -1, -1 + + if min_gpu_u == sys.maxsize and \ + max_gpu_u == -sys.maxsize - 1 and \ + sum_gpu_u == 0: + min_gpu_u, max_gpu_u, avg_gpu_u = -1, -1, -1 + if min_ntx_u == sys.maxsize and \ + max_ntx_u == -sys.maxsize - 1 and \ + sum_ntx_u == 0: + min_ntx_u, max_ntx_u, avg_ntx_u = -1, -1, -1 + + if min_nrx_u == sys.maxsize and \ + max_nrx_u == -sys.maxsize - 1 and \ + sum_nrx_u == 0: + min_nrx_u, max_nrx_u, avg_nrx_u = -1, -1, -1 + + job_stats = { + 'jobs_total': engine.jobs_completed + len(engine.running) + len(engine.queue), + 'jobs_completed': engine.jobs_completed, + 'throughput': throughput, + 'jobs_still_running': [job.id for job in engine.running], + 'jobs_still_in_queue': [job.id for job in engine.queue], + 'jobs <= 5 nodes': jobsSmall, + 'jobs <= 50 nodes': jobsMedium, + 'jobs <= 250 nodes': jobsLarge, + 'jobs <= 4500 nodes': jobsVLarge, + 'jobs > 4500 nodes': jobsHuge, + # Information on job-mix executed + 'min_job_size': min_job_size, + 'max_job_size': max_job_size, + 'average_job_size': avg_job_size, + 'min_runtime': min_runtime, + 'max_runtime': max_runtime, + 'average_runtime': avg_runtime, + 'min_energy': min_energy, + 'max_energy': max_energy, + 'avg_energy': avg_energy, + 'min_edp': min_edp, + 'max_edp': max_edp, + 'avg_edp': avg_edp, + 'min_edp^2': min_edp2, + 'max_edp^2': max_edp2, + 'avg_edp^2': avg_edp2, + 'min_aggregate_node_hours': min_agg_node_hours, + 'max_aggregate_node_hours': max_agg_node_hours, + 'avg_aggregate_node_hours': avg_agg_node_hours, + # Utilization: + 'min_cpu_util': min_cpu_u, + 'max_cpu_util': max_cpu_u, + 'avg_cpu_util': avg_cpu_u, + 'min_gpu_util': min_gpu_u, + 'max_gpu_util': max_gpu_u, + 'avg_gpu_util': avg_gpu_u, + 'min_ntx_util': min_ntx_u, + 'max_ntx_util': max_ntx_u, + 'avg_ntx_util': avg_ntx_u, + 'min_nrx_util': min_nrx_u, + 'max_nrx_util': max_nrx_u, + 'avg_nrx_util': avg_nrx_u, + # Completion statistics + 'min_wait_time': min_wait_time, + 'max_wait_time': max_wait_time, + 'average_wait_time': avg_wait_time, + 'min_turnaround_time': min_turnaround_time, + 'max_turnaround_time': max_turnaround_time, + 'average_turnaround_time': avg_turnaround_time, + 'min_area_weighted_response_time': min_awrt, + 'max_area_weighted_response_time': max_awrt, + 'area_weighted_avg_response_time': avg_awrt, + 'priority_weighted_specific_response_time': psf + } + return job_stats + + +def get_stats(engine: Engine): + return { + 'engine': get_engine_stats(engine), + 'job': get_job_stats(engine), + 'scheduler': get_scheduler_stats(engine), + 'network': get_network_stats(engine) if engine.simulate_network else {}, + } + + +def print_formatted_report(engine_stats=None, + job_stats=None, + scheduler_stats=None, + network_stats=None + ): + def print_report_section(name, data, templates): + if data: + rep_str = f"--- {name} ---" + print(rep_str) + for key, value in data.items(): + pretty_key = key.replace('_', ' ').title() + if key in templates: + pretty_value = templates[key].format(value) + elif isinstance(value, float): + pretty_value = f"{value:.2f}" + elif value is None: + pretty_value = "N/A" + else: + pretty_value = str(value) + print(f"{pretty_key}: {pretty_value}") + print(f"{'-' * len(rep_str)}\n") + print() + + # Print a formatted report + print() + print_report_section("Simulation Report", engine_stats, { + 'average_power': '{:.4f} MW', + 'min_loss': '{:.4f} MW', + 'average_loss': '{:.2f} MW', + 'max_loss': '{:.2f} MW', + 'system_power_efficiency': '{:.2f}%', + 'total_energy_consumed': '{:.2f} MW-hr', + 'carbon_emissions': '{:.4f} metric tons CO2', + 'total_cost': '${:.2f}', + }) + print_report_section("Job Stat Report", job_stats, { + 'throughput': '{:.2f} jobs/hour', + 'jobs_completed_percentage': "{:.2f}%", + }) + print_report_section("Scheduler Report", scheduler_stats, { + }) + print_report_section("Network Report", network_stats, { + "avg_network_util": "{:.2f}%", + "avg_per_job_slowdown": "{:.2f}x", + "max_per_job_slowdown": "{:.2f}x", + "avg_inter_job_congestion": "{:.2f}", + "max_inter_job_congestion": "{:.2f}", + "min_inter_job_congestion": "{:.2f}", + }) + + +def get_gauge_limits(engine: Engine): + """For setting max values in dashboard gauges""" + peak_flops = engine.flops_manager.get_rpeak() + peak_power = engine.power_manager.get_peak_power() + gflops_per_watt_max = peak_flops / 1E9 / peak_power + + return { + 'peak_flops': peak_flops, + 'peak_power': peak_power, + 'g_flops_w_peak': gflops_per_watt_max + } + + +class RunningStats: + """ + Calculate a subset of the stats in as "running totals" for each engine tick. This is much more + efficient than calling get_engine_stats() repeatedly. + """ + # TODO: maybe should combine this and get_engine_stats logic? + @staticmethod + def _running_stats(engine: Engine): + # Infinite generator used for the RunningStats logic + def running_sum_values(values, last_value, last_index): + return last_value + sum_values(values[last_index:]) + + def running_min_value(values, last_value, last_index): + if last_index < len(values): + new_min = min_value(values[last_index:]) + rtrn = new_min if last_value is None else min(new_min, last_value) + else: + rtrn = last_value # No change + return rtrn + + def running_max_value(values, last_value, last_index): + if last_index < len(values): + new_max = max_value(values[last_index:]) + return new_max if last_value is None else max(new_max, last_value) + else: + return last_value # No change + + last_power_index = 0 + power_sum = 0 + last_loss_index = 0 + loss_sum = 0 + loss_min = None + loss_max = None + + while True: + timesteps = engine.current_timestep - engine.timestep_start + throughput = engine.jobs_completed / timesteps * 3600 if timesteps != 0 else 0 # Jobs per hour + num_samples = len(engine.power_manager.history) if engine.power_manager else 0 + + power_sum = running_sum_values(engine.power_manager.history, power_sum, last_power_index) + average_power_mw = power_sum / num_samples / 1000 if num_samples else 0 + last_power_index = len(engine.power_manager.history) + + loss_sum = running_sum_values(engine.power_manager.loss_history, loss_sum, last_loss_index) + average_loss_mw = loss_sum / num_samples / 1000 if num_samples else 0 + loss_min = running_min_value(engine.power_manager.loss_history, loss_min, last_loss_index) + min_loss_mw = loss_min / 1000 if num_samples else 0 + loss_max = running_max_value(engine.power_manager.loss_history, loss_max, last_loss_index) + max_loss_mw = loss_max / 1000 if num_samples else 0 + last_loss_index = len(engine.power_manager.loss_history) + + loss_fraction = average_loss_mw / average_power_mw if average_power_mw else 0 + efficiency = 1 - loss_fraction if loss_fraction else 0 + total_energy_consumed = average_power_mw * timesteps / 3600 if timesteps else 0 # MW-hr + carbon_emissions = total_energy_consumed * 852.3 / 2204.6 / efficiency if efficiency else 0 + total_cost = total_energy_consumed * 1000 * engine.config.get('POWER_COST', 0) # Total cost in dollars + + yield { + "throughput": throughput, + "num_samples": num_samples, + "average_power": average_power_mw, + "min_loss": min_loss_mw, + "average_loss": average_loss_mw, + "max_loss": max_loss_mw, + "system_power_efficiency": efficiency * 100, + "total_energy_consumed": total_energy_consumed, + "carbon_emissions": carbon_emissions, + "total_cost": total_cost, + } + + def __init__(self, engine: Engine): + self._gen = RunningStats._running_stats(engine) + + def get_stats(self) -> dict: + return next(self._gen) diff --git a/raps/system_config.py b/raps/system_config.py new file mode 100644 index 0000000000000000000000000000000000000000..5253e0b1e58d5697e7f3786fed3c3afa0ad0c5c7 --- /dev/null +++ b/raps/system_config.py @@ -0,0 +1,333 @@ +import glob +import fnmatch +import functools +from typing import Any, Literal +from pathlib import Path +from functools import cached_property +import yaml +from pydantic import ( + model_validator, field_validator, model_serializer, SerializationInfo, + SerializerFunctionWrapHandler, ValidationInfo, +) +from raps.utils import ( + RAPSBaseModel, deep_merge, deep_subtract_dicts, is_yaml_file, ResolvedPath, validate_resolved_path, +) +from raps.raps_config import raps_config + +# Define Pydantic models for the config to handle parsing and validation + + +class SystemSystemConfig(RAPSBaseModel): + num_cdus: int + racks_per_cdu: int + nodes_per_rack: int + chassis_per_rack: int + nodes_per_blade: int + switches_per_chassis: int + nics_per_node: int + rectifiers_per_chassis: int + nodes_per_rectifier: int + missing_racks: list[int] = [] + down_nodes: list[int] = [] + cpus_per_node: int + gpus_per_node: int + cpu_peak_flops: float + gpu_peak_flops: float + cpu_fp_ratio: float + gpu_fp_ratio: float + threads_per_core: int | None = None + cores_per_cpu: int | None = None + + @model_validator(mode='after') + def _update_down_nodes(self): + for rack in self.missing_racks: + start_node_id = rack * self.nodes_per_rack + end_node_id = start_node_id + self.nodes_per_rack + self.down_nodes.extend(range(start_node_id, end_node_id)) + self.down_nodes = sorted(set(self.down_nodes)) + return self + + @cached_property + def num_racks(self) -> int: + return self.num_cdus * self.racks_per_cdu - len(self.missing_racks) + + @cached_property + def sc_shape(self) -> list[int]: + return [self.num_cdus, self.racks_per_cdu, self.nodes_per_rack] + + @cached_property + def total_nodes(self) -> int: + return self.num_cdus * self.racks_per_cdu * self.nodes_per_rack + + @cached_property + def blades_per_chassis(self) -> int: + return int(self.nodes_per_rack / self.chassis_per_rack / self.nodes_per_blade) + + @cached_property + def power_df_header(self) -> list[str]: + power_df_header = ["CDU"] + for i in range(1, self.racks_per_cdu + 1): + power_df_header.append(f"Rack {i}") + power_df_header.append("Sum") + for i in range(1, self.racks_per_cdu + 1): + power_df_header.append(f"Loss {i}") + power_df_header.append("Loss") + return power_df_header + + @cached_property + def available_nodes(self) -> int: + return self.total_nodes - len(self.down_nodes) + + +class SystemPowerConfig(RAPSBaseModel): + power_gpu_idle: float + power_gpu_max: float + power_cpu_idle: float + power_cpu_max: float + power_mem: float + power_nic: float | None = None + power_nic_idle: float | None = None + power_nic_max: float | None = None + power_nvme: float + power_switch: float + power_cdu: float + power_update_freq: int + rectifier_peak_threshold: float + sivoc_loss_constant: float + sivoc_efficiency: float + rectifier_loss_constant: float + rectifier_efficiency: float + power_cost: float + + +class SystemUqConfig(RAPSBaseModel): + power_gpu_uncertainty: float + power_cpu_uncertainty: float + power_mem_uncertainty: float + power_nic_uncertainty: float + power_nvme_uncertainty: float + power_cdus_uncertainty: float + power_node_uncertainty: float + power_switch_uncertainty: float + rectifier_power_uncertainty: float + + +JobEndStates = Literal["COMPLETED", "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL"] + + +class SystemSchedulerConfig(RAPSBaseModel): + job_arrival_time: int + mtbf: int + trace_quanta: int + min_wall_time: int + max_wall_time: int + ui_update_freq: int # TODO should be moved to raps_config + max_nodes_per_job: int + job_end_probs: dict[JobEndStates, float] + multitenant: bool = False + + +class SystemCoolingConfig(RAPSBaseModel): + cooling_efficiency: float + wet_bulb_temp: float + zip_code: str | None = None + country_code: str | None = None + fmu_path: ResolvedPath + fmu_column_mapping: dict[str, str] + w_htwps_key: str + w_ctwps_key: str + w_cts_key: str + temperature_keys: list[str] + + +class SystemNetworkConfig(RAPSBaseModel): + topology: Literal["capacity", "fat-tree", "dragonfly", "torus3d"] + network_max_bw: float + latency: float | None = None + + fattree_k: int | None = None + + dragonfly_d: int | None = None + dragonfly_a: int | None = None + dragonfly_p: int | None = None + + torus_x: int | None = None + torus_y: int | None = None + torus_z: int | None = None + torus_wrap: bool | None = None + torus_link_bw: float | None = None + torus_routing: str | None = None + + hosts_per_router: int | None = None + latency_per_hop: float | None = None + node_coords_csv: str | None = None + + +class SystemConfig(RAPSBaseModel): + system_name: str + """ Name of the system, defaults to the yaml file name """ + + base: str | None = None + """ + Optional, name or path to another SystemConfig to "inherit" from. Lets you make small modifications + to an existing system without having to copy the whole config. + """ + + system: SystemSystemConfig + power: SystemPowerConfig + scheduler: SystemSchedulerConfig + uq: SystemUqConfig | None = None + cooling: SystemCoolingConfig | None = None + network: SystemNetworkConfig | None = None + + @model_validator(mode="before") + def _load_base(cls, data, info: ValidationInfo): + if isinstance(data, dict) and data.get("base"): + data['base'] = resolve_system_reference(data['base'], info) + base_model = get_system_config(data['base']) + base_data = base_model.model_dump(mode='json', exclude_unset=True) + data = deep_merge(base_data, data) + return data + + @model_serializer(mode='wrap') + def model_serializer(self, handler: SerializerFunctionWrapHandler, info: SerializationInfo): + # don't include the base system data in the output + if self.base and (info.exclude_defaults or info.exclude_unset): + base = get_system_config(self.base) + return deep_subtract_dicts(handler(self), handler(base)) + else: + return handler(self) + + def get_legacy(self) -> dict[str, Any]: + """ + Return the system config as a flattened, uppercased dict. This is for backwards + compatibility with the rest of RAPS code so we can migrate to the new config format + gradually. The dict also as a "system_config" key that contains the SystemConfig object + itself. + """ + dump = self.model_dump(mode="json", exclude_none=True) + + renames = { # fields that need to be renamed to something other than just .upper() + "system_name": "system_name", + "w_htwps_key": "W_HTWPs_KEY", + "w_ctwps_key": "W_CTWPs_KEY", + "w_cts_key": "W_CTs_KEY", + "multitenant": "multitenant", + } + + config_dict: dict[str, Any] = {} + for k, v in dump.items(): # flatten + if isinstance(v, dict): + config_dict.update(v) + else: + config_dict[k] = v + config_dict["num_racks"] = self.system.num_racks + config_dict["sc_shape"] = self.system.sc_shape + config_dict["total_nodes"] = self.system.total_nodes + config_dict["blades_per_chassis"] = self.system.blades_per_chassis + config_dict["power_df_header"] = self.system.power_df_header + config_dict["available_nodes"] = self.system.available_nodes + + # rename keys + config_dict = {renames.get(k, k.upper()): v for k, v in config_dict.items()} + config_dict['system_config'] = self + return config_dict + + +class MultiPartitionSystemConfig(RAPSBaseModel): + system_name: str + partitions: list[SystemConfig] + + @field_validator("partitions") + def _validate_partitions(cls, partitions: list[SystemConfig]): + partition_names = [c.system_name for c in partitions] + if len(set(partition_names)) != len(partition_names): + raise ValueError(f"Duplicate system names: {','.join(partition_names)}") + return partitions + + @property + def partition_names(self): + return [c.system_name for c in self.partitions] + + +@functools.cache +def list_systems() -> list[str]: + """ Lists all available systems """ + return sorted([ + str(p.relative_to(raps_config.system_config_dir)).removesuffix(".yaml") + for p in raps_config.system_config_dir.rglob("*.yaml") + ]) + + +def get_system_config(system: str | SystemConfig) -> SystemConfig: + """ + Returns the system config as a Pydantic object. + system can either be a path to a custom .yaml file, or the name of one of the pre-configured + systems defined in RAPS_SYSTEM_CONFIG_DIR. + """ + if isinstance(system, SystemConfig): # Just pass system through if its already parsed + return system + elif is_yaml_file(system): + config_path = Path(system) + system_name = config_path.stem + else: + config_path = raps_config.system_config_dir / f"{system}.yaml" + system_name = system + + if not config_path.is_file(): + raise FileNotFoundError(f'"{system}" not found. Valid systems are: {list_systems()}') + config = { + "system_name": system_name, # You can override system_name in the yaml as well + **yaml.safe_load(config_path.read_text()), + } + # Pass context so paths in the SystemConfig can be resolved relative to the yaml file + return SystemConfig.model_validate(config, context={'base_path': config_path.parent}) + + +def get_partition_configs(partitions: list[str | SystemConfig]) -> MultiPartitionSystemConfig: + """ + Resolves multiple partition config files. Can pass globs, or directories to include all yaml + files under the directory. + """ + systems = list_systems() + multi_partition_systems = set(s.split("/")[0] for s in systems if "/" in s) + combined_system_name = [] + + parsed_configs: list[SystemConfig] = [] + for pat in partitions: + if isinstance(pat, SystemConfig): + parsed_configs.append(pat) + combined_system_name.append(pat.system_name) + elif pat in multi_partition_systems: + matched_systems = fnmatch.filter(systems, f"{pat}/*") + combined_system_name.append(pat) + elif fnmatch.filter(systems, pat): + matched_systems = fnmatch.filter(systems, pat) + combined_system_name.extend(s.split("/")[0] for s in matched_systems) + elif Path(pat).is_dir(): + matched_systems = sorted([str(s) for s in Path(pat).glob("*.yaml")]) + combined_system_name.append(Path(pat).name) + else: + matched_systems = sorted(glob.glob(pat)) + combined_system_name.extend(Path(s).stem for s in matched_systems) + + if not matched_systems: + raise FileNotFoundError(f'No config files match "{pat}"') + parsed_configs.extend(get_system_config(s) for s in sorted(matched_systems)) + + if len(parsed_configs) == 1: + combined_system_name = parsed_configs[0].system_name + else: + combined_system_name = "+".join(dict.fromkeys(combined_system_name)) # dedup, keep order + return MultiPartitionSystemConfig( + system_name=combined_system_name, + partitions=parsed_configs, + ) + + +def resolve_system_reference(system: str | SystemConfig, info: ValidationInfo): + """ If system is a yaml path, resolve it as a path. Otherwise leave it as a string """ + if isinstance(system, str) and is_yaml_file(system): + return str(validate_resolved_path(system, info)) + else: + return system diff --git a/raps/telemetry.py b/raps/telemetry.py index 2616bc78d7198bb1b961012f64317c821e9b77a5..0da391a5762ecfa6a57adae892fb001b1918e4b9 100644 --- a/raps/telemetry.py +++ b/raps/telemetry.py @@ -6,117 +6,390 @@ parsing parquet files, and generating job state information. The module defines a `Telemetry` class for managing telemetry data and several helper functions for data encryption and conversion between node name and index formats. """ +from typing import Literal +import random +from pathlib import Path +from datetime import datetime +from typing import Optional +from types import ModuleType +import importlib +import numpy as np +import pandas as pd +from pydantic import model_validator +# from rich.progress import track -import argparse +from raps.sim_config import SimConfig +from raps.system_config import get_system_config +from raps.job import Job, job_dict +from raps.utils import AutoAwareDatetime +import matplotlib.pyplot as plt +from raps.plotting import ( + plot_jobs_gantt, + plot_nodes_gantt, + plot_network_histogram +) +from raps.utils import ( + next_arrival_byconfargs, pydantic_add_args, SubParsers, ResolvedPath, WorkloadData, RAPSBaseModel, +) -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Telemetry data validator') - parser.add_argument('--jid', type=str, default='*', help='Replay job id') - parser.add_argument('-f', '--replay', nargs='+', type=str, - help='Either: path/to/joblive path/to/jobprofile' + \ - ' -or- filename.npz (overrides --workload option)') - parser.add_argument('-p', '--plot', action='store_true', help='Output plots') - parser.add_argument('--system', type=str, default='frontier', help='System config to use') - parser.add_argument('--reschedule', action='store_true', help='Reschedule the telemetry workload') - parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output') - args = parser.parse_args() -import importlib -import numpy as np -from tqdm import tqdm +# TODO: should reuse this model in SimConfig +class TelemetryArgs(RAPSBaseModel): + jid: str = '*' + """ Replay job id """ + replay: list[ResolvedPath] | None = None + """ path/to/joblive path/to/jobprofile -or- filename.npz (overrides --workload option) """ + plot: list[Literal["jobs", "nodes"]] | None = None + is_results_file: bool = False + """ Output plots """ + gantt_nodes: bool = False + """ Print Gannt with nodes required as line thickness (default false) """ + time: str | None = None + """ Length of time to simulate, e.g., 123, 123s, 27m, 3h, 7d """ + system: str = 'frontier' + """ System config to use """ + arrival: Literal['prescribed', 'poisson'] = "prescribed" + """ Modify arrival distribution ({choices[1]}) or use the original submit times """ + verbose: bool = False + output: str | None = None + """ Store output in --output file. """ + live: bool = False + """ Grab data from live system. """ + + @model_validator(mode="after") + def _validate_after(self): + if not self.live and not self.replay: + raise ValueError("Either --live or --replay is required") + return self -from .config import ConfigManager -from .job import Job -from .account import Accounts -from .plotting import plot_submit_times, plot_nodes_histogram -from .utils import next_arrival + @property + def system_name(self): + return self.system + + +shortcuts = { + "replay": "f", + "plot": "p", + "time": "t", + "verbose": "v", + "output": "o", +} class Telemetry: """A class for handling telemetry data, including reading/parsing job data, and loading/saving snapshots.""" + dataloader: Optional[ModuleType] def __init__(self, **kwargs): self.kwargs = kwargs - self.system = kwargs.get('system') + self.system = kwargs['system'] self.config = kwargs.get('config') + + if kwargs.get("dataloader"): + module = kwargs['dataloader'] + else: + module = f"raps.dataloaders.{self.system.split('/')[0]}" + try: - self.dataloader = importlib.import_module(f".dataloaders.{self.system}", package=__package__) - except: - print("WARNING: Failed to load dataloader") + self.dataloader = importlib.import_module(module, package=__package__) + except ImportError as e: + print(f"WARNING: Failed to load dataloader: {e}") + self.dataloader = None - def save_snapshot(self, jobs: list, accounts: dict, filename: str): + def save_snapshot(self, *, dest: str, result: WorkloadData, args: SimConfig | TelemetryArgs): """Saves a snapshot of the jobs to a compressed file. """ - np.savez_compressed(filename, jobs=jobs, accounts=accounts) + np.savez_compressed(dest, + jobs=[vars(j) for j in result.jobs], + telemetry_start=result.telemetry_start, + telemetry_end=result.telemetry_end, + start_date=result.start_date, + args=args, + ) + + def load_snapshot(self, snapshot: str | Path) -> tuple[WorkloadData, SimConfig | TelemetryArgs]: + """Reads a snapshot from a compressed file - def load_snapshot(self, snapshot: str) -> (list, dict): - """Reads a snapshot from a compressed file and returns the jobs.""" - jobs, accounts_dict = np.load(snapshot, allow_pickle=True, mmap_mode='r') - return jobs['jobs'].tolist(), Accounts.initialize_accounts_from_dict(accounts_dict) + :param str snapshot: Filename + :returns: + - job list + - timestep_start + - timestep_end + - args, which were used to generate the loaded snapshot + """ + data = np.load(snapshot, allow_pickle=True, mmap_mode='r') + jobs = [Job(j) for j in data['jobs']] + telemetry_start = data['telemetry_start'].item() + telemetry_end = data['telemetry_end'].item() + start_date = data['start_date'].item() + args = data['args'].item() + + result = WorkloadData( + jobs=jobs, + telemetry_start=telemetry_start, telemetry_end=telemetry_end, + start_date=start_date, + ) + + return result, args + + def load_csv_results(self, file): + jobs = [] + time_start = 0 + time_end = 0 + for line in pd.read_csv(file, chunksize=1): + job_info = job_dict(nodes_required=line.get('num_nodes').item(), + name=line.get('name').item(), + account=line.get('account').item(), + current_state=line.get('current_state').item(), + end_state=line.get('end_state').item(), + scheduled_nodes=line.get('scheduled_nodes').item(), + id=line.get('id').item(), + priority=line.get('priority').item(), + partition=line.get('partition').item(), + cpu_cores_required=line.get('cpu_cores_required').item(), + gpu_units_required=line.get('gpu_units_required').item(), + allocated_cpu_cores=line.get('allocated_cpu_cores').item(), + allocated_gpu_units=line.get('allocated_gpu_units').item(), + + cpu_trace=line.get('cpu_trace'), + gpu_trace=line.get('cpu_trace'), + ntx_trace=line.get('cpu_trace'), + nrx_trace=line.get('cpu_trace'), + submit_time=line.get('submit_time').item(), + time_limit=line.get('time_limit').item(), + start_time=line.get('start_time').item(), + end_time=line.get('end_time').item(), + expected_run_time=line.get('expected_run_time').item(), + current_run_time=line.get('current_run_time').item(), + trace_time=line.get('trace_time'), + # trace_start_time=line.get('trace_start_time').item(), + trace_start_time=line.get('trace_start_time'), + # trace_end_time=line.get('trace_end_time').item(), + trace_end_time=line.get('trace_end_time'), + trace_quanta=line.get('trace_quanta').item(), + trace_missing_values=line.get('trace_missing_values'), + downscale=line.get('downscale'), + ) + job = Job(job_info) + jobs.append(job) + # if hasattr(data,'args'): + # args_from_file = data["args"].item() # This should be empty as csv contains no args. + # else: + # args_from_file = None + + return jobs, time_start, time_end def load_data(self, files): """Load telemetry data using custom data loaders.""" + assert self.dataloader return self.dataloader.load_data(files, **self.kwargs) - def load_data_from_df(self, *args, **kwargs): + def load_live_data(self): """Load telemetry data using custom data loaders.""" - return self.dataloader.load_data_from_df(*args, **kwargs) + assert self.dataloader + return self.dataloader.load_live_data(**self.kwargs) + + def download_data(self, dest: Path, start: datetime | None, end: datetime | None): + """Load telemetry data using custom data loaders.""" + assert self.dataloader + if not hasattr(self.dataloader, "download"): + raise ValueError("Dataloader does not support download") + return self.dataloader.download(dest, start, end) def node_index_to_name(self, index: int): """ Convert node index into a name""" + assert self.dataloader return self.dataloader.node_index_to_name(index, config=self.config) def cdu_index_to_name(self, index: int): """ Convert cdu index into a name""" + assert self.dataloader return self.dataloader.cdu_index_to_name(index, config=self.config) def cdu_pos(self, index: int) -> tuple[int, int]: """ Return (row, col) tuple for a cdu index """ + assert self.dataloader return self.dataloader.cdu_pos(index, config=self.config) + def load_from_live_system(self) -> WorkloadData: + result = self.load_live_data() + return result + + def load_from_files(self, files) -> WorkloadData: + """ Load all files as combined jobs """ + assert len(files) >= 1 + files = [Path(f) for f in files] + + if str(files[0]).endswith(".npz"): + data: WorkloadData | None = None + for file in files: + print(f"Loading {file}") + new_data, args_from_file = self.load_snapshot(file) + print(f"File was generated with: --system {args_from_file.system_name}") + if not data: + data = new_data + else: + data.jobs.extend(new_data.jobs) + data.telemetry_start = min(data.telemetry_start, new_data.telemetry_start) + data.telemetry_end = min(data.telemetry_end, new_data.telemetry_end) + data.start_date = min(data.start_date, new_data.start_date) + else: # custom data loader + data = self.load_data(files) + self.update_jobs(data.jobs) + return data -if __name__ == "__main__": + def update_jobs(self, jobs: list[Job]): + """ Updates jobs with new scale or random start times """ + if self.kwargs.get("scale"): + for job in jobs: + job.nodes_required = random.randint(1, self.kwargs['scale']) + job.scheduled_nodes = None # Setting to None triggers scheduler to assign nodes - args_dict = vars(args) - config = ConfigManager(system_name=args.system).get_config() + if self.kwargs['arrival'] == "poisson": + # TODO: --arrival poisson distribution throws errors about start_time in some scenarios + # e.g. `python main.py run-parts experiments/mit-replay-24hrs.yaml --arrival poisson` + for job in jobs: + job.scheduled_nodes = None + job.submit_time = next_arrival_byconfargs(self.config, self.kwargs) + job.start_time = None + job.end_time = None + + +def run_telemetry_add_parser(subparsers: SubParsers): + parser = subparsers.add_parser("telemetry", description=""" + Telemetry data validator + """) + model_validate = pydantic_add_args(parser, TelemetryArgs, { + "cli_shortcuts": shortcuts, + }) + parser.set_defaults(impl=lambda args: run_telemetry(model_validate(args, {}))) + + +def run_telemetry(args: TelemetryArgs): + args_dict = args.model_dump() + config = get_system_config(args.system).get_legacy() args_dict['config'] = config td = Telemetry(**args_dict) - if args.replay[0].endswith(".npz"): - print(f"Loading {args.replay[0]}...") - jobs = td.load_snapshot(args.replay[0]) - if args.reschedule: - for job in tqdm(jobs, desc="Updating requested_nodes"): - job['requested_nodes'] = None - job['submit_time'] = next_arrival(1 / config['JOB_ARRIVAL_TIME']) + if args.is_results_file and args.replay: + file = str(args.replay[0]) + jobs, timestep_start, timestep_end = td.load_csv_results(file) + if args.live and not args.replay: + result = td.load_from_live_system() + jobs = result.jobs + timestep_start, timestep_end = result.telemetry_start, result.telemetry_end else: - jobs = td.load_data(args.replay) + result = td.load_from_files(args.replay) + jobs = result.jobs + timestep_start, timestep_end = result.telemetry_start, result.telemetry_end + + if args.output: + td.save_snapshot(dest=args.output, result=result, args=args) - timesteps = int(max(job['wall_time'] + job['submit_time'] for job in jobs)) + timesteps = timestep_end - timestep_start - dt_list = [] - wt_list = [] - nr_list = [] + dt_list = [] # arrival time ??? + tl_list = [] # time limit + ert_list = [] # expected run time + nr_list = [] # nodes required submit_times = [] + end_times = [] last = 0 - for job_vector in jobs: - job = Job(job_vector, 0) - wt_list.append(job.wall_time) + for job in jobs: + tl_list.append(job.time_limit) + ert_list.append(job.expected_run_time) nr_list.append(job.nodes_required) submit_times.append(job.submit_time) + end_times.append(job.submit_time + job.time_limit) if job.submit_time > 0: dt = job.submit_time - last dt_list.append(dt) last = job.submit_time if args.verbose: print(job) + dt_list = [item for item in dt_list if item is not None] + nr_list = [item for item in nr_list if item is not None] + tl_list = [item for item in tl_list if item is not None] + ert_list = [item for item in ert_list if item is not None] + print(f'Number of jobs: {len(jobs)}') print(f'Simulation will run for {timesteps} seconds') - print(f'Average job arrival time is: {np.mean(dt_list):.2f}s') - print(f'Average wall time is: {np.mean(wt_list):.2f}s') - print(f'Nodes required (avg): {np.mean(nr_list):.2f}') - print(f'Nodes required (max): {np.max(nr_list)}') - print(f'Nodes required (std): {np.std(nr_list):.2f}') + if dt_list: + print(f'Average job arrival time is: {np.mean(dt_list):.2f}s') + if tl_list: + print(f'Average time limit is: {np.mean(tl_list):.2f}s') + if ert_list: + print(f'Average expected runtime is: {np.mean(ert_list):.2f}s') + + if nr_list: + print(f'Nodes required (avg): {np.mean(nr_list):.2f}') + print(f'Nodes required (max): {np.max(nr_list)}') + print(f'Nodes required (std): {np.std(nr_list):.2f}') + + # ——— compute avg network traces ——— + ntx_means = [] + nrx_means = [] + for job in jobs: + job_vec = job.__dict__ + # only if there’s at least one valid sample + if hasattr(job_vec, 'ntx_trace'): + ntx = np.array(job_vec.get('ntx_trace', [])) + if ntx.size > 0 and not np.all(np.isnan(ntx)): + ntx_means.append(np.nanmean(ntx)) + if hasattr(job_vec, 'nrx_trace'): + nrx = np.array(job_vec.get('nrx_trace', [])) + if nrx.size > 0 and not np.all(np.isnan(nrx)): + nrx_means.append(np.nanmean(nrx)) + + if ntx_means: + print(f'Average ntx_trace per job: {np.mean(ntx_means):.2f}') + else: + print('No valid ntx_trace data found.') + + if nrx_means: + print(f'Average nrx_trace per job: {np.mean(nrx_means):.2f}') + else: + print('No valid nrx_trace data found.') if args.plot: - plot_nodes_histogram(nr_list) - plot_submit_times(submit_times, nr_list) + fig, ax = plt.subplots() + if args.plot == "jobs": + plot_jobs_gantt(ax=ax, jobs=jobs, bars_are_node_sized=args.gantt_nodes) + ax.invert_yaxis() + elif args.plot == "nodes": + plot_nodes_gantt(ax=ax, jobs=jobs) + elif args.plot == "network": + if ntx_means and nrx_means: + # combine into total per‐job traffic + net_means = [tx + rx for tx, rx in zip(ntx_means, nrx_means)] + plot_network_histogram(ax=ax, data=net_means) + if args.output is not None: + if args.output == "": + filename = f"{args.output}.svg" + else: + filename = args.output + plt.savefig(f'{filename}') + print(f"Saved to: {filename}") + else: + plt.show() + + +class DownloadArgs(RAPSBaseModel): + system: str + dest: ResolvedPath | None = None + start: AutoAwareDatetime | None = None + end: AutoAwareDatetime | None = None + + +def run_download_add_parser(subparsers: SubParsers): + parser = subparsers.add_parser("download", description=""" + Download telemetry data + """) + model_validate = pydantic_add_args(parser, DownloadArgs) + parser.set_defaults(impl=lambda args: run_download(model_validate(args, {}))) + + +def run_download(args: DownloadArgs): + config = get_system_config(args.system).get_legacy() + td = Telemetry(system=args.system, config=config) + dest = args.dest if args.dest else Path("./data").resolve() / args.system + td.download_data(dest, args.start, args.end) diff --git a/raps/train_rl.py b/raps/train_rl.py new file mode 100644 index 0000000000000000000000000000000000000000..d6ddd429f8ee33f3cba2a8930097c6ac9eca2d37 --- /dev/null +++ b/raps/train_rl.py @@ -0,0 +1,56 @@ +from raps.sim_config import SingleSimConfig, SIM_SHORTCUTS +from raps.utils import SubParsers, pydantic_add_args, read_yaml_parsed + + +def train_rl_add_parser(subparsers: SubParsers): + parser = subparsers.add_parser("train-rl", description=""" + Example usage: + raps train-rl --system mit_supercloud/part-gpu -f /opt/data/mit_supercloud/202201 + """) + parser.add_argument("config_file", nargs="?", default=None, help=""" + YAML sim config file, can be used to configure an experiment instead of using CLI + flags. Pass "-" to read from stdin. + """) + model_validate = pydantic_add_args(parser, SingleSimConfig, model_config={ + "cli_shortcuts": SIM_SHORTCUTS, + }) + + def impl(args): + model = model_validate(args, read_yaml_parsed(SingleSimConfig, args.config_file)) + model.scheduler = "rl" + train_rl(model) + parser.set_defaults(impl=impl) + + +def train_rl(rl_config: SingleSimConfig): + from stable_baselines3 import PPO + from raps.envs.raps_env import RAPSEnv + + args_dict = rl_config.get_legacy_args_dict() + config = rl_config.system_configs[0].get_legacy() + args_dict['config'] = config + args_dict['args'] = rl_config.get_legacy_args() + + env = RAPSEnv(rl_config) + + model = PPO( + "MlpPolicy", + env, + n_steps=512, # shorter rollouts (quicker feedback loop) + batch_size=128, # must divide n_steps evenly + n_epochs=10, # of minibatch passes per update + gamma=0.99, # discount (keeps long-term credit) + learning_rate=3e-4, # default Adam lr, can try 1e-4 if unstable + ent_coef=0.01, # encourage exploration + verbose=1, + tensorboard_log="./ppo_raps_logs/" + ) + + model.learn(total_timesteps=10000, tb_log_name="ppo_raps") + + # Output stats + stats = env.get_stats() + print(stats) + + # Save trained model + model.save("ppo_raps") diff --git a/raps/ui.py b/raps/ui.py index 1e7a53c5317c1a57ed098b8c2689e8bfa0f72ae7..cee033a7cf73df18d7d8b1ccc1de98acc37bd475 100644 --- a/raps/ui.py +++ b/raps/ui.py @@ -1,41 +1,84 @@ +import sys +import os import pandas as pd +import numpy as np +from datetime import datetime from rich.align import Align from rich.console import Console from rich.layout import Layout from rich.panel import Panel from rich.table import Table -from .utils import summarize_ranges, convert_seconds -from .constants import ELLIPSES -from .engine import TickData, Engine +from rich.live import Live +from rich.progress import ( + Progress, + TextColumn, + BarColumn, + TimeRemainingColumn, + TimeElapsedColumn, + MofNCompleteColumn +) + +from contextlib import nullcontext + +from raps.utils import summarize_ranges, convert_seconds_to_hhmmss, convert_seconds_to_hhmm +from raps.constants import ELLIPSES +from raps.engine import TickData, Engine + +MAX_ROWS = 30 class LayoutManager: - def __init__(self, layout_type, engine: Engine, debug, **config): + def __init__(self, layout_type, engine: Engine, total_timesteps=0, debug=None, args_dict=None, **config): + self.debug = debug + if args_dict is not None: + self.noui = args_dict.get("noui") + self.simulate_network = args_dict.get("simulate_network") + else: + self.noui = False + self.simulate_network = False self.engine = engine self.config = config - self.console = Console() - self.layout = Layout() + self.topology = self.engine.config.get("TOPOLOGY", "none") self.hascooling = layout_type == "layout2" - self.debug = debug - self.setup_layout(layout_type) self.power_df_header = self.config['POWER_DF_HEADER'] self.racks_per_cdu = self.config['RACKS_PER_CDU'] self.power_column = self.power_df_header[self.racks_per_cdu + 1] self.loss_column = self.power_df_header[-1] + if self.debug or self.noui: + return + + self.console = Console() + self.layout = Layout() + self.setup_layout(layout_type) + self.progress = Progress( + TextColumn("Progress: [progress.percentage]{task.percentage:>3.0f}%"), + BarColumn(bar_width=None), + TextColumn("•"), + MofNCompleteColumn(), + TextColumn("•"), + TimeElapsedColumn(), + TextColumn("•"), + TimeRemainingColumn() + ) + self.progress_task = self.progress.add_task("Progress", total=total_timesteps, name="Progress") + def setup_layout(self, layout_type): - if layout_type == "layout2": - self.layout.split_row(Layout(name="left", ratio=3), Layout(name="right", ratio=2)) - self.layout["left"].split_column( - Layout(name="pressflow", ratio=6), - Layout(name="powertemp", ratio=11), - Layout(name="totpower", ratio=3), - ) - self.layout["right"].split(Layout(name="scheduled", ratio=17), Layout(name="status", ratio=3)) - else: - self.layout.split_row(Layout(name="left", ratio=1), Layout(name="right", ratio=1)) - self.layout["left"].split_column(Layout(name="upper", ratio=8), Layout(name="lower", ratio=2)) - self.layout["right"].split_column(Layout(name="scheduled", ratio=8), Layout(name="status", ratio=2)) + if not self.debug: + self.layout.split_column(Layout(name="main"), Layout(name="progress", size=1)) + if layout_type == "layout2": + self.layout["main"].split_row(Layout(name="left", ratio=3), Layout(name="right", ratio=2)) + self.layout["main"]["left"].split_column( + Layout(name="pressflow", ratio=6), + Layout(name="powertemp", ratio=11), + Layout(name="totpower", ratio=3), + ) + self.layout["main"]["right"].split(Layout(name="scheduled", ratio=17), Layout(name="status", ratio=3)) + else: + self.layout["main"].split_row(Layout(name="left", ratio=1), Layout(name="right", ratio=1)) + self.layout["main"]["left"].split_column(Layout(name="upper", ratio=8), Layout(name="lower", ratio=2)) + self.layout["main"]["right"].split_column( + Layout(name="scheduled", ratio=8), Layout(name="status", ratio=2)) def create_table(self, title, columns, header_style="bold green"): """ @@ -71,7 +114,11 @@ class LayoutManager: total_power_mw = total_power_kw / 1000.0 total_loss_kw = df[self.loss_column].sum() total_loss_mw = total_loss_kw / 1000.0 - return total_power_mw, total_loss_mw, f"{total_loss_mw / total_power_mw * 100:.2f}%", total_power_kw, total_loss_kw + return \ + total_power_mw, \ + total_loss_mw, \ + f"{total_loss_mw / total_power_mw * 100:.2f}%", \ + total_power_kw, total_loss_kw def update_scheduled_jobs(self, jobs, show_nodes=False): """ @@ -84,45 +131,103 @@ class LayoutManager: show_nodes : bool, optional Flag indicating whether to display node information (default is False). """ - # Define columns with header styles - columns = ["JOBID", "WALL TIME", "NAME", "ACCOUNT", "ST", "NODES", "NODE SEGMENTS"] - if show_nodes: - columns.append("NODELIST") - columns.append("TIME") + + # Decide whether to show "SLOWDOWN" (if real topology) or "NODE SEGMENTS" (if capacity/none) + # show_slowdown = (self.topology in ("fat-tree", "dragonfly", "capacity")) + show_slowdown = self.simulate_network + + # Build the column headers + # columns = ["JOBID", "WALL TIME", "NAME", "ACCOUNT", "ST"] + columns = ["JOBID", "TIME LIMIT", "NAME", "ACCOUNT", "ST", "NODES"] + if show_slowdown: + columns.append("SLOW DOWN") + else: + if show_nodes: + columns.append("NODELIST") + else: + columns.append("SEGMENT") # NODE SEGMENTS + + columns.append("WALL TIME") # Create table with bold magenta headers table = Table(title="Job Queue", header_style="bold magenta", expand=True) for col in columns: table.add_column(col, justify="center") - # Add data rows with white values - for job in jobs: - node_segments = summarize_ranges(job.scheduled_nodes) - if show_nodes: + # Add data rows + for job in jobs[:MAX_ROWS]: + # Number of requested nodes as a string + # n_nodes = str(job.nodes_required) # Unused + + if show_slowdown: + # Each Job should have job.net_congestion set in Engine.tick() + slow = getattr(job, "slowdown_factor", 0.0) + # Format as "1.23×" (if ≤1.00 you will see "1.00×") + slowdown_str = f"{slow:.2f}×" + col_slow = slowdown_str + else: + # Fallback to original NODE SEGMENTS logic + node_segments = summarize_ranges(job.scheduled_nodes) + if show_nodes: + if len(node_segments) > 4: + nodes_display = ", ".join(node_segments[:2] + [ELLIPSES] + node_segments[-2:]) + else: + nodes_display = ", ".join(node_segments) + col_slow = nodes_display # reused variable name for simplicity + else: + # col_slow = str(len(node_segments)) + col_slow = str(len(node_segments)) + + # If show_nodes is True, we need to append NODELIST as well + if show_nodes and not show_slowdown: + # use the same node_segments variable to build the list of nodes if len(node_segments) > 4: nodes_display = ", ".join(node_segments[:2] + [ELLIPSES] + node_segments[-2:]) else: nodes_display = ", ".join(node_segments) + col_nodelist = nodes_display else: - nodes_display = str(len(node_segments)) + col_nodelist = col_slow # This logic is a bit flawed... + nodes_display = col_nodelist + + if self.engine.downscale != 1: + running_time_str = convert_seconds_to_hhmmss(job.current_run_time // self.engine.downscale) + \ + f" +{job.current_run_time % self.engine.downscale}/{self.engine.downscale}s" + else: + running_time_str = convert_seconds_to_hhmm(job.current_run_time) row = [ str(job.id).zfill(5), - convert_seconds(job.wall_time), + convert_seconds_to_hhmm(job.time_limit // self.engine.downscale), + # str(job.wall_time), str(job.name), str(job.account), - job.state.value, + job.current_state.value, str(job.nodes_required), nodes_display, - convert_seconds(job.running_time) + running_time_str ] - # Add the row with the 'white' style applied to the whole row + + # If the job has been flagged as “dilated”, show its row in yellow + if getattr(job, "dilated", False): + row = [f"[yellow]{x}[/yellow]" for x in row] + table.add_row(*row, style="white") # Update the layout self.layout["scheduled"].update(Panel(Align(table, align="center"))) - def update_status(self, time, nrun, nqueue, active_nodes, free_nodes, down_nodes): + def update_status(self, + time, + nrun, + nqueue, + active_nodes, + free_nodes, + down_nodes, + avg_net_util, + slowdown, + time_delta, + timestep_start=0): """ Updates the status information table with the provided system status data. @@ -142,20 +247,46 @@ class LayoutManager: List of nodes that are down. """ # Define columns with header styles - columns = ["Time", "Jobs Running", "Jobs Queued", "Active Nodes", "Free Nodes", "Down Nodes"] + columns = [] + time_header = "Time" + if timestep_start != 0: # append time simulated + time_header += " (+Sim)" + columns.append(time_header) + columns.append("Jobs Running") + columns.append("Jobs Queued") + columns.append("Active Nodes") + columns.append("Free Nodes") + columns.append("Down Nodes") + columns.append("Speed") + + if self.simulate_network: + columns.extend(("Net Util (%)", "Slowdown per job")) table = Table(header_style="bold magenta", expand=True) for col in columns: table.add_column(col, justify="center") + row = [] # Add data row with white values - row = [ - convert_seconds(time), - str(nrun), - str(nqueue), - str(active_nodes), - str(free_nodes), - str(len(down_nodes)) - ] + time_in_s = time // self.engine.downscale + if (time_in_s < 946684800): # Introducing Y2K into our codebase! Kek + time_str = convert_seconds_to_hhmmss(time_in_s) + else: + # For the curious: If the simulation time in seconds is large than + # unix timestamp for Jan 2000 this is a unix timestamp, + time_str = f"{datetime.fromtimestamp(time_in_s).strftime('%Y-%m-%d %H:%M')}" + if timestep_start != 0: # append time simulated + time_str += f"\nSim: {convert_seconds_to_hhmmss(time_in_s - timestep_start)}" + + row.append(time_str) + row.append(str(nrun)) + row.append(str(nqueue)) + row.append(str(active_nodes)) + row.append(str(free_nodes)) + row.append(str(len(down_nodes))) + row.append(f"{time_delta}x") + if self.simulate_network: + row.append(f"{avg_net_util * 100:.0f}%") + row.append(f"{slowdown:.1f}x") # Add the row with the 'white' style applied to the whole row table.add_row(*row, style="white") @@ -212,8 +343,13 @@ class LayoutManager: return df - - def update_powertemp_array(self, power_df, cooling_outputs, pflops, gflop_per_watt, system_util, uncertainties=False): + def update_powertemp_array(self, + power_df, + cooling_outputs, + pflops, + gflop_per_watt, + system_util, + uncertainties=False): """ Updates the displayed power and temperature table with the provided data. @@ -225,8 +361,9 @@ class LayoutManager: DataFrame containing temperature and cooling data. """ # Define the specific columns for power - #power_columns = POWER_DF_HEADER[0:RACKS_PER_CDU + 2] + [POWER_DF_HEADER[-1]] # "CDU", "Rack 1", "Rack 2", "Rack 3", "Sum", "Loss" - power_columns = self.power_df_header[0:self.racks_per_cdu + 2] + [self.power_df_header[-1]] # "CDU", "Rack 1", "Rack 2", "Rack 3", "Sum", "Loss" + # power_columns = POWER_DF_HEADER[0:RACKS_PER_CDU + 2] + [POWER_DF_HEADER[-1]] + # "CDU", "Rack 1", "Rack 2", "Rack 3", "Sum", "Loss" + power_columns = self.power_df_header[0:self.racks_per_cdu + 2] + [self.power_df_header[-1]] fmu_cols = self.config['FMU_COLUMN_MAPPING'] # Updated cooling keys to include temperature instead of pressure @@ -253,6 +390,9 @@ class LayoutManager: if uncertainties: pass else: + power_df = power_df.replace([np.nan], 0.0) + power_df = power_df.replace([np.inf], sys.maxsize) + power_df = power_df.replace([-np.inf], -sys.maxsize - 1) power_df = power_df[power_columns].astype(int) # Populate the table with data from the DataFrame, applying the data styles @@ -262,7 +402,8 @@ class LayoutManager: ] cooling_values = [ - f"[{data_styles[i + len(power_columns)]}]{cooling_row[1][key]:.1f}[/]" for i, key in enumerate(cooling_keys) + f"[{data_styles[i + len(power_columns)]}]{cooling_row[1][key]:.1f}[/]" for + i, key in enumerate(cooling_keys) ] table.add_row(*(power_values + cooling_values)) @@ -288,7 +429,7 @@ class LayoutManager: total_power_str, str(f"{pflops:.2f}"), str(f"{gflop_per_watt:.1f}"), - total_loss_str + " (" + percent_loss_str+ ")", + total_loss_str + " (" + percent_loss_str + ")", f"{cooling_outputs['pue']:.2f}", style="white" # Apply white style to all elements in the row ) @@ -318,6 +459,9 @@ class LayoutManager: if uncertainties: pass else: + power_df = power_df.replace([np.nan], 0.0) + power_df = power_df.replace([np.inf], sys.maxsize) + power_df = power_df.replace([-np.inf], -sys.maxsize - 1) power_df = power_df[display_columns].round().astype(int) # Create table for displaying rack power and loss with styling @@ -345,7 +489,8 @@ class LayoutManager: percent_loss_str = f"{total_loss_mw / total_power_mw * 100:.2f}%" if not self.hascooling: - self.layout["upper"].update(Panel(Align(table, align="center"))) + self.layout["upper"].update(Panel(Align(table, align="center"), + title=self.engine.config["system_name"].capitalize())) # Create Total Power table with green headers and white data total_table = Table(show_header=True, header_style="bold green") @@ -359,9 +504,9 @@ class LayoutManager: total_table.add_row( f"{system_util:.1f}%", total_power_str, - str(f"{pflops:.2f}"), - str(f"{gflop_per_watt:.1f}"), - total_loss_str + " (" + percent_loss_str+ ")", + str(f"{pflops:.2f}" if pflops is not None else "None"), + str(f"{gflop_per_watt:.1f}" if gflop_per_watt is not None else "None"), + total_loss_str + " (" + percent_loss_str + ")", style="white" # Apply 'white' style to the entire row ) @@ -374,9 +519,16 @@ class LayoutManager: self.layout["lower"].update(Panel(Align(total_table, align="center"), title="Power and Performance")) - def update(self, data: TickData): + def update_progress_bar(self, timestamp): + self.progress.update(self.progress_task, description=f"{timestamp}", advance=timestamp, transient=True) + self.layout["progress"].update(self.progress.get_renderable()) + + def update_full_layout(self, data: TickData, time_delta=1, timestep_start=0): + if self.debug: + return uncertainties = self.engine.power_manager.uncertainties + # if data.current_time % self.config['UI_UPDATE_FREQ'] == 0: if self.engine.cooling_model: self.update_powertemp_array( data.power_df, data.fmu_outputs, data.p_flops, data.g_flops_w, data.system_util, @@ -385,27 +537,42 @@ class LayoutManager: self.update_pressflow_array(data.fmu_outputs) self.update_scheduled_jobs(data.running + data.queue) + self.update_status( - data.current_time, len(data.running), len(data.queue), data.num_active_nodes, - data.num_free_nodes, data.down_nodes, + data.current_timestep, + len(data.running), + len(data.queue), + data.num_active_nodes, + data.num_free_nodes, + data.down_nodes, + data.avg_net_util, + data.slowdown_per_job, + data.time_delta, + timestep_start=timestep_start ) + self.update_power_array( data.power_df, data.p_flops, data.g_flops_w, data.system_util, uncertainties=uncertainties, ) - def render(self): - if not self.debug: - self.console.clear() - self.console.print(self.layout) - - def run(self, jobs, timesteps): + def run(self): """ Runs the UI, blocking until the simulation is complete """ - for data in self.engine.run_simulation(jobs, timesteps): - if data.current_time % self.config['UI_UPDATE_FREQ'] == 0: - self.update(data) - self.render() - - def run_stepwise(self, jobs, timesteps): - """ Prepares the UI and returns a generator for the simulation """ - return self.engine.run_simulation(jobs, timesteps) + if not self.debug and not self.noui: + context = Live(self.layout, auto_refresh=True, refresh_per_second=3) + else: + context = nullcontext() + try: + with context: + # last_i = 0 + for i, data in enumerate(self.engine.run_simulation(autoshutdown=True)): + if data and (not self.debug and not self.noui): + self.update_full_layout(data, + self.engine.time_delta, + timestep_start=self.engine.timestep_start) + # self.update_progress_bar(i-last_i) + # last_i=i + if not self.debug and not self.noui: + self.update_progress_bar(1) + finally: + os.system("stty sane") diff --git a/raps/utils.py b/raps/utils.py index 5ead3d137cf431af851fa897ae7aeccacec7b0ec..0c77d3f02e8669091e90145bc796305032544b9f 100644 --- a/raps/utils.py +++ b/raps/utils.py @@ -6,26 +6,142 @@ generating random numbers, summarizing and expanding ranges, determining job sta """ -from datetime import timedelta - +from datetime import datetime, timedelta, timezone, date +from collections.abc import Iterable +from enum import Enum +import os import hashlib import math +import re import numpy as np import pandas as pd import random import sys import uuid import json +import argparse +from pathlib import Path +from typing import Annotated as A, TypeVar, TypeAlias, Protocol +from pydantic import ( + BaseModel, TypeAdapter, AfterValidator, BeforeValidator, ConfigDict, AwareDatetime, ValidationError, + ValidationInfo, +) +from pydantic_settings import BaseSettings, SettingsConfigDict, CliApp, CliSettingsSource, SettingsError +import yaml +from yaml import YAMLError +from raps.job import Job + + +def deep_merge(a: dict, b: dict): + a = {**a} + for key in b.keys(): + if key in a and isinstance(a[key], dict) and isinstance(b[key], dict): + a[key] = deep_merge(a[key], b[key]) + else: + a[key] = b[key] + return a + + +def deep_subtract_dicts(a: dict, b: dict): + """ + Remove all fields from a that are already in b, such that + deep_merge(deep_subtract_dicts(a, b), b) == a + a should contain a superset of b's keys. + """ + a = {**a} + for key in b.keys(): + if key in a: + if a[key] == b[key]: + a.pop(key) + elif isinstance(a[key], dict) and isinstance(b[key], dict): + a[key] = deep_subtract_dicts(a[key], b[key]) + # otherwise keep key in a as is + return a + + +def to_dict(arg): + """ + Normalizes arg to a dictionary if necessary. Used to convert between legacy argparse.Namespace + objects and dictionaries. + """ + if isinstance(arg, dict): + return arg + elif isinstance(arg, argparse.Namespace): + return vars(arg) + else: + raise ValueError(f"Cannot convert {arg} to dict") + + +DateType = TypeVar("DateType", date, datetime) + + +def date_range(start: DateType, end: DateType, step=timedelta(days=1)) -> Iterable[DateType]: + window_start = start + while window_start < end: + yield window_start + window_start += step + + +def sum_values(values): + return sum(x[1] for x in values) if values else 0 -def convert_seconds(seconds): +def min_value(values): + return min(x[1] for x in values) if values else 0 + + +def max_value(values): + return max(x[1] for x in values) if values else 0 + + +def convert_seconds_to_hhmmss(seconds): + """Convert seconds to time format: 3661s -> 01:01""" + td = timedelta(seconds=seconds) + h, m, s = str(td).split(':') + return f"{h}:{m}:{s}" + + +def convert_seconds_to_hhmm(seconds): """Convert seconds to time format: 3661s -> 01:01""" + # if it's a NumPy scalar, extract the Python value + if hasattr(seconds, "item"): + seconds = seconds.item() td = timedelta(seconds=seconds) h, m, _ = str(td).split(':') return f"{h}:{m}" -def truncated_normalvariate(mu, sigma, lower, upper): +def truncated_normalvariate_int(mu, sigma, lower, upper): + """ + Generate a random number from a truncated normal distribution. + + Parameters + ---------- + mu : float + Mean of the distribution. + sigma : float + Standard deviation of the distribution. + lower : float + Lower bound of the truncated distribution. + upper : float + Upper bound of the truncated distribution. + + Returns + ------- + int + Random number from the truncated normal distribution. + """ + CUTOFF = 100000000 + i = 0 + while i < CUTOFF: + number = random.normalvariate(mu, sigma) + if lower < number < upper: + return round(number) + i += 1 + raise Exception(f"mu:{mu} sigma:{sigma}, not a single hit in {CUTOFF} tries.") + + +def truncated_normalvariate_float(mu, sigma, lower, upper): """ Generate a random number from a truncated normal distribution. @@ -45,10 +161,40 @@ def truncated_normalvariate(mu, sigma, lower, upper): float Random number from the truncated normal distribution. """ - while True: + CUTOFF = 100000000 + i = 0 + while i < CUTOFF: number = random.normalvariate(mu, sigma) if lower < number < upper: return number + i += 1 + raise Exception(f"mu:{mu} sigma:{sigma}, not a single hit in {CUTOFF} tries.") + + +def truncated_weibull(scale, shape, min, max): + while True: + number = random.weibullvariate(scale, shape) + if min < number <= max: + return int(number) + + +def truncated_weibull_float(scale, shape, min, max): + while True: + number = random.weibullvariate(scale, shape) + if min < number <= max: + return float(number) + + +def return_nearest_power_of(*, number, base): + if base == 1: + return number + else: + next_num = base ** math.ceil(math.log(number, base)) + prev_num = base ** math.floor(math.log(number, base)) + if next_num - number < number - prev_num: + return next_num + else: + return prev_num def linear_to_3d_index(linear_index, shape): @@ -242,6 +388,7 @@ def create_binary_array_numpy(max_time, trace_quanta, util): traces[i, :int(util * num_quanta / 100)] = 1 return traces + def extract_data_csv(fileName, skiprows, header): """ Read passed csv file path @ In, filename, dataframe, facility telemetry data @@ -254,16 +401,18 @@ def extract_data_csv(fileName, skiprows, header): df = df.dropna() return df + def resampledf(df, time_resampled): """ Match key and return idx @ In, None @ Out, CDU_names, list, list of CDU names """ - df.set_index('time',inplace =True) + df.set_index('time', inplace=True) df = df.reindex(df.index.union(time_resampled)).interpolate('values').loc[time_resampled] df = df.reset_index() return df + def output_dict(d, title='', output_file=sys.stdout): """ Write dictionary contents to a file. @@ -282,6 +431,7 @@ def output_dict(d, title='', output_file=sys.stdout): for key, value in d.items(): file.write(f"{key}: {value}\n") + def create_casename(prefix=''): """ Generate a unique case name. @@ -299,38 +449,140 @@ def create_casename(prefix=''): return prefix + str(uuid.uuid4())[:7] -def next_arrival(lambda_rate): - if not hasattr(next_arrival, 'next_time'): +def create_file_indexed(prefix: str, path: str = None, ending: str = None, create=True) -> str: + if path is not None: + os.makedirs(path, exist_ok=True) + else: + path = "./" + index = 1 + while True: + if ending: + filename = f"{prefix}_{index:03d}.{ending}" + else: + filename = f"{prefix}_{index:03d}" + filepath = os.path.join(path, filename) + if not os.path.exists(filepath): + if create: + open(filepath, "w").close() + return filepath + index += 1 + + +def create_dir_indexed(dir: str, path: str = None) -> str: + if dir is None: + raise ValueError("'dir' cannot be none") + if path is None: + path = os.getcwd() + index = 1 + while True: + dirname = f"{dir}_{index:03d}" + fullpath = os.path.join(path, dirname) + if not os.path.exists(fullpath): + os.makedirs(fullpath, exist_ok=False) + return fullpath + index += 1 + + +def next_arrival_byconfargs(config, args, reset=False): + args = to_dict(args) + arrival_rate = 1 + arrival_time = config['JOB_ARRIVAL_TIME'] + downscale = args['downscale'] + + if args['job_arrival_rate']: + arrival_rate = args['job_arrival_rate'] + if args['job_arrival_time']: + arrival_time = args['job_arrival_time'] + return next_arrival(arrival_rate / (arrival_time * downscale), reset) + + +def next_arrival_byconfkwargs(config, kwargs, reset=False): + arrival_rate = 1 + arrival_time = config['JOB_ARRIVAL_TIME'] + if kwargs['job_arrival_rate']: + arrival_rate = kwargs['job_arrival_rate'] + if kwargs['job_arrival_time']: + arrival_time = kwargs['job_arrival_time'] + return next_arrival(arrival_rate / arrival_time, reset) + + +def next_arrival(lambda_rate, reset=False, start_time=0): + if not hasattr(next_arrival, 'next_time') or reset is True: # Initialize the first time it's called - next_arrival.next_time = 0 + next_arrival.next_time = start_time else: next_arrival.next_time += \ -math.log(1.0 - random.random()) / lambda_rate return next_arrival.next_time -def convert_to_seconds(time_str): - # Define the conversion factors - time_factors = { - 'd': 86400, # 1 day = 86400 seconds - 'h': 3600, # 1 hour = 3600 seconds - 'm': 60, # 1 minute = 60 seconds - 's': 1 # 1 second = 1 second - } - - # Check if the input string ends with a unit or is purely numeric - if time_str[-1].isdigit(): - return int(time_str) # Directly return the number if it's purely numeric - - # Extract the numeric part and the time unit - num = int(time_str[:-1]) - unit = time_str[-1] - - # Convert to seconds using the conversion factors - if unit in time_factors: - return num * time_factors[unit] - else: - raise ValueError(f"Unknown time unit: {unit}") +TIME_UNITS = { + 'd': timedelta(days=1), + 'h': timedelta(hours=1), + 'm': timedelta(minutes=1), + 's': timedelta(seconds=1), + 'ds': timedelta(milliseconds=100), + 'cs': timedelta(milliseconds=10), + 'ms': timedelta(milliseconds=1), +} + + +def parse_time_unit(unit) -> timedelta: + parsed_unit = unit + if TypeAdapter(timedelta).validator.isinstance_python(unit): + parsed_unit = TypeAdapter(timedelta).validate_python(unit) + elif isinstance(unit, str): + parsed_unit = TIME_UNITS.get(unit) + if not isinstance(parsed_unit, timedelta): + raise ValueError(f"Invalid time unit {unit}") + if parsed_unit not in TIME_UNITS.values() or parsed_unit > TIME_UNITS['s']: + raise ValueError("Only time units of s, ds, cs, and ms are supported") + return parsed_unit + + +def parse_td(td, unit: str | timedelta = 's') -> timedelta: + """ Parse into a timedelta. Pass unit to interpret raw numbers as (default seconds) """ + unit = parse_time_unit(unit) + if TypeAdapter(int).validator.isinstance_python(td): + return unit * TypeAdapter(int).validate_python(td) + if TypeAdapter(timedelta).validator.isinstance_python(td): + return TypeAdapter(timedelta).validate_python(td) + if isinstance(td, str): + if not pd.isna(pd.to_timedelta(td, errors="coerce")): + return pd.to_timedelta(td) + # Special case parsing for ds and cs units which pandas doesn't support + re_match = re.fullmatch(r"(\d+)\s*(\w+)", td.strip()) + if re_match and re_match[2] in TIME_UNITS: + num_str, unit_str = re_match.groups() + return int(num_str) * TIME_UNITS[unit_str] + raise ValueError(f"Invalid timedelta: {td}") + + +def convert_to_time_unit(td, unit: str | timedelta = 's'): + """ + Converts to integer number of time unit + Throws if the given time is less than the unit + """ + num = parse_td(td, unit) / parse_time_unit(unit) + if (num != 0 and num < 1) or not num.is_integer(): + raise ValueError(f"{td} is not divisible by time unit {unit}") + return int(num) + + +def infer_time_unit(td) -> timedelta: + """ Infers the time unit the user meant for the input string """ + parsed_td = parse_td(td) + time_unit = None + if isinstance(td, str): # infer unit from string, e.g. 1s or 200ms + re_match = re.fullmatch(r"(\d+)\s*(\w+)", td.strip()) + if re_match and re_match[2] in TIME_UNITS: + time_unit = TIME_UNITS[re_match[2]] + if not time_unit: + for unit in sorted(TIME_UNITS.values(), reverse=True): + if (parsed_td % unit).total_seconds() == 0: + time_unit = unit + break + return min(TIME_UNITS['s'], time_unit or TIME_UNITS['s']) def encrypt(name): @@ -343,25 +595,82 @@ def encrypt(name): def write_dict_to_file(dictionary, file_path): """Function to write dictionary to a text file""" with open(file_path, 'w') as file: - for key, value in dictionary.items(): + file.write("{") + for j, (key, value) in enumerate(dictionary.items()): if isinstance(value, dict): - file.write(f"{key}: {{\n") - for subkey, subvalue in value.items(): - file.write(f" {subkey}: {subvalue}\n") - file.write("}\n") + file.write(f"\"{str(key)}\": {{\n") + for i, (subkey, subvalue) in enumerate(value.items()): + base_subvalue = convert_numpy_to_builtin(subvalue) + json_string = toJSON(base_subvalue) + file.write(f" \"{str(subkey)}\": {json_string}") + if i < len(value.items()) - 1: + file.write(", ") + file.write("}") else: - file.write(f"{key}: {value}\n") + file.write(f"\"{str(key)}\": {value}") + if j < len(dictionary.items()) - 1: + file.write(", ") + file.write("\n") + file.write("}") def toJSON(obj): """Function to dump a json string from object""" return json.dumps( obj, - default=lambda o:o.__dict__, + default=lambda o: o.__dict__, sort_keys=True, indent=4) +def convert_numpy_to_builtin(obj): + if isinstance(obj, dict): + tmp_obj = dict() + for k, v in obj.items(): + tmp_obj[k] = convert_numpy_to_builtin(v) + return tmp_obj + elif isinstance(obj, list): + return [convert_numpy_to_builtin(i) for i in obj] + elif isinstance(obj, np.ndarray): + tmplist = obj.tolist() + return convert_numpy_to_builtin(tmplist) + elif isinstance(obj, (np.integer, np.int64, np.int32)): + return int(obj) + elif isinstance(obj, (np.floating, np.float64, np.float32)): + return float(obj) + elif isinstance(obj, (np.bool_)): + return bool(obj) + else: + return obj + + +def get_current_utilization(trace, job: Job): + """Return utilization for a trace at the job's current running time. + Note: this should move to a trace.py and a Trace class! + """ + if (isinstance(trace, list) and trace) or \ + (isinstance(trace, np.ndarray) and trace.size != 0): + + if not job.trace_quanta: + raise ValueError("job.trace_quanta is not set; cannot compute utilization.") + + time_quanta_index = int((job.current_run_time - job.trace_start_time) // job.trace_quanta) + if time_quanta_index < 0: + time_quanta_index = 0 + + if time_quanta_index < len(trace): + util = get_utilization(trace, time_quanta_index) + else: + util = get_utilization(trace, max(0, len(trace) - 1)) + elif isinstance(trace, (float, int)): + util = trace + else: + raise ValueError(f"trace is of unexpected type: {type(trace)}.") + util = 0.0 + + return util + + def get_utilization(trace, time_quanta_index): """Retrieve utilization value for a given trace at a specific time quanta index.""" if isinstance(trace, (list, np.ndarray)): @@ -370,3 +679,283 @@ def get_utilization(trace, time_quanta_index): return float(trace) else: raise TypeError(f"Invalid type for utilization: {type(trace)}.") + + +class ValueComparableEnum(Enum): + def __eq__(self, other): + if isinstance(other, Enum): + return self.value == other.value + return self.value == other + + def __hash__(self): # required if you override __eq__ + return hash(self.value) + + +def normalize_tz(d: datetime): + """ Convert datetime to UTC. If naive, assume local time, then convert to UTC """ + if not d.tzinfo: + return d.astimezone().astimezone(timezone.utc) + else: + return d.astimezone(timezone.utc) + + +def validate_resolved_path(path: str | Path, info: ValidationInfo): + context = info.context or {} + path = Path(path).expanduser() + if context.get('base_path'): + base_path = Path(context["base_path"]).expanduser().resolve() + else: + base_path = Path.cwd() + path = (base_path / path).resolve() + # This is used on the simulation server to block reading arbitrary files + if context.get("force_under_base_path"): + if not path.is_relative_to(base_path): + raise ValueError(f"{path} is not under {base_path}") + return path + + +ResolvedPath = A[Path, AfterValidator(validate_resolved_path)] +""" +Resolve a path, and expand ~ in the path string. +Paths can be resolved relative to specific path instead of cwd by passing +`context={"base_path": "my/path"}` in model_validate(). +""" + + +AutoAwareDatetime = A[datetime, AfterValidator(normalize_tz)] +""" Datetime type wrapper, makes sure timezone is set """ + +SmartTimedelta = A[timedelta, BeforeValidator(parse_td)] +""" Can be passed as ISO 8601 format like PT5M, or a string like 9s, or a number of seconds """ + + +class RAPSBaseModel(BaseModel): + """ Base Pydantic model with shared config """ + model_config = ConfigDict( + use_attribute_docstrings=True, + ) + + +T = TypeVar("T", bound=BaseModel, covariant=True) + + +class ModelArgsValidator(Protocol[T]): + def __call__(self, args: argparse.Namespace, init_data: dict | None = None) -> T: + ... + + +def pydantic_add_args( + parser: argparse.ArgumentParser, model_cls: type[T], + model_config: SettingsConfigDict | None = None, +) -> ModelArgsValidator[T]: + """ + Add arguments to the parser from the model. Returns a function that can be used to parse the + model from the argparse args. + + Normally you'd just configure Pydantic to just automatically create a BaseSettings object from + sys.argv and/or env variables. But we want a bit more control over the cli parser, and to use + the SimConfig model as a regular non-settings model in the simulation server. So here we do + some hacks to apply the args manually. + """ + model_config_dict = SettingsConfigDict({ + "cli_implicit_flags": True, + "cli_kebab_case": True, + "title": model_cls.__name__, + **(model_config or {}), + "cli_parse_args": False, # Don't automatically parse args + }) + + class SettingsModel(model_cls, BaseSettings): + @classmethod + def settings_customise_sources(cls, settings_cls, + init_settings, env_settings, dotenv_settings, file_secret_settings, + ): + return (init_settings,) # Don't load from env vars or anything else + + model_config = model_config_dict + + cli_settings_source = CliSettingsSource(SettingsModel, root_parser=parser) + + def model_args_validator(args: argparse.Namespace, init_data: dict | None = None): + try: + model = CliApp.run(SettingsModel, + cli_args=args, + cli_settings_source=cli_settings_source, + **(init_data or {}), + ) + # Recreate model so we don't return the SettingsModel subclass + # use exclude_unset so that model_field_set is preserved as well + return model_cls.model_validate(model.model_dump(exclude_unset=True)) + except (ValidationError, SettingsError) as err: + print(err) + sys.exit(1) + return model_args_validator + + +SubParsers: TypeAlias = "argparse._SubParsersAction[argparse.ArgumentParser]" +""" Alias for the result of argparse parser.add_subparsers """ + + +def yaml_dump(data, header_comment=''): + """ Dumps yaml with pretty formatting """ + if header_comment: + header_comment = '\n'.join(f'# {ln}' for ln in header_comment.splitlines()) + "\n" + + class IndentDumper(yaml.Dumper): + def represent_data(self, data): + # Quote all strings with special characters to avoid confusion + if ( + isinstance(data, str) and + (not re.fullmatch(r"[\w-]+", data) or data.isdigit()) and + "\n" not in data + ): + return self.represent_scalar('tag:yaml.org,2002:str', data, style='"') + return super(IndentDumper, self).represent_data(data) + + def increase_indent(self, flow=False, indentless=False): + # Indent lists + return super(IndentDumper, self).increase_indent(flow, False) + + return header_comment + yaml.dump( + data, + Dumper=IndentDumper, + sort_keys=False, + indent=2, + allow_unicode=True, + ) + + +def read_yaml(config_file: str | None) -> dict: + """ Parses yaml file. Pass "-" to read from stdin """ + # Assume stdin if not terminal + if config_file == "-" or (not config_file and not sys.stdin.isatty()): + data = sys.stdin.read() + elif config_file: + data = Path(config_file).read_text() + else: + data = "" + if data.strip(): + result = yaml.safe_load(data) + else: + result = {} + if not isinstance(result, dict): + raise ValueError("Expected yaml document to contain a top-level mapping") + return result + + +def read_yaml_parsed(cls: type[T], config_file=None) -> dict: + """ + Like read_yaml, but parses the input to resolve paths etc. + Exits on error after printing message (for use in the CLI) + """ + try: + yaml_data = read_yaml(config_file) + if yaml_data: + # Resolve paths in yaml relative to the yaml file + base_path = Path(config_file).parent if config_file and config_file != "-" else None + model = cls.model_validate(yaml_data, context={"base_path": base_path}) + yaml_data = model.model_dump(mode='json', exclude_unset=True) + except (ValidationError, ValueError, YAMLError) as err: + print(f'Failed to parse yaml "{config_file}"') + print(err) + sys.exit(1) + return yaml_data + + +def is_yaml_file(path: str | Path): + """ Return true if the path is .yaml, .yml, or .json """ + return Path(path).suffix in ['.yaml', '.yml', '.json'] + + +class WorkloadData(RAPSBaseModel): + """ + Represents a workload, a list of jobs with some metadata. Returned by dataloaders load_data() + function, and by Workload.generate_jobs(). + + jobs: + The list of parsed jobs. + + telemetry_start + the first timestep in which the simulation be executed. + + telemetry_end + the last timestep in which the simulation can be executed. + + start_date + The actual date that telemetry_start represents. + ---- + Explanation regarding times: + + The loaded dataframe contains + a first timestamp with associated data + and a last timestamp with associated data + + These form the maximum extent of the simuluation time. + telemetry_start and telemetry_end. + + [ ] + ^ ^ + telemetry_start telemetry_end + + These values form the maximum extent of the simulation. + telemetry_start is typically 0, but any int can be used as long as all the times in the + jobs are relative to the telemetry_start. + + Next is the actual extent of the simulation: + + [ ] + ^ ^ + simulation_start simulation_end + + The simulation will start at telemetry_start by default, but the user can specify an explicit + simulation start time. + + Additionally, jobs can have started before telemetry_start, + And can have a recorded ending after simulation_end, + [ ] + ^ ^ + first_start_timestamp last_end_timestamp + + This means that the time between first_start_timestamp and telemetry_start + has no associated values in the traces! + The missing values after simulation_end can be ignored, as the simulatuion + will have stoped before. + + However, the times before telemetry_start have to be padded to generate + correct offsets within their data! + Within the simulation a job's current time is specified as the difference + between its start_time and the current timestep of the simulation. + + With this each job's + - submit_time + - time_limit + - start_time # Maybe Null + - end_time # Maybe Null + - expected_run_time (end_time - start_time) # Maybe Null + - current_run_time (How long did the job run already, when loading) # Maybe zero + - trace_time (lenght of each trace in seconds) # Maybe Null + - trace_start_time (time offset in seconds after which the trace starts) # Maybe Null + - trace_end_time (time offset in seconds after which the trace ends) # Maybe Null + - trace_quanta (job's associated trace quanta, to correctly replay with different trace quanta) # Maybe Null + has to be set for use within the simulation + + The values trace_start_time are similar to the telemetry_start and + telemetry_stop but may different due to missing data, for each job. + + The returned values are these: + - The list of parsed jobs. (as a Job object) + - telemetry_start: int (in seconds) + - telemetry_end: int (in seconds) + - start_date: datetime + """ + jobs: list[Job] + telemetry_start: int + telemetry_end: int + # TODO: It might make more sense to make start_timestep/end_timestep always unix time, then we + # wouldn't need this extra start_date field. + # Don't use AutoAwareDatetime here as we want to enforce dataloaders returning timezone info + start_date: A[AwareDatetime, AfterValidator(lambda d: d.astimezone(timezone.utc))] + + model_config = ConfigDict( + arbitrary_types_allowed=True, + ) diff --git a/raps/validators.py b/raps/validators.py new file mode 100644 index 0000000000000000000000000000000000000000..cb811dde244973bc6f779e76313dcc05135cb5fa --- /dev/null +++ b/raps/validators.py @@ -0,0 +1,12 @@ +def recompute_power(nodes, running_jobs, current_time): + node_power = {n['id']: 0.0 for n in nodes} + for j in running_jobs: + idx = max(0, current_time - j.start_time) + # Clamp index + idx = min(idx, len(j.cpu_trace)-1) + cpu_p = j.cpu_trace[idx] + gpu_p = j.gpu_trace[idx] if j.gpu_trace else 0 + nid = j.scheduled_nodes[0] + node_power[nid] += cpu_p + gpu_p + total = sum(node_power.values()) + return node_power, total diff --git a/raps/weather.py b/raps/weather.py index b31f88e2ab3888f57893d2c6ef5c6e1cc7611549..655e8f35e2bcbceef954df01a33a3275ec8156f1 100644 --- a/raps/weather.py +++ b/raps/weather.py @@ -7,7 +7,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) class Weather: - def __init__(self, iso_string, config): + def __init__(self, start: datetime | None, config): """ Initialize the Weather class with configuration loaded from a JSON file. If zip_code and country_code are provided, the coordinates (lat, lon) @@ -20,13 +20,7 @@ class Weather: self.lon = None self.weather_cache = {} # Cache for storing weather data for the entire day self.has_coords = False - self.start = None - - try: - # Convert the ISO 8601 string to a datetime object - self.start = datetime.fromisoformat(iso_string.replace("Z", "+00:00")) - except ValueError: - print("Invalid ISO 8601 datetime string specified for --start. Using default temperature instead.") + self.start = start # Retrieve coordinates if zip_code and country_code are provided if self.zip_code and self.country_code: @@ -52,13 +46,14 @@ class Weather: if not self.zip_code or not self.country_code: print("Error: ZIP code or country code is not specified.") return None, None - - geocoding_url = f'https://nominatim.openstreetmap.org/search?postalcode={self.zip_code}&country={self.country_code}&format=json' + + geocoding_url = "https://nominatim.openstreetmap.org/search?" + \ + f"postalcode={self.zip_code}&country={self.country_code}&format=json" headers = { 'User-Agent': 'ExaDigiT' # Custom User-Agent header } response = requests.get(geocoding_url, headers=headers, verify=False) # Disable SSL verification temporarily - + # Check for successful response if response.status_code == 200: try: @@ -82,10 +77,12 @@ class Weather: if self.lat is None or self.lon is None: print("Error: Latitude and longitude are not set. Please provide valid ZIP code and country code.") return - - weather_url = f'https://archive-api.open-meteo.com/v1/archive?latitude={self.lat}&longitude={self.lon}&start_date={date}&end_date={date}&temperature_unit=celsius&hourly=temperature_2m' + + weather_url = "https://archive-api.open-meteo.com/v1/archive?" + \ + f"latitude={self.lat}&longitude={self.lon}&" + \ + f"start_date={date}&end_date={date}&temperature_unit=celsius&hourly=temperature_2m" response = requests.get(weather_url, verify=False) # Disable SSL verification temporarily - + # Check for successful response if response.status_code == 200: try: @@ -93,7 +90,7 @@ class Weather: if 'hourly' in data and 'temperature_2m' in data['hourly']: times = data['hourly']['time'] temperatures = data['hourly']['temperature_2m'] - + # Cache the weather data for fast lookup for i, time in enumerate(times): temp_celsius = temperatures[i] @@ -108,7 +105,6 @@ class Weather: else: print(f"Error fetching weather data. Status Code: {response.status_code}") - def get_temperature(self, target_datetime): """ Get temperature for a specific datetime from cached data. @@ -116,13 +112,13 @@ class Weather: if not self.has_coords: print("Error: Latitude and longitude are not set. Please provide valid ZIP code and country code.") return None - + # Round target_datetime to the nearest previous hour target_hour = target_datetime.replace(minute=0, second=0, microsecond=0) - + # Convert to string format without timezone info to match cache format target_hour_str = target_hour.isoformat(timespec='minutes').replace('+00:00', '') # Remove timezone information - + # Retrieve from cache if target_hour_str in self.weather_cache: return self.weather_cache[target_hour_str] diff --git a/raps/workload.py b/raps/workload.py deleted file mode 100644 index c5dc89845dc3248119ce808bec1401f232cf0242..0000000000000000000000000000000000000000 --- a/raps/workload.py +++ /dev/null @@ -1,227 +0,0 @@ -""" -Module for generating workload traces and jobs. - -This module provides functionality for generating random workload traces and -jobs for simulation and testing purposes. - -Attributes ----------- -TRACE_QUANTA : int - The time interval in seconds for tracing workload utilization. -MAX_NODES_PER_JOB : int - The maximum number of nodes required for a job. -JOB_NAMES : list - List of possible job names for random job generation. -CPUS_PER_NODE : int - Number of CPUs per node. -GPUS_PER_NODE : int - Number of GPUs per node. -MAX_WALL_TIME : int - Maximum wall time for a job in seconds. -MIN_WALL_TIME : int - Minimum wall time for a job in seconds. -JOB_END_PROBS : list - List of probabilities for different job end states. - -""" - -import random -import numpy as np - -from .job import job_dict - -JOB_NAMES = ["LAMMPS", "GROMACS", "VASP", "Quantum ESPRESSO", "NAMD",\ - "OpenFOAM", "WRF", "AMBER", "CP2K", "nek5000", "CHARMM",\ - "ABINIT", "Cactus", "Charm++", "NWChem", "STAR-CCM+",\ - "Gaussian", "ANSYS", "COMSOL", "PLUMED", "nekrs",\ - "TensorFlow", "PyTorch", "BLAST", "Spark", "GAMESS",\ - "ORCA", "Simulink", "MOOSE", "ELK"] - -ACCT_NAMES = ["ACT01", "ACT02", "ACT03", "ACT04", "ACT05", "ACT06", "ACT07",\ - "ACT08", "ACT09", "ACT10", "ACT11", "ACT12", "ACT13", "ACT14"] - -MAX_PRIORITY = 500000 - -from .utils import truncated_normalvariate, determine_state, next_arrival - - -class Workload: - def __init__(self, *configs): - """ Initialize Workload with multiple configurations. """ - self.partitions = [config['system_name'] for config in configs] - self.config_map = {config['system_name']: config for config in configs} - - def compute_traces(self, cpu_util: float, gpu_util: float, wall_time: int, trace_quanta: int) -> tuple[np.ndarray, np.ndarray]: - """ Compute CPU and GPU traces based on mean CPU & GPU utilizations and wall time. """ - cpu_trace = cpu_util * np.ones(int(wall_time) // trace_quanta) - gpu_trace = gpu_util * np.ones(int(wall_time) // trace_quanta) - return (cpu_trace, gpu_trace) - - def generate_random_jobs(self, num_jobs: int) -> list[list[any]]: - """ Generate random jobs with specified number of jobs. """ - jobs = [] - for job_index in range(num_jobs): - # Randomly select a partition - partition = random.choice(self.partitions) - # Get the corresponding config for the selected partition - config = self.config_map[partition] - - nodes_required = random.randint(1, config['MAX_NODES_PER_JOB']) - name = random.choice(JOB_NAMES) - account = random.choice(ACCT_NAMES) - cpu_util = random.random() * config['CPUS_PER_NODE'] - gpu_util = random.random() * config['GPUS_PER_NODE'] - mu = (config['MAX_WALL_TIME'] + config['MIN_WALL_TIME']) / 2 - sigma = (config['MAX_WALL_TIME'] - config['MIN_WALL_TIME']) / 6 - wall_time = truncated_normalvariate(mu, sigma, config['MIN_WALL_TIME'], config['MAX_WALL_TIME']) // 3600 * 3600 - end_state = determine_state(config['JOB_END_PROBS']) - cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, wall_time, config['TRACE_QUANTA']) - priority = random.randint(0, MAX_PRIORITY) - net_tx, net_rx = [], [] - - # Jobs arrive according to Poisson process - time_to_next_job = next_arrival(1 / config['JOB_ARRIVAL_TIME']) - - jobs.append(job_dict(nodes_required, name, account, cpu_trace, gpu_trace, net_tx, net_rx, \ - wall_time, end_state, None, time_to_next_job, None, priority, partition)) - - return jobs - - def random(self, **kwargs): - """ Generate random workload """ - num_jobs = kwargs.get('num_jobs', 0) - return self.generate_random_jobs(num_jobs=num_jobs) - - def peak(self, **kwargs): - """Peak power test for multiple partitions""" - jobs = [] - - # Iterate through each partition and get its configuration - for partition in self.partitions: - # Fetch the config for the current partition - config = self.config_map[partition] - - # Generate traces based on partition-specific configuration - cpu_util = config['CPUS_PER_NODE'] - gpu_util = config['GPUS_PER_NODE'] - cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 10800, config['TRACE_QUANTA']) - net_tx, net_rx = [], [] - - # Create job info for this partition - job_info = job_dict( - config['AVAILABLE_NODES'], # Nodes required - f"Max Test {partition}", # Name with partition label - ACCT_NAMES[0], # User account - cpu_trace, # CPU trace - gpu_trace, # GPU trace - net_tx, # Network transmit trace - net_rx, # Network receive trace - len(gpu_trace) * config['TRACE_QUANTA'], # Wall time - 'COMPLETED', # End state - None, # Scheduled nodes - 0, # Time to next job - None, # Job ID - 100, # Priority - partition # Partition name - ) - print(job_info) - jobs.append(job_info) # Add job to the list - - return jobs - - def idle(self, **kwargs): - """Idle power test for multiple partitions""" - - # List to hold jobs for all partitions - jobs = [] - - # Iterate through each partition and get its configuration - for partition in self.partitions: - # Fetch partition-specific configuration - config = self.config_map[partition] - - # Generate traces based on partition-specific configuration - cpu_util, gpu_util = 0, 0 # Idle test has zero utilization - cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 43200, config['TRACE_QUANTA']) # 12 hours - net_tx, net_rx = [], [] - - # Create job info for this partition - job_info = job_dict( - config['AVAILABLE_NODES'], # Nodes required - f"Idle Test {partition}", # Name with partition label - ACCT_NAMES[0], # User account - cpu_trace, # CPU trace - gpu_trace, # GPU trace - net_tx, # Network transmit trace - net_rx, # Network receive trace - len(gpu_trace) * config['TRACE_QUANTA'], # Wall time - 'COMPLETED', # End state - None, # Scheduled nodes - 0, # Time to next job - None, # Job ID - 100, # Priority - partition # Partition name - ) - jobs.append(job_info) # Add job to the list - - return jobs - - def benchmark(self, **kwargs): - """Benchmark tests for multiple partitions""" - - # List to hold jobs for all partitions - jobs = [] - account = ACCT_NAMES[0] - - # Iterate through each partition and its config - for partition in self.partitions: - # Fetch partition-specific configuration - config = self.config_map[partition] - net_tx, net_rx = [], [] - - # Max test - cpu_util, gpu_util = 1, 4 - cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 10800, config['TRACE_QUANTA']) - job_info = job_dict( - config['AVAILABLE_NODES'], - f"Max Test {partition}", account, - cpu_trace, gpu_trace, net_tx, net_rx, - len(gpu_trace) * config['TRACE_QUANTA'], 'COMPLETED', None, 100, None, 0, partition - ) - jobs.append(job_info) - - # OpenMxP run - cpu_util, gpu_util = 0, 4 - cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 3600, config['TRACE_QUANTA']) - job_info = job_dict( - config['AVAILABLE_NODES'], - f"OpenMxP {partition}", account, - cpu_trace, gpu_trace, net_tx, net_rx, - len(gpu_trace) * config['TRACE_QUANTA'], 'COMPLETED', None, 300, None, 0, partition - ) - jobs.append(job_info) - - # HPL run - cpu_util, gpu_util = 0.33, 0.79 * 4 # based on 24-01-18 run - cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 3600, config['TRACE_QUANTA']) - job_info = job_dict( - config['AVAILABLE_NODES'], - f"HPL {partition}", account, - cpu_trace, gpu_trace, net_tx, net_rx, - len(gpu_trace) * config['TRACE_QUANTA'], 'COMPLETED', None, 200, None, 0, partition - ) - jobs.append(job_info) - - # Idle test - cpu_util, gpu_util = 0, 0 - cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 3600, config['TRACE_QUANTA']) - job_info = job_dict( - config['AVAILABLE_NODES'], - f"Idle Test {partition}", account, - cpu_trace, gpu_trace, net_tx, net_rx, - len(gpu_trace) * config['TRACE_QUANTA'], 'COMPLETED', None, 0, None, 0, partition - ) - jobs.append(job_info) - - return jobs - diff --git a/raps/workloads/__init__.py b/raps/workloads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d7891964bcb6711f851c19de44e666c78e1fbd94 --- /dev/null +++ b/raps/workloads/__init__.py @@ -0,0 +1,112 @@ +"""Workloads package init.""" + +import math +import numpy as np +import pandas as pd + +from raps.utils import WorkloadData, SubParsers +from raps.utils import pydantic_add_args, create_file_indexed +from raps.sim_config import SingleSimConfig +from raps.telemetry import Telemetry + +from .basic import BasicWorkload +from .calculon import Calculon +from .constants import JOB_NAMES, ACCT_NAMES, MAX_PRIORITY +from .distribution import DistributionWorkload +from .hpl import HPL +from .live import continuous_job_generation +from .multitenant import MultitenantWorkload +from .network import NetworkTestWorkload +from .inter_job_congestion import InterJobCongestionWorkload +from .utils import plot_job_hist + + +class BaseWorkload: + """Base class with common workload logic.""" + + def __init__(self, args, *configs): + self.partitions = [c['system_name'] for c in configs] + self.config_map = {c['system_name']: c for c in configs} + self.args = args + + def generate_jobs(self): + jobs = getattr(self, self.args.workload)(args=self.args) + timestep_end = int(math.ceil(max([job.end_time for job in jobs]))) + now = pd.Timestamp.now('UTC').floor("min").to_pydatetime() + return WorkloadData( + jobs=jobs, + telemetry_start=0, + telemetry_end=timestep_end, + start_date=self.args.start if self.args.start else now, + ) + + def compute_traces(self, + cpu_util: float, + gpu_util: float, + expected_run_time: int, + trace_quanta: int + ) -> tuple[np.ndarray, np.ndarray]: + """ Compute CPU and GPU traces based on mean CPU & GPU utilizations and wall time. """ + cpu_trace = cpu_util * np.ones(int(expected_run_time) // trace_quanta) + gpu_trace = gpu_util * np.ones(int(expected_run_time) // trace_quanta) + return (cpu_trace, gpu_trace) + +class Workload( + BaseWorkload, + DistributionWorkload, + BasicWorkload, + MultitenantWorkload, + NetworkTestWorkload, + InterJobCongestionWorkload, + Calculon, + HPL +): + """Final workload class with all workload types.""" + pass + +__all__ = [ + "Workload", + "JOB_NAMES", "ACCT_NAMES", "MAX_PRIORITY", +] + + +def run_workload_add_parser(subparsers: SubParsers): + from raps.sim_config import SIM_SHORTCUTS + # TODO: Separate the arguments for this command + parser = subparsers.add_parser("workload", description=""" + Saves workload as a snapshot. + """) + parser.add_argument("config_file", nargs="?", default=None, help=""" + YAML sim config file, can be used to configure an experiment instead of using CLI + flags. Pass "-" to read from stdin. + """) + model_validate = pydantic_add_args(parser, SingleSimConfig, model_config={ + "cli_shortcuts": SIM_SHORTCUTS, + }) + parser.set_defaults(impl=lambda args: run_workload(model_validate(args, {}))) + + +def run_workload(sim_config: SingleSimConfig): + args = sim_config.get_legacy_args() + args_dict = sim_config.get_legacy_args() + config = sim_config.system_configs[0].get_legacy() + + if sim_config.replay: + td = Telemetry(**args_dict) + jobs = td.load_from_files(sim_config.replay).jobs + else: + workload = Workload(args, config) + jobs = getattr(workload, sim_config.workload)(args=sim_config.get_legacy_args()) + plot_job_hist(jobs, + config=config, + dist_split=sim_config.multimodal, + gantt_nodes=sim_config.gantt_nodes) + + out = sim_config.get_output() + if out: + timestep_start = min([x.submit_time for x in jobs]) + timestep_end = math.ceil(max([x.submit_time for x in jobs]) + max([x.expected_run_time for x in jobs])) + filename = create_file_indexed('wl', path=str(out), create=False, ending="npz").split(".npz")[0] + # savez_compressed add npz itself, but create_file_indexed needs to check for .npz to find existing files + np.savez_compressed(filename, jobs=jobs, timestep_start=timestep_start, timestep_end=timestep_end, args=args) + print(filename + ".npz") # To std-out to show which npz was created. diff --git a/raps/workloads/basic.py b/raps/workloads/basic.py new file mode 100644 index 0000000000000000000000000000000000000000..dbe2dfd9278feaa379345ba030d484c8acca0fe2 --- /dev/null +++ b/raps/workloads/basic.py @@ -0,0 +1,422 @@ +import math +import random + +from raps.job import Job, job_dict +from raps.utils import ( + truncated_normalvariate_int, + determine_state, + next_arrival, + next_arrival_byconfargs, +) + +from .constants import JOB_NAMES, ACCT_NAMES, MAX_PRIORITY + + +class BasicWorkload: + + # Test for random 'reasonable' AI jobs + def randomAI(self, **kwargs): + args = kwargs.get('args', None) + jobs = [] + for i in range(args.numjobs): + draw = random.randint(0, 10) + if draw != 0: + et = random.randint(7200, 28800) + nr = random.choice([128, 256, 512, 1024, 1280, 1792, 2048]) + new_job = Job(job_dict(nodes_required=nr, + name="LLM Production", + account="llmUser", + end_state="Success", + id=random.randint(1, 99999), + cpu_trace=0.1, + gpu_trace=(random.uniform(0.55, 0.8) + * self.config_map[self.args.system]['GPUS_PER_NODE']), + ntx_trace=None, + nrx_trace=None, + submit_time=0, + time_limit=random.randint(43200, 43200), + start_time=0, + end_time=et, + expected_run_time=et)) + else: + et = random.randint(300, 7200) + nr = random.choice([1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 128]) + new_job = Job(job_dict(nodes_required=nr, + name="User-Test LLM", + account="llmUser", + end_state="Success", + id=random.randint(1, 99999), + cpu_trace=1, + gpu_trace=(0.2 * self.config_map[self.args.system]['GPUS_PER_NODE']), + ntx_trace=None, + nrx_trace=None, + submit_time=0, + time_limit=43200, + start_time=0, + end_time=et, + expected_run_time=random.randint(60, 7200))) + jobs.append(new_job) + return jobs + + def synthetic(self, **kwargs): + args = kwargs.get('args', None) + print(args) + total_jobs = args.numjobs + orig_job_size_distribution = args.jobsize_distribution + orig_wall_time_distribution = args.walltime_distribution + orig_cpuutil_distribution = args.cpuutil_distribution + orig_gpuutil_distribution = args.gpuutil_distribution + jobs = [] + if len(args.jobsize_distribution) != 1 and sum(args.multimodal) != 1.0: + raise Exception(f"Sum of --multimodal != 1.0 : {args.multimodal} == {sum(args.multimodal)}") + for i, (jsdist, wtdist, cudist, gudist, percentage) in enumerate(zip(args.jobsize_distribution, + args.walltime_distribution, + args.cpuutil_distribution, + args.gpuutil_distribution, + args.multimodal)): + + args.numjobs = math.floor(total_jobs * percentage) + args.jobsize_distribution = jsdist + args.walltime_distribution = wtdist + args.cpuutil_distribution = cudist + args.gpuutil_distribution = gudist + + job_arrival_distribution_to_draw_from = self.job_arrival_distribution_draw_poisson + match args.jobsize_distribution: + case "uniform": + job_size_distribution_to_draw_from = self.job_size_distribution_draw_uniform + case "normal": + job_size_distribution_to_draw_from = self.job_size_distribution_draw_normal + case "weibull": + job_size_distribution_to_draw_from = self.job_size_distribution_draw_weibull + case _: + raise NotImplementedError(args.jobsize_distribution) + + match args.walltime_distribution: + case "weibull": + wall_time_distribution_to_draw_from = self.wall_time_distribution_draw_weibull + case "normal": + wall_time_distribution_to_draw_from = self.wall_time_distribution_draw_normal + case "uniform": + wall_time_distribution_to_draw_from = self.wall_time_distribution_draw_uniform + case _: + raise NotImplementedError(args.walltime_distribution) + + match args.cpuutil_distribution: + case "uniform": + cpu_util_distribution_to_draw_from = self.cpu_utilization_distribution_draw_uniform + case "normal": + cpu_util_distribution_to_draw_from = self.cpu_utilization_distribution_draw_normal + case "weibull": + cpu_util_distribution_to_draw_from = self.cpu_utilization_distribution_draw_weibull + case _: + raise NotImplementedError(args.cpuutil_distribution) + + match args.gpuutil_distribution: + case "uniform": + gpu_util_distribution_to_draw_from = self.gpu_utilization_distribution_draw_uniform + case "normal": + gpu_util_distribution_to_draw_from = self.gpu_utilization_distribution_draw_normal + case "weibull": + gpu_util_distribution_to_draw_from = self.gpu_utilization_distribution_draw_weibull + case _: + raise NotImplementedError(args.gpuutil_distribution) + + new_jobs = self.generate_jobs_from_distribution( + job_arrival_distribution_to_draw_from=job_arrival_distribution_to_draw_from, + job_size_distribution_to_draw_from=job_size_distribution_to_draw_from, + cpu_util_distribution_to_draw_from=cpu_util_distribution_to_draw_from, + gpu_util_distribution_to_draw_from=gpu_util_distribution_to_draw_from, + wall_time_distribution_to_draw_from=wall_time_distribution_to_draw_from, + args=args) + next_arrival(0, reset=True) + jobs.extend(new_jobs) + args.numjobs = total_jobs + args.jobsize_distribution = orig_job_size_distribution + args.cpuutil_distribution = orig_cpuutil_distribution + args.gpuutil_distribution = orig_gpuutil_distribution + args.walltime_distribution = orig_wall_time_distribution + return jobs + + def generate_random_jobs(self, args) -> list[list[any]]: + """ Generate random jobs with specified number of jobs. """ + + partition = random.choice(self.partitions) + config = self.config_map[partition] + + # time_delta = args.time_delta # Unused + downscale = args.downscale + + config['MIN_WALL_TIME'] = config['MIN_WALL_TIME'] * downscale + config['MAX_WALL_TIME'] = config['MAX_WALL_TIME'] * downscale + jobs = [] + for job_index in range(args.numjobs): + # Randomly select a partition + # Get the corresponding config for the selected partition + nodes_required = random.randint(1, config['MAX_NODES_PER_JOB']) + name = random.choice(JOB_NAMES) + account = random.choice(ACCT_NAMES) + cpu_util = random.random() * config['CPUS_PER_NODE'] + gpu_util = random.random() * config['GPUS_PER_NODE'] + mu = (config['MAX_WALL_TIME'] + config['MIN_WALL_TIME']) / 2 + sigma = (config['MAX_WALL_TIME'] - config['MIN_WALL_TIME']) / 6 + wall_time = (truncated_normalvariate_int( + mu, sigma, config['MIN_WALL_TIME'], config['MAX_WALL_TIME']) // (3600 * downscale) * (3600 * downscale)) + time_limit = (truncated_normalvariate_int(mu, sigma, wall_time, + config['MAX_WALL_TIME']) // (3600 * downscale) * (3600 * downscale)) + # print(f"wall_time: {wall_time//downscale}") + # print(f"time_limit: {time_limit//downscale}") + end_state = determine_state(config['JOB_END_PROBS']) + cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, wall_time, config['TRACE_QUANTA']) + priority = random.randint(0, MAX_PRIORITY) + net_tx, net_rx = None, None + + # Jobs arrive according to Poisson process + time_to_next_job = int(next_arrival_byconfargs(config, args)) + # wall_time = wall_time * downscale + # time_limit = time_limit * downscale + + job_info = job_dict(nodes_required=nodes_required, name=name, + account=account, cpu_trace=cpu_trace, + gpu_trace=gpu_trace, ntx_trace=net_tx, + nrx_trace=net_rx, end_state=end_state, + id=job_index, priority=priority, + partition=partition, + submit_time=time_to_next_job - 100, + time_limit=time_limit, + start_time=time_to_next_job, + end_time=time_to_next_job + wall_time, + expected_run_time=wall_time, trace_time=wall_time, + trace_start_time=0, trace_end_time=wall_time, + trace_quanta=config['TRACE_QUANTA'] * downscale, + downscale=downscale + ) + job = Job(job_info) + jobs.append(job) + return jobs + + def random(self, **kwargs): + """ Generate random workload """ + args = kwargs.get('args', None) + return self.generate_random_jobs(args=args) + + def peak(self, **kwargs): + """Peak power test for multiple partitions""" + jobs = [] + + # Iterate through each partition and get its configuration + for partition in self.partitions: + # Fetch the config for the current partition + config = self.config_map[partition] + + # Generate traces based on partition-specific configuration + cpu_util = config['CPUS_PER_NODE'] + gpu_util = config['GPUS_PER_NODE'] + cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 10800, config['TRACE_QUANTA']) + net_tx, net_rx = None, None + + job_time = len(gpu_trace) * config['TRACE_QUANTA'] + # Create job info for this partition + job_info = job_dict(nodes_required=config['AVAILABLE_NODES'], + # Down nodes, therefore doesnt work list(range(config['AVAILABLE_NODES'])), + scheduled_nodes=[], + name=f"Max Test {partition}", + account=ACCT_NAMES[0], + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=net_tx, + nrx_trace=net_rx, + end_state='COMPLETED', + id=None, + priority=100, + partition=partition, + time_limit=job_time + 1, + start_time=0, + end_time=job_time, + expected_run_time=job_time, + trace_time=job_time, + trace_start_time=0, + trace_end_time=job_time, + trace_quanta=config['TRACE_QUANTA'] + ) + job = Job(job_info) + jobs.append(job) # Add job to the list + + return jobs + + def idle(self, **kwargs): + jobs = [] + # Iterate through each partition and get its configuration + for partition in self.partitions: + # Fetch the config for the current partition + config = self.config_map[partition] + + # Generate traces based on partition-specific configuration + cpu_util, gpu_util = 0, 0 + cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 10800, config['TRACE_QUANTA']) + net_tx, net_rx = None, None + + job_time = len(gpu_trace) * config['TRACE_QUANTA'] + # Create job info for this partition + job_info = job_dict( + nodes_required=config['AVAILABLE_NODES'], + name=f"Idle Test {partition}", + account=ACCT_NAMES[0], + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=net_tx, + nrx_trace=net_rx, + end_state='COMPLETED', + scheduled_nodes=[], # list(range(config['AVAILABLE_NODES'])), + id=None, + priority=100, + partition=partition, + time_limit=job_time + 1, + submit_time=0, + start_time=0, + end_time=job_time, + expected_run_time=job_time, + trace_time=job_time, + trace_start_time=0, + trace_end_time=job_time, + trace_quanta=config['TRACE_QUANTA']) + job = Job(job_info) + jobs.append(job) # Add job to the list + + return jobs + + def benchmark(self, **kwargs): + """Benchmark tests for multiple partitions""" + + # List to hold jobs for all partitions + jobs = [] + account = ACCT_NAMES[0] + # Iterate through each partition and its config + for partition in self.partitions: + # Fetch partition-specific configuration + config = self.config_map[partition] + net_tx, net_rx = None, None + + # Max test + cpu_util, gpu_util = 1, 4 + cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 10800, config['TRACE_QUANTA']) + + job_time = len(gpu_trace) * config['TRACE_QUANTA'] + + job_info = job_dict( + nodes_required=config['AVAILABLE_NODES'], + scheduled_nodes=[], # Explicit scheduled nodes will not work due to down nodes + name=f"Max Test {partition}", + account=account, + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=net_tx, + nrx_trace=net_rx, + end_state='COMPLETED', + id=None, + priority=100, + partition=partition, + submit_time=0, + time_limit=job_time + 1, + start_time=0, + end_time=job_time, + expected_run_time=job_time, + trace_time=job_time, + trace_start_time=0, + trace_end_time=job_time, + trace_missing_values=False, + trace_quanta=config['TRACE_QUANTA']) + job = Job(job_info) + jobs.append(job) + + # OpenMxP run + cpu_util, gpu_util = 0, 4 + cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 3600, config['TRACE_QUANTA']) + job_time = len(gpu_trace) * config['TRACE_QUANTA'] + + job_info = job_dict( + nodes_required=config['AVAILABLE_NODES'], + scheduled_nodes=[], # Explicit scheduled nodes will not work due to down nodes + name=f"OpenMxP {partition}", + account=account, + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=net_tx, + nrx_trace=net_rx, + end_state='COMPLETED', + id=None, + priority=100, + partition=partition, + submit_time=0, + time_limit=job_time + 1, + start_time=10800, + end_time=14200, + expected_run_time=job_time, + trace_time=job_time, + trace_start_time=0, + trace_end_time=job_time, + trace_missing_values=False, + trace_quanta=config['TRACE_QUANTA']) + job = Job(job_info) + jobs.append(job) + + # HPL run + cpu_util, gpu_util = 0.33, 0.79 * 4 # based on 24-01-18 run + cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 3600, config['TRACE_QUANTA']) + job_time = len(gpu_trace) * config['TRACE_QUANTA'] + job_info = job_dict( + nodes_required=config['AVAILABLE_NODES'], + scheduled_nodes=[], # Explicit scheduled nodes will not work due to down nodes + name=f"HPL {partition}", + account=account, + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=net_tx, + nrx_trace=net_rx, + end_state='COMPLETED', + id=None, + priority=100, + partition=partition, + submit_time=0, + time_limit=job_time + 1, + start_time=14200, + end_time=17800, + expected_run_time=job_time, + trace_time=job_time, + trace_start_time=0, + trace_end_time=job_time, + trace_missing_values=False, + trace_quanta=config['TRACE_QUANTA']) + job = Job(job_info) + jobs.append(job) + + # Idle test + cpu_trace, gpu_trace = self.compute_traces(cpu_util, gpu_util, 3600, config['TRACE_QUANTA']) + job_time = len(gpu_trace) * config['TRACE_QUANTA'] + job_info = job_dict( + nodes_required=config['AVAILABLE_NODES'], + scheduled_nodes=[], # Explicit scheduled nodes will not work due to down nodes + name=f"Idle Test {partition}", + account=account, + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=net_tx, + nrx_trace=net_rx, + end_state='COMPLETED', + id=None, + priority=100, + partition=partition, + submit_time=0, + time_limit=job_time + 1, + start_time=17800, + end_time=21400, + expected_run_time=job_time, + trace_time=job_time, + trace_start_time=0, + trace_end_time=job_time, + trace_missing_values=False, + trace_quanta=config['TRACE_QUANTA']) + job = Job(job_info) + jobs.append(job) + + return jobs diff --git a/raps/workloads/calculon.py b/raps/workloads/calculon.py new file mode 100644 index 0000000000000000000000000000000000000000..f843084eafa4e6a547288074922da29a1761165a --- /dev/null +++ b/raps/workloads/calculon.py @@ -0,0 +1,180 @@ +""" +Calculon is a analytical model for estimating LLM training times for given architectures +on particular hardware. It is described in the paper: + + Isaev, Mikhail, et al. "Calculon: a methodology and tool for high-level co-design of + systems and large language models." SC23 Proceedings + https://dl.acm.org/doi/pdf/10.1145/3581784.3607102 + +The code is available at https://github.com/calculon-ai/calculon +which this module assumes is already cloned into the third_party directory. + +Calculon requires installing `psutil`, which can be pip installed via: + + pip install psutil + +Since Calculon by default supports A100 GPUs, we are able to use the default files that +are already setup in Calculon, and therefore have added two systems which have A100 GPUs: +Selene and Perlmutter. Example run commands: + + python main.py run --system selene -w calculon + python main.py run --system perlmutter -w calculon + +This code is currently setup to generate synthetic traces for four different LLM models: +megatron-22B, gpt3-175B, turing-530B, and megatron-1T. These four tests can take a couple +**hours** to run. On first run, consider commenting out the last three models to only test +the smallest case, megatron-22B. The parameter `llm_models_tests` below defines which tests +are run. + +Finally, the code below is setup to uses previously cached results, so once the json +files are generated by Calculon, they can be rerun very quickly again and again. +The caveat to this is if you want to change some Calculon configurations, +you will need to delete the cached json files in the calculon/optimal_executions folder, +to force it to regenerate new files. + +""" +import math +import json +import os +import random +import subprocess +from pathlib import Path + +import numpy as np + +from raps.job import Job, job_dict + +from .constants import ACCT_NAMES + + +class Calculon: + """Calculon workload mixin for Workload class.""" + + def __init__(self, *args, **kwargs): + # NOTE: mixins usually accept (sim_config_args, system_config_dict) through Workload + super().__init__(*args, **kwargs) + + def calculon(self, **kwargs): + """Generate workload using Calculon backend + job trace synthesis.""" + jobs = [] + + llm_models_test = [ + ["megatron-22B", 8, 4], + ["gpt3-175B", 64, 64], + ["turing-530B", 280, 280], + ["megatron-1T", 512, 512], + ] + + for llm_model, num_nodes, max_batch_size in llm_models_test: + for partition in self.partitions: + config = self.config_map[partition] + gpu_system = "a100_80g" + data_type = "float16" + output = f"{llm_model}_{gpu_system}_{max_batch_size}_{data_type}_{num_nodes}.json" + + # call Calculon binary/subprocess to get MFU + batch time + mfu, total_batch_time = self._run_calculon( + llm_model, gpu_system, max_batch_size, num_nodes, data_type, output + ) + + # derive job stats + num_iters = 1000000 # realistic number is probably in the millions + trace_quanta = config["TRACE_QUANTA"] + + job_time = total_batch_time * num_iters + num_samples = math.ceil(job_time / trace_quanta) + 1 + end_time = num_samples * trace_quanta # align job to tick grid + + # use random CPU utilizations for now + cpu_util = random.random() * config["CPUS_PER_NODE"] + cpu_trace = np.full(num_samples, cpu_util) # same length + gpu_trace = np.full(num_samples, mfu) # length matches simulation steps + + net_tx, net_rx = [], [] + num_nodes = num_nodes // config["GPUS_PER_NODE"] + + epochs = 1 + wall_time = job_time + for i in range(epochs): + job_info = job_dict( + nodes_required=num_nodes, + scheduled_nodes=[], + name=f"{llm_model} training for {num_iters} iterations", + account=ACCT_NAMES[0], + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=net_tx, + nrx_trace=net_rx, + end_state="COMPLETED", + id=None, + priority=100, + partition=partition, + time_limit=job_time + 1, + start_time=0, + end_time=end_time, + expected_run_time=end_time, + trace_quanta=trace_quanta, + trace_time=job_time, + trace_start_time=0, + trace_end_time=job_time, + ) + job = Job(job_info) + jobs.append(job) + wall_time += job_time + + return jobs + + def _run_calculon(self, model, system, max_batch_size, num_nodes, data_type, output): + """Internal: run Calculon subprocess and parse result.""" + base_path = Path("third_party/calculon") + output_dir = base_path / "optimal_executions" + output_dir.mkdir(exist_ok=True) + + # expected files + raw_file = output_dir / f"{output.replace('.json', '_raw.json')}" + exec_file = output_dir / f"{output.replace('.json', '_exec.json')}" + stats_file = output_dir / f"{output.replace('.json', '_stats.json')}" + + # if all three exist, skip running + if raw_file.exists() and exec_file.exists() and stats_file.exists(): + print(f"[INFO] Using cached Calculon results for {output}") + with open(raw_file) as f: + data = json.load(f) + first_key = list(data.keys())[0] + stats = data[first_key]["stats"] + mfu = stats.get("sample_rate", 0) # or compute MFU if you want + batch_time = stats.get("block_fw_time", 0) # example placeholder + return mfu, batch_time + + # otherwise, run Calculon + opt_cmd = [ + "./bin/calculon", "llm-optimal-execution", + f"models/{model}.json", + str(num_nodes), + str(max_batch_size), + data_type, + f"systems/{system}.json", + str(raw_file), + ] + + llm_cmd = [ + "./bin/calculon", "llm", + f"models/{model}.json", + str(exec_file), + f"systems/{system}.json", + str(stats_file), + ] + + subprocess.run(opt_cmd, check=True, cwd=base_path, env={**os.environ, "PYTHONPATH": "."}) + subprocess.run(llm_cmd, check=True, cwd=base_path, env={**os.environ, "PYTHONPATH": "."}) + + # parse output + with open(raw_file) as f: + data = json.load(f) + first_key = list(data.keys())[0] + stats = data[first_key]["stats"] + + mfu = stats.get("sample_rate", 0) + batch_time = stats.get("block_fw_time", 0) + + return mfu, batch_time diff --git a/raps/workloads/constants.py b/raps/workloads/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..2ffb39e943b8b73faf271978b741f699875b8801 --- /dev/null +++ b/raps/workloads/constants.py @@ -0,0 +1,13 @@ +"""Shared constants for workloads.""" + +JOB_NAMES = [ + "LAMMPS", "GROMACS", "VASP", "Quantum ESPRESSO", "NAMD", + "OpenFOAM", "WRF", "AMBER", "CP2K", "nek5000", "CHARMM", + "ABINIT", "Cactus", "Charm++", "NWChem", "STAR-CCM+", + "Gaussian", "ANSYS", "COMSOL", "PLUMED", "nekrs", + "TensorFlow", "PyTorch", "BLAST", "Spark", "GAMESS", + "ORCA", "Simulink", "MOOSE", "ELK" +] + +ACCT_NAMES = [f"ACT{i:02d}" for i in range(1, 15)] +MAX_PRIORITY = 500000 diff --git a/raps/workloads/distribution.py b/raps/workloads/distribution.py new file mode 100644 index 0000000000000000000000000000000000000000..8f3cc6f127b603d040cac9c257467efa1cdb7783 --- /dev/null +++ b/raps/workloads/distribution.py @@ -0,0 +1,188 @@ +import math +import random + +from raps.job import Job, job_dict +from raps.utils import ( + truncated_normalvariate_int, + truncated_normalvariate_float, + truncated_weibull, + truncated_weibull_float, + determine_state, + next_arrival_byconfargs, +) + +from .constants import JOB_NAMES, ACCT_NAMES, MAX_PRIORITY + +class DistributionWorkload: + + def job_arrival_distribution_draw_poisson(self, args, config): + return next_arrival_byconfargs(config, args) + + + def job_size_distribution_draw_uniform(self, args, config): + min_v = 1 + max_v = config['MAX_NODES_PER_JOB'] + if (args.jobsize_is_power_of is not None): + base = args.jobsize_is_power_of + possible_jobsizes = [base ** exp for exp in range(min_v, int(math.floor(math.log(max_v, base))))] + selection = random.randint(0, len(possible_jobsizes) - 1) + number = possible_jobsizes[selection] + elif (args.jobsize_is_of_degree is not None): + exp = args.jobsize_is_of_degree + possible_jobsizes = [base ** exp for base in range(min_v, int(math.floor(pow(max_v, 1 / exp))))] + selection = random.randint(0, len(possible_jobsizes) - 1) + number = possible_jobsizes[selection] + else: + number = random.randint(1, config['MAX_NODES_PER_JOB']) + return number + + + def job_size_distribution_draw_weibull(self, args, config): + min_v = 1 + max_v = config['MAX_NODES_PER_JOB'] + if (args.jobsize_is_power_of is not None): + base = args.jobsize_is_power_of + possible_jobsizes = [base ** exp for exp in range(min_v, int(math.floor(math.log(max_v, base))))] + scale = math.log(args.jobsize_weibull_scale, base) + shape = math.log(args.jobsize_weibull_shape, base) + selection = truncated_weibull(scale, shape, 0, len(possible_jobsizes) - 1) + number = possible_jobsizes[selection] + elif (args.jobsize_is_of_degree is not None): + exp = args.jobsize_is_of_degree + possible_jobsizes = [base ** exp for base in range(min_v, int(math.floor(pow(max_v, 1 / exp))))] + scale = math.pow(args.jobsize_weibull_scale, 1 / exp) + shape = math.pow(args.jobsize_weibull_shape, 1 / exp) + selection = truncated_weibull(scale, shape, 0, len(possible_jobsizes) - 1) + number = possible_jobsizes[selection] + else: + number = truncated_weibull(args.jobsize_weibull_scale, args.jobsize_weibull_shape, + 1, config['MAX_NODES_PER_JOB']) + return number + + + def job_size_distribution_draw_normal(self, args, config): + min_v = 1 + max_v = config['MAX_NODES_PER_JOB'] + if (args.jobsize_is_power_of is not None): + base = args.jobsize_is_power_of + possible_jobsizes = [base ** exp for exp in range(min_v, int(math.floor(math.log(max_v, base))))] + mean = math.log(args.jobsize_normal_mean, base) + stddev = math.log(args.jobsize_normal_stddev, base) # (len(possible_jobsizes) / (max_v - min_v)) + selection = truncated_normalvariate_int(mean, stddev, 0, len(possible_jobsizes) - 1) + number = possible_jobsizes[selection - 1] + elif (args.jobsize_is_of_degree is not None): + exp = args.jobsize_is_of_degree + possible_jobsizes = [base ** exp for base in range(min_v, int(math.floor(pow(max_v, 1 / exp))))] + mean = math.pow(args.jobsize_normal_mean, 1 / exp) + stddev = math.pow(args.jobsize_normal_stddev, 1 / exp) + selection = truncated_weibull(mean, stddev, 0, len(possible_jobsizes) - 1) + number = possible_jobsizes[selection] + else: + number = truncated_normalvariate_int( + args.jobsize_normal_mean, args.jobsize_normal_stddev, 1, config['MAX_NODES_PER_JOB']) + return number + + + def cpu_utilization_distribution_draw_uniform(self, args, config): + return random.uniform(0.0, config['CPUS_PER_NODE']) + + + def cpu_utilization_distribution_draw_normal(self, args, config): + return truncated_normalvariate_float(args.cpuutil_normal_mean, + args.cpuutil_normal_stddev, + 0.0, config['CPUS_PER_NODE']) + + + def cpu_utilization_distribution_draw_weibull(self, args, config): + return truncated_weibull_float(args.cpuutil_weibull_scale, + args.cpuutil_weibull_shape, + 0.0, config['CPUS_PER_NODE']) + + + def gpu_utilization_distribution_draw_uniform(self, args, config): + return random.uniform(0.0, config['GPUS_PER_NODE']) + + + def gpu_utilization_distribution_draw_normal(self, args, config): + return truncated_normalvariate_float(args.gpuutil_normal_mean, + args.gpuutil_normal_stddev, + 0.0, config['GPUS_PER_NODE']) + + + def gpu_utilization_distribution_draw_weibull(self, args, config): + return truncated_weibull_float(args.gpuutil_weibull_scale, + args.gpuutil_weibull_shape, + 0.0, config['GPUS_PER_NODE']) + + + def wall_time_distribution_draw_uniform(self, args, config): + return random.uniform(config['MIN_WALL_TIME'], config['MAX_WALL_TIME']) + + + def wall_time_distribution_draw_normal(self, args, config): + return max(1, truncated_normalvariate_int(float(args.walltime_normal_mean), + float(args.walltime_normal_stddev), config['MIN_WALL_TIME'], + config['MAX_WALL_TIME']) / 3600 * 3600) + + + def wall_time_distribution_draw_weibull(self, args, config): + return truncated_weibull(args.walltime_weibull_scale, + args.walltime_weibull_shape, + config['MIN_WALL_TIME'], config['MAX_WALL_TIME']) + + + def generate_jobs_from_distribution(self, *, + job_arrival_distribution_to_draw_from, + job_size_distribution_to_draw_from, + cpu_util_distribution_to_draw_from, + gpu_util_distribution_to_draw_from, + wall_time_distribution_to_draw_from, + args + ) -> list[list[any]]: + jobs = [] + partition = random.choice(self.partitions) + config = self.config_map[partition] + for job_index in range(args.numjobs): + submit_time = int(job_arrival_distribution_to_draw_from(args, config)) + start_time = submit_time + nodes_required = job_size_distribution_to_draw_from(args, config) + name = random.choice(JOB_NAMES) + account = random.choice(ACCT_NAMES) + cpu_util = cpu_util_distribution_to_draw_from(args, config) + if "CORES_PER_CPU" in config: + cpu_cores_required = random.randint(0, config["CORES_PER_CPU"]) + else: + cpu_cores_required = None + gpu_util = gpu_util_distribution_to_draw_from(args, config) + if "GPUS_PER_NODE" in config: + if isinstance(gpu_util, list): + gpu_units_required = random.randint(0, max(config["GPUS_PER_NODE"], math.ceil(max(gpu_util)))) + else: + gpu_units_required = random.randint(0, max(config["GPUS_PER_NODE"], math.ceil(gpu_util))) + wall_time = wall_time_distribution_to_draw_from(args, config) + end_time = start_time + wall_time + time_limit = max(wall_time, wall_time_distribution_to_draw_from(args, config)) + end_state = determine_state(config['JOB_END_PROBS']) + cpu_trace = cpu_util # self.compute_traces(cpu_util, gpu_util, wall_time, config['TRACE_QUANTA']) + gpu_trace = gpu_util # self.compute_traces(cpu_util, gpu_util, wall_time, config['TRACE_QUANTA']) + priority = random.randint(0, MAX_PRIORITY) + net_tx, net_rx = None, None + job_info = job_dict(nodes_required=nodes_required, name=name, + account=account, cpu_trace=cpu_trace, + gpu_trace=gpu_trace, ntx_trace=net_tx, + nrx_trace=net_rx, end_state=end_state, + id=job_index, priority=priority, + partition=partition, + submit_time=submit_time, + time_limit=time_limit, + start_time=start_time, + end_time=end_time, + expected_run_time=wall_time, trace_time=wall_time, + trace_start_time=0, trace_end_time=wall_time, + cpu_cores_required=cpu_cores_required, + gpu_units_required=gpu_units_required, + trace_quanta=config['TRACE_QUANTA'] + ) + job = Job(job_info) + jobs.append(job) + return jobs diff --git a/raps/workloads/hpl.py b/raps/workloads/hpl.py new file mode 100644 index 0000000000000000000000000000000000000000..e338061e5358a65d3cc7aef09032cea0acc360d1 --- /dev/null +++ b/raps/workloads/hpl.py @@ -0,0 +1,229 @@ +""" +Hao Lu’s analytical HPL model adapter for ExaDigiT. + +Usage: + python main.py run -w hpl -d +or: + python raps/workloads/hpl.py +""" + +from raps.job import Job, job_dict +import numpy as np +import math + + +class HPL: + """Analytical HPL workload generator for ExaDigiT.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # ------------------------------------------------------------------------- + # Public entry + # ------------------------------------------------------------------------- + def hpl(self, **kwargs): + jobs = [] + + # You can add more scenarios; comment out big ones while testing. + hpl_tests = [ + # Smaller grid (quick sanity check) + {"M": 2_097_152, "b": 576, "P": 16, "Q": 32, "Rtype": "1-ring", "f": 0.6}, + # Frontier-scale shape (comment in when ready) + {"M": 8_900_000, "b": 576, "P": 192, "Q": 384, "Rtype": "1-ring", "f": 0.6}, + ] + + for test in hpl_tests: + for partition in self.partitions: + cfg = self.config_map[partition] + trace_quanta = cfg["TRACE_QUANTA"] + + # Per-iteration timings (already concurrency-aware) + iterations = self._run_hpl_model(**test) + + # Convert iteration timings to sampled traces on TRACE_QUANTA grid + gpu_trace, cpu_trace = self._emit_traces_from_iters( + iterations, trace_quanta, cfg + ) + total_time = len(gpu_trace) * trace_quanta + + # Node count: ranks / (GPUs_per_node * GCDs_per_GPU) + gpus = cfg["GPUS_PER_NODE"] + gcds = cfg.get("GCDS_PER_GPU", 2) # Frontier MI250X default: 2 + ranks = test["P"] * test["Q"] + nodes_required = max(1, ranks // (gpus * gcds)) + + job_info = job_dict( + nodes_required=nodes_required, + scheduled_nodes=[], + name=f"HPL_{test['M']}x{test['M']}_P{test['P']}Q{test['Q']}", + account="benchmark", + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=[], + nrx_trace=[], + id=None, + end_state="COMPLETED", + priority=100, + partition=partition, + time_limit=total_time, + start_time=0, + end_time=total_time, + expected_run_time=total_time, + trace_quanta=trace_quanta, + trace_time=total_time, + trace_start_time=0, + trace_end_time=total_time, + ) + jobs.append(Job(job_info)) + + return jobs + + # ------------------------------------------------------------------------- + # Analytical per-iteration model (concurrency-aware) + # ------------------------------------------------------------------------- + def _run_hpl_model(self, M, b, P, Q, Rtype="1-ring", f=0.6): + """ + Returns a list of dicts, one per iteration: + { + "T_iter": , + "gpu_active": , + "cpu_active": , + "net_active": , + } + + Concurrency-aware scaling: + - UPDATE (DGEMM) work is distributed over the full P*Q ranks → divide by (P*Q) + - PDFACT/LBCAST/RS* progress along process columns (Q) → divide by Q + This makes the per-iteration times reflect global wall-time. + """ + # Effective per-rank throughputs/bandwidths (empirical constants) + CAllgather = 6.3e9 # bytes/s + C1ring = 7.0e9 # bytes/s + Creduce = 46e6 # bytes/s + Fcpublas = 240e9 # FLOP/s + Fgemm = 24e12 # FLOP/s + + Ml = M / P + Nl = M / Q + nb = int(M / b) + iterations = [] + + for i in range(nb): + Ml_i = Ml - (i * b / P) + if Ml_i <= 0: + break + + # Local column partition sizes (A = [A1 | A2]), f is the split ratio + Nl1_i = max((1.0 - f) * Nl - (i * b / Q), 0.0) + Nl2_i = (f * Nl) if (i * b) < (f * Nl) else max(Nl - (i * b / Q), 0.0) + + # Component times (per-rank formulations) + # NOTE: units already account for bytes vs. elements (coeffs 16, 2/3, etc.) + TPDFACT_rank = (b**2) / Creduce + (2.0 / 3.0) * (b**2) * Ml_i / Fcpublas + TLBCAST_rank = 16.0 * b * Ml_i / C1ring + TUPD1_rank = 2.0 * b * Ml_i * Nl1_i / Fgemm + TUPD2_rank = 2.0 * b * Ml_i * Nl2_i / Fgemm + TRS1_rank = 16.0 * b * Nl1_i / CAllgather + TRS2_rank = 16.0 * b * Nl2_i / CAllgather + + # Concurrency: convert rank-local times to global wall-time contributions + # (coarse but effective partitioning of the communicators) + TPDFACT = TPDFACT_rank #/ Q + TLBCAST = TLBCAST_rank #/ Q + TRS1 = TRS1_rank #/ Q + TRS2 = TRS2_rank #/ Q + TUPD1 = TUPD1_rank #/ (P * Q) + TUPD2 = TUPD2_rank #/ (P * Q) + + # Two pipeline stages per iteration (HPL) + stage1 = max(TPDFACT + TLBCAST + TRS1, TUPD2) + stage2 = max(TRS2, TUPD1) + T_iter = stage1 + stage2 + + # Attribute activity (for utilization duty fractions) + gpu_active = max(TUPD1, TUPD2) + cpu_active = TPDFACT + net_active = TLBCAST + TRS1 + TRS2 + + iterations.append( + dict( + T_iter=T_iter, + gpu_active=gpu_active, + cpu_active=cpu_active, + net_active=net_active, + ) + ) + + return iterations + + def _emit_traces_from_iters(self, iterations, trace_quanta, cfg): + gpn = cfg["GPUS_PER_NODE"] + gpu_trace, cpu_trace = [], [] + acc_time = 0.0 + acc_gpu = 0.0 + acc_cpu = 0.0 + + for it in iterations: + T = it["T_iter"] + if T <= 0: + continue + + total_act = it["gpu_active"] + it["cpu_active"] + it["net_active"] + compute_ratio = it["gpu_active"] / total_act if total_act > 0 else 0.0 + cpu_ratio = it["cpu_active"] / total_act if total_act > 0 else 0.0 + fg = 0.8 + 0.2 * compute_ratio + fc = 0.6 + 0.3 * cpu_ratio + + acc_time += T + acc_gpu += gpn * fg * T + acc_cpu += fc * T + + # emit one sample each time we accumulate ≥ trace_quanta + while acc_time >= trace_quanta: + gpu_trace.append(acc_gpu / acc_time) + cpu_trace.append(acc_cpu / acc_time) + acc_time -= trace_quanta + acc_gpu = acc_cpu = 0.0 + + # flush remainder + if acc_time > 0: + gpu_trace.append(acc_gpu / acc_time) + cpu_trace.append(acc_cpu / acc_time) + + return np.array(gpu_trace), np.array(cpu_trace) + +# ----------------------------------------------------------------------------- +# Stand-alone test +# ----------------------------------------------------------------------------- +if __name__ == "__main__": + + class DummyHPL(HPL): + def __init__(self): + self.partitions = ["gpu"] + self.config_map = { + "gpu": { + "TRACE_QUANTA": 15.0, # seconds/sample + "GPUS_PER_NODE": 4, # Frontier physical GPUs/node + "GCDS_PER_GPU": 2, # MI250X logical ranks/GPU + "CPUS_PER_NODE": 64, + } + } + + hpl = DummyHPL() + jobs = hpl.hpl() + + print(f"Generated {len(jobs)} HPL job(s)\n") + for i, job in enumerate(jobs): + print(f"--- Job {i} ---") + print(f"Name: {job.name}") + print(f"Nodes required: {job.nodes_required}") + print(f"Wall time: {job.trace_time:.1f}s") + print(f"Trace samples: {len(job.gpu_trace)}") + print(f"Avg GPU util: {np.mean(job.gpu_trace):.2f} (0..{hpl.config_map['gpu']['GPUS_PER_NODE']})") + print(f"Avg CPU util: {np.mean(job.cpu_trace):.2f} (0..1)") + # Peek at starts/ends + print("GPU head:", np.round(job.gpu_trace[:8], 3)) + print("GPU tail:", np.round(job.gpu_trace[-8:], 3)) + print("CPU head:", np.round(job.cpu_trace[:8], 3)) + print("CPU tail:", np.round(job.cpu_trace[-8:], 3)) + print() diff --git a/raps/workloads/inter_job_congestion.py b/raps/workloads/inter_job_congestion.py new file mode 100644 index 0000000000000000000000000000000000000000..3fd569d61d36f5714a32ff2deac3eacf4f9a7951 --- /dev/null +++ b/raps/workloads/inter_job_congestion.py @@ -0,0 +1,141 @@ +import math +import random +from typing import List, Tuple + +from raps.job import Job, job_dict +from raps.network import max_throughput_per_tick + +class InterJobCongestionWorkload: + """ Workload generator for inter-job congestion test """ + def inter_job_congestion(self, args) -> List[Job]: + legacy_cfg = self.config_map[self.partitions[0]] + topology = legacy_cfg.get("TOPOLOGY", "").lower() + return generate_jobs( + legacy_cfg=legacy_cfg, + topology=topology, + J=args.numjobs, + trace_quanta=legacy_cfg.get("TRACE_QUANTA", 20), + tx_fraction_per_job=getattr(args, 'txfrac', 0.35), # Assuming txfrac might be an arg + seed=args.seed + ) + + +def infer_group_params(legacy_cfg: dict, topology: str) -> Tuple[int, int, str]: + """ + Infer (hosts_per_group, total_groups, group_label) + depending on network topology. + """ + total_nodes = int(legacy_cfg["TOTAL_NODES"]) + + if topology == "fat-tree": + k = int(legacy_cfg.get("FATTREE_K", 32)) + H = k // 2 # hosts per ToR + R = math.ceil(total_nodes / H) + return H, R, "rack" + + elif topology == "dragonfly": + routers_per_group = int(legacy_cfg.get("ROUTERS_PER_GROUP", 8)) + nodes_per_router = int(legacy_cfg.get("NODES_PER_ROUTER", 4)) + H = routers_per_group * nodes_per_router + R = max(1, total_nodes // H) + return H, R, "group" + + elif topology == "torus3d": + dims = ( + int(legacy_cfg.get("TORUS_X", 12)), + int(legacy_cfg.get("TORUS_Y", 12)), + int(legacy_cfg.get("TORUS_Z", 12)), + ) + R = math.prod(dims) + return 1, R, "torus" + + else: + return 1, 1, "flat" + + +def pick_two_distinct_groups(R: int) -> Tuple[int, int]: + """Pick two distinct group indices (far apart if possible).""" + if R <= 2: + return (0, 1 if R > 1 else 0) + a = random.randrange(0, R // 2) + b = random.randrange(R // 2, R) + if a == b: + b = (b + 1) % R + return a, b + + +def nodes_in_group(group_idx: int, H: int, total_nodes: int, n: int) -> List[int]: + """Pick n contiguous nodes from a group.""" + start = group_idx * H + end = min(start + H, total_nodes) + n = min(n, end - start) + base = random.randrange(start, end - n + 1) if (end - start - n) > 0 else start + return list(range(base, base + n)) + + +def generate_jobs( + legacy_cfg: dict, + topology: str, + J: int = 60, + trace_quanta: int = 20, + tx_fraction_per_job: float = 0.35, + seed: int = 42 +) -> List[Job]: + """Generate synthetic jobs spanning and overlapping local groups.""" + random.seed(seed) + total_nodes = int(legacy_cfg["TOTAL_NODES"]) + H, R, label = infer_group_params(legacy_cfg, topology) + per_tick_bw = max_throughput_per_tick(legacy_cfg, trace_quanta) + per_dir = tx_fraction_per_job * per_tick_bw + + print(f"[INFO] topology={topology}, {label}s={R}, hosts_per_{label}={H}") + print(f"[INFO] total_nodes={total_nodes}, per-dir={per_dir:.2e} B/tick") + + jobs: List[Job] = [] + jid = 1 + + # Roughly 60% cross-group, 25% intra-group, 15% multi-group + n_cross = int(J * 0.6) + n_intra = int(J * 0.25) + n_multi = J - n_cross - n_intra + + for _ in range(n_cross): + a, b = pick_two_distinct_groups(R) + nodes = nodes_in_group(a, H, total_nodes, 1) + nodes_in_group(b, H, total_nodes, 1) + jobs.append(make_job(jid, nodes, per_dir, trace_quanta)) + jid += 1 + + for _ in range(n_intra): + g = random.randrange(0, R) + nodes = nodes_in_group(g, H, total_nodes, 2) + jobs.append(make_job(jid, nodes, per_dir, trace_quanta)) + jid += 1 + + for _ in range(n_multi): + a, b = pick_two_distinct_groups(R) + nodes = nodes_in_group(a, H, total_nodes, 2) + nodes_in_group(b, H, total_nodes, 2) + jobs.append(make_job(jid, nodes, per_dir, trace_quanta)) + jid += 1 + + print(f"[INFO] jobs={len(jobs)} (cross={n_cross}, intra={n_intra}, multi={n_multi})") + return jobs + + +def make_job(jid: int, nodes: List[int], per_dir: float, trace_quanta: int) -> Job: + """Helper: create one synthetic Job object.""" + trace_len = 900 // trace_quanta + return Job(job_dict( + id=jid, + name=f"job_{jid}", + account="test", + nodes_required=len(nodes), + scheduled_nodes=nodes, + cpu_trace=[0] * trace_len, + gpu_trace=[0] * trace_len, + ntx_trace=[per_dir] * trace_len, + nrx_trace=[per_dir] * trace_len, + trace_quanta=trace_quanta, + expected_run_time=900, + time_limit=1800, + end_state="COMPLETED" + )) diff --git a/raps/workloads/live.py b/raps/workloads/live.py new file mode 100644 index 0000000000000000000000000000000000000000..4a468af631a420e0a823fef0bea91fd07b840758 --- /dev/null +++ b/raps/workloads/live.py @@ -0,0 +1,6 @@ +def continuous_job_generation(*, engine, timestep, jobs): + # print("if len(engine.queue) <= engine.continuous_workload.args.maxqueue:") + # print(f"if {len(engine.queue)} <= {engine.continuous_workload.args.maxqueue}:") + if len(engine.queue) <= engine.continuous_workload.args.maxqueue: + new_jobs = engine.continuous_workload.generate_jobs().jobs + jobs.extend(new_jobs) diff --git a/raps/workloads/multitenant.py b/raps/workloads/multitenant.py new file mode 100644 index 0000000000000000000000000000000000000000..61d7c327951c12a82b133058526382291ea09dd1 --- /dev/null +++ b/raps/workloads/multitenant.py @@ -0,0 +1,154 @@ +import random +from raps.job import Job, job_dict +from .constants import ACCT_NAMES, MAX_PRIORITY + +class MultitenantWorkload: + + def multitenant(self, **kwargs): + """ + Generate deterministic jobs to validate multitenant scheduling & power. + + usage example: + + python main.py run-multi-part -x mit_supercloud -w multitenant + + Parameters + ---------- + mode : str + One of: + - 'ONE_JOB_PER_NODE_ALL_CORES' + - 'TWO_JOBS_PER_NODE_SPLIT' + - 'STAGGERED_JOBS_PER_NODE' + wall_time : int + Duration (seconds) of each job (default: 3600) + trace_quanta : int + Sampling interval for traces; defaults to config['TRACE_QUANTA'] + + Returns + ------- + list[dict] + List of job_dict entries. + """ + mode = kwargs.get('mode', 'TWO_JOBS_PER_NODE_SPLIT') + wall_time = kwargs.get('wall_time', 3600) + + jobs = [] + + for partition in self.partitions: + cfg = self.config_map[partition] + trace_quanta = kwargs.get('trace_quanta', cfg['TRACE_QUANTA']) + + cores_per_cpu = cfg.get('CORES_PER_CPU', 1) + cpus_per_node = cfg.get('CPUS_PER_NODE', 1) + cores_per_node = cores_per_cpu * cpus_per_node + gpus_per_node = cfg.get('GPUS_PER_NODE', 0) + + n_nodes = cfg['AVAILABLE_NODES'] + + def make_trace(cpu_util, gpu_util): + return self.compute_traces(cpu_util, gpu_util, wall_time, trace_quanta) + + job_id_ctr = 0 + + if mode == 'ONE_JOB_PER_NODE_ALL_CORES': + # Each node runs one job that consumes all cores/GPUs + for nid in range(n_nodes): + cpu_trace, gpu_trace = make_trace(cores_per_node, gpus_per_node) + jobs.append(Job(job_dict( + nodes_required=1, + cpu_cores_required=cores_per_node, + gpu_units_required=gpus_per_node, + name=f"MT_full_node_{partition}_{nid}", + account=random.choice(ACCT_NAMES), + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=[], nrx_trace=[], + end_state='COMPLETED', + id=job_id_ctr, + priority=random.randint(0, MAX_PRIORITY), + partition=partition, + submit_time=0, + time_limit=wall_time, + start_time=0, + end_time=wall_time, + expected_run_time=wall_time, + trace_time=wall_time, + trace_start_time=0, + trace_end_time=wall_time, + trace_quanta=cfg['TRACE_QUANTA'] + ))) + job_id_ctr += 1 + + elif mode == 'TWO_JOBS_PER_NODE_SPLIT': + # Two jobs per node: split CPU/GPU roughly in half + for nid in range(n_nodes): + cpu_a = cores_per_node // 2 + cpu_b = cores_per_node - cpu_a + gpu_a = gpus_per_node // 2 + gpu_b = gpus_per_node - gpu_a + + for idx, (c_req, g_req, tag) in enumerate([(cpu_a, gpu_a, 'A'), + (cpu_b, gpu_b, 'B')]): + cpu_trace, gpu_trace = make_trace(c_req, g_req) + jobs.append(Job(job_dict( + nodes_required=1, # still one node; multitenant RM packs cores + cpu_cores_required=c_req, + gpu_units_required=g_req, + name=f"MT_split_node_{partition}_{nid}_{tag}", + account=random.choice(ACCT_NAMES), + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=[], nrx_trace=[], + end_state='COMPLETED', + id=job_id_ctr, + priority=random.randint(0, MAX_PRIORITY), + partition=partition, + submit_time=0, + time_limit=wall_time, + start_time=0, + end_time=wall_time, + expected_run_time=wall_time, + trace_time=wall_time, + trace_start_time=0, + trace_end_time=wall_time, + trace_quanta=cfg['TRACE_QUANTA'] + ))) + job_id_ctr += 1 + + elif mode == 'STAGGERED_JOBS_PER_NODE': + # Three jobs per node, staggered starts: 0, wall_time/3, 2*wall_time/3 + offsets = [0, wall_time // 3, 2 * wall_time // 3] + cpu_each = cores_per_node // 3 or 1 + gpu_each = max(1, gpus_per_node // 3) if gpus_per_node else 0 + + for nid in range(n_nodes): + for k, offset in enumerate(offsets): + cpu_trace, gpu_trace = make_trace(cpu_each, gpu_each) + jobs.append(Job(job_dict( + nodes_required=1, + cpu_cores_required=cpu_each, + gpu_units_required=gpu_each, + name=f"MT_stagger_node_{partition}_{nid}_{k}", + account=random.choice(ACCT_NAMES), + cpu_trace=cpu_trace, + gpu_trace=gpu_trace, + ntx_trace=[], nrx_trace=[], + end_state='COMPLETED', + id=job_id_ctr, + priority=random.randint(0, MAX_PRIORITY), + partition=partition, + submit_time=offset, + time_limit=wall_time, + start_time=offset, + end_time=offset + wall_time, + expected_run_time=wall_time, + trace_time=wall_time, + trace_start_time=0, + trace_end_time=wall_time, + trace_quanta=cfg['TRACE_QUANTA'] + ))) + job_id_ctr += 1 + else: + raise ValueError(f"Unknown multitenant mode: {mode}") + + return jobs diff --git a/raps/workloads/network.py b/raps/workloads/network.py new file mode 100644 index 0000000000000000000000000000000000000000..e5302c89efc6bb986576b65bfa385f39cbe77a53 --- /dev/null +++ b/raps/workloads/network.py @@ -0,0 +1,62 @@ + +from raps.job import Job, job_dict + + +class NetworkTestWorkload: + def network_test(self, **kwargs): + """ + Synthetic workload to test network congestion. + Generates several jobs with varying sizes and bandwidths, + including overlapping node assignments to induce interference. + """ + jobs = [] + trace_len = 180 # 15 minutes with 20s quanta + + # -------------------------------------------------------- + # Hard-coded configuration + # -------------------------------------------------------- + # Define per-job properties + bw = 1e10 + job_configs = [ + # (job_id, node_list, bandwidth_bytes_per_tick) + (1, [0, 1], bw), # 2-node job +# (2, [1, 2], bw), # Job 2 overlaps node 1 (causes congestion) + (2, [128, 129], bw), # Job 2 on a distant rack (no shared link) + (3, [256], bw), # isolated single-node job + (4, [512, 513, 514], 5e11), # multi-node but separate + (5, [1020], bw), # distant single-node job + ] + + runtime = 900 # seconds + time_limit = 1800 # seconds + trace_quanta = 20 # seconds + + # -------------------------------------------------------- + # Job creation loop + # -------------------------------------------------------- + for job_id, node_list, bw in job_configs: + job_info = job_dict( + id=job_id, + name=f"net_job_{job_id}", + account="test", + nodes_required=len(node_list), + scheduled_nodes=node_list, + cpu_trace=[1] * trace_len, + gpu_trace=[1] * trace_len, + ntx_trace=[bw] * trace_len, + nrx_trace=[bw] * trace_len, + submit_time=0, + start_time=0, + expected_run_time=runtime, + time_limit=time_limit, + end_state="COMPLETED", + trace_quanta=trace_quanta, + ) + jobs.append(Job(job_info)) + print(f"[DEBUG] Created net_job_{job_id} nodes={node_list} bw={bw:.2e}") + + print("\n[DEBUG] Requested node assignments:") + for job in jobs: + print(f" Job {job.id}: nodes_required={job.nodes_required}, scheduled_nodes={job.scheduled_nodes}") + + return jobs diff --git a/raps/workloads/utils.py b/raps/workloads/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d673ddf43c780e8d60104ff763d7356477040baa --- /dev/null +++ b/raps/workloads/utils.py @@ -0,0 +1,159 @@ +import math +import numpy as np +import matplotlib.pyplot as plt + + +def plot_job_hist(jobs, config=None, dist_split=None, gantt_nodes=False): + # put args.multimodal in dist_split! + split = [1.0] + num_dist = 1 + if dist_split: + num_dist = len(dist_split) + split = dist_split + + y = [y.nodes_required for y in jobs] + x = [x.expected_run_time for x in jobs] + x2 = [x.time_limit for x in jobs] + fig_m = plt.figure() + gs = fig_m.add_gridspec(30, 1) + gs0 = gs[0:20].subgridspec(500, 500, hspace=0, wspace=0) + gs1 = gs[24:].subgridspec(1, 1) + + ax_top = fig_m.add_subplot(gs0[:]) + ax_top.axis('off') + ax_top.set_title('Job Distribution') + + ax_bot = fig_m.add_subplot(gs1[:]) + ax_bot.axis('off') + ax_bot.set_title('Submit Time + Wall Time') + + # ax0 = fig_m.add_subplot(gs[:2,:]) + # ax1 = fig_m.add_subplot(gs[2:,:]) + + # gss = gridspec.GridSpec(5, 5, figure=ax0) + # fig, axs = plt.subplots(2, 2, gridspec_kw={'width_ratios': (4, 1), 'height_ratios': (1, 4)}) + axs = [] + col = [] + col.append(fig_m.add_subplot(gs0[:100, :433])) + col.append(fig_m.add_subplot(gs0[:100, 433:])) + axs.append(col.copy()) + col = [] + col.append(fig_m.add_subplot(gs0[100:, :433])) + col.append(fig_m.add_subplot(gs0[100:, 433:])) + axs.append(col.copy()) + + ax_b = fig_m.add_subplot(gs1[:, :]) + + # Create scatter plot + for i in range(len(x)): + axs[1][0].plot([x[i], x2[i]], [y[i], y[i]], color='lightblue', zorder=1) + axs[1][0].scatter(x2, y, marker='.', c='lightblue', zorder=2) + axs[1][0].scatter(x, y, zorder=3) + + cpu_util = [x.cpu_trace for x in jobs] + if isinstance(cpu_util[0], np.ndarray): + cpu_util = np.concatenate(cpu_util).ravel() + elif isinstance(cpu_util[0], list): + cpu_util = [sum(part) / len(part) for part in cpu_util] + gpu_util = [x.gpu_trace for x in jobs] + if isinstance(gpu_util[0], np.ndarray): + gpu_util = np.concatenate(gpu_util).ravel() + elif isinstance(gpu_util[0], list): + gpu_util = [sum(part) / len(part) for part in gpu_util] + if not all([x == 0 for x in gpu_util]): + axs[0][1].scatter(cpu_util, gpu_util, zorder=2, marker='.', s=0.2) + axs[0][1].hist(gpu_util, bins=100, orientation='horizontal', zorder=1, density=True, color='tab:purple') + axs[0][1].axhline(np.mean(gpu_util), color='r', linewidth=1, zorder=3) + axs[0][1].set(ylim=[0, config['GPUS_PER_NODE']]) + axs[0][1].set_ylabel("gpu util") + axs[0][1].yaxis.set_label_coords(1.15, 0.5) + axs[0][1].yaxis.set_label_position("right") + axs[0][1].yaxis.tick_right() + else: + axs[0][1].set_yticks([]) + axs[0][1].hist(cpu_util, bins=100, orientation='vertical', zorder=1, density=True, color='tab:cyan') + axs[0][1].axvline(np.mean(cpu_util), color='r', linewidth=1, zorder=3) + axs[0][1].set(xlim=[0, config['CPUS_PER_NODE']]) + axs[0][1].set_xlabel("cpu util") + axs[0][1].xaxis.set_label_coords(0.5, 1.30) + axs[0][1].xaxis.set_label_position("top") + axs[0][1].xaxis.tick_top() + axs[0][0].hist(x2, bins=max(1, math.ceil(min(100, (max(x2) - min(x))))), orientation='vertical', color='lightblue') + axs[0][0].hist(x, bins=max(1, math.ceil(min(100, (max(x2) - min(x))))), orientation='vertical') + axs[1][0].sharex(axs[0][0]) + axs[1][1].hist(y, bins=max(1, min(100, (max(y) - min(y)))), orientation='horizontal') + axs[1][0].sharey(axs[1][1]) + + # Remove ticks + axs[0][0].set_xticks([]) + axs[1][1].set_yticks([]) + axs[0][1].spines['top'].set_color('white') + axs[0][1].spines['right'].set_color('white') + axs[1][0].set_ylabel("nodes [N]") + axs[1][0].set_xlabel("wall time [hh:mm]") + minx_s = 0 + maxx_s = math.ceil(max(x2)) + x_label_mins = [n for n in np.arange(minx_s // 60, maxx_s // 60)] + x_label_ticks = [n * 60 for n in x_label_mins[0::60]] + x_label_str = [str(x1).zfill(2) + ":" + str(x2).zfill(2) for + (x1, x2) in [(n // 60, n % 60) for + n in x_label_mins[0::60]]] + axs[1][0].set_xticks(x_label_ticks, x_label_str) + miny = min(y) + maxy = max(y) + interval = max(1, maxy // 10) + y_ticks = np.arange(0, maxy, interval) + y_ticks[0] = miny + axs[1][0].set_yticks(y_ticks) + + axs[0][0].tick_params(axis="x", labelbottom=False) + axs[1][1].tick_params(axis="y", labelleft=False) + + # Submit_time and Wall_time + duration = [x.expected_run_time for x in jobs] + nodes_required = [x.nodes_required for x in jobs] + submit_t = [x.submit_time for x in jobs] + + offset = 0 + split_index = 0 + split_offset = math.floor(len(x) * split[split_index]) + if gantt_nodes: + if split[0] == 0.0: + ax_b.axhline(y=offset, color='red', linestyle='--', lw=0.5) + split_index += 1 + for i in range(len(x)): + # ax_b.barh(i,duration[i], height=1.0, left=submit_t[i]) + ax_b.barh(offset + nodes_required[i] / 2, duration[i], height=nodes_required[i], left=submit_t[i]) + offset += nodes_required[i] + if i != len(x) - 1 and i == split_offset - 1 and split_index < len(split): + ax_b.axhline(y=offset, color='red', linestyle='--', lw=0.5) + split_index += 1 + split_offset += math.floor(len(x) * split[split_index]) + # ax_b.axhline(y=(len(x)/num_dist * i)-0.5, color='red', linestyle='--',lw=0.5) + if split[-1] == 0.0: + ax_b.axhline(y=offset, color='red', linestyle='--', lw=0.5) + split_index += 1 + ax_b.set_ylabel("Jobs' acc. nodes") + else: + for i in range(len(x)): + ax_b.barh(i, duration[i], height=1.0, left=submit_t[i]) + for i in range(1, num_dist): + if num_dist == 1: + break + ax_b.axhline(y=(len(x) * split[split_index]) - 0.5, color='red', linestyle='--', lw=0.5) + split_index += 1 + ax_b.set_ylabel("Job ID") + # ax_b labels: + ax_b.set_xlabel("time [hh:mm]") + minx_s = 0 + maxx_s = math.ceil(max([x.expected_run_time for x in jobs]) + max([x.submit_time for x in jobs])) + x_label_mins = [n for n in np.arange(minx_s // 60, maxx_s // 60)] + x_label_ticks = [n * 60 for n in x_label_mins[0::60]] + x_label_str = [str(x1).zfill(2) + ":" + str(x2).zfill(2) for + (x1, x2) in [(n // 60, n % 60) for + n in x_label_mins[0::60]]] + + ax_b.set_xticks(x_label_ticks, x_label_str) + ax_b.yaxis.set_inverted(True) + + plt.show() diff --git a/scripts/get_cluster_v2_traces.sh b/scripts/get_cluster_v2_traces.sh new file mode 100755 index 0000000000000000000000000000000000000000..5cff60712069b0ce42127a18d1213b0fcc724430 --- /dev/null +++ b/scripts/get_cluster_v2_traces.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Define the base GCS path for the 2011 (V2) dataset +GCS_BASE_PATH="gs://clusterdata-2011-2" +LOCAL_DIR="./google_cluster_data_2011_sample" # Local directory to save files +NUM_FILES_PER_TYPE=1 # Adjust this number: 1 is very small, 5-10 is a decent sample + +# Create the local base directory if it doesn't exist +mkdir -p "$LOCAL_DIR" + +# Define the event types present in the 2011 (V2) dataset +EVENT_TYPES=( + "machine_events" + "job_events" + "task_events" + "task_usage" +) + +echo "Starting download of Google Cluster Data V2 (2011) sample..." + +# Loop through each event type +for event_type in "${EVENT_TYPES[@]}"; do + echo "Processing event type: ${event_type}" + + # Create a local subdirectory for each event type + mkdir -p "${LOCAL_DIR}/${event_type}" + + # List files in the current event type's GCS directory, take the first N, and download them + # Added '2>/dev/null' to suppress BrokenPipeError messages from gsutil ls + gsutil ls "${GCS_BASE_PATH}/${event_type}/part-*.csv.gz" 2>/dev/null | head -n "${NUM_FILES_PER_TYPE}" | while read -r gcs_path; do + echo " Downloading $(basename "$gcs_path")..." + gsutil cp "$gcs_path" "${LOCAL_DIR}/${event_type}/" + done +done + +echo "---" +echo "Download complete. Files are in: $LOCAL_DIR" +echo "You've downloaded a sample of the 2011 (V2) Google Cluster Traces." diff --git a/scripts/get_data.sh b/scripts/get_data.sh deleted file mode 100755 index a4263fda3ac83e8b3148122e8c122b7db61adf65..0000000000000000000000000000000000000000 --- a/scripts/get_data.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -# Note: -# recommend setting up ~/.ssh/config to specify User and HostName -# Host mymachine -# User jdoe -# HostName mymachine.com - -machine="mymachine" -mkdir -p jobprofile slurm/jobcomplete slurm/joblive - -if [ -n "$1" ]; then - DATE=$1 -else - DATE="2024-01-19" -fi - -DPATH=/path/to/data/lake - -/usr/bin/scp -r $machine:$DPATH/jobprofile/jobprofile/date=$DATE jobprofile -/usr/bin/scp -r $machine:$DPATH/slurm/joblive/date=$DATE slurm/joblive diff --git a/scripts/marconi100-day51.sh b/scripts/marconi100-day51.sh new file mode 100644 index 0000000000000000000000000000000000000000..77cbe459861a4cfcfc435908a3380282f52871f6 --- /dev/null +++ b/scripts/marconi100-day51.sh @@ -0,0 +1,4 @@ +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy replay +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs --backfill easy +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy priority --backfill firstfit diff --git a/scripts/meta_run.sh b/scripts/meta_run.sh index 41f4831e043f64a1f1ca822874fcd8f6537f3d4e..0c60596090d60d0ab3be306b1136ff1d23b5e7ea 100755 --- a/scripts/meta_run.sh +++ b/scripts/meta_run.sh @@ -17,7 +17,7 @@ while [ $current_sec -le $end_sec ]; do DATEDIRS="date=$DATEDIR" # Construct the command with the formatted date - command="python main.py -d -o --plot power loss -f $DPATH/slurm/joblive/$DATEDIRS $DPATH/jobprofile/jobprofile/$DATEDIRS >& $DATEDIRS.out &" + command="python main.py -d -o --plot power loss -f $DPATH/slurm/joblive/$DATEDIRS,$DPATH/jobprofile/jobprofile/$DATEDIRS >& $DATEDIRS.out &" sleep 10 # Execute the command diff --git a/scripts/parse_philly_traces.py b/scripts/parse_philly_traces.py new file mode 100644 index 0000000000000000000000000000000000000000..57ce8c2bed61ef07dd49b4e06fb91fbc7112f31a --- /dev/null +++ b/scripts/parse_philly_traces.py @@ -0,0 +1,73 @@ +""" +See raps/dataloaders/philly.py for how to download philly traces. + +Run following to parse philly traces into separate files for each day: + + python /path/to/raps/scripts/parse_philly_traces.py cluster_cpu_util + python /path/to/raps/scripts/parse_philly_traces.py cluster_gpu_util + +This will parse these two files into two directories, cpu_by_day and gpu_by_day, +creating one file for each day and adding the lines for that day into the files. +""" +import os +import sys +from datetime import datetime +from tqdm import tqdm + +if len(sys.argv) < 2: + print("Usage: python parse_by_day.py ") + sys.exit(1) + +input_file = sys.argv[1] + +with open(input_file) as f: + total_lines = sum(1 for _ in f) - 1 + +with open(input_file) as f: + header = f.readline().strip().split(",") + print("Header:", header) + + # detect file type from header + is_cpu = "cpu_util" in [h.lower() for h in header] + + # pick output dir name based on file type + output_dir = "cpu_by_day" if is_cpu else "gpu_by_day" + os.makedirs(output_dir, exist_ok=True) + + for line in tqdm(f, total=total_lines, desc="Processing lines"): + parts = line.strip().split(",") + + if len(parts) < 3: + continue + + raw_time = parts[0].replace(" PST", "").replace(" PDT", "") + try: + ts = datetime.strptime(raw_time, "%Y-%m-%d %H:%M:%S") + except ValueError: + continue + + machine_id = parts[1] + + if is_cpu: + try: + value = float(parts[2]) + except ValueError: + value = 0.0 + label = "cpu_util" + else: + utils = [] + for v in parts[2:]: + try: + utils.append(float(v)) + except ValueError: + pass + value = sum(utils) / max(1, len([u for u in utils if u > 0])) + label = "gpu_util" + + day_str = ts.strftime("%Y-%m-%d") + out_path = os.path.join(output_dir, f"{day_str}.csv") + + with open(out_path, "a") as out: + if out.tell() == 0: # only write header if file is new + out.write(f"time,machine_id,{label}\n") + out.write(f"{ts},{machine_id},{value:.3f}\n") diff --git a/scripts/plot3.py b/scripts/plot3.py new file mode 100644 index 0000000000000000000000000000000000000000..6470ff49c98739edf7d25242cb9c73338f626321 --- /dev/null +++ b/scripts/plot3.py @@ -0,0 +1,152 @@ +#!/bin/env python3 +import pandas as pd +import pyarrow.parquet as pq +import matplotlib.pyplot as plt + +style=['seaborn-v0_8', 'tableau-colorblind10'] + +for j in range(-1,len(style)): + if j in range(0,len(style)): + plt.style.use(style[j]) + + colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] + # Revert to the default style + plt.style.use('default') + # Apply ggplot colors to default style + plt.rcParams['axes.prop_cycle'] = plt.cycler(color=colors) + + + import sys + + if len(sys.argv) > 1: + path = sys.argv[1] + else: + print(f"Usage: python {sys.argv[0]} ") + exit() + + # e.g. path = "$HOME/Repositories/exadigit/raps/simulation_results/b803010" + + policies = ['fcfs-nobf','fcfs-easy','priority-nobf','priority-easy','priority-ffbf','replay'] + files = ['cooling_model.parquet', 'loss_history.parquet', 'power_history.parquet', 'util.parquet'] + files = ['cooling_model.parquet', 'power_history.parquet', 'util.parquet'] + files = ['util.parquet', 'power_history.parquet', 'cooling_model.parquet'] + #files = ['loss_history.parquet', 'power_history.parquet', 'util.parquet'] + #files = ['power_history.parquet', 'util.parquet', 'cooling_model.parquet'] + + policy_path = {f"{policy}":f"{path}/{policy}" for policy in policies} + full_files = {f"{policy}":f"{path}/{policy}/{file}" for policy in policies for file in files} + + + def iter_to_seconds(i): + return i * 15 + + + fig, axs = plt.subplots(len(files),figsize=(12, 12)) + for i,file in enumerate(files): + policy_files = [f"{path}/{policy}/{file}" for policy in policies] + for policy_file in policy_files: + # df = pd.read_parquet(policy_file) + x = 'time' + policy = policy_file.split('/')[-2] + if file == "power_history.parquet": + y = 'power [kw]' + ylab = 'Power [kW]' + ylim = 29000 + axs[i].set_ylim(0,ylim) + + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time',1:'power [kw]'}) + + elif file == "cooling_model.parquet": + y = 'pue' + ylab = 'PUE' + + df = pd.read_parquet(policy_file) + df['index'] = df.index + df[x] = df['index'].apply(iter_to_seconds) + ymax = max(df['pue']) + #axs[i].plot(df[x],df[y], label=ylab) + + elif file == "loss_history.parquet": + y = 'loss [kw]' + ylab = 'Loss [kW]' + ylim = 29000 + axs[i].set_ylim(0,ylim) + + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time',1:'loss [kw]'}) + #axs[i].plot(df[x],df[y], label=ylab) + + elif file == "util.parquet": + y = 'utilization' + ylab = 'Utilization' + + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time', 1:'utilization [%]'}) + df[y] = df['utilization [%]'] / 100 + #axs[i].plot(df[x], df[y], label=ylab) + + else: + raise KeyError + + axs[i].plot(df[x],df[y], label=policy) + axs[i].set_ylabel(ylab) + #$axs[i].plot(df[0],df[1],label=policy) + if file == "power_history.parquet": + axs[i].legend(loc='upper right') + axs[i].set_title('Power') + elif file == "util.parquet": + axs[i].set_title('Utilization') + axs[i].legend(loc='lower right') + elif file == "cooling_model.parquet": + axs[i].set_title('PUE') + axs[i].legend(loc='upper right') + elif file == "loss_history.parquet": + axs[i].set_title('Loss') + axs[i].legend(loc='upper right') + else: + raise KeyError() + #plt.show() + plt.savefig(f"Type{[j]}.png") + + + #for i in [1]: + # fig, ax1 = plt.subplots(figsize=(10, 6)) + # + # power = path + "/" + files[2] + # loss = path + "/" + files[1] + # util = path + "/" + files[3] + # + # df_power = pd.read_parquet(power) + # df_power = df_power.rename(columns={0:'time',1:'power [kw]'}) + # ax1.plot(df_power['time'],df_power['power [kw]'], color='black', label='Power kW]') + # + # #df_loss = pd.read_parquet(loss) + # #df_loss = df_loss.rename(columns={0:'time',1:'loss [kw]'}) + # #ax1.plot(df_loss['time'],df_loss['loss [kw]'], color='red', label='Loss [kW]') + # + # ax2 = ax1.twinx() + # + # #df_cooling = pd.read_parquet(cooling) + # #df_cooling['index'] = df_cooling.index + # #df_cooling['time'] = df_cooling['index'].apply(iter_to_seconds) + # #ymax = max(df_cooling['pue']) + # #ax2.plot(df_cooling['time'],df_cooling['pue'], color='blue', label='PUE') + # + # df_util = pd.read_parquet(util) + # df_util = df_util.rename(columns={0:'time', 1:'utilization [%]'}) + # df_util['utilization'] = df_util['utilization [%]'] / 100 + # ax2.plot(df_util['time'],df_util['utilization'], color='orange', label='Utilization') + # + # #ymax = max(max(df_cooling['pue']),max(df_util['utilization'])) + # ymax = max(0,max(df_util['utilization'])) + # ax2.set_ylim([0, ymax * 1.05]) + # + # ax1.set_xlabel('time [s]') + # ax1.set_ylabel('[kW]') + # ax2.set_ylabel('[%]') + # plt.title(path) + # ax1.legend(loc='upper left') + # ax2.legend(loc='upper right') + # plt.show() + # #plt.savefig("test.png") diff --git a/scripts/plot_p-util_t.py b/scripts/plot_p-util_t.py new file mode 100644 index 0000000000000000000000000000000000000000..f91fbd7099438c8d032507e1c75e4ffce601b3ef --- /dev/null +++ b/scripts/plot_p-util_t.py @@ -0,0 +1,83 @@ +#!/bin/env python3 +import pandas as pd +import pyarrow.parquet as pq +import matplotlib.pyplot as plt + +import sys + + + +if len(sys.argv) > 1: + path = sys.argv[1] +else: + print(f"Usage: python {sys.argv[0]} ") + exit() + +# e.g. path = "$HOME/Repositories/exadigit/raps/simulation_results/b803010" + +files = ['cooling_model.parquet', 'loss_history.parquet', 'power_history.parquet', 'util.parquet'] + +full_files = [f"{path}/{file}" for file in files] + + +def iter_to_seconds(i): + return i * 15 + + +SMALL_SIZE = 16 +MEDIUM_SIZE = 18 +BIGGER_SIZE = 22 + +plt.rc('font', size=SMALL_SIZE) # controls default text sizes +plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title +plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels +plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels +plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels +plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize +plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title + + + +for i in [1]: + fig, ax1 = plt.subplots(figsize=(10, 6)) + + power = path + "/" + files[2] + loss = path + "/" + files[1] + util = path + "/" + files[3] + cooling = path + "/" + files[0] + + df_power = pd.read_parquet(power) + df_power = df_power.rename(columns={0:'time',1:'power [kw]'}) + ax1.plot(df_power['time'],df_power['power [kw]'], color='black', label='Power kW]') + + df_loss = pd.read_parquet(loss) + df_loss = df_loss.rename(columns={0:'time',1:'loss [kw]'}) + ax1.plot(df_loss['time'],df_loss['loss [kw]'], color='red', label='Loss [kW]') + + ax2 = ax1.twinx() + + df_cooling = pd.read_parquet(cooling) + df_cooling['index'] = df_cooling.index + df_cooling['time'] = df_cooling['index'].apply(iter_to_seconds) + ymax = max(df_cooling['pue']) + ax2.plot(df_cooling['time'],df_cooling['pue'], color='blue', label='PUE') + + df_util = pd.read_parquet(util) + df_util = df_util.rename(columns={0:'time', 1:'utilization [%]'}) + df_util['utilization'] = df_util['utilization [%]'] / 100 + ax2.plot(df_util['time'],df_util['utilization'], color='orange', label='Utilization') + + ymax = max(max(df_cooling['pue']),max(df_util['utilization'])) + #ymax = max(0,max(df_util['utilization'])) + ax2.set_ylim([0, ymax * 1.05]) + + ax1.set_xlabel('time [s]') + ax1.set_ylabel('[kW]') + ax2.set_ylabel('[%]') + #path + #plt.title(path) + ax1.legend(loc='upper left') + ax2.legend(loc='upper right') + #plt.rcParams.update({'font.size': 30}) + #plt.show() + plt.savefig("test.png") diff --git a/scripts/plots/2in1-adastra.py b/scripts/plots/2in1-adastra.py new file mode 100644 index 0000000000000000000000000000000000000000..47647c989013743e46252ae61c4d9bfd86862653 --- /dev/null +++ b/scripts/plots/2in1-adastra.py @@ -0,0 +1,166 @@ +#!/bin/env python3 +import pandas as pd +import pyarrow.parquet as pq +import matplotlib.pyplot as plt + +import sys + +import matplotlib +matplotlib.rcParams['text.usetex'] = True + +plt.style.use("paper.mplstyle") + + +plt.rcParams.update({ + "text.usetex": True, + "font.family": "Libertine" +}) + +plt.rcParams['text.latex.preamble'] = r'\usepackage{libertine}' + +plt.rcParams.update({ + "text.usetex": True, + "font.family": "serif", + "font.serif": ["Linux Libertine O"], # Specify the font family +}) + + +pt = 1. / 72.27 +width = 1.2 * 241.14749 * pt +golden = (1 + 5**0.5) / 2 +height = width / golden * 3. / 5. + + +carray = ['tab:cyan','tab:orange','tab:brown','tab:blue'] + +if len(sys.argv) > 1: + path = sys.argv[1] +else: + print(f"Usage: python {sys.argv[0]} ") + exit() + +# e.g. path = "$HOME/Repositories/exadigit/raps/simulation_results/adastra/full" + +policies = ['fcfs-nobf','fcfs-easy','priority-nobf','priority-easy','priority-ffbf','replay'] +policies = ['fcfs-nobf','fcfs-easy','priority-ffbf','replay'] +files = ['cooling_model.parquet', 'loss_history.parquet', 'power_history.parquet', 'util.parquet'] +files = ['power_history.parquet', 'util.parquet'] +#files = ['util.parquet', 'power_history.parquet'] +#files = ['loss_history.parquet', 'power_history.parquet', 'util.parquet'] +#files = ['power_history.parquet', 'util.parquet', 'cooling_model.parquet'] + +policy_path = {f"{policy}":f"{path}/{policy}" for policy in policies} +full_files = {f"{policy}":f"{path}/{policy}/{file}" for policy in policies for file in files} + + +def iter_to_seconds(i): + return i * 15 + +c_cnt=0 +fig, axs = plt.subplots(len(files),figsize=(width,2 * height)) +for i,file in enumerate(files): + policy_files = [f"{path}/{policy}/{file}" for policy in policies] + for c,policy_file in enumerate(policy_files): + # df = pd.read_parquet(policy_file) + x = 'time' + xlab = 'Time [hours/days]' + policy = policy_file.split('/')[-2] + if file == "power_history.parquet": + y = 'power [kw]' + ylab = 'Power [kW]' + #ymax = 26000 + #ymin = 6500 + #axs[i].set_ylim(ymin,ymax) + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time',1:'power [kw]'}) + + elif file == "cooling_model.parquet": + if c_cnt == 0: + y = 'pue' + ylab = 'PUE' + if c_cnt == 1: + y = 'simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.T_fac_ctw_r_C' + ylab = 'Temperature [°C]' + if c_cnt == 2: + y = 'simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.T_fac_ctw_s_C' + ylab = 'Temperature [°C]' + + df = pd.read_parquet(policy_file) + df['index'] = df.index + df[x] = df['index'].apply(iter_to_seconds) + ymax = max(df[y]) + #axs[i].plot(df[x],df[y], label=ylab) + #y = 'simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.T_fac_ctw_r_C' + + elif file == "loss_history.parquet": + y = 'loss [kw]' + ylab = 'Loss [kW]' + #ylim = 29000 + #axs[i].set_ylim(0,ylim) + + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time',1:'loss [kw]'}) + #axs[i].plot(df[x],df[y], label=ylab) + + elif file == "util.parquet": + y = 'utilization' + ylab = r'Utilization [\%]' + + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time', 1:'utilization [%]'}) + df[y] = df['utilization [%]'] / 100 + #axs[i].plot(df[x], df[y], label=ylab) + + else: + raise KeyError + + timeline_s = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] + timeline_s = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] + timeline_s = [24 * 60 * 60 * x for x in timeline_s] + timeline_text = ['0:00','', '', 'day 3', '', '', 'day 6', '', '', 'day 9', '', '', 'day 12', '', '','day 15', ''] + axs[i].set_xticks(timeline_s,timeline_text) + if i == 1: + pass + else: + axs[i].set_xticklabels([]) # Remove x-axis labels + xlab = None + + + timeline_s = [0,21600,43200,64800,86400] + + #axs[i].set_xlim(timeline_s[0],timeline_s[-1]) + axs[i].set_xlabel(xlab) + axs[i].plot(df[x],df[y], label=policy, color=carray[c]) + axs[i].set_ylabel(ylab) + #$axs[i].plot(df[0],df[1],label=policy) + if file == "power_history.parquet": + axs[i].legend(loc='lower right',frameon=True) + axs[i].get_legend().get_frame().set_linewidth(0.0) + axs[i].set_title('Power',x=0.075, y=0.75,ha="left") + elif file == "util.parquet": + axs[i].set_title('Utilization',x=0.075, y=0.05,ha="left") + axs[i].legend(loc='lower right',frameon=True) + axs[i].get_legend().get_frame().set_linewidth(0.0) + elif file == "cooling_model.parquet": + if c_cnt == 0: + axs[i].set_title('PUE',x=0.05, y=0.8,ha="left") + axs[i].legend(loc='upper right') + elif c_cnt == 1: + axs[i].set_title('Cooling Tower\nReturn\nTemperature',x=0.05, y=0.5,ha="left") + axs[i].legend(loc='upper right') + else: + axs[i].set_title('Cooling Tower Supply Temperature',x=0.1, y=0.8) + axs[i].legend(loc='upper right') + c_cnt = c_cnt+1 + elif file == "loss_history.parquet": + axs[i].set_title('Loss') + axs[i].legend(loc='upper right') + else: + raise KeyError() +#plt.show() +#plt.savefig(f"3in1.png",bbox_inches='tight') +#plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) +fig.subplots_adjust(hspace=0) +plt.tight_layout(pad=0,w_pad=0.0,h_pad=-0.08)#3) +plt.savefig(f"2in1-adastra.png",bbox_inches='tight',pad_inches = 0.02, dpi = 300) + diff --git a/scripts/plots/2in1-pm100day50.py b/scripts/plots/2in1-pm100day50.py new file mode 100644 index 0000000000000000000000000000000000000000..1f3e0df75bbae1a5e247a668b2720a4f3a8b2a23 --- /dev/null +++ b/scripts/plots/2in1-pm100day50.py @@ -0,0 +1,166 @@ +#!/bin/env python3 +import pandas as pd +import pyarrow.parquet as pq +import matplotlib.pyplot as plt + +import sys + +import matplotlib +matplotlib.rcParams['text.usetex'] = True + +plt.style.use("paper.mplstyle") + + +plt.rcParams.update({ + "text.usetex": True, + "font.family": "Libertine" +}) + +plt.rcParams['text.latex.preamble'] = r'\usepackage{libertine}' + +plt.rcParams.update({ + "text.usetex": True, + "font.family": "serif", + "font.serif": ["Linux Libertine O"], # Specify the font family +}) + + +pt = 1. / 72.27 +width = 1.2 * 241.14749 * pt +golden = (1 + 5**0.5) / 2 +height = width / golden * 3. / 5. + + +carray = ['tab:cyan','tab:orange','tab:brown','tab:blue'] + +if len(sys.argv) > 1: + path = sys.argv[1] +else: + print(f"Usage: python {sys.argv[0]} ") + exit() + +# e.g. path = "$HOME/Repositories/exadigit/raps/simulation_results/marconi100/day51" + +policies = ['fcfs-nobf','fcfs-easy','priority-nobf','priority-easy','priority-ffbf','replay'] +policies = ['fcfs-nobf','fcfs-easy','priority-ffbf','replay'] +files = ['cooling_model.parquet', 'loss_history.parquet', 'power_history.parquet', 'util.parquet'] +files = ['power_history.parquet', 'util.parquet'] +#files = ['util.parquet', 'power_history.parquet'] +#files = ['loss_history.parquet', 'power_history.parquet', 'util.parquet'] +#files = ['power_history.parquet', 'util.parquet', 'cooling_model.parquet'] + +policy_path = {f"{policy}":f"{path}/{policy}" for policy in policies} +full_files = {f"{policy}":f"{path}/{policy}/{file}" for policy in policies for file in files} + + +def iter_to_seconds(i): + return i * 15 + +c_cnt=0 +fig, axs = plt.subplots(len(files),figsize=(width,2 * height)) +for i,file in enumerate(files): + policy_files = [f"{path}/{policy}/{file}" for policy in policies] + for c,policy_file in enumerate(policy_files): + # df = pd.read_parquet(policy_file) + x = 'time' + xlab = 'Time [hours/days]' + policy = policy_file.split('/')[-2] + if file == "power_history.parquet": + y = 'power [kw]' + ylab = 'Power [kW]' + #ymax = 26000 + #ymin = 6500 + #axs[i].set_ylim(ymin,ymax) + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time',1:'power [kw]'}) + + elif file == "cooling_model.parquet": + if c_cnt == 0: + y = 'pue' + ylab = 'PUE' + if c_cnt == 1: + y = 'simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.T_fac_ctw_r_C' + ylab = 'Temperature [°C]' + if c_cnt == 2: + y = 'simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.T_fac_ctw_s_C' + ylab = 'Temperature [°C]' + + df = pd.read_parquet(policy_file) + df['index'] = df.index + df[x] = df['index'].apply(iter_to_seconds) + ymax = max(df[y]) + #axs[i].plot(df[x],df[y], label=ylab) + #y = 'simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.T_fac_ctw_r_C' + + elif file == "loss_history.parquet": + y = 'loss [kw]' + ylab = 'Loss [kW]' + #ylim = 29000 + #axs[i].set_ylim(0,ylim) + + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time',1:'loss [kw]'}) + #axs[i].plot(df[x],df[y], label=ylab) + + elif file == "util.parquet": + y = 'utilization' + ylab = r'Utilization [\%]' + + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time', 1:'utilization [%]'}) + df[y] = df['utilization [%]'] / 100 + #axs[i].plot(df[x], df[y], label=ylab) + + else: + raise KeyError + + timeline_s = [] + timeline_text = [] + + timeline_s.extend([4320000,4330800,4341600,4352400,4363200,4374000,4384800,4395600]) + timeline_text.extend(['0:00\nDay 50','3:00','6:00','9:00','12:00','15:00','18:00','21:00']) + timeline_s.extend([4406400,4417200,4428000,4438800,4449600,4460400,4471200,4482000]) + timeline_text.extend(['0:00\nDay 51','3:00','6:00','9:00','12:00','15:00','18:00','21:00']) + + axs[i].set_xticks(timeline_s,timeline_text) + if i == 1: + pass + else: + axs[i].set_xticklabels([]) # Remove x-axis labels + xlab = None + + axs[i].set_xlabel(xlab) + axs[i].plot(df[x],df[y], label=policy, color=carray[c]) + axs[i].set_ylabel(ylab) + #$axs[i].plot(df[0],df[1],label=policy) + if file == "power_history.parquet": + axs[i].legend(loc='lower right',frameon=True) + axs[i].get_legend().get_frame().set_linewidth(0.0) + axs[i].set_title('Power',x=0.07, y=0.03,ha="left") + elif file == "util.parquet": + axs[i].set_title('Utilization',x=0.07, y=0.03,ha="left") + axs[i].legend(loc='lower right',frameon=True) + axs[i].get_legend().get_frame().set_linewidth(0.0) + elif file == "cooling_model.parquet": + if c_cnt == 0: + axs[i].set_title('PUE',x=0.05, y=0.8,ha="left") + axs[i].legend(loc='upper right') + elif c_cnt == 1: + axs[i].set_title('Cooling Tower\nReturn\nTemperature',x=0.05, y=0.5,ha="left") + axs[i].legend(loc='upper right') + else: + axs[i].set_title('Cooling Tower Supply Temperature',x=0.1, y=0.8) + axs[i].legend(loc='upper right') + c_cnt = c_cnt+1 + elif file == "loss_history.parquet": + axs[i].set_title('Loss') + axs[i].legend(loc='upper right') + else: + raise KeyError() +#plt.show() +#plt.savefig(f"3in1.png",bbox_inches='tight') +#plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) +fig.subplots_adjust(hspace=0) +plt.tight_layout(pad=0,w_pad=0.0,h_pad=-0.08)#3) +plt.savefig(f"2in1-pm100day50.png",bbox_inches='tight',pad_inches = 0.02, dpi = 300) + diff --git a/scripts/plots/4in1-frontier-wC.py b/scripts/plots/4in1-frontier-wC.py new file mode 100644 index 0000000000000000000000000000000000000000..79ce23563cfa59247f362fa27ee80d8e7172c072 --- /dev/null +++ b/scripts/plots/4in1-frontier-wC.py @@ -0,0 +1,167 @@ +#!/bin/env python3 +import pandas as pd +import pyarrow.parquet as pq +import matplotlib.pyplot as plt + +import sys + +import matplotlib +matplotlib.rcParams['text.usetex'] = True + +plt.style.use("paper.mplstyle") + + +plt.rcParams.update({ + "text.usetex": True, + "font.family": "Libertine" +}) + +plt.rcParams['text.latex.preamble'] = r'\usepackage{libertine}' + +plt.rcParams.update({ + "text.usetex": True, + "font.family": "serif", + "font.serif": ["Linux Libertine O"], # Specify the font family +}) + + +pt = 1. / 72.27 +width = 1.2 * 241.14749 * pt +golden = (1 + 5**0.5) / 2 +height = width / golden * 3. / 5. + + +carray = ['tab:cyan','tab:orange','tab:brown','tab:blue'] + +if len(sys.argv) > 1: + path = sys.argv[1] +else: + print(f"Usage: python {sys.argv[0]} ") + exit() + +# e.g. path = "$HOME/Repositories/exadigit/raps/simulation_results/frontier" + +policies = ['fcfs-nobf','fcfs-easy','priority-nobf','priority-easy','priority-ffbf','replay'] +policies = ['fcfs-nobf','fcfs-easy','priority-ffbf','replay'] +files = ['cooling_model.parquet', 'loss_history.parquet', 'power_history.parquet', 'util.parquet'] +files = ['cooling_model.parquet', 'power_history.parquet', 'util.parquet'] +files = ['util.parquet', 'power_history.parquet', 'cooling_model.parquet', 'cooling_model.parquet'] #, 'cooling_model.parquet'] +#files = ['util.parquet', 'power_history.parquet'] +#files = ['loss_history.parquet', 'power_history.parquet', 'util.parquet'] +#files = ['power_history.parquet', 'util.parquet', 'cooling_model.parquet'] + +policy_path = {f"{policy}":f"{path}/{policy}" for policy in policies} +full_files = {f"{policy}":f"{path}/{policy}/{file}" for policy in policies for file in files} + + +def iter_to_seconds(i): + return i * 15 + +c_cnt=0 +fig, axs = plt.subplots(len(files),figsize=(width,4 * height)) +for i,file in enumerate(files): + policy_files = [f"{path}/{policy}/{file}" for policy in policies] + for c,policy_file in enumerate(policy_files): + # df = pd.read_parquet(policy_file) + x = 'time' + xlab = 'Time [hours]' + policy = policy_file.split('/')[-2] + if file == "power_history.parquet": + y = 'power [kw]' + ylab = 'Power [kW]' + #ymax = 26000 + #ymin = 6500 + #axs[i].set_ylim(ymin,ymax) + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time',1:'power [kw]'}) + + elif file == "cooling_model.parquet": + if c_cnt == 0: + y = 'pue' + ylab = 'PUE' + if c_cnt == 1: + y = 'simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.T_fac_ctw_r_C' + ylab = 'Temperature [°C]' + if c_cnt == 2: + y = 'simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.T_fac_ctw_s_C' + ylab = 'Temperature [°C]' + + df = pd.read_parquet(policy_file) + df['index'] = df.index + df[x] = df['index'].apply(iter_to_seconds) + ymax = max(df[y]) + #axs[i].plot(df[x],df[y], label=ylab) + #y = 'simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.T_fac_ctw_r_C' + + elif file == "loss_history.parquet": + y = 'loss [kw]' + ylab = 'Loss [kW]' + #ylim = 29000 + #axs[i].set_ylim(0,ylim) + + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time',1:'loss [kw]'}) + #axs[i].plot(df[x],df[y], label=ylab) + + elif file == "util.parquet": + y = 'utilization' + ylab = r'Utilization [\%]' + + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time', 1:'utilization [%]'}) + df[y] = df['utilization [%]'] / 100 + #axs[i].plot(df[x], df[y], label=ylab) + + else: + raise KeyError + + + if i == 3: + timeline_s = [0,21600,43200,64800,86400] + timeline_h = ['0:00','6:00','12:00','18:00','24:00'] + axs[i].set_xticks(timeline_s,timeline_h) + else: + timeline_s = [0,21600,43200,64800,86400] + timeline_h = ['0:00','6:00','12:00','18:00','24:00'] + axs[i].set_xticks(timeline_s,timeline_h) + + axs[i].set_xticklabels([]) # Remove x-axis labels + xlab = None + + + timeline_s = [0,21600,43200,64800,86400] + + #axs[i].set_xlim(timeline_s[0],timeline_s[-1]) + axs[i].set_xlabel(xlab) + axs[i].plot(df[x],df[y], label=policy, color=carray[c]) + axs[i].set_ylabel(ylab) + #$axs[i].plot(df[0],df[1],label=policy) + if file == "power_history.parquet": + axs[i].legend(loc='upper right') + axs[i].set_title('Power',x=0.05, y=0.8,ha="left") + elif file == "util.parquet": + axs[i].set_title('Utilization',x=0.05, y=0.1,ha="left") + axs[i].legend(loc='lower right') + elif file == "cooling_model.parquet": + if c_cnt == 0: + axs[i].set_title('PUE',x=0.05, y=0.8,ha="left") + axs[i].legend(loc='upper right') + elif c_cnt == 1: + axs[i].set_title('Cooling Tower\nReturn\nTemperature',x=0.05, y=0.5,ha="left") + axs[i].legend(loc='upper right') + else: + axs[i].set_title('Cooling Tower Supply Temperature',x=0.1, y=0.8) + axs[i].legend(loc='upper right') + c_cnt = c_cnt+1 + elif file == "loss_history.parquet": + axs[i].set_title('Loss') + axs[i].legend(loc='upper right') + else: + raise KeyError() +#plt.show() +#plt.savefig(f"3in1.png",bbox_inches='tight') +#plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) +fig.subplots_adjust(hspace=0) +plt.tight_layout(pad=0,w_pad=0.0,h_pad=-0.08)#3) +plt.savefig(f"4in1-frontier-wC.png",bbox_inches='tight',pad_inches = 0.02, dpi = 300) + diff --git a/scripts/plots/fgk_frontier.py b/scripts/plots/fgk_frontier.py new file mode 100644 index 0000000000000000000000000000000000000000..20c3deeee4e7230fa9631760cfa09a010c19b502 --- /dev/null +++ b/scripts/plots/fgk_frontier.py @@ -0,0 +1,169 @@ +#!/bin/env python3 +import pandas as pd +import pyarrow.parquet as pq +import matplotlib.pyplot as plt +import matplotlib +import numpy + +import sys + +matplotlib.rcParams['text.usetex'] = True + +plt.style.use("paper.mplstyle") + + +plt.rcParams.update({ + "text.usetex": True, + "font.family": "Libertine" +}) + +plt.rcParams['text.latex.preamble'] = r'\usepackage{libertine}' + +plt.rcParams.update({ + "text.usetex": True, + "font.family": "serif", + "font.serif": ["Linux Libertine O"], # Specify the font family +}) + + + +pt = 1. / 72.27 +width = 1.2*241.14749 * pt +golden = (1 + 5**0.5) / 2 +height = width / golden * 4./5. +# COLUMNWIDTH241.14749pt TEXTWIDTH506.295pt + + +carray = [] +t = plt.get_cmap('tab10').colors +for i in range(0,len(t)): + carray.append(t[i]) +g = carray[2] +carray[2] = carray[4] +carray[4] = g + +if len(sys.argv) > 1: + path = sys.argv[1] +else: + print(f"Usage: python {sys.argv[0]} ") + exit() +# e.g. path = "$HOME/Repositories/exadigit/raps/simulation_results/frontier/nnew_fkg_2024-01-18" + +policies = [ + 'replay', +# 'replay-ffbf', +# 'fcfs-ffbf', +# 'priority-ffbf', # on fcfs +# 'sjf-ffbf', +# 'ljf-ffbf', # on prio + 'acct_avg_power-ffbf', + 'acct_low_avg_power-ffbf', +# 'acct_avg_power_w4lj-ffbf', + 'acct_edp-ffbf', + 'acct_fugaku_pts-ffbf', + #'acct_ed2p-ffbf', #Sim to edp + #'acct_pdp-ffbf', + +] +#policies = ['fcfs-nobf','fcfs-easy','priority-nobf','priority-easy','priority-ffbf','replay'] +#policies = ['fcfs-nobf','fcfs-easy','priority-ffbf','replay'] +#policies = ['replay','prio-ffbf','fugaku_pts'] +#files = ['cooling_model.parquet', 'loss_history.parquet', 'power_history.parquet', 'util.parquet'] +#files = ['cooling_model.parquet', 'power_history.parquet', 'util.parquet'] +#files = ['util.parquet', 'power_history.parquet', 'cooling_model.parquet'] +files = ['util.parquet', 'power_history.parquet'] +files = ['power_history.parquet'] +#files = ['loss_history.parquet', 'power_history.parquet', 'util.parquet'] +#files = ['power_history.parquet', 'util.parquet', 'cooling_model.parquet'] + +prefix = "" + +policy_path = {f"{policy}":f"{path}/{prefix}{policy}" for policy in policies} +full_files = {f"{policy}":f"{path}/{prefix}{policy}/{file}" for policy in policies for file in files} + + +def iter_to_seconds(i): + return i * 15 + + +fig, axs = plt.subplots(len(files),figsize=(width, height*len(files)),sharex=True) +if isinstance(axs, matplotlib.axes._axes.Axes): + axs = [axs] +elif isinstance(axs, numpy.ndarray): + pass +else: + pass + +for i,file in enumerate(files): + policy_files = [f"{path}/{prefix}{policy}/{file}" for policy in policies] + for c,policy_file in enumerate(policy_files): + # df = pd.read_parquet(policy_file) + x = 'time' + xlab = 'Time [hours]' + policy = policy_file.split('/')[-2] + if file == "power_history.parquet": + y = 'power [kw]' + ylab = 'Power [kW]' + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time',1:'power [kw]'}) + + + elif file == "cooling_model.parquet": + y = 'pue' + ylab = 'PUE' + + df = pd.read_parquet(policy_file) + df['index'] = df.index + df[x] = df['index'].apply(iter_to_seconds) + ymax = max(df['pue']) + + + elif file == "loss_history.parquet": + y = 'loss [kw]' + ylab = 'Loss [kW]' + + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time',1:'loss [kw]'}) + + elif file == "util.parquet": + y = 'utilization' + ylab = 'Utilization' + + df = pd.read_parquet(policy_file) + df = df.rename(columns={0:'time', 1:'utilization [%]'}) + df[y] = df['utilization [%]'] / 100 + + + + else: + raise KeyError + + timeline_s = [0,21600,43200,64800,86400] + timeline_h = ['0:00','6:00','12:00','18:00','24:00'] + axs[i].set_xticks(timeline_s,timeline_h) + axs[i].set_xlabel(xlab) + + axs[i].plot(df[x],df[y], label=policy, + #linewidth=0.5, + marker='', color=carray[c]) + axs[i].set_ylabel(ylab) + #$axs[i].plot(df[0],df[1],label=policy) + if file == "power_history.parquet": + axs[i].legend(loc='center left',bbox_to_anchor=(0.02, 0.6)) + axs[i].set_title('Power',x=0.1,y=0.80) + elif file == "util.parquet": + axs[i].set_title('Utilization') + axs[i].legend(loc='lower left') + elif file == "cooling_model.parquet": + axs[i].set_title('PUE') + axs[i].legend(loc='upper left') + elif file == "loss_history.parquet": + axs[i].set_title('Loss') + axs[i].legend(loc='upper left') + else: + raise KeyError() +#plt.show() +#plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) +fig.subplots_adjust(hspace=0) +plt.tight_layout(pad=0,w_pad=0.0,h_pad=-0.08)#3) +plt.savefig(f"nnew_fkg_2024-01-18.png",bbox_inches='tight',pad_inches = 0.02, dpi = 300)#, bbox_inches='tight') diff --git a/scripts/plots/paper.mplstyle b/scripts/plots/paper.mplstyle new file mode 100644 index 0000000000000000000000000000000000000000..5f77db60eebc003b101e745c0e289477dc5c3ed8 --- /dev/null +++ b/scripts/plots/paper.mplstyle @@ -0,0 +1,86 @@ +## See https://matplotlib.org/stable/tutorials/introductory/customizing.html#a-sample-matplotlibrc-file + +text.usetex: True +text.latex.preamble: \usepackage{amsmath}\usepackage{amssymb} +font.family: serif +#font.serif: \T1/LinuxLibertineT-TLF/m/n/10 +savefig.bbox: tight +savefig.format: pdf + +lines.linewidth: .5 + +## *************************************************************************** +## * AXES * +## *************************************************************************** +axes.linewidth: 0.5 # edge line width +axes.labelsize: 10 # font size of the x and y labels +axes.labelpad: 3.0 # space between label and axis +#axes.labelweight: normal # weight of the x and y labels +axes.grid: True +axes.grid.axis: y + +grid.linewidth: 0.2 + +## *************************************************************************** +## * TICKS * +## *************************************************************************** +## See https://matplotlib.org/api/axis_api.html#matplotlib.axis.Tick +xtick.top: True # draw ticks on the top side +# xtick.bottom: True # draw ticks on the bottom side +# xtick.labeltop: False # draw label on the top +# xtick.labelbottom: True # draw label on the bottom +# xtick.major.size: 3 # major tick size in points +# xtick.minor.size: 1.5 # minor tick size in points +xtick.major.width: .3 # major tick width in points +# xtick.minor.width: .3 # minor tick width in points +# xtick.major.pad: 2 # distance to major tick label in points +# xtick.minor.pad: 2 # distance to the minor tick label in points +# xtick.color: black # color of the ticks +# xtick.labelcolor: inherit # color of the tick labels or inherit from xtick.color +xtick.labelsize: 8 # font size of the tick labels +xtick.direction: in # direction: {in, out, inout} +# xtick.minor.visible: True # visibility of minor ticks on x-axis +# xtick.major.top: True # draw x axis top major ticks +# xtick.major.bottom: True # draw x axis bottom major ticks +# xtick.minor.top: False # draw x axis top minor ticks +# xtick.minor.bottom: False # draw x axis bottom minor ticks +# xtick.alignment: center # alignment of xticks +# +# ytick.left: True # draw ticks on the left side +ytick.right: True # draw ticks on the right side +# ytick.labelleft: True # draw tick labels on the left side +# ytick.labelright: False # draw tick labels on the right side +# ytick.major.size: 3 # major tick size in points +# ytick.minor.size: 1.5 # minor tick size in points +# ytick.major.width: .3 # major tick width in points +# ytick.minor.width: .3 # minor tick width in points +# ytick.major.pad: 2 # distance to major tick label in points +# ytick.minor.pad: 2 # distance to the minor tick label in points +# ytick.color: black # color of the ticks +# ytick.labelcolor: inherit # color of the tick labels or inherit from ytick.color +ytick.labelsize: 8 # font size of the tick labels +ytick.direction: in # direction: {in, out, inout} +# ytick.minor.visible: True # visibility of minor ticks on y-axis +# ytick.major.left: True # draw y axis left major ticks +# ytick.major.right: True # draw y axis right major ticks +# ytick.minor.left: True # draw y axis left minor ticks +# ytick.minor.right: True # draw y axis right minor ticks +# ytick.alignment: center_baseline # alignment of yticks + +## *************************************************************************** +## * LEGEND * +## *************************************************************************** +legend.loc: upper right +legend.frameon: False # if True, draw the legend on a background patch +# legend.framealpha: 0.8 # legend patch transparency +# legend.fancybox: True # if True, use a rounded box for the + # legend background, else a rectangle +#legend.markerscale: 1.0 # the relative size of legend markers vs. original +legend.fontsize: 6 + +## *************************************************************************** +## * FIGURE * +## *************************************************************************** +# figure.figsize: 3.4, 2.55 # figure size in inches +figure.dpi: 300 # figure dots per inch +# figure.frameon: True # enable figure frame diff --git a/scripts/run_inter_job_congestion.py b/scripts/run_inter_job_congestion.py new file mode 100644 index 0000000000000000000000000000000000000000..9312fd43b97bb1362c91960aac96b406f7c835db --- /dev/null +++ b/scripts/run_inter_job_congestion.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +RAPS Network Congestion Test (Inter-Job Interference) +====================================================== + +This script is a wrapper that uses the integrated `inter_job_congestion` +workload from the RAPS library to run a standalone network simulation. + +It evaluates inter-job network congestion by simulating multiple jobs +running concurrently on the same network and finding the total congestion +on the most loaded link. + +Usage: + python scripts/run_inter_job_congestion.py --config config/lassen.yaml + +Example: + python scripts/run_inter_job_congestion.py --config config/lassen.yaml --jobs 80 --txfrac 0.35 -v +""" + +from __future__ import annotations +import argparse +from pathlib import Path + +from raps.system_config import get_system_config +from raps.network import ( + NetworkModel, + simulate_inter_job_congestion, +) +from raps.workloads import Workload + + +def print_verbose_stats(stats): + print("\n--- Detailed Network Congestion Stats ---") + print(f" Max Congestion (Worst Link): {stats['max']:.2f}") + print(f" Mean Link Congestion: {stats['mean']:.2f}") + print(f" Min Link Congestion: {stats['min']:.2f}") + print(f" Std Dev of Congestion: {stats['std_dev']:.2f}") + print("\n Top 10 Most Congested Links:") + for (link, congestion) in stats['top_links']: + print(f" - Link {link}: {congestion:.2f}") + print("---------------------------------------") + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser(description="Standalone inter-job network congestion test for RAPS.") + parser.add_argument("--config", required=True, help="Path to system YAML (e.g., config/lassen.yaml)") + parser.add_argument("--jobs", type=int, default=60, help="Number of synthetic jobs") + parser.add_argument("--txfrac", type=float, default=0.35, help="Fraction of per-link bandwidth per job") + parser.add_argument("--debug", action="store_true", help="Enable network debug output") + parser.add_argument("--verbose", "-v", action="store_true", help="Print detailed statistics") + args = parser.parse_args() + + # --- Load config and detect topology --- + sys_cfg = get_system_config(args.config) + legacy = sys_cfg.get_legacy() + + topology = legacy.get("TOPOLOGY", "").lower() + if not topology: + raise ValueError(f"Could not infer topology from {args.config}. Found: {topology!r}") + + # --- Generate Jobs via Workload module --- + # The workload class expects specific attribute names, so we add them to the args object. + args.workload = 'inter_job_congestion' + args.numjobs = args.jobs + args.seed = 42 # Keep seed consistent for this test script + args.start = None + + workload_generator = Workload(args, legacy) + workload_data = workload_generator.generate_jobs() + jobs = workload_data.jobs + + print(f"[INFO] Detected topology: {topology}") + print(f"[INFO] Generated {len(jobs)} jobs for congestion test.") + + # --- Initialize network model --- + net = NetworkModel( + config=legacy, + available_nodes=list(range(legacy["TOTAL_NODES"])), + output_dir=Path(f"test-{Path(args.config).stem}"), + debug=args.debug, + ) + + # --- Simulate all jobs running concurrently --- + congestion_stats = simulate_inter_job_congestion(net, jobs, legacy, debug=args.debug) + + print(f"[RESULT] config={args.config}, topology={topology}, jobs={len(jobs)}, " + f"total_congestion={congestion_stats['max']:.2f}") + + if args.verbose: + print_verbose_stats(congestion_stats) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..0b7d8b3b9ef3fa4a76ef715f238f2279039eb26c --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,40 @@ +import pytest +import uuid +import shutil +from glob import glob +from pathlib import Path +import gc +import os +from .util import PROJECT_ROOT + + +def pytest_addoption(parser): + parser.addoption( + "--runlong", action="store_true", default=False, help="Run long-running tests" + ) + + +def pytest_runtest_setup(item): + if "long" in item.keywords and not item.config.getoption("--runlong"): + # reason = f"Skipping {item.nodeid} because it requires --runlong" + reason = "Skipping test because it requires --runlong" + pytest.skip(reason) + + +@pytest.fixture() +def sim_output(): + """ + Handles cleaning up output from the sim. + Can also be used even if you aren't outputing anything to run garbage collection after the sim. + """ + os.chdir(PROJECT_ROOT) + out = f"test-output/test-{str(uuid.uuid4())[:8]}" + yield out + for file in glob(f"{out}*"): + if Path(file).is_dir(): + shutil.rmtree(file) + else: + Path(file).unlink() + + # Also force a garbage collection to clean up memory after running a simulation + gc.collect() diff --git a/tests/smoke.py b/tests/smoke.py index 9174b3c22409cb68873ddd860cac9aac946c703c..a2ea598f39d69a812f7180cf3cdc41a0b66939f8 100644 --- a/tests/smoke.py +++ b/tests/smoke.py @@ -3,21 +3,23 @@ import argparse import subprocess # Define the data path -DATAPATH = os.path.expanduser("~/data") +DATAPATH = os.getenv("RAPS_DATA_DIR", "/opt/data") # Standardize the time setting DEFAULT_TIME = "1h" +BENCH_TIME = "4h" # Define systems and their corresponding filenames SYSTEMS = { "frontier": "frontier/slurm/joblive/date=2024-01-18 frontier/jobprofile/date=2024-01-18", "marconi100": "marconi100/job_table.parquet", "lassen": "lassen/Lassen-Supercomputer-Job-Dataset", - "adastraMI250": "adastra/AdastaJobsMI250_15days.parquet" + "adastraMI250": "adastra/AdastaJobsMI250_15days.parquet", } VALID_CHOICES = set(SYSTEMS.keys()).union({"synthetic", "hetero"}) + def run_command(command): """Helper function to run a shell command.""" print(f"Running: {command}") @@ -26,10 +28,12 @@ def run_command(command): print(f"Error: Command failed with return code {result.returncode}") exit(-1) + def build_command(system, file_paths, additional_args=""): """Build the command string for the given system and file paths.""" full_paths = " ".join([os.path.join(DATAPATH, path) for path in file_paths.split()]) - return f"python main.py --system {system} -f {full_paths} -t {DEFAULT_TIME} {additional_args}".strip() + return f"python main.py run --system {system} -f {full_paths} -t {DEFAULT_TIME} -o none {additional_args}".strip() + def execute_system_tests(systems): """Execute tests for selected systems.""" @@ -37,18 +41,21 @@ def execute_system_tests(systems): command = build_command(system, SYSTEMS[system]) run_command(command) + def synthetic_workload_tests(): """Run synthetic workload tests.""" print("Starting synthetic workload tests...") - run_command(f"python main.py -t {DEFAULT_TIME}") - run_command(f"python main.py -w benchmark -t {DEFAULT_TIME}") - run_command(f"python main.py -w peak -t {DEFAULT_TIME}") - run_command(f"python main.py -w idle -t {DEFAULT_TIME}") + run_command(f"python main.py run -t {DEFAULT_TIME}") + run_command(f"python main.py run -w benchmark -t {BENCH_TIME}") + run_command(f"python main.py run -w peak -t {DEFAULT_TIME}") + run_command(f"python main.py run -w idle -t {DEFAULT_TIME}") + def hetero_tests(): """Run heterogeneous workload tests.""" print("Starting heterogeneous workload tests...") - run_command(f"python multi-part-sim.py -x setonix/part-cpu setonix/part-gpu -t {DEFAULT_TIME}") + run_command(f"python main.py run-parts -x setonix/part-cpu setonix/part-gpu -t {DEFAULT_TIME}") + def main(): """Main function to parse arguments and run tests.""" @@ -56,7 +63,8 @@ def main(): parser.add_argument( "tests", nargs="*", # Allow multiple test selections, including none - help="Run tests for one or more specific systems (e.g., 'frontier lassen'), 'synthetic' workloads, or 'hetero'. If omitted, all tests run.", + help="Run tests for one or more specific systems (e.g., 'frontier lassen')," + "'synthetic' workloads, or 'hetero'. If omitted, all tests run." ) args = parser.parse_args() @@ -83,5 +91,6 @@ def main(): if system_tests: execute_system_tests(system_tests) + if __name__ == "__main__": main() diff --git a/tests/systems/__init__.py b/tests/systems/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/systems/conftest.py b/tests/systems/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..31fa936244e72532a13efa85702bf763dafbc1ce --- /dev/null +++ b/tests/systems/conftest.py @@ -0,0 +1,222 @@ +import pytest +from tests.util import DATA_PATH + + +SYSTEM_CONFIGS = { + "40frontiers": { + "marks": [pytest.mark.long], # All these tests are long running as the system is large. + "main": True, + "telemetry": False, + "workload": False, + "multi-part-sim": False, + "withdata": False, + "start": None, + "files": [], + "cooling": False, + "uncertainty": True, + "time": True, + "time_delta": True, + "net": False, + }, + "adastraMI250": { + "marks": [], + "main": True, + "telemetry": True, + "workload": True, + "multi-part-sim": False, + "withdata": True, + "start": "2024-09-01T02:00:00Z", + "files": ["adastraMI250/AdastaJobsMI250_15days.parquet"], + "cooling": False, + "uncertainty": True, + "time": True, + "time_delta": True, + "net": False, + }, + "bluewaters": { + "marks": [], + "main": True, + "telemetry": True, + "workload": True, + "multi-part-sim": False, + "withdata": True, + "start": "2017-03-28T02:00:00Z", + "files": ["bluewaters"], + "cooling": False, + "uncertainty": False, + "time": True, + "time_delta": True, + "net": False, + }, + "frontier": { + "marks": [], + "main": True, + "telemetry": True, + "workload": True, + "multi-part-sim": False, + "withdata": True, + "start": "2024-01-18T03:00:00Z", + "files": ["frontier/slurm/joblive/date=2024-01-18/", "frontier/jobprofile/date=2024-01-18/"], + "cooling": True, + "uncertainty": True, + "time": True, + "time_delta": True, + "net": False, + }, + "fugaku": { + "marks": [], + "main": True, + "telemetry": True, + "workload": True, + "multi-part-sim": False, + "withdata": True, + "start": "2021-04-03T02:00:00Z", + "files": ["fugaku/21_04.parquet"], + "cooling": False, + "uncertainty": False, + "time": True, + "time_delta": True, + "net": False, + }, + "gcloudv2": { + "marks": [], + "main": True, + "telemetry": True, + "workload": True, + "multi-part-sim": False, + "withdata": True, + "start": "2011-05-02T05:00:00Z", + "files": ["gcloud/v2/google_cluster_data_2011_sample"], + "cooling": False, + "uncertainty": False, + "time": True, + "time_delta": True, + "net": False, + }, + "lassen": { + "marks": [], + "main": True, + "telemetry": False, # Takes very long! + "workload": False, + "multi-part-sim": False, + "withdata": True, + "start": "2019-08-22T00:00:00Z", + "files": ["lassen/Lassen-Supercomputer-Job-Dataset"], + "cooling": True, + "uncertainty": False, + "time": True, + "time_delta": True, + "net": True, + }, + "marconi100": { + "marks": [], + "main": True, + "telemetry": True, + "workload": True, + "multi-part-sim": False, + "withdata": True, + "start": "2020-05-06T07:30:00Z", + "files": ["marconi100/job_table.parquet"], + "cooling": True, + "uncertainty": False, + "time": True, + "time_delta": True, + "net": False, + }, + "mit_supercloud": { + "marks": [], + "main": False, + "telemetry": False, + "workload": False, + "multi-part-sim": True, + "withdata": True, + "start": "2021-05-22T00:00:00Z", + "files": ["mit_supercloud/202201"], + "cooling": False, + "uncertainty": False, + "time": False, + "time_delta": False, + "net": False, + "net-multi-sim": True, + }, + "setonix": { + "marks": [], + "main": False, + "telemetry": True, + "workload": False, + "multi-part-sim": True, + "withdata": False, + "files": [], + "start": None, + "cooling": False, + "uncertainty": False, + "time": False, + "time_delta": False, + "net": False, + }, + "summit": { + "marks": [], + "main": True, + "telemetry": False, + "workload": False, + "multi-part-sim": False, + "withdata": False, + "files": [], + "start": None, + "cooling": True, + "uncertainty": False, + "time": True, + "time_delta": True, + "net": False, + }, + "lumi": { + "marks": [], + "main": False, + "telemetry": False, + "workload": False, + "multi-part-sim": True, + "withdata": False, + "files": [], + "start": None, + "cooling": False, + "uncertainty": False, + "time": False, + "time_delta": False, + "net": False, + "net-multi-sim": False + }, +} + + +@pytest.fixture(params=[ + pytest.param(k, marks=v.get('marks', [])) for k, v in SYSTEM_CONFIGS.items() +]) +def system(request): + return request.param + + +# Add markers to each test for the system. +# Similar to pytest -m marker. +# These are explicitly defined in pytest.ini, to avoid warnings. +# This way you can run test with: pytest -m systemname +def pytest_collection_modifyitems(config, items): + for item in items: + system = item.callspec.params.get("system") if hasattr(item, "callspec") else None + if system: + item.add_marker(getattr(pytest.mark, system)) + + +# #Define tests to run here! +@pytest.fixture +def system_config(system): + return SYSTEM_CONFIGS[system] + + +@pytest.fixture +def system_files(system): + file_list = [DATA_PATH / f for f in SYSTEM_CONFIGS[system].get('files', [])] + for file in file_list: + assert file.exists(), \ + f"File `{file}' does not exist. does ./data exist or is RAPS_DATA_DIR set?" + + return [str(f) for f in file_list] diff --git a/tests/systems/test_engine_basic.py b/tests/systems/test_engine_basic.py new file mode 100644 index 0000000000000000000000000000000000000000..32b4221b5f36b3895924b419cc5b3088d64769b6 --- /dev/null +++ b/tests/systems/test_engine_basic.py @@ -0,0 +1,48 @@ +import pytest +from ..util import run_engine +from raps.engine import Engine +from raps.sim_config import SingleSimConfig +from raps.stats import get_engine_stats, get_job_stats, RunningStats + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata +] + + +def test_engine_basic(system, system_config, sim_output): + if not system_config.get("main", False): + pytest.skip(f"{system} does not support basic main run.") + + engine, stats = run_engine({ + "system": system, + "time": "2m", + }) + + assert stats['tick_count'] == 120 + assert stats['engine']['time_simulated'] == '0:02:00' + + +def test_engine_stats(system, system_config, sim_output): + if not system_config.get("main", False): + pytest.skip(f"{system} does not support basic main run.") + + engine = Engine(SingleSimConfig.model_validate({ + "system": system, + "time": "2m", + })) + gen = engine.run_simulation() + running_stats = RunningStats(engine) + + for tick in gen: + stats = running_stats.get_stats() + stats = running_stats.get_stats() + + final_stats = { + **get_engine_stats(engine), + **get_job_stats(engine), + } + + # Confirm the running stats match up with the final stat computation + for stat in stats.keys(): + assert pytest.approx(stats[stat]) == final_stats[stat], f"stat {stat}" diff --git a/tests/systems/test_main_basic_run.py b/tests/systems/test_main_basic_run.py new file mode 100644 index 0000000000000000000000000000000000000000..37661f30ca384e6d8a8d181d0793c5ae703b3478 --- /dev/null +++ b/tests/systems/test_main_basic_run.py @@ -0,0 +1,24 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata +] + + +def test_main_basic_run(system, system_config, sim_output): + if not system_config.get("main", False): + pytest.skip(f"{system} does not support basic main run.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "--time", "1m", + "--system", system, + "-o", sim_output, + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_cooling_run.py b/tests/systems/test_main_cooling_run.py new file mode 100644 index 0000000000000000000000000000000000000000..da0c3a33015b483bbd29b109724dd2c41f920293 --- /dev/null +++ b/tests/systems/test_main_cooling_run.py @@ -0,0 +1,27 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata, + pytest.mark.cooling, +] + + +def test_main_cooling_run(system, system_config, sim_output): + if not system_config.get("cooling", False): + pytest.skip(f"{system} does not support cooling.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "--time", "1h", + "--system", system, + "-c", + "--noui", + "-o", sim_output + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_cooling_uncertainty_run.py b/tests/systems/test_main_cooling_uncertainty_run.py new file mode 100644 index 0000000000000000000000000000000000000000..472771d2e663b8075c78f78fadd77ac0bfa2eef3 --- /dev/null +++ b/tests/systems/test_main_cooling_uncertainty_run.py @@ -0,0 +1,29 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata, + pytest.mark.cooling, + pytest.mark.uncertainty +] + + +def test_main_cooling_uncertainty_run(request, system, system_config, sim_output): + print(f"Markexpr: {request.config.option.markexpr}") + if not system_config.get("uncertainty", False) or not system_config.get("cooling", False): + pytest.skip(f"{system} does not support cooling or uncertainty.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "--time", "3m", + "--system", system, + "-c", + "-u", + "--noui", + "-o", sim_output + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_fastforward_run.py b/tests/systems/test_main_fastforward_run.py new file mode 100644 index 0000000000000000000000000000000000000000..5a3b38fc77681b20ad2a064548b8ac64e0060d42 --- /dev/null +++ b/tests/systems/test_main_fastforward_run.py @@ -0,0 +1,22 @@ +import pytest +from ..util import run_engine + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata, + pytest.mark.fastforward +] + + +@pytest.mark.parametrize("ff_arg", ["0s", "1s", "3600s", "60m"]) +def test_main_fastforward_run(system, system_config, ff_arg, sim_output): + if not system_config.get("main", False): + pytest.skip(f"{system} does not support basic main even without data.") + + engine, stats = run_engine({ + "system": system, + "fastforward": ff_arg, + "time": "10s", + }) + assert stats['engine']['time_simulated'] == '0:00:10' diff --git a/tests/systems/test_main_help.py b/tests/systems/test_main_help.py new file mode 100644 index 0000000000000000000000000000000000000000..97fabefc56ff11be1ff034c6c10fd63d8b990dfc --- /dev/null +++ b/tests/systems/test_main_help.py @@ -0,0 +1,24 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata +] + + +def test_main_help(system, system_config): + if not system_config.get("main", False): + pytest.skip(f"{system} does not support basic main run.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "-h" + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" + assert "usage:" in result.stdout diff --git a/tests/systems/test_main_network_run.py b/tests/systems/test_main_network_run.py new file mode 100644 index 0000000000000000000000000000000000000000..5c139895a4559f71b39bd461317dad56c5e915e4 --- /dev/null +++ b/tests/systems/test_main_network_run.py @@ -0,0 +1,32 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata, + pytest.mark.network +] + + +def test_main_network_run(system, system_config, sim_output, pytestconfig): + if system == "lassen" and not pytestconfig.getoption("--runlong"): + pytest.skip("This test for \"lassen\" is very long; pass --runlong to run it") + + if not system_config.get("main", False): + pytest.skip(f"{system} does not support basic main run.") + + if not system_config.get("net", False): + pytest.skip(f"{system} does not support network run.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "--time", "1m", + "--system", system, + "--net", + "-o", sim_output + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_network_withdata_run.py b/tests/systems/test_main_network_withdata_run.py new file mode 100644 index 0000000000000000000000000000000000000000..1cbeae863542c3a48c32a77bdac6e6061fd7c5c9 --- /dev/null +++ b/tests/systems/test_main_network_withdata_run.py @@ -0,0 +1,29 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata, + pytest.mark.withdata, + pytest.mark.long, + pytest.mark.network +] + + +def test_main_network_withdata_run(system, system_config, system_files, sim_output): + if not system_config.get("net", False): + pytest.skip(f"{system} does not support basic net run.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "--time", "1m", + "--system", system, + "-f", ','.join(system_files), + "--net", + "-o", sim_output + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_noui_run.py b/tests/systems/test_main_noui_run.py new file mode 100644 index 0000000000000000000000000000000000000000..08e71898c51fba4813d39e5487aece2e62e9b42c --- /dev/null +++ b/tests/systems/test_main_noui_run.py @@ -0,0 +1,25 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata +] + + +def test_main_noui_run(system, system_config, sim_output): + if not system_config.get("main", False): + pytest.skip(f"{system} does not support basic main run.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "--time", "1m", + "--system", system, + "--noui", + "-o", sim_output + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_start_run.py b/tests/systems/test_main_start_run.py new file mode 100644 index 0000000000000000000000000000000000000000..19cc163ee7d4ffccc2377922222736ce5224d71c --- /dev/null +++ b/tests/systems/test_main_start_run.py @@ -0,0 +1,22 @@ +import pytest +from ..util import run_engine + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata, +] + + +@pytest.mark.parametrize("start", [ + "2025-01-01", "2024-01-04T00:00Z", "1970-01-01T00:00:00+00:00", +]) +def test_main_start_run(system, system_config, sim_output, start): + if not system_config.get("main", False): + pytest.skip(f"{system} does not support basic main even without data.") + + engine, stats = run_engine({ + "system": system, + "time": "10s", + "start": start + }) diff --git a/tests/systems/test_main_time_delta_run.py b/tests/systems/test_main_time_delta_run.py new file mode 100644 index 0000000000000000000000000000000000000000..2ca8477f452fbfc0e0c8326301d28de5d7c8a1be --- /dev/null +++ b/tests/systems/test_main_time_delta_run.py @@ -0,0 +1,39 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT +from raps.utils import convert_to_time_unit, convert_seconds_to_hhmmss + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata, + pytest.mark.time_delta +] + + +@pytest.mark.parametrize("time_arg, tdelta_arg", [ + ("100", "1"), + ("100", "1s"), + ("100", "10s"), + ("10m", "1m"), + ("10h", "1h"), + ("10h", "3h"), + ("3d", "1d") +], ids=["1", "1s", "10s", "1m", "1h", "3h", "1d"]) +def test_main_time_delta_run(system, system_config, time_arg, tdelta_arg, sim_output): + if not system_config.get("time_delta", False): + pytest.skip(f"{system} does not support time_delta run.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "-t", time_arg, + "--time-delta", tdelta_arg, + "--system", system, + "--noui", + "-o", sim_output + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" + time = convert_to_time_unit(time_arg) + assert f"Time Simulated: {convert_seconds_to_hhmmss(time)}" in result.stdout diff --git a/tests/systems/test_main_time_delta_sub_second_run.py b/tests/systems/test_main_time_delta_sub_second_run.py new file mode 100644 index 0000000000000000000000000000000000000000..db80105777545f110d25b1bfbe71aabafef2f929 --- /dev/null +++ b/tests/systems/test_main_time_delta_sub_second_run.py @@ -0,0 +1,44 @@ +import os +import subprocess +import gc +import pytest +from tests.util import PROJECT_ROOT +from raps.utils import convert_seconds_to_hhmmss, parse_td + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata, + pytest.mark.time_delta +] + + +@pytest.mark.parametrize("time_arg, tdelta_arg", [ + ("10", "1ds"), + ("60", "3ds"), + ("1", "1cs"), + ("1", "1ms"), + ("10ds", "1cs"), + ("10cs", "1ms"), + ("100ms", "1ms"), + ("100ms", "1s"), +], ids=["1ds", "3ds", "1cs", "1ms", "1cs-for-10ds", "1ms-for-10cs", "1ms-for-100ms", "1s-for-100ms"]) +def test_main_time_delta_sub_second_run(system, system_config, time_arg, tdelta_arg, sim_output): + if not system_config.get("time_delta", False): + pytest.skip(f"{system} does not support time_delta run.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "-t", time_arg, + "--time-delta", tdelta_arg, + "--system", system, + "--noui", + "-o", sim_output, + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" + time = parse_td(time_arg).seconds + assert f"Time Simulated: {convert_seconds_to_hhmmss(time)}" in result.stdout + + del result + gc.collect() diff --git a/tests/systems/test_main_time_run.py b/tests/systems/test_main_time_run.py new file mode 100644 index 0000000000000000000000000000000000000000..0faa06cb121c1f444ea8e64d0e9e04dbf4e9d1c9 --- /dev/null +++ b/tests/systems/test_main_time_run.py @@ -0,0 +1,35 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata, + pytest.mark.time +] + + +@pytest.mark.parametrize("time_args", [ + "0", "1", "3600", "7200", + pytest.param("43200", marks=pytest.mark.long), # mark this one as long + "0s", "1s", "3600s", "7200s", + pytest.param("43200s", marks=pytest.mark.long), # mark this one as long + "0m", "1m", "60m", + "0h", "1h", + pytest.param("6h", marks=pytest.mark.long), # mark this one as long +]) +def test_main_time_run(system, system_config, time_args, sim_output): + if not system_config.get("main", False): + pytest.skip(f"{system} does not support basic main run.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "--time", time_args, + "--system", system, + "--noui", + "-o", sim_output + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_uncertainty_run.py b/tests/systems/test_main_uncertainty_run.py new file mode 100644 index 0000000000000000000000000000000000000000..f3d5bd070eb34a86ce0e698a8835320d4971329e --- /dev/null +++ b/tests/systems/test_main_uncertainty_run.py @@ -0,0 +1,28 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata, + pytest.mark.uncertainty, + pytest.mark.long +] + + +def test_main_uncertainty_run(system, system_config, sim_output): + if not system_config.get("uncertainty", False): + pytest.skip(f"{system} does not support uncertainty.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "--time", "3m", + "--system", system, + "-u", + "--noui", + "-o", sim_output + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_main_withdata_range_run.py b/tests/systems/test_main_withdata_range_run.py new file mode 100644 index 0000000000000000000000000000000000000000..63c3e8657d3d523b3ed32abf61c118987f4973bb --- /dev/null +++ b/tests/systems/test_main_withdata_range_run.py @@ -0,0 +1,27 @@ +import pytest +from ..util import run_engine + +pytestmark = [ + pytest.mark.system, + pytest.mark.withdata, + pytest.mark.long +] + + +def test_main_withdata_range_run(system, system_config, system_files, sim_output): + if not system_config.get("main", False): + pytest.skip(f"{system} does not support basic main even without data.") + if not system_config.get("withdata", False): + pytest.skip(f"{system} does not support basic main with data.") + + engine, stats = run_engine({ + "system": system, + "start": system_config['start'], + "time": "10m", + "replay": system_files, + }) + + # Check that it at least loaded some data + assert stats['tick_count'] == 10 * 60 + assert stats['job']['jobs_total'] > 0 + assert len(stats['job']['jobs_still_running']) + stats['job']['jobs_completed'] > 0 diff --git a/tests/systems/test_main_withdata_run.py b/tests/systems/test_main_withdata_run.py new file mode 100644 index 0000000000000000000000000000000000000000..1ca64134f47b31faffa852f7005832ef2a6abfdf --- /dev/null +++ b/tests/systems/test_main_withdata_run.py @@ -0,0 +1,26 @@ +import pytest +from ..util import run_engine + +pytestmark = [ + pytest.mark.system, + pytest.mark.withdata, + pytest.mark.long +] + + +def test_main_withdata_run(system, system_config, system_files, sim_output): + if not system_config.get("main", False): + pytest.skip(f"{system} does not support basic main even without data.") + if not system_config.get("withdata", False): + pytest.skip(f"{system} does not support basic main with data.") + + engine, stats = run_engine({ + "system": system, + "time": "20m", + "replay": system_files, + }) + + # Check that it at least loaded some data + assert stats['tick_count'] == 20 * 60 + assert stats['job']['jobs_total'] > 0 + assert len(stats['job']['jobs_still_running']) + stats['job']['jobs_completed'] > 0 diff --git a/tests/systems/test_multi_part_sim_basic_run.py b/tests/systems/test_multi_part_sim_basic_run.py new file mode 100644 index 0000000000000000000000000000000000000000..0edcc903b02330a93e37095a358d952cc8922344 --- /dev/null +++ b/tests/systems/test_multi_part_sim_basic_run.py @@ -0,0 +1,28 @@ +import os +import subprocess +import gc +import pytest +from tests.util import PROJECT_ROOT + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata +] + + +def test_multi_part_sim_basic_run(system, system_config, sim_output): + + if not system_config.get("multi-part-sim", False): + pytest.skip(f"{system} does not support basic multi-part-sim run.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run-parts", + "--time", "1h", + "-x", f"{system}/*", + "-o", sim_output, + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" + del result + gc.collect() diff --git a/tests/systems/test_multi_part_sim_network_run.py b/tests/systems/test_multi_part_sim_network_run.py new file mode 100644 index 0000000000000000000000000000000000000000..c5560148abb9d74bde8de7b44ddd6a1ca099cd58 --- /dev/null +++ b/tests/systems/test_multi_part_sim_network_run.py @@ -0,0 +1,28 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT + + +pytestmark = [ + pytest.mark.system, + pytest.mark.nodata +] + + +def test_multi_part_sim_network_run(system, system_config, sim_output): + if not system_config.get("multi-part-sim", False): + pytest.skip(f"{system} does not support basic multi-part-sim run.") + + if not system_config.get("net", False): + pytest.skip(f"{system} does not support network run.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run-parts", + "--time", "1h", + "-x", f"{system}/*", + "--net", + "-o", sim_output, + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_multi_part_sim_withdata_run.py b/tests/systems/test_multi_part_sim_withdata_run.py new file mode 100644 index 0000000000000000000000000000000000000000..b35e8fe194cf31cc640c1698078b076d719a7211 --- /dev/null +++ b/tests/systems/test_multi_part_sim_withdata_run.py @@ -0,0 +1,26 @@ +import pytest +from tests.util import run_multi_part_engine + + +pytestmark = [ + pytest.mark.system, + pytest.mark.withdata, + pytest.mark.long +] + + +def test_multi_part_sim_withdata_run(system, system_config, system_files, sim_output): + if not system_config.get("multi-part-sim", False): + pytest.skip(f"{system} does not support basic multi-part-sim run even without data.") + if not system_config.get("withdata", False): + pytest.skip(f"{system} does not support multi-part-sim run with data.") + + engine, stats = run_multi_part_engine({ + "start": system_config['start'], + "time": "1h", + "partitions": [system], + "replay": system_files, + }) + + times = [s['engine']['time_simulated'] for s in stats['partitions'].values()] + assert len(set(times)) == 1 # All run the same time diff --git a/tests/systems/test_telemetry_withdata_run.py b/tests/systems/test_telemetry_withdata_run.py new file mode 100644 index 0000000000000000000000000000000000000000..43a218b045b226fba9d1ebfb469b196bcfe8d016 --- /dev/null +++ b/tests/systems/test_telemetry_withdata_run.py @@ -0,0 +1,26 @@ +import os +import subprocess +import pytest +from tests.util import PROJECT_ROOT + + +pytestmark = [ + pytest.mark.system, + pytest.mark.withdata +] + + +def test_telemetry_main_withdata_run(system, system_config, system_files, sim_output): + if not system_config.get("telemetry", False): + pytest.skip(f"{system} does not support telemetry run.") + if not system_config.get("withdata", False): + pytest.skip(f"{system} does not support telemetry run with data.") + + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "telemetry", + "--system", system, + "-f", ','.join(system_files), + "-o", sim_output, + ], capture_output=True, text=True, stdin=subprocess.DEVNULL) + assert result.returncode == 0, f"Failed on {system}: {result.stderr}" diff --git a/tests/systems/test_workload_synthetic.py b/tests/systems/test_workload_synthetic.py new file mode 100644 index 0000000000000000000000000000000000000000..107b01521e6205d1eb60945192fb73c79baf5234 --- /dev/null +++ b/tests/systems/test_workload_synthetic.py @@ -0,0 +1,110 @@ +import subprocess +import gc +import pytest + + +pytestmark = [ + pytest.mark.system, + pytest.mark.workload, +] + + +def flatten(dist): + name, args = dist + return [name, *args] + + +jobdist_case = [ + ("weibull", ["--jobsize-weibull-shape", "0.75", "--jobsize-weibull-scale", "16"]), + ("normal", ["--jobsize-normal-stddev", "100", "--jobsize-normal-mean", "16"]), + ("uniform", []), +] +cpudist_case = [ + ("weibull", ["--cpuutil-weibull-shape", "0.75", "--cpuutil-weibull-scale", "16"]), + ("normal", ["--cpuutil-normal-stddev", "100", "--cpuutil-normal-mean", "16"]), + ("uniform", []), +] +gpudist_case = [ + ("weibull", ["--gpuutil-weibull-shape", "0.75", "--gpuutil-weibull-scale", "16"]), + ("normal", ["--gpuutil-normal-stddev", "100", "--gpuutil-normal-mean", "16"]), + ("uniform", []), +] +wtimedist_case = [ + ("weibull", ["--walltime-weibull-shape", "0.75", "--walltime-weibull-scale", "16"]), + ("normal", ["--walltime-normal-stddev", "100", "--walltime-normal-mean", "16"]), + ("uniform", []), +] +additional_params_cases = [ + "", # nothing + ["--jobsize-is-of-degree", "2"], + ["--jobsize-is-of-degree", "3"], + ["--jobsize-is-power-of", "2"], + ["--jobsize-is-power-of", "3"], +] + + +@pytest.mark.parametrize( + "jobdist", jobdist_case, ids=lambda d: d[0] +) +@pytest.mark.parametrize( + "cpudist", cpudist_case, ids=lambda d: d[0] +) +@pytest.mark.parametrize( + "gpudist", gpudist_case, ids=lambda d: d[0] +) +@pytest.mark.parametrize( + "wtimedist", wtimedist_case, ids=lambda d: d[0] +) +@pytest.mark.parametrize( + "additional_params", additional_params_cases, ids=lambda p: (p or "none") +) +def test_workload_synthetic_run( + system, system_config, jobdist, cpudist, gpudist, wtimedist, additional_params +): + """Run the real synthetic workload generator with every combination + of job, CPU, GPU, wall‑time distributions and optional extra flags. + The test simply verifies that the script exits with status 0. + """ + + if not system_config.get("workload", False): + pytest.skip(f"{system} does not support workload run.") + + # Build the command line. Each distribution tuple expands into: + # dist_name, , , ... + cmd = [ + "python", "main.py", "workload", + "--system", system, + "-w", "synthetic", + "--jobsize-distribution", *flatten(jobdist), + "--cpuutil-distribution", *flatten(cpudist), + "--gpuutil-distribution", *flatten(gpudist), + "--walltime-distribution", *flatten(wtimedist), + ] + + # Add any extra parameters if present. + if additional_params: + # If the flag contains a space we keep it as a single string. + cmd.extend(additional_params) + + cmd1 = ["python", "-c \"exit()\""] + result = subprocess.run(cmd1, capture_output=True, text=True, stdin=subprocess.DEVNULL) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + stdin=subprocess.DEVNULL, + timeout=1 + ) + except subprocess.TimeoutExpired: + result.returncode = 0 + + assert result.returncode == 0, ( + f"Failed on {system} with {jobdist[0]}, {cpudist[0]}, " + f"{gpudist[0]}, {wtimedist[0]}: {result.stderr}" + ) + + # Explicitly delete the result to avoid hitting + # “Too many open file descriptors” on slow CI machines. + del result + gc.collect() diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000000000000000000000000000000000000..4b09fa0515abb09703afb74c2b0e0acd19c2ddbc --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,41 @@ + +import subprocess +import os +from pathlib import Path + +import pytest +pytestmark = pytest.mark.nodata + +PROJECT_ROOT = Path(__file__).resolve().parent.parent # adjust if needed + + +@pytest.mark.order(1) +def test_main_withui(): + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "--time", "1h", + "-o", 'none', + ], capture_output=True, text=True) + assert result.returncode == 0 + + +@pytest.mark.order(2) +def test_main_noui(): + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", + "--time", "1h", + "--noui", "-o", 'none', + ], capture_output=True, text=True) + assert result.returncode == 0 + + +@pytest.mark.long +@pytest.mark.order(3) +def test_main_long(): + os.chdir(PROJECT_ROOT) + result = subprocess.run([ + "python", "main.py", "run", "-o", 'none', + ], capture_output=True, text=True) + assert result.returncode == 0 diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/unit/test_net_dragonfly.py b/tests/unit/test_net_dragonfly.py new file mode 100644 index 0000000000000000000000000000000000000000..b8711fd055fde30a86a51fc055b77d25ad80e064 --- /dev/null +++ b/tests/unit/test_net_dragonfly.py @@ -0,0 +1,48 @@ +from raps.network.dragonfly import build_dragonfly, dragonfly_node_id_to_host_name + + +def test_build_dragonfly(): + """Test building a small dragonfly network.""" + D = 2 # Routers per group + A = 2 # Gloobal connections per router + P = 2 # Compute nodes per router + G = build_dragonfly(D, A, P) + + # Check number of nodes + num_routers = D * (A + 1) + num_hosts = num_routers * P + total_nodes = num_routers + num_hosts + assert len(G.nodes) == total_nodes + + # Check number of edges + routers_per_group = D + # Edges of the router clique: + router_clique_edges_per_group = ((routers_per_group * (routers_per_group - 1)) // 2) + # Edges for all router compute nodes: + compute_node_edges_per_router = P + # Total Intra-group edges: + intra_group_edges = router_clique_edges_per_group + compute_node_edges_per_router * D + + # Inter-group edges + total_groups = A + 1 + inter_group_edges_simple_clique = ((total_groups * (total_groups-1)) // 2) + inter_group_edges = inter_group_edges_simple_clique * D + # Host to router edges + total_edges = intra_group_edges * total_groups + inter_group_edges + assert len(G.edges) == total_edges + + # Check node types + node_types = [data["layer"] for _, data in G.nodes(data=True)] + assert node_types.count("router") == num_routers + assert node_types.count("host") == num_hosts + + +def test_dragonfly_node_id_to_host_name(): + """Test the dragonfly_node_id_to_host_name function.""" + D, A, P = 2, 2, 2 + # Test a few node IDs + assert dragonfly_node_id_to_host_name(0, D, A, P) == "h_0_0_0" + assert dragonfly_node_id_to_host_name(1, D, A, P) == "h_0_0_1" + assert dragonfly_node_id_to_host_name(2, D, A, P) == "h_0_1_0" + assert dragonfly_node_id_to_host_name(3, D, A, P) == "h_0_1_1" + assert dragonfly_node_id_to_host_name(4, D, A, P) == "h_1_0_0" diff --git a/tests/unit/test_net_fat_tree.py b/tests/unit/test_net_fat_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..93750e0187b5afb4410438f9389f95163fdc5cba --- /dev/null +++ b/tests/unit/test_net_fat_tree.py @@ -0,0 +1,42 @@ +import pytest +from raps.network.fat_tree import build_fattree, node_id_to_host_name + +def test_build_fattree_k4(): + """Test building a k=4 fat-tree.""" + k = 4 + G = build_fattree(k, 16) + + # Check number of nodes + num_hosts = k * (k // 2) * (k // 2) + num_edge_switches = k * (k // 2) + num_agg_switches = k * (k // 2) + num_core_switches = (k // 2) ** 2 + total_nodes = num_hosts + num_edge_switches + num_agg_switches + num_core_switches + assert len(G.nodes) == total_nodes + + # Check number of edges + # Host to edge switch edges + host_edges = num_hosts + # Edge to agg switch edges + edge_agg_edges = k * (k // 2) * (k // 2) + # Agg to core switch edges + agg_core_edges = k * (k // 2) * (k // 2) + total_edges = host_edges + edge_agg_edges + agg_core_edges + assert len(G.edges) == total_edges + + # Check node types + node_types = [data["type"] for _, data in G.nodes(data=True)] + assert node_types.count("host") == num_hosts + assert node_types.count("edge") == num_edge_switches + assert node_types.count("agg") == num_agg_switches + assert node_types.count("core") == num_core_switches + +def test_node_id_to_host_name(): + """Test the node_id_to_host_name function.""" + k = 4 + # Test a few node IDs + assert node_id_to_host_name(0, k) == "h_0_0_0" + assert node_id_to_host_name(1, k) == "h_0_0_1" + assert node_id_to_host_name(2, k) == "h_0_1_0" + assert node_id_to_host_name(3, k) == "h_0_1_1" + assert node_id_to_host_name(4, k) == "h_1_0_0" diff --git a/tests/unit/test_net_torus3d.py b/tests/unit/test_net_torus3d.py new file mode 100644 index 0000000000000000000000000000000000000000..3e38eb420dd090b2109a30788b31a6c49cea5ebb --- /dev/null +++ b/tests/unit/test_net_torus3d.py @@ -0,0 +1,43 @@ +from raps.network.torus3d import build_torus3d, torus_route_xyz + + +def test_build_torus3d(): + """Test building a small 3D torus network.""" + dims = (2, 2, 2) + G, meta = build_torus3d(dims) + + # Check number of nodes + num_routers = dims[0] * dims[1] * dims[2] + hosts_per_router = 1 # Default! Assumption + num_hosts = num_routers * hosts_per_router + total_nodes = num_routers + num_hosts + assert len(G.nodes) == total_nodes + + # Check number of edges + # Router to router edges + router_edges = (num_routers * 3) # Each router has 3 neighbors in a 3D torus + # Host to router edges + host_router_edges = num_routers * hosts_per_router + total_edges = router_edges + host_router_edges + assert len(G.edges) == total_edges + + # Check node types + node_types = [data["type"] for _, data in G.nodes(data=True)] + assert node_types.count("router") == num_routers + assert node_types.count("host") == num_hosts + + +def test_torus_route_xyz(): + """Test the torus_route_xyz function.""" + dims = (4, 4, 4) + # Test a simple route + path = torus_route_xyz("r_0_0_0", "r_1_1_1", dims) + assert path == ["r_0_0_0", "r_1_0_0", "r_1_1_0", "r_1_1_1"] + + # Test a route with wrap-around + path = torus_route_xyz("r_3_3_3", "r_0_0_0", dims, wrap=True) + assert path == ["r_3_3_3", "r_0_3_3", "r_0_0_3", "r_0_0_0"] + + # Test a route without wrap-around + path = torus_route_xyz("r_0_0_0", "r_1_1_1", dims, wrap=False) + assert path == ["r_0_0_0", "r_1_0_0", "r_1_1_0", "r_1_1_1"] diff --git a/tests/unit/test_system_config.py b/tests/unit/test_system_config.py new file mode 100644 index 0000000000000000000000000000000000000000..e32b99f4b5aaddd71752f77f4aaf1d6b619a9901 --- /dev/null +++ b/tests/unit/test_system_config.py @@ -0,0 +1,26 @@ +import pytest +from raps.raps_config import raps_config +from raps.system_config import list_systems, get_system_config, get_partition_configs + + +@pytest.mark.parametrize("system_name", list_systems()) +def test_configs(system_name): + # Very basic test that all system configs are valid + config = get_system_config(system_name) + assert config.system_name == system_name + assert config.get_legacy()['system_name'] == system_name + assert config.get_legacy()['system_config'] == config + + +@pytest.mark.parametrize("input,expected_name,expected_configs", [ + (["lumi"], "lumi", ["lumi/lumi-c", "lumi/lumi-g"]), + (["lumi/*"], "lumi", ["lumi/lumi-c", "lumi/lumi-g"]), + (["frontier", "summit"], "frontier+summit", ["frontier", "summit"]), + # test passing arbitrary paths + ([str(raps_config.system_config_dir / "lumi")], "lumi", ["lumi-c", "lumi-g"]), + ([str(raps_config.system_config_dir / "lumi/lumi-*")], "lumi-c+lumi-g", ["lumi-c", "lumi-g"]), +]) +def test_get_partition_configs(input, expected_name, expected_configs): + result = get_partition_configs(input) + assert result.system_name == expected_name + assert result.partition_names == expected_configs diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5eb7edb4a15f2a75adf87ded18b6bf8049c45878 --- /dev/null +++ b/tests/unit/test_utils.py @@ -0,0 +1,56 @@ +import pytest +from datetime import timedelta +from raps.utils import parse_td, convert_to_time_unit, infer_time_unit, TIME_UNITS, parse_time_unit + + +@pytest.mark.parametrize("input,expected", [ + ("1", timedelta(seconds=1)), + ("1m", timedelta(minutes=1)), + (timedelta(minutes=1), timedelta(minutes=1)), + (2, timedelta(seconds=2)), + ("PT2S", timedelta(seconds=2)), + ("+1 day", timedelta(days=1)), + ("2ds", timedelta(milliseconds=200)), + ("2cs", timedelta(milliseconds=20)), + ("2ms", timedelta(milliseconds=2)), +]) +def test_parse_td(input, expected): + assert parse_td(input) == expected + + +@pytest.mark.parametrize("input,expected", [ + ("s", timedelta(seconds=1)), + ("ms", timedelta(milliseconds=1)), + ("ds", timedelta(milliseconds=100)), + ("cs", timedelta(milliseconds=10)), +]) +def test_parse_time_unit(input, expected): + assert parse_time_unit(input) == expected + + +def test_parse_td_error(): + with pytest.raises(ValueError): + parse_td("1x") + + +@pytest.mark.parametrize("input,unit,expected", [ + ("1s", 's', 1), + ("1m", 's', 60), + (0, 'ms', 0), + (timedelta(seconds=6), 'ms', 6000), +]) +def test_convert_to_time_unit(input, unit, expected): + assert convert_to_time_unit(input, unit) == expected + + +@pytest.mark.parametrize("input,expected", [ + ("1s", 's'), + ("1000ms", 'ms'), + (0, 's'), + (timedelta(seconds=6), 's'), + (timedelta(days=6), 's'), + (timedelta(milliseconds=6), 'ms'), + (timedelta(milliseconds=60), 'cs'), +]) +def test_infer_time_unit(input, expected): + assert infer_time_unit(input) == TIME_UNITS[expected] diff --git a/tests/util.py b/tests/util.py new file mode 100644 index 0000000000000000000000000000000000000000..4082d4f80282a03cd5fb45d552850ae61b1d9a14 --- /dev/null +++ b/tests/util.py @@ -0,0 +1,97 @@ +import os +from typing import Any +from pathlib import Path +import shlex +import json +from raps.engine import Engine +from raps.stats import get_stats +from raps.multi_part_engine import MultiPartEngine +from raps.sim_config import SingleSimConfig, MultiPartSimConfig + + +def find_project_root(): + path = Path(__file__).resolve() + while not (path / "pyproject.toml").exists(): + if path.parent == path: + raise RuntimeError("Could not find project root.") + path = path.parent + return path + + +PROJECT_ROOT = find_project_root() +CONFIG_PATH = PROJECT_ROOT / "config" +DATA_PATH = Path(os.getenv("RAPS_DATA_DIR", PROJECT_ROOT / "data")).resolve() + +# Maybe usefull but now all systems are listed explicitly! +system_list = [ + entry for entry in os.listdir(CONFIG_PATH) + if os.path.isfile(os.path.join(CONFIG_PATH, entry, 'system.json')) +] + + +def requires_all_markers(request, required_markers): + markexpr = getattr(request.config.option, "markexpr", "") + selected = set(part.strip() for part in markexpr.split("and")) + return required_markers.issubset(selected) + + +def _get_cmd(config, sub_cmd): + return f"echo {shlex.quote(json.dumps(config))} | python main.py {sub_cmd} - -o none" + + +def run_engine(sim_config, include_ticks=False) -> tuple[Engine, dict[str, Any]]: + """ + Run a simulation to completion. Returns the completed Engine and a dict containing the engine + stats. If include_ticks is True, the dict will also include a list of all the TickDatas (this + can be very large, especially if cooling is enabled!) + """ + # Log command to rerun the test manually for debugging convenience + print(f"Command to reproduce run:\n {_get_cmd(sim_config, "run")}") + + sim_config = SingleSimConfig.model_validate(sim_config) + engine = Engine(sim_config) + gen = engine.run_simulation() + + stats = { + "tick_count": 0, + "tick_datas": [] if include_ticks else None, + } + + for tick in gen: + stats['tick_count'] += 1 + if include_ticks: + stats['tick_datas'].append(tick) + + stats.update(get_stats(engine)) + + return engine, stats + + +def run_multi_part_engine(sim_config, include_ticks=False) -> tuple[MultiPartEngine, dict[str, dict[str, Any]]]: + """ + Run a multi-part simulation to completion. Returns the completed Engine and a dict containing the engine + stats for each partition. If include_ticks is True, the dicts will also include a list of all the + TickDatas (this can be very large, especially if cooling is enabled!) + """ + # Log command to rerun the test manually for debugging convenience + print(f"Command to reproduce run:\n {_get_cmd(sim_config, "run-parts")}") + + sim_config = MultiPartSimConfig.model_validate(sim_config) + multi_engine = MultiPartEngine(sim_config) + gen = multi_engine.run_simulation() + + stats = { + "tick_count": 0, + "tick_datas": [] if include_ticks else None, + "partitions": {}, + } + + for tick in gen: + stats['tick_count'] += 1 + if include_ticks: + stats['tick_datas'].append(tick) + + for partition, engine in multi_engine.engines.items(): + stats['partitions'][partition] = get_stats(engine) + + return multi_engine, stats diff --git a/third_party/ScheduleFlow b/third_party/ScheduleFlow new file mode 160000 index 0000000000000000000000000000000000000000..3fdfd3675e68f0c2a0e68c1d7ce7205940d28216 --- /dev/null +++ b/third_party/ScheduleFlow @@ -0,0 +1 @@ +Subproject commit 3fdfd3675e68f0c2a0e68c1d7ce7205940d28216