From 136c252e0a0999ee9d85019bcad68042a3449252 Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 15 Sep 2025 15:20:59 -0400 Subject: [PATCH 1/3] More consistent usage of entrypoint script in docs --- README.md | 7 ++++--- experiments/mit-replay-24hrs.yaml | 2 +- experiments/mit-synthetic.yaml | 2 +- raps/dataloaders/adastraMI250.py | 6 +++--- raps/dataloaders/bluewaters.py | 2 +- raps/dataloaders/frontier.py | 2 +- raps/dataloaders/fugaku.py | 6 +++--- raps/dataloaders/lassen.py | 10 +++++----- raps/dataloaders/marconi100.py | 8 ++++---- raps/schedulers/fastsim.py | 2 +- scripts/marconi100-day51.sh | 8 ++++---- tests/util.py | 2 +- 12 files changed, 29 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 90ab831..9c708af 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ For MIT Supercloud raps run-parts -x mit_supercloud -w multitenant # Reinforcement learning test case - python main.py train-rl --system mit_supercloud/part-cpu -f /opt/data/mit_supercloud/202201 + raps train-rl --system mit_supercloud/part-cpu -f /opt/data/mit_supercloud/202201 For Lumi @@ -135,11 +135,12 @@ This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename There are three ways to modify replaying of telemetry data: 1. `--arrival`. Changing the arrival time distribution - replay cases will default to `--arrival prescribed`, where the jobs will be submitted exactly as they were submitted on the physical machine. This can be changed to `--arrival poisson` to change when the jobs arrive, which is especially useful in cases where there may be gaps in time, e.g., when the system goes down for several days, or the system is is underutilized. -python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --arrival poisson + + raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --arrival poisson 2. `--policy`. Changing the way the jobs are scheduled. The `--policy` flag will be set by default to `replay` in cases where a telemetry file is provided, in which case the jobs will be scheduled according to the start times provided. Changing the `--policy` to `fcfs` or `backfill` will use the internal scheduler, e.g.: - python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h + raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h 3. `--scale`. Changing the scale of each job in the telemetry data. The `--scale` flag will specify the maximum number of nodes for each job (generally set this to the max number of nodes of the smallest partition), and randomly select the number of nodes for each job from one to max nodes. This flag is useful when replaying telemetry from a larger system onto a smaller system. diff --git a/experiments/mit-replay-24hrs.yaml b/experiments/mit-replay-24hrs.yaml index 1357886..6990069 100644 --- a/experiments/mit-replay-24hrs.yaml +++ b/experiments/mit-replay-24hrs.yaml @@ -1,4 +1,4 @@ -# python main.py run-multi-part experiments/mit-replay-24hrs.yaml +# raps run-multi-part experiments/mit-replay-24hrs.yaml partitions: ["mit_supercloud/part-cpu", "mit_supercloud/part-gpu"] replay: - /opt/data/mit_supercloud/202201 diff --git a/experiments/mit-synthetic.yaml b/experiments/mit-synthetic.yaml index 5f68cd1..6a24946 100644 --- a/experiments/mit-synthetic.yaml +++ b/experiments/mit-synthetic.yaml @@ -1,3 +1,3 @@ -# python main.py run-multi-part experiments/mit-synthetic.yaml +# raps run-multi-part experiments/mit-synthetic.yaml partitions: ["mit_supercloud/part-cpu", "mit_supercloud/part-gpu"] workload: multitenant diff --git a/raps/dataloaders/adastraMI250.py b/raps/dataloaders/adastraMI250.py index 8cadbfb..ed60807 100644 --- a/raps/dataloaders/adastraMI250.py +++ b/raps/dataloaders/adastraMI250.py @@ -6,13 +6,13 @@ # to simulate the dataset - python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 + raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 # to replay with different scheduling policy - python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 --policy priority --backfill easy + raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 --policy priority --backfill easy # to run a specific time range - python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 \ + raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 \ --start 2024-11-01T00:00:00Z --end 2024-11-02T00:00:00Z # to analyze dataset diff --git a/raps/dataloaders/bluewaters.py b/raps/dataloaders/bluewaters.py index 728e2bb..7b1ee1f 100644 --- a/raps/dataloaders/bluewaters.py +++ b/raps/dataloaders/bluewaters.py @@ -3,7 +3,7 @@ Blue Waters dataloader Example test case: - python main.py -f /opt/data/bluewaters --start 20170328 --system bluewaters -net + raps run -f /opt/data/bluewaters --start 20170328 --system bluewaters -net To download the necessary datasets: diff --git a/raps/dataloaders/frontier.py b/raps/dataloaders/frontier.py index a6ac45e..23efd2f 100644 --- a/raps/dataloaders/frontier.py +++ b/raps/dataloaders/frontier.py @@ -4,7 +4,7 @@ # To simulate DATEDIR="date=2024-01-18" DPATH=/path/to/data - python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR + raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR # To analyze the data python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR diff --git a/raps/dataloaders/fugaku.py b/raps/dataloaders/fugaku.py index 4ccc885..5a531fa 100644 --- a/raps/dataloaders/fugaku.py +++ b/raps/dataloaders/fugaku.py @@ -10,9 +10,9 @@ The '--arrival poisson' will compute submit times from Poisson distribution, instead of using the submit times given in F-Data. - python main.py --system fugaku -f /path/to/21_04.parquet - python main.py --system fugaku -f /path/to/21_04.parquet --validate - python main.py --system fugaku -f /path/to/21_04.parquet --policy priority --backfill easy + raps run --system fugaku -f /path/to/21_04.parquet + raps run --system fugaku -f /path/to/21_04.parquet --validate + raps run --system fugaku -f /path/to/21_04.parquet --policy priority --backfill easy """ import pandas as pd from tqdm import tqdm diff --git a/raps/dataloaders/lassen.py b/raps/dataloaders/lassen.py index 8bded75..db86513 100644 --- a/raps/dataloaders/lassen.py +++ b/raps/dataloaders/lassen.py @@ -14,19 +14,19 @@ Usage Instructions: git lfs pull # to analyze dataset and plot histograms - python -m raps.telemetry -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --plot + raps telemetry -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --plot # to simulate the dataset as submitted - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen + raps run -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen # to modify the submit times of the telemetry according to Poisson distribution - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --arrival poisson + raps run -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --arrival poisson # to fast-forward 365 days and replay for 1 day. This region day has 2250 jobs with 1650 jobs executed. - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --start '2019-08-22T00:00:00+00:00' -t 1d + raps run -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --start '2019-08-22T00:00:00+00:00' -t 1d # For the network replay this command gives suiteable snapshots: - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit -t 12h --arrival poisson # noqa + raps run -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit -t 12h --arrival poisson # noqa """ import math diff --git a/raps/dataloaders/marconi100.py b/raps/dataloaders/marconi100.py index 6ff310b..4b3c5c6 100644 --- a/raps/dataloaders/marconi100.py +++ b/raps/dataloaders/marconi100.py @@ -9,14 +9,14 @@ Download `job_table.parquet` from https://zenodo.org/records/10127767 # to simulate the dataset - python main.py -f /path/to/job_table.parquet --system marconi100 + raps run -f /path/to/job_table.parquet --system marconi100 # to replay using differnt schedulers - python main.py -f /path/to/job_table.parquet --system marconi100 --policy fcfs --backfill easy - python main.py -f /path/to/job_table.parquet --system marconi100 --policy priority --backfill firstfit + raps run -f /path/to/job_table.parquet --system marconi100 --policy fcfs --backfill easy + raps run -f /path/to/job_table.parquet --system marconi100 --policy priority --backfill firstfit # to fast-forward 60 days and replay for 1 day - python main.py -f /path/to/job_table.parquet --system marconi100 --start 2020-07-05T00:00:00+00:00 -t 1d + raps run -f /path/to/job_table.parquet --system marconi100 --start 2020-07-05T00:00:00+00:00 -t 1d # to analyze dataset python -m raps.telemetry -f /path/to/job_table.parquet --system marconi100 -v diff --git a/raps/schedulers/fastsim.py b/raps/schedulers/fastsim.py index 855dcbd..e930a1c 100644 --- a/raps/schedulers/fastsim.py +++ b/raps/schedulers/fastsim.py @@ -10,7 +10,7 @@ from raps.sim_config import args from raps.system_config import get_system_config # Run with this command: -# python main.py --system kestrel -f ../data/fastsim_jobs_output.parquet --scheduler fastsim --policy priority --start 2024-09-01T00:00 --end 2024-09-15T00:00 +# raps run --system kestrel -f ../data/fastsim_jobs_output.parquet --scheduler fastsim --policy priority --start 2024-09-01T00:00 --end 2024-09-15T00:00 class Scheduler(): """ diff --git a/scripts/marconi100-day51.sh b/scripts/marconi100-day51.sh index 01da9a2..77cbe45 100644 --- a/scripts/marconi100-day51.sh +++ b/scripts/marconi100-day51.sh @@ -1,4 +1,4 @@ -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy replay -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs --backfill easy -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy priority --backfill firstfit +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy replay +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs --backfill easy +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy priority --backfill firstfit diff --git a/tests/util.py b/tests/util.py index 4bbf8f8..46736b3 100644 --- a/tests/util.py +++ b/tests/util.py @@ -13,7 +13,7 @@ from raps.stats import ( def find_project_root(): path = Path(__file__).resolve() - while not (path / "main.py").exists(): + while not (path / "pyproject.toml").exists(): if path.parent == path: raise RuntimeError("Could not find project root.") path = path.parent -- GitLab From 66797338bfd51f50604f4710830241c05a2388fc Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 15 Sep 2025 15:31:02 -0400 Subject: [PATCH 2/3] Add dependencies --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index c009d2a..732315e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,8 @@ dependencies = [ "pydantic-settings>=2.10.1", "stable-baselines3==2.7.0", "gym==0.26.2", + "dill==0.4.0", + "argcomplete==3.6.2", "pre-commit" ] -- GitLab From 1090bf977b7212d409f671c8eddbbf1b04fa6c9e Mon Sep 17 00:00:00 2001 From: Jesse Hines Date: Mon, 15 Sep 2025 17:35:33 -0400 Subject: [PATCH 3/3] Add shell completion --- .gitignore | 1 + main.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 68 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index bf49923..5f7f2b5 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ venv *.prof simulation_results/ models/*.fmu +.shell-completion-cache diff --git a/main.py b/main.py index b2eae93..ab464b0 100755 --- a/main.py +++ b/main.py @@ -1,18 +1,77 @@ #!/usr/bin/env python3 +# PYTHON_ARGCOMPLETE_OK """ ExaDigiT Resource Allocator & Power Simulator (RAPS) """ import argparse -from raps.helpers import check_python_version -from raps.run_sim import run_sim_add_parser, run_parts_sim_add_parser, show_add_parser -from raps.workloads import run_workload_add_parser -from raps.telemetry import run_telemetry_add_parser -from raps.train_rl import train_rl_add_parser +from pathlib import Path +import os +import textwrap +import copy +import gzip +import dill +import argcomplete -check_python_version() +# Implement shell completion using argcomplete +# Importing all of raps' dependencies like pandas etc can be rather slow, often taking 1-2 seconds. So for snappy shell +# completion we need avoid imports on the shell completion path. We could do this by shuffling the code around to +# create the parser without importing any heavy-weight libraries. But that would be a pain to maintain and track that +# pandas or scipy aren't accidentally imported transitively. Pandas can also be convenient to use in validating SimConfig +# etc, which is needed to build the argparser. So instead, we cache the generated argparser object so that shell +# completion can run without importing the rest of raps. +PARSER_CACHE = Path(__file__).parent / '.shell-completion-cache' + + +def shell_completion_add_parser(subparsers): + parser = subparsers.add_parser("shell-completion", description=textwrap.dedent(""" + Register shell completion for RAPS. + """).strip(), formatter_class=argparse.RawDescriptionHelpFormatter) + + # Run the command from argcomplete, this edits ~/.bash_completion to register argcomplete + def impl(args): + os.system("activate-global-python-argcomplete") + + parser.set_defaults(impl=impl) + + +def shell_complete(): + try: + parser = dill.loads(gzip.decompress(PARSER_CACHE.read_bytes())) + except Exception: + PARSER_CACHE.unlink(missing_ok=True) # delete cache if corrupted somehow + parser = argparse.ArgumentParser() + # Use a dummy parser so that autocomplete still handles sys.exit tab complete if there's no + # cache. Cache will be created on first run of `main.py` + + argcomplete.autocomplete(parser, always_complete_options=False) + + +def cache_parser(parser: argparse.ArgumentParser): + parser = copy.deepcopy(parser) + subparsers = next(a for a in parser._actions if isinstance(a, argparse._SubParsersAction)) + # Don't need to pickle the impl functions + for subparser in subparsers.choices.values(): + subparser.set_defaults(impl=lambda args: None) + + pickled = gzip.compress(dill.dumps(parser), compresslevel=4, mtime=0) + if not PARSER_CACHE.exists() or PARSER_CACHE.read_bytes() != pickled: + try: # Ignore if there's some kind of write or permission error + PARSER_CACHE.write_bytes(pickled) + except Exception: + pass def main(cli_args: list[str] | None = None): + shell_complete() # will output shell completion and sys.exit during tab complete + + from raps.helpers import check_python_version + check_python_version() + + from raps.run_sim import run_sim_add_parser, run_parts_sim_add_parser, show_add_parser + from raps.workloads import run_workload_add_parser + from raps.telemetry import run_telemetry_add_parser + from raps.train_rl import train_rl_add_parser + parser = argparse.ArgumentParser( description=""" ExaDigiT Resource Allocator & Power Simulator (RAPS) @@ -27,8 +86,9 @@ def main(cli_args: list[str] | None = None): run_workload_add_parser(subparsers) run_telemetry_add_parser(subparsers) train_rl_add_parser(subparsers) + shell_completion_add_parser(subparsers) - # TODO: move other misc scripts into here + cache_parser(parser) args = parser.parse_args(cli_args) assert args.impl, "subparsers should add an impl function to args" -- GitLab