diff --git a/.gitignore b/.gitignore index bf4992384792ca911820fbb085a4f85ba0969e84..5f7f2b5f827601e2c9e09c22b38c1210bd9b201c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ venv *.prof simulation_results/ models/*.fmu +.shell-completion-cache diff --git a/README.md b/README.md index 90ab831fbe21fe88ad180154a00ac2790780649c..9c708afd5e5f92aa574d3a6ca998781ff470df6e 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ For MIT Supercloud raps run-parts -x mit_supercloud -w multitenant # Reinforcement learning test case - python main.py train-rl --system mit_supercloud/part-cpu -f /opt/data/mit_supercloud/202201 + raps train-rl --system mit_supercloud/part-cpu -f /opt/data/mit_supercloud/202201 For Lumi @@ -135,11 +135,12 @@ This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename There are three ways to modify replaying of telemetry data: 1. `--arrival`. Changing the arrival time distribution - replay cases will default to `--arrival prescribed`, where the jobs will be submitted exactly as they were submitted on the physical machine. This can be changed to `--arrival poisson` to change when the jobs arrive, which is especially useful in cases where there may be gaps in time, e.g., when the system goes down for several days, or the system is is underutilized. -python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --arrival poisson + + raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --arrival poisson 2. `--policy`. Changing the way the jobs are scheduled. The `--policy` flag will be set by default to `replay` in cases where a telemetry file is provided, in which case the jobs will be scheduled according to the start times provided. Changing the `--policy` to `fcfs` or `backfill` will use the internal scheduler, e.g.: - python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h + raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR --policy fcfs --backfill firstfit -t 12h 3. `--scale`. Changing the scale of each job in the telemetry data. The `--scale` flag will specify the maximum number of nodes for each job (generally set this to the max number of nodes of the smallest partition), and randomly select the number of nodes for each job from one to max nodes. This flag is useful when replaying telemetry from a larger system onto a smaller system. diff --git a/experiments/mit-replay-24hrs.yaml b/experiments/mit-replay-24hrs.yaml index 1357886f9d02cdb61e915e1f56b5711a4bf0fde0..69900699af0d6b63d68720a1e4ad59ad9dcf4579 100644 --- a/experiments/mit-replay-24hrs.yaml +++ b/experiments/mit-replay-24hrs.yaml @@ -1,4 +1,4 @@ -# python main.py run-multi-part experiments/mit-replay-24hrs.yaml +# raps run-multi-part experiments/mit-replay-24hrs.yaml partitions: ["mit_supercloud/part-cpu", "mit_supercloud/part-gpu"] replay: - /opt/data/mit_supercloud/202201 diff --git a/experiments/mit-synthetic.yaml b/experiments/mit-synthetic.yaml index 5f68cd18030cc1a4ba1b3918b880083f9c275f94..6a24946f41f3b21ada3546a074c46fabaf10932c 100644 --- a/experiments/mit-synthetic.yaml +++ b/experiments/mit-synthetic.yaml @@ -1,3 +1,3 @@ -# python main.py run-multi-part experiments/mit-synthetic.yaml +# raps run-multi-part experiments/mit-synthetic.yaml partitions: ["mit_supercloud/part-cpu", "mit_supercloud/part-gpu"] workload: multitenant diff --git a/main.py b/main.py index b2eae9392b068eab22da2798631750808f7f0a32..ab464b058f3b616c9bfe37a553f350883148eeda 100755 --- a/main.py +++ b/main.py @@ -1,18 +1,77 @@ #!/usr/bin/env python3 +# PYTHON_ARGCOMPLETE_OK """ ExaDigiT Resource Allocator & Power Simulator (RAPS) """ import argparse -from raps.helpers import check_python_version -from raps.run_sim import run_sim_add_parser, run_parts_sim_add_parser, show_add_parser -from raps.workloads import run_workload_add_parser -from raps.telemetry import run_telemetry_add_parser -from raps.train_rl import train_rl_add_parser +from pathlib import Path +import os +import textwrap +import copy +import gzip +import dill +import argcomplete -check_python_version() +# Implement shell completion using argcomplete +# Importing all of raps' dependencies like pandas etc can be rather slow, often taking 1-2 seconds. So for snappy shell +# completion we need avoid imports on the shell completion path. We could do this by shuffling the code around to +# create the parser without importing any heavy-weight libraries. But that would be a pain to maintain and track that +# pandas or scipy aren't accidentally imported transitively. Pandas can also be convenient to use in validating SimConfig +# etc, which is needed to build the argparser. So instead, we cache the generated argparser object so that shell +# completion can run without importing the rest of raps. +PARSER_CACHE = Path(__file__).parent / '.shell-completion-cache' + + +def shell_completion_add_parser(subparsers): + parser = subparsers.add_parser("shell-completion", description=textwrap.dedent(""" + Register shell completion for RAPS. + """).strip(), formatter_class=argparse.RawDescriptionHelpFormatter) + + # Run the command from argcomplete, this edits ~/.bash_completion to register argcomplete + def impl(args): + os.system("activate-global-python-argcomplete") + + parser.set_defaults(impl=impl) + + +def shell_complete(): + try: + parser = dill.loads(gzip.decompress(PARSER_CACHE.read_bytes())) + except Exception: + PARSER_CACHE.unlink(missing_ok=True) # delete cache if corrupted somehow + parser = argparse.ArgumentParser() + # Use a dummy parser so that autocomplete still handles sys.exit tab complete if there's no + # cache. Cache will be created on first run of `main.py` + + argcomplete.autocomplete(parser, always_complete_options=False) + + +def cache_parser(parser: argparse.ArgumentParser): + parser = copy.deepcopy(parser) + subparsers = next(a for a in parser._actions if isinstance(a, argparse._SubParsersAction)) + # Don't need to pickle the impl functions + for subparser in subparsers.choices.values(): + subparser.set_defaults(impl=lambda args: None) + + pickled = gzip.compress(dill.dumps(parser), compresslevel=4, mtime=0) + if not PARSER_CACHE.exists() or PARSER_CACHE.read_bytes() != pickled: + try: # Ignore if there's some kind of write or permission error + PARSER_CACHE.write_bytes(pickled) + except Exception: + pass def main(cli_args: list[str] | None = None): + shell_complete() # will output shell completion and sys.exit during tab complete + + from raps.helpers import check_python_version + check_python_version() + + from raps.run_sim import run_sim_add_parser, run_parts_sim_add_parser, show_add_parser + from raps.workloads import run_workload_add_parser + from raps.telemetry import run_telemetry_add_parser + from raps.train_rl import train_rl_add_parser + parser = argparse.ArgumentParser( description=""" ExaDigiT Resource Allocator & Power Simulator (RAPS) @@ -27,8 +86,9 @@ def main(cli_args: list[str] | None = None): run_workload_add_parser(subparsers) run_telemetry_add_parser(subparsers) train_rl_add_parser(subparsers) + shell_completion_add_parser(subparsers) - # TODO: move other misc scripts into here + cache_parser(parser) args = parser.parse_args(cli_args) assert args.impl, "subparsers should add an impl function to args" diff --git a/pyproject.toml b/pyproject.toml index c009d2aa23d3d1cdb5c45119bdcf1ada91eb6416..732315e5532577baf0fff52275ccd9b74c0f3230 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,8 @@ dependencies = [ "pydantic-settings>=2.10.1", "stable-baselines3==2.7.0", "gym==0.26.2", + "dill==0.4.0", + "argcomplete==3.6.2", "pre-commit" ] diff --git a/raps/dataloaders/adastraMI250.py b/raps/dataloaders/adastraMI250.py index 8cadbfbf2bcbeef8a32f519f8aede6390c22ed7b..ed60807d0818c5209eaa7998109b0712e47dd661 100644 --- a/raps/dataloaders/adastraMI250.py +++ b/raps/dataloaders/adastraMI250.py @@ -6,13 +6,13 @@ # to simulate the dataset - python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 + raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 # to replay with different scheduling policy - python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 --policy priority --backfill easy + raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 --policy priority --backfill easy # to run a specific time range - python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 \ + raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 \ --start 2024-11-01T00:00:00Z --end 2024-11-02T00:00:00Z # to analyze dataset diff --git a/raps/dataloaders/bluewaters.py b/raps/dataloaders/bluewaters.py index 728e2bb236bcb4bae520cab1b060eb4156bd309c..7b1ee1f943611674a1e67b1791c3c0d3bc2f40aa 100644 --- a/raps/dataloaders/bluewaters.py +++ b/raps/dataloaders/bluewaters.py @@ -3,7 +3,7 @@ Blue Waters dataloader Example test case: - python main.py -f /opt/data/bluewaters --start 20170328 --system bluewaters -net + raps run -f /opt/data/bluewaters --start 20170328 --system bluewaters -net To download the necessary datasets: diff --git a/raps/dataloaders/frontier.py b/raps/dataloaders/frontier.py index a6ac45e70e94fbfc08ffb54496a74a8955f80552..23efd2fbcfb30476e050a19c8ebe4094572e6305 100644 --- a/raps/dataloaders/frontier.py +++ b/raps/dataloaders/frontier.py @@ -4,7 +4,7 @@ # To simulate DATEDIR="date=2024-01-18" DPATH=/path/to/data - python main.py -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR + raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR # To analyze the data python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR diff --git a/raps/dataloaders/fugaku.py b/raps/dataloaders/fugaku.py index 4ccc8859b9b345b4d2e1d912f25ccd6796350504..5a531fa19742e97c8209d89a86c94b2e26fa105d 100644 --- a/raps/dataloaders/fugaku.py +++ b/raps/dataloaders/fugaku.py @@ -10,9 +10,9 @@ The '--arrival poisson' will compute submit times from Poisson distribution, instead of using the submit times given in F-Data. - python main.py --system fugaku -f /path/to/21_04.parquet - python main.py --system fugaku -f /path/to/21_04.parquet --validate - python main.py --system fugaku -f /path/to/21_04.parquet --policy priority --backfill easy + raps run --system fugaku -f /path/to/21_04.parquet + raps run --system fugaku -f /path/to/21_04.parquet --validate + raps run --system fugaku -f /path/to/21_04.parquet --policy priority --backfill easy """ import pandas as pd from tqdm import tqdm diff --git a/raps/dataloaders/lassen.py b/raps/dataloaders/lassen.py index 8bded75f6ebc9e2d17a6136a8b38c266e1f0cbb2..db86513051cf22d86e6f9940bd0305ac6763b808 100644 --- a/raps/dataloaders/lassen.py +++ b/raps/dataloaders/lassen.py @@ -14,19 +14,19 @@ Usage Instructions: git lfs pull # to analyze dataset and plot histograms - python -m raps.telemetry -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --plot + raps telemetry -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --plot # to simulate the dataset as submitted - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen + raps run -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen # to modify the submit times of the telemetry according to Poisson distribution - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --arrival poisson + raps run -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --arrival poisson # to fast-forward 365 days and replay for 1 day. This region day has 2250 jobs with 1650 jobs executed. - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --start '2019-08-22T00:00:00+00:00' -t 1d + raps run -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --start '2019-08-22T00:00:00+00:00' -t 1d # For the network replay this command gives suiteable snapshots: - python main.py -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit -t 12h --arrival poisson # noqa + raps run -f /path/to/LAST/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit -t 12h --arrival poisson # noqa """ import math diff --git a/raps/dataloaders/marconi100.py b/raps/dataloaders/marconi100.py index 6ff310b288579e6269643f0622c24387d84681c8..4b3c5c6ba9a5ef9fbbb6ce55843b07b336607766 100644 --- a/raps/dataloaders/marconi100.py +++ b/raps/dataloaders/marconi100.py @@ -9,14 +9,14 @@ Download `job_table.parquet` from https://zenodo.org/records/10127767 # to simulate the dataset - python main.py -f /path/to/job_table.parquet --system marconi100 + raps run -f /path/to/job_table.parquet --system marconi100 # to replay using differnt schedulers - python main.py -f /path/to/job_table.parquet --system marconi100 --policy fcfs --backfill easy - python main.py -f /path/to/job_table.parquet --system marconi100 --policy priority --backfill firstfit + raps run -f /path/to/job_table.parquet --system marconi100 --policy fcfs --backfill easy + raps run -f /path/to/job_table.parquet --system marconi100 --policy priority --backfill firstfit # to fast-forward 60 days and replay for 1 day - python main.py -f /path/to/job_table.parquet --system marconi100 --start 2020-07-05T00:00:00+00:00 -t 1d + raps run -f /path/to/job_table.parquet --system marconi100 --start 2020-07-05T00:00:00+00:00 -t 1d # to analyze dataset python -m raps.telemetry -f /path/to/job_table.parquet --system marconi100 -v diff --git a/raps/schedulers/fastsim.py b/raps/schedulers/fastsim.py index 855dcbdec7f8433332b3040eb9a72f1e4b3d292a..e930a1c5d552bd8acb580939aaa9ed8633cb7a23 100644 --- a/raps/schedulers/fastsim.py +++ b/raps/schedulers/fastsim.py @@ -10,7 +10,7 @@ from raps.sim_config import args from raps.system_config import get_system_config # Run with this command: -# python main.py --system kestrel -f ../data/fastsim_jobs_output.parquet --scheduler fastsim --policy priority --start 2024-09-01T00:00 --end 2024-09-15T00:00 +# raps run --system kestrel -f ../data/fastsim_jobs_output.parquet --scheduler fastsim --policy priority --start 2024-09-01T00:00 --end 2024-09-15T00:00 class Scheduler(): """ diff --git a/scripts/marconi100-day51.sh b/scripts/marconi100-day51.sh index 01da9a2bfc9a1e6930f3a52e22718982864f9cd2..77cbe459861a4cfcfc435908a3380282f52871f6 100644 --- a/scripts/marconi100-day51.sh +++ b/scripts/marconi100-day51.sh @@ -1,4 +1,4 @@ -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy replay -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs --backfill easy -python main.py -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy priority --backfill firstfit +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy replay +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy fcfs --backfill easy +./main.py run -f ~/data/marconi100/job_table.parquet --system marconi100 --ff 4381000 -t 61000 -o --policy priority --backfill firstfit diff --git a/tests/util.py b/tests/util.py index 4bbf8f8c18ff074d373dc31d727e7ae7b8ff7c19..46736b31a8f970df1db78863bc54e33a80c64251 100644 --- a/tests/util.py +++ b/tests/util.py @@ -13,7 +13,7 @@ from raps.stats import ( def find_project_root(): path = Path(__file__).resolve() - while not (path / "main.py").exists(): + while not (path / "pyproject.toml").exists(): if path.parent == path: raise RuntimeError("Could not find project root.") path = path.parent