Loading .gitignore +1 −0 Original line number Diff line number Diff line Loading @@ -9,3 +9,4 @@ models/fmu-models .shell-completion-cache raps-output-* ppo_raps_logs /data config/selene.yaml +1 −1 Original line number Diff line number Diff line Loading @@ -4,7 +4,7 @@ system: nodes_per_rack: 4 rectifiers_per_rack: 32 chassis_per_rack: 4 nodes_per_blade: 2 nodes_per_blade: 1 switches_per_chassis: 4 nics_per_node: 4 rectifiers_per_chassis: 4 Loading main.py +2 −1 Original line number Diff line number Diff line Loading @@ -69,7 +69,7 @@ def main(cli_args: list[str] | None = None): from raps.run_sim import run_sim_add_parser, run_parts_sim_add_parser, show_add_parser from raps.workloads import run_workload_add_parser from raps.telemetry import run_telemetry_add_parser from raps.telemetry import run_telemetry_add_parser, run_download_add_parser from raps.train_rl import train_rl_add_parser parser = argparse.ArgumentParser( Loading @@ -85,6 +85,7 @@ def main(cli_args: list[str] | None = None): show_add_parser(subparsers) run_workload_add_parser(subparsers) run_telemetry_add_parser(subparsers) run_download_add_parser(subparsers) train_rl_add_parser(subparsers) shell_completion_add_parser(subparsers) Loading raps/dataloaders/adastraMI250.py +25 −15 Original line number Diff line number Diff line """ # get the data Download `AdastaJobsMI250_15days.parquet` from https://zenodo.org/records/14007065/files/AdastaJobsMI250_15days.parquet ``` raps download --system adastraMI250 ``` This will download the dataset from https://zenodo.org/records/14007065/files/AdastaJobsMI250_15days.parquet # to simulate the dataset raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 Loading @@ -17,12 +17,14 @@ # to analyze dataset python -m raps.telemetry -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 -v """ import uuid import numpy as np import pandas as pd from pathlib import Path from datetime import datetime from tqdm import tqdm import urllib.request from ..job import job_dict, Job from ..utils import WorkloadData Loading Loading @@ -279,3 +281,11 @@ def cdu_pos(index: int, config: dict) -> tuple[int, int]: name = CDU_NAMES[index - 1] row, col = int(name[2]), int(name[3:5]) return (row, col) def download(dest: Path, start: datetime | None, end: datetime | None): dest.mkdir(parents=True) filename = "AdastaJobsMI250_15days.parquet" print(f"Downloading {filename}") urllib.request.urlretrieve(f"https://zenodo.org/records/14007065/files/{filename}", dest / filename) print("Done!") raps/dataloaders/frontier.py +51 −9 Original line number Diff line number Diff line Loading @@ -10,13 +10,16 @@ python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR """ import time from datetime import datetime, timezone from datetime import datetime, timezone, timedelta from zoneinfo import ZoneInfo import numpy as np import pandas as pd import subprocess from tqdm import tqdm from pathlib import Path from ..job import job_dict, Job from ..utils import power_to_utilization, encrypt, WorkloadData from ..utils import power_to_utilization, encrypt, WorkloadData, date_range def aging_boost(nnodes): Loading Loading @@ -609,3 +612,42 @@ def cdu_pos(index: int, config: dict) -> tuple[int, int]: name = CDU_NAMES[index - 1] row, col = int(name[2]), int(name[3:5]) return (row, col) def download(dest: Path, start: datetime | None, end: datetime | None): HOST = "dtn.ccs.ornl.gov" DATA_LAKE = "/lustre/orion/stf218/proj-shared/data/lake/frontier" print("Downloading the Frontier dataset requires access permissions.") print("If you have access you can download via SSH.") USERNAME = input("NCCS Username: ") # jobs are indexed by submission time so download a few extra days to make sure we get all that # ran over start -> end if start: start = (start - timedelta(days=2)).astimezone(ZoneInfo("UTC")) else: start = datetime.fromisoformat("2023-09-01T00:00:00Z") if end: end = (end + timedelta(days=2)).astimezone(ZoneInfo("UTC")) else: end = datetime.now(ZoneInfo("UTC")) days = list(date_range(start, end)) dest.mkdir(parents=True) subprocess.run(["rsync", "-rvm", *[f"--include=date={d.date().isoformat()}/***" for d in days], "--exclude", '*', f"{USERNAME}@{HOST}:{DATA_LAKE}/jobprofile/jobprofile/", str(dest / "jobprofile") ], check=True, text=True) (dest / 'slurm').mkdir(parents=True) subprocess.run(["rsync", "-rvm", *[f"--include=date={d.date().isoformat()}/***" for d in days], "--exclude", '*', f"{USERNAME}@{HOST}:{DATA_LAKE}/slurm/joblive/", str(dest / "slurm/joblive") ], check=True, text=True) print("Done!") Loading
.gitignore +1 −0 Original line number Diff line number Diff line Loading @@ -9,3 +9,4 @@ models/fmu-models .shell-completion-cache raps-output-* ppo_raps_logs /data
config/selene.yaml +1 −1 Original line number Diff line number Diff line Loading @@ -4,7 +4,7 @@ system: nodes_per_rack: 4 rectifiers_per_rack: 32 chassis_per_rack: 4 nodes_per_blade: 2 nodes_per_blade: 1 switches_per_chassis: 4 nics_per_node: 4 rectifiers_per_chassis: 4 Loading
main.py +2 −1 Original line number Diff line number Diff line Loading @@ -69,7 +69,7 @@ def main(cli_args: list[str] | None = None): from raps.run_sim import run_sim_add_parser, run_parts_sim_add_parser, show_add_parser from raps.workloads import run_workload_add_parser from raps.telemetry import run_telemetry_add_parser from raps.telemetry import run_telemetry_add_parser, run_download_add_parser from raps.train_rl import train_rl_add_parser parser = argparse.ArgumentParser( Loading @@ -85,6 +85,7 @@ def main(cli_args: list[str] | None = None): show_add_parser(subparsers) run_workload_add_parser(subparsers) run_telemetry_add_parser(subparsers) run_download_add_parser(subparsers) train_rl_add_parser(subparsers) shell_completion_add_parser(subparsers) Loading
raps/dataloaders/adastraMI250.py +25 −15 Original line number Diff line number Diff line """ # get the data Download `AdastaJobsMI250_15days.parquet` from https://zenodo.org/records/14007065/files/AdastaJobsMI250_15days.parquet ``` raps download --system adastraMI250 ``` This will download the dataset from https://zenodo.org/records/14007065/files/AdastaJobsMI250_15days.parquet # to simulate the dataset raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 Loading @@ -17,12 +17,14 @@ # to analyze dataset python -m raps.telemetry -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 -v """ import uuid import numpy as np import pandas as pd from pathlib import Path from datetime import datetime from tqdm import tqdm import urllib.request from ..job import job_dict, Job from ..utils import WorkloadData Loading Loading @@ -279,3 +281,11 @@ def cdu_pos(index: int, config: dict) -> tuple[int, int]: name = CDU_NAMES[index - 1] row, col = int(name[2]), int(name[3:5]) return (row, col) def download(dest: Path, start: datetime | None, end: datetime | None): dest.mkdir(parents=True) filename = "AdastaJobsMI250_15days.parquet" print(f"Downloading {filename}") urllib.request.urlretrieve(f"https://zenodo.org/records/14007065/files/{filename}", dest / filename) print("Done!")
raps/dataloaders/frontier.py +51 −9 Original line number Diff line number Diff line Loading @@ -10,13 +10,16 @@ python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR """ import time from datetime import datetime, timezone from datetime import datetime, timezone, timedelta from zoneinfo import ZoneInfo import numpy as np import pandas as pd import subprocess from tqdm import tqdm from pathlib import Path from ..job import job_dict, Job from ..utils import power_to_utilization, encrypt, WorkloadData from ..utils import power_to_utilization, encrypt, WorkloadData, date_range def aging_boost(nnodes): Loading Loading @@ -609,3 +612,42 @@ def cdu_pos(index: int, config: dict) -> tuple[int, int]: name = CDU_NAMES[index - 1] row, col = int(name[2]), int(name[3:5]) return (row, col) def download(dest: Path, start: datetime | None, end: datetime | None): HOST = "dtn.ccs.ornl.gov" DATA_LAKE = "/lustre/orion/stf218/proj-shared/data/lake/frontier" print("Downloading the Frontier dataset requires access permissions.") print("If you have access you can download via SSH.") USERNAME = input("NCCS Username: ") # jobs are indexed by submission time so download a few extra days to make sure we get all that # ran over start -> end if start: start = (start - timedelta(days=2)).astimezone(ZoneInfo("UTC")) else: start = datetime.fromisoformat("2023-09-01T00:00:00Z") if end: end = (end + timedelta(days=2)).astimezone(ZoneInfo("UTC")) else: end = datetime.now(ZoneInfo("UTC")) days = list(date_range(start, end)) dest.mkdir(parents=True) subprocess.run(["rsync", "-rvm", *[f"--include=date={d.date().isoformat()}/***" for d in days], "--exclude", '*', f"{USERNAME}@{HOST}:{DATA_LAKE}/jobprofile/jobprofile/", str(dest / "jobprofile") ], check=True, text=True) (dest / 'slurm').mkdir(parents=True) subprocess.run(["rsync", "-rvm", *[f"--include=date={d.date().isoformat()}/***" for d in days], "--exclude", '*', f"{USERNAME}@{HOST}:{DATA_LAKE}/slurm/joblive/", str(dest / "slurm/joblive") ], check=True, text=True) print("Done!")