Commit 19421791 authored by Hines, Jesse's avatar Hines, Jesse
Browse files

Merge branch 'dataloader-changes' into 'develop'

More changes to dataloaders and raps config for the simulation server

See merge request exadigit/raps!133
parents c95115b2 a10d0e7d
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -9,3 +9,4 @@ models/fmu-models
.shell-completion-cache
raps-output-*
ppo_raps_logs
/data
+1 −1
Original line number Diff line number Diff line
@@ -4,7 +4,7 @@ system:
  nodes_per_rack: 4
  rectifiers_per_rack: 32
  chassis_per_rack: 4
  nodes_per_blade: 2
  nodes_per_blade: 1
  switches_per_chassis: 4
  nics_per_node: 4
  rectifiers_per_chassis: 4
+2 −1
Original line number Diff line number Diff line
@@ -69,7 +69,7 @@ def main(cli_args: list[str] | None = None):

    from raps.run_sim import run_sim_add_parser, run_parts_sim_add_parser, show_add_parser
    from raps.workloads import run_workload_add_parser
    from raps.telemetry import run_telemetry_add_parser
    from raps.telemetry import run_telemetry_add_parser, run_download_add_parser
    from raps.train_rl import train_rl_add_parser

    parser = argparse.ArgumentParser(
@@ -85,6 +85,7 @@ def main(cli_args: list[str] | None = None):
    show_add_parser(subparsers)
    run_workload_add_parser(subparsers)
    run_telemetry_add_parser(subparsers)
    run_download_add_parser(subparsers)
    train_rl_add_parser(subparsers)
    shell_completion_add_parser(subparsers)

+25 −15
Original line number Diff line number Diff line
"""

# get the data
    Download `AdastaJobsMI250_15days.parquet` from
    https://zenodo.org/records/14007065/files/AdastaJobsMI250_15days.parquet

```
raps download --system adastraMI250
```
This will download the dataset from https://zenodo.org/records/14007065/files/AdastaJobsMI250_15days.parquet

# to simulate the dataset
raps run -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250
@@ -17,12 +17,14 @@

# to analyze dataset
python -m raps.telemetry -f /path/to/AdastaJobsMI250_15days.parquet --system adastraMI250 -v

"""
import uuid
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from tqdm import tqdm
import urllib.request

from ..job import job_dict, Job
from ..utils import WorkloadData
@@ -279,3 +281,11 @@ def cdu_pos(index: int, config: dict) -> tuple[int, int]:
    name = CDU_NAMES[index - 1]
    row, col = int(name[2]), int(name[3:5])
    return (row, col)


def download(dest: Path, start: datetime | None, end: datetime | None):
    dest.mkdir(parents=True)
    filename = "AdastaJobsMI250_15days.parquet"
    print(f"Downloading {filename}")
    urllib.request.urlretrieve(f"https://zenodo.org/records/14007065/files/{filename}", dest / filename)
    print("Done!")
+51 −9
Original line number Diff line number Diff line
@@ -10,13 +10,16 @@
python -m raps.telemetry -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR
"""
import time
from datetime import datetime, timezone
from datetime import datetime, timezone, timedelta
from zoneinfo import ZoneInfo
import numpy as np
import pandas as pd
import subprocess
from tqdm import tqdm
from pathlib import Path

from ..job import job_dict, Job
from ..utils import power_to_utilization, encrypt, WorkloadData
from ..utils import power_to_utilization, encrypt, WorkloadData, date_range


def aging_boost(nnodes):
@@ -609,3 +612,42 @@ def cdu_pos(index: int, config: dict) -> tuple[int, int]:
    name = CDU_NAMES[index - 1]
    row, col = int(name[2]), int(name[3:5])
    return (row, col)


def download(dest: Path, start: datetime | None, end: datetime | None):
    HOST = "dtn.ccs.ornl.gov"
    DATA_LAKE = "/lustre/orion/stf218/proj-shared/data/lake/frontier"

    print("Downloading the Frontier dataset requires access permissions.")
    print("If you have access you can download via SSH.")
    USERNAME = input("NCCS Username: ")
    # jobs are indexed by submission time so download a few extra days to make sure we get all that
    # ran over start -> end
    if start:
        start = (start - timedelta(days=2)).astimezone(ZoneInfo("UTC"))
    else:
        start = datetime.fromisoformat("2023-09-01T00:00:00Z")
    if end:
        end = (end + timedelta(days=2)).astimezone(ZoneInfo("UTC"))
    else:
        end = datetime.now(ZoneInfo("UTC"))

    days = list(date_range(start, end))

    dest.mkdir(parents=True)
    subprocess.run(["rsync", "-rvm",
                    *[f"--include=date={d.date().isoformat()}/***" for d in days],
                    "--exclude", '*',
                    f"{USERNAME}@{HOST}:{DATA_LAKE}/jobprofile/jobprofile/",
                    str(dest / "jobprofile")
                    ], check=True, text=True)

    (dest / 'slurm').mkdir(parents=True)
    subprocess.run(["rsync", "-rvm",
                    *[f"--include=date={d.date().isoformat()}/***" for d in days],
                    "--exclude", '*',
                    f"{USERNAME}@{HOST}:{DATA_LAKE}/slurm/joblive/",
                    str(dest / "slurm/joblive")
                    ], check=True, text=True)

    print("Done!")
Loading