Commit a63b246c authored by Brewer, Wes's avatar Brewer, Wes
Browse files

Add Aurora system support with dataloader, config, and experiment files

parent f87cb7a5
Loading
Loading
Loading
Loading

config/aurora.yaml

0 → 100644
+54 −0
Original line number Diff line number Diff line
system:
  num_cdus: 42
  racks_per_cdu: 4
  nodes_per_rack: 64
  chassis_per_rack: 8
  nodes_per_blade: 1
  switches_per_chassis: 4
  nics_per_node: 4
  rectifiers_per_chassis: 4
  nodes_per_rectifier: 4
  missing_racks: [166, 167]
  down_nodes: []
  cpus_per_node: 2
  gpus_per_node: 6
  cpu_peak_flops: 3.9936E12
  gpu_peak_flops: 52.43E12
  cpu_fp_ratio: 0.511
  gpu_fp_ratio: 0.511
power:
  power_gpu_idle: 88
  power_gpu_max: 600
  power_cpu_idle: 90
  power_cpu_max: 350
  power_mem: 74.26
  power_nic: 20
  power_nvme: 30
  power_switch: 250
  power_cdu: 8473.47
  power_update_freq: 15
  rectifier_peak_threshold: 13670
  sivoc_loss_constant: 13
  sivoc_efficiency: 0.98
  rectifier_loss_constant: 17
  rectifier_efficiency: 0.96
  power_cost: 0.094
scheduler:
  job_arrival_time: 15
  mtbf: 11
  trace_quanta: 15
  min_wall_time: 60
  max_wall_time: 43200
  ui_update_freq: 900
  max_nodes_per_job: 3000
  job_end_probs:
    COMPLETED: 0.63
    FAILED: 0.13
    CANCELLED: 0.12
    TIMEOUT: 0.11
    NODE_FAIL: 0.01
  queues:
    default: {}
    urgent:
      reserved_nodes: 64
      max_runtime: 172800  # 2 days in seconds
+3 −0
Original line number Diff line number Diff line
system: aurora
replay:
- /opt/data/aurora/ANL-ALCF-DJC-AURORA_20250127_20251031.csv
+1 −1
Original line number Diff line number Diff line
system: bluewaters
replay:
  - /opt/data/bluewaters
start: "20170328"
start: "2017-03-28"
simulate_network: True
filter: "traffic > 1e8"
+5 −0
Original line number Diff line number Diff line
partitions: ["philly/2-gpu", "philly/8-gpu"]
replay:
  - /opt/data/philly/trace-data
start: 2017-10-03T00:00
end: 2017-10-04T00:00
+107 −0
Original line number Diff line number Diff line
"""
Download DIM_JOB_COMPOSITE dataset from https://reports.alcf.anl.gov/data/aurora.html

Test case:

    raps run experiments/aurora.yaml -t 1 --noui --no-cooling

"""
import pandas as pd
from pathlib import Path
from datetime import datetime, timezone

from ..job import job_dict, Job
from ..utils import WorkloadData


def load_data(local_dataset_path, **kwargs):
    """
    Aurora dataloader.
    """
    if isinstance(local_dataset_path, list):
        filepath = Path(local_dataset_path[0])
    else:
        filepath = Path(local_dataset_path)

    if not filepath.is_file():
        raise FileNotFoundError(f"File not found: {filepath}")

    print(f"Reading data from {filepath}")

    usecols = [
        "COBALT_JOBID", "JOB_NAME", "QUEUED_TIMESTAMP", "START_TIMESTAMP", "END_TIMESTAMP",
        "NODES_REQUESTED", "CORES_REQUESTED", "WALLTIME_SECONDS", "RUNTIME_SECONDS",
        "USERNAME_GENID",
    ]
    df = pd.read_csv(filepath, usecols=usecols, low_memory=False, on_bad_lines='warn')

    # Parse timestamps vectorially
    df['START_TIMESTAMP'] = pd.to_datetime(df['START_TIMESTAMP'], errors='coerce')
    df['END_TIMESTAMP']   = pd.to_datetime(df['END_TIMESTAMP'],   errors='coerce')
    df['QUEUED_TIMESTAMP'] = pd.to_datetime(df['QUEUED_TIMESTAMP'], errors='coerce')

    # Filter
    df = df.dropna(subset=['START_TIMESTAMP', 'END_TIMESTAMP', 'QUEUED_TIMESTAMP'])
    df = df[(df['NODES_REQUESTED'] > 0) & (df['RUNTIME_SECONDS'] > 0)]
    df = df[df['END_TIMESTAMP'] > df['START_TIMESTAMP']]
    df = df.sort_values('START_TIMESTAMP').reset_index(drop=True)

    if df.empty:
        return WorkloadData(jobs=[], telemetry_start=0, telemetry_end=0,
                            start_date=datetime.now(timezone.utc))

    # Convert timestamps to epoch seconds (vectorised)
    t0_ts = df['START_TIMESTAMP'].min()
    t0 = int(t0_ts.timestamp())

    start_times  = ((df['START_TIMESTAMP']  - t0_ts).dt.total_seconds()).astype(int).to_numpy()
    end_times    = ((df['END_TIMESTAMP']    - t0_ts).dt.total_seconds()).astype(int).to_numpy()
    submit_times = ((df['QUEUED_TIMESTAMP'] - t0_ts).dt.total_seconds()).clip(lower=0).astype(int).to_numpy()

    job_names    = df['JOB_NAME'].fillna('N/A').astype(str).to_numpy()
    job_ids      = [n.split('.')[0] for n in job_names]
    nodes        = df['NODES_REQUESTED'].astype(int).to_numpy()
    cores        = df['CORES_REQUESTED'].fillna(0).astype(int).to_numpy()
    walltimes    = df['WALLTIME_SECONDS'].astype(int).to_numpy()
    runtimes     = df['RUNTIME_SECONDS'].astype(int).to_numpy()
    accounts     = df['USERNAME_GENID'].fillna('N/A').astype(str).to_numpy()

    print(f"Aurora: building {len(df)} jobs")

    jobs = []
    for i in range(len(df)):
        job = job_dict(
            id=job_ids[i],
            name=job_names[i],
            submit_time=int(submit_times[i]),
            start_time=int(start_times[i]),
            end_time=int(end_times[i]),
            time_limit=int(walltimes[i]),
            expected_run_time=int(runtimes[i]),
            nodes_required=int(nodes[i]),
            cpu_cores_required=int(cores[i]),
            account=accounts[i],
            scheduled_nodes=[],
            gpu_trace=0,
            cpu_trace=0,
            nrx_trace=[],
            ntx_trace=[],
            end_state="COMPLETED",
            priority=0,
            current_run_time=0,
            trace_time=int(submit_times[i]),
            trace_start_time=int(start_times[i]),
            trace_end_time=int(end_times[i]),
            trace_quanta=1,
        )
        jobs.append(Job(job))

    telemetry_end = int(end_times.max())
    start_date = datetime.fromtimestamp(t0, timezone.utc)

    return WorkloadData(
        jobs=jobs,
        telemetry_start=0,
        telemetry_end=telemetry_end,
        start_date=start_date,
    )