Add Aurora system support with dataloader, config, and experiment files (a63b246c) · Commits · ExaDigiT / sim-raps

config/aurora.yaml

0 → 100644

+54 −0

Original line number	Diff line number	Diff line
		system:
		num_cdus: 42
		racks_per_cdu: 4
		nodes_per_rack: 64
		chassis_per_rack: 8
		nodes_per_blade: 1
		switches_per_chassis: 4
		nics_per_node: 4
		rectifiers_per_chassis: 4
		nodes_per_rectifier: 4
		missing_racks: [166, 167]
		down_nodes: []
		cpus_per_node: 2
		gpus_per_node: 6
		cpu_peak_flops: 3.9936E12
		gpu_peak_flops: 52.43E12
		cpu_fp_ratio: 0.511
		gpu_fp_ratio: 0.511
		power:
		power_gpu_idle: 88
		power_gpu_max: 600
		power_cpu_idle: 90
		power_cpu_max: 350
		power_mem: 74.26
		power_nic: 20
		power_nvme: 30
		power_switch: 250
		power_cdu: 8473.47
		power_update_freq: 15
		rectifier_peak_threshold: 13670
		sivoc_loss_constant: 13
		sivoc_efficiency: 0.98
		rectifier_loss_constant: 17
		rectifier_efficiency: 0.96
		power_cost: 0.094
		scheduler:
		job_arrival_time: 15
		mtbf: 11
		trace_quanta: 15
		min_wall_time: 60
		max_wall_time: 43200
		ui_update_freq: 900
		max_nodes_per_job: 3000
		job_end_probs:
		COMPLETED: 0.63
		FAILED: 0.13
		CANCELLED: 0.12
		TIMEOUT: 0.11
		NODE_FAIL: 0.01
		queues:
		default: {}
		urgent:
		reserved_nodes: 64
		max_runtime: 172800 # 2 days in seconds

experiments/aurora.yaml

0 → 100644

+3 −0

Original line number	Diff line number	Diff line
		system: aurora
		replay:
		- /opt/data/aurora/ANL-ALCF-DJC-AURORA_20250127_20251031.csv

experiments/bluewaters.yaml

+1 −1

Original line number	Diff line number	Diff line
		system: bluewaters
		replay:
		- /opt/data/bluewaters
		start: "20170328"
		start: "2017-03-28"
		simulate_network: True
		filter: "traffic > 1e8"

experiments/philly.yaml

0 → 100644

+5 −0

Original line number	Diff line number	Diff line
		partitions: ["philly/2-gpu", "philly/8-gpu"]
		replay:
		- /opt/data/philly/trace-data
		start: 2017-10-03T00:00
		end: 2017-10-04T00:00

raps/dataloaders/aurora.py

0 → 100644

+107 −0

Original line number	Diff line number	Diff line
		"""
		Download DIM_JOB_COMPOSITE dataset from https://reports.alcf.anl.gov/data/aurora.html

		Test case:

		raps run experiments/aurora.yaml -t 1 --noui --no-cooling

		"""
		import pandas as pd
		from pathlib import Path
		from datetime import datetime, timezone

		from ..job import job_dict, Job
		from ..utils import WorkloadData


		def load_data(local_dataset_path, **kwargs):
		"""
		Aurora dataloader.
		"""
		if isinstance(local_dataset_path, list):
		filepath = Path(local_dataset_path[0])
		else:
		filepath = Path(local_dataset_path)

		if not filepath.is_file():
		raise FileNotFoundError(f"File not found: {filepath}")

		print(f"Reading data from {filepath}")

		usecols = [
		"COBALT_JOBID", "JOB_NAME", "QUEUED_TIMESTAMP", "START_TIMESTAMP", "END_TIMESTAMP",
		"NODES_REQUESTED", "CORES_REQUESTED", "WALLTIME_SECONDS", "RUNTIME_SECONDS",
		"USERNAME_GENID",
		]
		df = pd.read_csv(filepath, usecols=usecols, low_memory=False, on_bad_lines='warn')

		# Parse timestamps vectorially
		df['START_TIMESTAMP'] = pd.to_datetime(df['START_TIMESTAMP'], errors='coerce')
		df['END_TIMESTAMP'] = pd.to_datetime(df['END_TIMESTAMP'], errors='coerce')
		df['QUEUED_TIMESTAMP'] = pd.to_datetime(df['QUEUED_TIMESTAMP'], errors='coerce')

		# Filter
		df = df.dropna(subset=['START_TIMESTAMP', 'END_TIMESTAMP', 'QUEUED_TIMESTAMP'])
		df = df[(df['NODES_REQUESTED'] > 0) & (df['RUNTIME_SECONDS'] > 0)]
		df = df[df['END_TIMESTAMP'] > df['START_TIMESTAMP']]
		df = df.sort_values('START_TIMESTAMP').reset_index(drop=True)

		if df.empty:
		return WorkloadData(jobs=[], telemetry_start=0, telemetry_end=0,
		start_date=datetime.now(timezone.utc))

		# Convert timestamps to epoch seconds (vectorised)
		t0_ts = df['START_TIMESTAMP'].min()
		t0 = int(t0_ts.timestamp())

		start_times = ((df['START_TIMESTAMP'] - t0_ts).dt.total_seconds()).astype(int).to_numpy()
		end_times = ((df['END_TIMESTAMP'] - t0_ts).dt.total_seconds()).astype(int).to_numpy()
		submit_times = ((df['QUEUED_TIMESTAMP'] - t0_ts).dt.total_seconds()).clip(lower=0).astype(int).to_numpy()

		job_names = df['JOB_NAME'].fillna('N/A').astype(str).to_numpy()
		job_ids = [n.split('.')[0] for n in job_names]
		nodes = df['NODES_REQUESTED'].astype(int).to_numpy()
		cores = df['CORES_REQUESTED'].fillna(0).astype(int).to_numpy()
		walltimes = df['WALLTIME_SECONDS'].astype(int).to_numpy()
		runtimes = df['RUNTIME_SECONDS'].astype(int).to_numpy()
		accounts = df['USERNAME_GENID'].fillna('N/A').astype(str).to_numpy()

		print(f"Aurora: building {len(df)} jobs")

		jobs = []
		for i in range(len(df)):
		job = job_dict(
		id=job_ids[i],
		name=job_names[i],
		submit_time=int(submit_times[i]),
		start_time=int(start_times[i]),
		end_time=int(end_times[i]),
		time_limit=int(walltimes[i]),
		expected_run_time=int(runtimes[i]),
		nodes_required=int(nodes[i]),
		cpu_cores_required=int(cores[i]),
		account=accounts[i],
		scheduled_nodes=[],
		gpu_trace=0,
		cpu_trace=0,
		nrx_trace=[],
		ntx_trace=[],
		end_state="COMPLETED",
		priority=0,
		current_run_time=0,
		trace_time=int(submit_times[i]),
		trace_start_time=int(start_times[i]),
		trace_end_time=int(end_times[i]),
		trace_quanta=1,
		)
		jobs.append(Job(job))

		telemetry_end = int(end_times.max())
		start_date = datetime.fromtimestamp(t0, timezone.utc)

		return WorkloadData(
		jobs=jobs,
		telemetry_start=0,
		telemetry_end=telemetry_end,
		start_date=start_date,
		)