Initial implementation of Hao's HPL analytical model (1365689d) · Commits · ExaDigiT / sim-raps

raps/sim_config.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -136,7 +136,7 @@ class SimConfig(RAPSBaseModel, abc.ABC):
		# Workload arguments (TODO split into separate model)
		workload: Literal['random', 'benchmark', 'peak', 'idle', 'synthetic',
		'multitenant', 'replay', 'randomAI', 'network_test',
		'inter_job_congestion', 'calculon'] = "random"
		'inter_job_congestion', 'calculon', 'hpl'] = "random"

		""" Type of synthetic workload """
		multimodal: list[float] = [1.0]

raps/workloads/init.py

+3 −1

Original line number	Diff line number	Diff line
		@@ -13,6 +13,7 @@ from .basic import BasicWorkload
		from .calculon import Calculon
		from .constants import JOB_NAMES, ACCT_NAMES, MAX_PRIORITY
		from .distribution import DistributionWorkload
		from .hpl import HPL
		from .live import continuous_job_generation
		from .multitenant import MultitenantWorkload
		from .network import NetworkTestWorkload
		@@ -57,7 +58,8 @@ class Workload(
		MultitenantWorkload,
		NetworkTestWorkload,
		InterJobCongestionWorkload,
		Calculon
		Calculon,
		HPL
		):
		"""Final workload class with all workload types."""
		pass

raps/workloads/hpl.py

0 → 100644

+140 −0

Original line number	Diff line number	Diff line
		"""
		Test using:

		python main.py run -w hpl -d
		python raps/workloads/hpl.py
		"""
		from raps.job import Job, job_dict
		import numpy as np
		import math, random, json


		class HPL:
		"""Analytical HPL workload generator for ExaDigiT"""

		def __init__(self, args, *kwargs):
		super().__init__(args, *kwargs)

		def hpl(self, **kwargs):
		jobs = []
		# Example: parameter sweep across node counts or block sizes
		hpl_tests = [
		#{"M": 131072, "b": 576, "P": 192, "Q": 384, "Rtype": "1-ring"},
		#{"M": 131072, "b": 576, "P": 16, "Q": 32, "Rtype": "1-ring"},
		{"M": 741455, "b": 576, "P": 16, "Q": 32, "Rtype": "1-ring"},
		]

		#GCDS_PER_GPU = 2

		for test in hpl_tests:
		for partition in self.partitions:
		cfg = self.config_map[partition]
		trace_quanta = cfg["TRACE_QUANTA"]

		# --- Analytical model evaluation ---
		results = self._run_hpl_model(**test)

		total_time = results["T_total"]
		gpu_util = results["gpu_util"]
		cpu_util = results["cpu_util"]

		num_samples = math.ceil(total_time / trace_quanta) + 1
		gpu_trace = np.full(num_samples, gpu_util)
		cpu_trace = np.full(num_samples, cpu_util)

		job_info = job_dict(
		#nodes_required=test["P"] * test["Q"] // (cfg["GPUS_PER_NODE"] * GCDS_PER_GPU),
		nodes_required=test["P"] * test["Q"] // cfg["GPUS_PER_NODE"],
		scheduled_nodes=[],
		name=f"HPL_{test['M']}x{test['M']}",
		account="benchmark",
		cpu_trace=cpu_trace,
		gpu_trace=gpu_trace,
		ntx_trace=[], nrx_trace=[],
		id=None,
		end_state="COMPLETED",
		priority=100,
		partition=partition,
		time_limit=total_time,
		start_time=0,
		end_time=total_time,
		expected_run_time=total_time,
		trace_quanta=trace_quanta,
		trace_time=total_time,
		trace_start_time=0,
		trace_end_time=total_time,
		)
		jobs.append(Job(job_info))
		return jobs

		def _run_hpl_model(self, M, b, P, Q, Rtype="1-ring", f=0.6):
		# constants (Table II + Fig 2b)
		CAllgather = 6.3e9
		C1ring = 7e9
		Creduce = 46e6
		Fcpublas = 240e9
		Fgemm = 24e12

		Ml = M / P
		Nl = M / Q
		nb = int(M / b)
		total_T = 0.0

		print("*** nb:", nb)
		for i in range(nb):
		Ml_i = Ml - (i * b / P)
		Nl1_i = max((1 - f) * Nl - i * b / Q, 0)
		Nl2_i = f * Nl if i * b < f * Nl else Nl - i * b / Q

		TPDFACT = b ** 2 / Creduce + (2 / 3) * b ** 2 * Ml_i / Fcpublas
		TLBCAST = 16 * b * Ml_i / C1ring
		TUPD1 = 2 * b * Ml_i * Nl1_i / Fgemm
		TUPD2 = 2 * b * Ml_i * Nl2_i / Fgemm
		TRS1 = 16 * b * Nl1_i / CAllgather
		TRS2 = 16 * b * Nl2_i / CAllgather

		total_T += max(TPDFACT + TLBCAST + TRS1, TUPD2) + max(TRS2, TUPD1)

		# derive synthetic utilization
		gpu_util = min(1.0, (Fgemm / 25e12)) # normalized ratio
		cpu_util = min(1.0, (Fcpublas / 250e9))

		return {"T_total": total_T, "gpu_util": gpu_util, "cpu_util": cpu_util}

		if __name__ == "__main__":
		import json
		import numpy as np

		# Mock minimal configuration values to mimic ExaDigiT runtime
		class DummyHPL(HPL):
		def __init__(self):
		# Provide fake partitions and system config
		self.partitions = ["gpu"]
		self.config_map = {
		"gpu": {
		"TRACE_QUANTA": 15.0, # seconds per trace tick
		"GPUS_PER_NODE": 4,
		"CPUS_PER_NODE": 64,
		}
		}

		# Instantiate dummy workload
		workload = DummyHPL()

		# Run synthetic job generation
		jobs = workload.hpl()

		print(f"Generated {len(jobs)} HPL jobs:\n")
		for i, job in enumerate(jobs):
		print(i, job)
		print(f"--- Job {i} ---")
		print(f"Name: {job.name}")
		print(f"Nodes required: {job.nodes_required}")
		print(f"Wall time: {job.trace_time:.2f} s")
		print(f"CPU trace length: {len(job.cpu_trace)}")
		print(f"GPU trace length: {len(job.gpu_trace)}")
		print(f"Avg CPU util: {np.mean(job.cpu_trace):.3f}")
		print(f"Avg GPU util: {np.mean(job.gpu_trace):.3f}")
		print(f"Expected run time: {job.expected_run_time:.2f}")
		print()