From 1365689d905381a22bab2630683febb782be559b Mon Sep 17 00:00:00 2001
From: Wes Brewer <brewerwh@ornl.gov>
Date: Fri, 24 Oct 2025 20:39:29 -0400
Subject: [PATCH 1/7] Initial implementation of Hao's HPL analytical model

---
 raps/sim_config.py         |   2 +-
 raps/workloads/__init__.py |   4 +-
 raps/workloads/hpl.py      | 140 +++++++++++++++++++++++++++++++++++++
 3 files changed, 144 insertions(+), 2 deletions(-)
 create mode 100644 raps/workloads/hpl.py

diff --git a/raps/sim_config.py b/raps/sim_config.py
index 254859a..a12512f 100644
--- a/raps/sim_config.py
+++ b/raps/sim_config.py
@@ -136,7 +136,7 @@ class SimConfig(RAPSBaseModel, abc.ABC):
     # Workload arguments (TODO split into separate model)
     workload: Literal['random', 'benchmark', 'peak', 'idle', 'synthetic',
                       'multitenant', 'replay', 'randomAI', 'network_test',
-                      'inter_job_congestion', 'calculon'] = "random"
+                      'inter_job_congestion', 'calculon', 'hpl'] = "random"
 
     """ Type of synthetic workload """
     multimodal: list[float] = [1.0]
diff --git a/raps/workloads/__init__.py b/raps/workloads/__init__.py
index 9bcb41a..d789196 100644
--- a/raps/workloads/__init__.py
+++ b/raps/workloads/__init__.py
@@ -13,6 +13,7 @@ from .basic import BasicWorkload
 from .calculon import Calculon
 from .constants import JOB_NAMES, ACCT_NAMES, MAX_PRIORITY
 from .distribution import DistributionWorkload
+from .hpl import HPL
 from .live import continuous_job_generation
 from .multitenant import MultitenantWorkload
 from .network import NetworkTestWorkload
@@ -57,7 +58,8 @@ class Workload(
     MultitenantWorkload,
     NetworkTestWorkload,
     InterJobCongestionWorkload,
-    Calculon
+    Calculon,
+    HPL
 ):
     """Final workload class with all workload types."""
     pass
diff --git a/raps/workloads/hpl.py b/raps/workloads/hpl.py
new file mode 100644
index 0000000..bb6f18c
--- /dev/null
+++ b/raps/workloads/hpl.py
@@ -0,0 +1,140 @@
+"""
+Test using:
+
+    python main.py run -w hpl -d
+    python raps/workloads/hpl.py    
+"""
+from raps.job import Job, job_dict
+import numpy as np
+import math, random, json
+
+
+class HPL:
+    """Analytical HPL workload generator for ExaDigiT"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def hpl(self, **kwargs):
+        jobs = []
+        # Example: parameter sweep across node counts or block sizes
+        hpl_tests = [
+            #{"M": 131072, "b": 576, "P": 192, "Q": 384, "Rtype": "1-ring"},
+            #{"M": 131072, "b": 576, "P": 16, "Q": 32, "Rtype": "1-ring"},
+            {"M": 741455, "b": 576, "P": 16, "Q": 32, "Rtype": "1-ring"},
+        ]
+
+        #GCDS_PER_GPU = 2
+
+        for test in hpl_tests:
+            for partition in self.partitions:
+                cfg = self.config_map[partition]
+                trace_quanta = cfg["TRACE_QUANTA"]
+
+                # --- Analytical model evaluation ---
+                results = self._run_hpl_model(**test)
+
+                total_time = results["T_total"]
+                gpu_util = results["gpu_util"]
+                cpu_util = results["cpu_util"]
+
+                num_samples = math.ceil(total_time / trace_quanta) + 1
+                gpu_trace = np.full(num_samples, gpu_util)
+                cpu_trace = np.full(num_samples, cpu_util)
+
+                job_info = job_dict(
+                    #nodes_required=test["P"] * test["Q"] // (cfg["GPUS_PER_NODE"] * GCDS_PER_GPU),
+                    nodes_required=test["P"] * test["Q"] // cfg["GPUS_PER_NODE"],
+                    scheduled_nodes=[],
+                    name=f"HPL_{test['M']}x{test['M']}",
+                    account="benchmark",
+                    cpu_trace=cpu_trace,
+                    gpu_trace=gpu_trace,
+                    ntx_trace=[], nrx_trace=[],
+                    id=None,
+                    end_state="COMPLETED",
+                    priority=100,
+                    partition=partition,
+                    time_limit=total_time,
+                    start_time=0,
+                    end_time=total_time,
+                    expected_run_time=total_time,
+                    trace_quanta=trace_quanta,
+                    trace_time=total_time,
+                    trace_start_time=0,
+                    trace_end_time=total_time,
+                )
+                jobs.append(Job(job_info))
+        return jobs
+
+    def _run_hpl_model(self, M, b, P, Q, Rtype="1-ring", f=0.6):
+        # constants (Table II + Fig 2b)
+        CAllgather = 6.3e9
+        C1ring = 7e9
+        Creduce = 46e6
+        Fcpublas = 240e9
+        Fgemm = 24e12
+
+        Ml = M / P
+        Nl = M / Q
+        nb = int(M / b)
+        total_T = 0.0
+
+        print("*** nb:", nb)
+        for i in range(nb):
+            Ml_i = Ml - (i * b / P)
+            Nl1_i = max((1 - f) * Nl - i * b / Q, 0)
+            Nl2_i = f * Nl if i * b < f * Nl else Nl - i * b / Q
+
+            TPDFACT = b ** 2 / Creduce + (2 / 3) * b ** 2 * Ml_i / Fcpublas
+            TLBCAST = 16 * b * Ml_i / C1ring
+            TUPD1 = 2 * b * Ml_i * Nl1_i / Fgemm
+            TUPD2 = 2 * b * Ml_i * Nl2_i / Fgemm
+            TRS1 = 16 * b * Nl1_i / CAllgather
+            TRS2 = 16 * b * Nl2_i / CAllgather
+
+            total_T += max(TPDFACT + TLBCAST + TRS1, TUPD2) + max(TRS2, TUPD1)
+
+        # derive synthetic utilization
+        gpu_util = min(1.0, (Fgemm / 25e12))      # normalized ratio
+        cpu_util = min(1.0, (Fcpublas / 250e9))
+
+        return {"T_total": total_T, "gpu_util": gpu_util, "cpu_util": cpu_util}
+
+if __name__ == "__main__":
+    import json
+    import numpy as np
+
+    # Mock minimal configuration values to mimic ExaDigiT runtime
+    class DummyHPL(HPL):
+        def __init__(self):
+            # Provide fake partitions and system config
+            self.partitions = ["gpu"]
+            self.config_map = {
+                "gpu": {
+                    "TRACE_QUANTA": 15.0,      # seconds per trace tick
+                    "GPUS_PER_NODE": 4,
+                    "CPUS_PER_NODE": 64,
+                }
+            }
+
+    # Instantiate dummy workload
+    workload = DummyHPL()
+
+    # Run synthetic job generation
+    jobs = workload.hpl()
+
+    print(f"Generated {len(jobs)} HPL jobs:\n")
+    for i, job in enumerate(jobs):
+        print(i, job)
+        print(f"--- Job {i} ---")
+        print(f"Name: {job.name}")
+        print(f"Nodes required: {job.nodes_required}")
+        print(f"Wall time: {job.trace_time:.2f} s")
+        print(f"CPU trace length: {len(job.cpu_trace)}")
+        print(f"GPU trace length: {len(job.gpu_trace)}")
+        print(f"Avg CPU util: {np.mean(job.cpu_trace):.3f}")
+        print(f"Avg GPU util: {np.mean(job.gpu_trace):.3f}")
+        print(f"Expected run time: {job.expected_run_time:.2f}")
+        print()
+
-- 
GitLab


From fea99646a9836a3b2ce5a969b21f579f3998d0c0 Mon Sep 17 00:00:00 2001
From: Wes Brewer <brewerwh@ornl.gov>
Date: Fri, 24 Oct 2025 20:49:52 -0400
Subject: [PATCH 2/7] Add ref to Hao's SC25 paper

---
 raps/workloads/hpl.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/raps/workloads/hpl.py b/raps/workloads/hpl.py
index bb6f18c..31469ae 100644
--- a/raps/workloads/hpl.py
+++ b/raps/workloads/hpl.py
@@ -1,8 +1,16 @@
 """
+Hao Lu's analytical HPL model. Ref:
+
+    Lu et al., "Insights from Optimizing HPL Performance on Exascale Systems: 
+    A Comparative Analysis of Panel Factorization", in SC'25 Proceedings.
+
 Test using:
 
     python main.py run -w hpl -d
+
+or:
     python raps/workloads/hpl.py    
+
 """
 from raps.job import Job, job_dict
 import numpy as np
@@ -101,6 +109,7 @@ class HPL:
 
         return {"T_total": total_T, "gpu_util": gpu_util, "cpu_util": cpu_util}
 
+
 if __name__ == "__main__":
     import json
     import numpy as np
@@ -137,4 +146,3 @@ if __name__ == "__main__":
         print(f"Avg GPU util: {np.mean(job.gpu_trace):.3f}")
         print(f"Expected run time: {job.expected_run_time:.2f}")
         print()
-
-- 
GitLab


From 4c539b51fa95ca5b32d676d0e321dea33b95c053 Mon Sep 17 00:00:00 2001
From: Matthias Maiterth <maiterthm@ornl.gov>
Date: Tue, 28 Oct 2025 11:08:37 -0400
Subject: [PATCH 3/7] Fixed old remnants of running_time. --> jobs have
 current_run_time

---
 raps/engine.py       | 12 ++++++------
 raps/job.py          |  6 +++---
 raps/network/base.py | 18 +++++++++++-------
 raps/power.py        |  3 +--
 raps/ui.py           |  6 +++---
 raps/utils.py        |  5 +++--
 6 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/raps/engine.py b/raps/engine.py
index ac7e8c7..d502e45 100644
--- a/raps/engine.py
+++ b/raps/engine.py
@@ -499,7 +499,7 @@ class Engine:
         # update Running time
         for job in self.running:
             if job.current_state == JobState.RUNNING:
-                job.running_time = self.current_timestep - job.start_time
+                job.current_run_time = self.current_timestep - job.start_time
 
         # Stop the simulation if no more jobs are running or in the queue or in the job list.
         if autoshutdown and \
@@ -552,7 +552,7 @@ class Engine:
 
         for job in self.running:
 
-            job.running_time = self.current_timestep - job.start_time
+            job.current_run_time = self.current_timestep - job.start_time
 
             if job.current_state != JobState.RUNNING:
                 raise ValueError(
@@ -561,15 +561,15 @@ class Engine:
                 )
             else:  # if job.state == JobState.RUNNING:
                 # Error checks
-                if not replay and job.running_time > job.time_limit and job.end_time is not None:
+                if not replay and job.current_run_time > job.time_limit and job.end_time is not None:
                     raise Exception(f"Job exceded time limit! "
-                                    f"{job.running_time} > {job.time_limit}"
+                                    f"{job.current_run_time} > {job.time_limit}"
                                     f"\n{job}"
                                     f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)"
                                     )
-                if replay and job.running_time > job.expected_run_time:
+                if replay and job.current_run_time > job.expected_run_time:
                     raise Exception(f"Job should have ended in replay! "
-                                    f" {job.running_time} > {job.expected_run_time}"
+                                    f" {job.current_run_time} > {job.expected_run_time}"
                                     f"\n{job}"
                                     f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)"
                                     )
diff --git a/raps/job.py b/raps/job.py
index 05c455e..4d1dda0 100644
--- a/raps/job.py
+++ b/raps/job.py
@@ -180,7 +180,7 @@ class Job:
         self.trace_start_time = None  # Relative start time of the trace (to running time)
         self.trace_end_time = None    # Relative end time of the trace
         self.trace_quanta = None  # Trace quanta associated with the job # None means single value!
-        self.running_time = 0     # Current running time updated when simulating
+        self.current_run_time = 0     # Current running time updated when simulating
 
         # If a job dict was given, override the values from the job_dict:
         for key, value in job_dict.items():
@@ -232,7 +232,7 @@ class Job:
                 f"trace_start_time={self.trace_start_time}, "
                 f"trace_end_time={self.trace_end_time}, "
                 f"trace_quanta={self.trace_quanta}, "
-                f"running_time={self.running_time}, "
+                f"current_run_time={self.current_run_time}, "
                 f"power={self.power}, "
                 f"power_history={self.power_history})")
 
@@ -296,7 +296,7 @@ class JobStatistics:
         self.account = job.account
         self.num_nodes = len(job.scheduled_nodes)
         self.scheduled_nodes = job.scheduled_nodes
-        self.run_time = job.running_time
+        self.run_time = job.current_run_time
         self.submit_time = job.submit_time
         self.start_time = job.start_time
         self.end_time = job.end_time
diff --git a/raps/network/base.py b/raps/network/base.py
index bab2ec8..3f3daeb 100644
--- a/raps/network/base.py
+++ b/raps/network/base.py
@@ -4,6 +4,7 @@ from raps.utils import get_current_utilization
 from raps.network.fat_tree import node_id_to_host_name
 from raps.network.torus3d import link_loads_for_job_torus, torus_host_from_real_index
 
+
 def debug_print_trace(job, label: str = ""):
     """Print either the length (if iterable) or the value of job.gpu_trace."""
     if hasattr(job.gpu_trace, "__len__"):
@@ -138,6 +139,7 @@ def worst_link_util(loads, throughput):
             max_util = util
     return max_util
 
+
 def get_link_util_stats(loads, throughput, top_n=10):
     """
     Calculates a distribution of link utilization stats.
@@ -148,9 +150,9 @@ def get_link_util_stats(loads, throughput, top_n=10):
 
     # Calculate utilization for every link
     utilizations = {(edge): (byte_load * 8) / throughput for edge, byte_load in loads.items()}
-    
+
     util_values = list(utilizations.values())
-    
+
     stats = {
         'max': np.max(util_values),
         'mean': np.mean(util_values),
@@ -161,14 +163,16 @@ def get_link_util_stats(loads, throughput, top_n=10):
     # Get top N congested links
     sorted_links = sorted(utilizations.items(), key=lambda item: item[1], reverse=True)
     stats['top_links'] = sorted_links[:top_n]
-    
+
     return stats
 
+
 def max_throughput_per_tick(legacy_cfg: dict, trace_quanta: int) -> float:
     """Return bytes-per-tick throughput of a single link."""
     bw = legacy_cfg.get("NETWORK_MAX_BW") or 12.5e9
     return float(bw) * trace_quanta
 
+
 def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False):
     """
     Simulates network congestion from a list of concurrently running jobs.
@@ -181,8 +185,8 @@ def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False):
     trace_quanta = jobs[0].trace_quanta if jobs else 0
 
     for job in jobs:
-        # Assuming job.running_time is 0 for this static simulation
-        job.running_time = 0
+        # Assuming job.current_run_time is 0 for this static simulation
+        job.current_run_time = 0
         job.trace_start_time = 0
         net_tx = get_current_utilization(job.ntx_trace, job)
 
@@ -193,7 +197,7 @@ def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False):
                 host_list = [node_id_to_host_name(n, k) for n in job.scheduled_nodes]
             else:  # dragonfly
                 host_list = [network_model.real_to_fat_idx[real_n] for real_n in job.scheduled_nodes]
-            
+
             job_loads = link_loads_for_job(network_model.net_graph, host_list, net_tx)
 
         elif network_model.topology == "torus3d":
@@ -214,5 +218,5 @@ def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False):
 
     max_throughput = max_throughput_per_tick(legacy_cfg, trace_quanta)
     net_stats = get_link_util_stats(total_loads, max_throughput)
-    
+
     return net_stats
diff --git a/raps/power.py b/raps/power.py
index dd0745b..b1e6c9d 100644
--- a/raps/power.py
+++ b/raps/power.py
@@ -55,7 +55,7 @@ def compute_node_power(cpu_util, gpu_util, net_util, config):
     power_gpu = gpu_util * config['POWER_GPU_MAX'] + \
         (config['GPUS_PER_NODE'] - gpu_util) * config['POWER_GPU_IDLE']
 
-    if config.get("POWER_NIC_IDLE") != None and config.get("POWER_NIC_MAX") != None:
+    if config.get("POWER_NIC_IDLE") is not None and config.get("POWER_NIC_MAX") is not None:
         power_nic = config['POWER_NIC_IDLE'] + \
             (config['POWER_NIC_MAX'] - config['POWER_NIC_IDLE']) * net_util
     else:
@@ -432,7 +432,6 @@ class PowerManager:
         jobs_power = self.update_power_state(scheduled_nodes, cpu_utils, gpu_utils, net_utils)
 
         for i, job in enumerate(running_jobs):
-            # if job.running_time % self.config['TRACE_QUANTA'] == 0:
             job.power_history.append(jobs_power[i] * len(job.scheduled_nodes))
 
         # Update the power array UI component
diff --git a/raps/ui.py b/raps/ui.py
index 6330bc9..03ca136 100644
--- a/raps/ui.py
+++ b/raps/ui.py
@@ -191,10 +191,10 @@ class LayoutManager:
                 nodes_display = col_nodelist
 
             if self.engine.downscale != 1:
-                running_time_str = convert_seconds_to_hhmmss(job.running_time // self.engine.downscale) + \
-                    f" +{job.running_time % self.engine.downscale}/{self.engine.downscale}s"
+                running_time_str = convert_seconds_to_hhmmss(job.current_run_time // self.engine.downscale) + \
+                    f" +{job.current_run_time % self.engine.downscale}/{self.engine.downscale}s"
             else:
-                running_time_str = convert_seconds_to_hhmm(job.running_time)
+                running_time_str = convert_seconds_to_hhmm(job.current_run_time)
 
             row = [
                 str(job.id).zfill(5),
diff --git a/raps/utils.py b/raps/utils.py
index e232bce..d98be2a 100644
--- a/raps/utils.py
+++ b/raps/utils.py
@@ -640,7 +640,7 @@ def get_current_utilization(trace, job: Job):
     if not job.trace_quanta:
         raise ValueError("job.trace_quanta is not set; cannot compute utilization.")
 
-    time_quanta_index = int((job.running_time - job.trace_start_time) // job.trace_quanta)
+    time_quanta_index = int((job.current_run_time - job.trace_start_time) // job.trace_quanta)
     if time_quanta_index < 0:
         time_quanta_index = 0
 
@@ -700,6 +700,7 @@ def validate_resolved_path(path: str | Path, info: ValidationInfo):
             raise ValueError(f"{path} is not under {base_path}")
     return path
 
+
 ResolvedPath = A[Path, AfterValidator(validate_resolved_path)]
 """
 Resolve a path, and expand ~ in the path string.
@@ -829,7 +830,7 @@ def read_yaml(config_file: str | None) -> dict:
     return result
 
 
-def read_yaml_parsed(cls: type[T], config_file = None) -> dict:
+def read_yaml_parsed(cls: type[T], config_file=None) -> dict:
     """
     Like read_yaml, but parses the input to resolve paths etc.
     Exits on error after printing message (for use in the CLI)
-- 
GitLab


From 81070941ea676b9d77cf61621e47ff2a7c678828 Mon Sep 17 00:00:00 2001
From: Matthias Maiterth <maiterthm@ornl.gov>
Date: Tue, 28 Oct 2025 11:10:36 -0400
Subject: [PATCH 4/7] Scheduler stats displaying seconds again.

---
 raps/ui.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/raps/ui.py b/raps/ui.py
index 03ca136..cee033a 100644
--- a/raps/ui.py
+++ b/raps/ui.py
@@ -269,13 +269,13 @@ class LayoutManager:
         # Add data row with white values
         time_in_s = time // self.engine.downscale
         if (time_in_s < 946684800):  # Introducing Y2K into our codebase! Kek
-            time_str = convert_seconds_to_hhmm(time_in_s)
+            time_str = convert_seconds_to_hhmmss(time_in_s)
         else:
             # For the curious: If the simulation time in seconds is large than
             # unix timestamp for Jan 2000 this is a unix timestamp,
             time_str = f"{datetime.fromtimestamp(time_in_s).strftime('%Y-%m-%d %H:%M')}"
         if timestep_start != 0:  # append time simulated
-            time_str += f"\nSim: {convert_seconds_to_hhmm(time_in_s - timestep_start)}"
+            time_str += f"\nSim: {convert_seconds_to_hhmmss(time_in_s - timestep_start)}"
 
         row.append(time_str)
         row.append(str(nrun))
-- 
GitLab


From b633cf89869c8ef3341c1b887ad91befde850bd3 Mon Sep 17 00:00:00 2001
From: Wes Brewer <brewerwh@ornl.gov>
Date: Tue, 28 Oct 2025 11:29:06 -0400
Subject: [PATCH 5/7] Add more HPL test cases

---
 raps/workloads/hpl.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/raps/workloads/hpl.py b/raps/workloads/hpl.py
index 31469ae..e65b4d5 100644
--- a/raps/workloads/hpl.py
+++ b/raps/workloads/hpl.py
@@ -27,9 +27,9 @@ class HPL:
         jobs = []
         # Example: parameter sweep across node counts or block sizes
         hpl_tests = [
-            #{"M": 131072, "b": 576, "P": 192, "Q": 384, "Rtype": "1-ring"},
-            #{"M": 131072, "b": 576, "P": 16, "Q": 32, "Rtype": "1-ring"},
-            {"M": 741455, "b": 576, "P": 16, "Q": 32, "Rtype": "1-ring"},
+            {"M": 1482910, "b": 576, "P": 16, "Q": 32, "Rtype": "1-ring"},
+            {"M": 2965820, "b": 576, "P": 32, "Q": 32, "Rtype": "1-ring"},
+            {"M": 16777216, "b": 576, "P": 192, "Q": 192, "Rtype": "1-ring"},
         ]
 
         #GCDS_PER_GPU = 2
-- 
GitLab


From efe3dff3c7e2aaff0b033c8e0a0ff1af2d31e246 Mon Sep 17 00:00:00 2001
From: Matthias Maiterth <maiterthm@ornl.gov>
Date: Tue, 28 Oct 2025 13:01:36 -0400
Subject: [PATCH 6/7] Adjusted gpu_util to consider node utilization + linter
 fixes

---
 raps/workloads/hpl.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/raps/workloads/hpl.py b/raps/workloads/hpl.py
index e65b4d5..7378a44 100644
--- a/raps/workloads/hpl.py
+++ b/raps/workloads/hpl.py
@@ -1,7 +1,7 @@
 """
 Hao Lu's analytical HPL model. Ref:
 
-    Lu et al., "Insights from Optimizing HPL Performance on Exascale Systems: 
+    Lu et al., "Insights from Optimizing HPL Performance on Exascale Systems:
     A Comparative Analysis of Panel Factorization", in SC'25 Proceedings.
 
 Test using:
@@ -9,12 +9,12 @@ Test using:
     python main.py run -w hpl -d
 
 or:
-    python raps/workloads/hpl.py    
+    python raps/workloads/hpl.py
 
 """
 from raps.job import Job, job_dict
 import numpy as np
-import math, random, json
+import math
 
 
 class HPL:
@@ -32,7 +32,7 @@ class HPL:
             {"M": 16777216, "b": 576, "P": 192, "Q": 192, "Rtype": "1-ring"},
         ]
 
-        #GCDS_PER_GPU = 2
+        # GCDS_PER_GPU = 2
 
         for test in hpl_tests:
             for partition in self.partitions:
@@ -43,7 +43,7 @@ class HPL:
                 results = self._run_hpl_model(**test)
 
                 total_time = results["T_total"]
-                gpu_util = results["gpu_util"]
+                gpu_util = self.config_map[self.args.system]['GPUS_PER_NODE'] * results["gpu_util"]
                 cpu_util = results["cpu_util"]
 
                 num_samples = math.ceil(total_time / trace_quanta) + 1
@@ -51,7 +51,7 @@ class HPL:
                 cpu_trace = np.full(num_samples, cpu_util)
 
                 job_info = job_dict(
-                    #nodes_required=test["P"] * test["Q"] // (cfg["GPUS_PER_NODE"] * GCDS_PER_GPU),
+                    # nodes_required=test["P"] * test["Q"] // (cfg["GPUS_PER_NODE"] * GCDS_PER_GPU),
                     nodes_required=test["P"] * test["Q"] // cfg["GPUS_PER_NODE"],
                     scheduled_nodes=[],
                     name=f"HPL_{test['M']}x{test['M']}",
@@ -111,8 +111,6 @@ class HPL:
 
 
 if __name__ == "__main__":
-    import json
-    import numpy as np
 
     # Mock minimal configuration values to mimic ExaDigiT runtime
     class DummyHPL(HPL):
-- 
GitLab


From 7162992814a8e784d554bc023cbe1e6ecaf19bda Mon Sep 17 00:00:00 2001
From: Wes Brewer <brewerwh@ornl.gov>
Date: Tue, 28 Oct 2025 17:10:25 -0400
Subject: [PATCH 7/7] Update HPL to call Hao's model for each iteration

---
 raps/workloads/hpl.py | 225 +++++++++++++++++++++++++++++-------------
 1 file changed, 154 insertions(+), 71 deletions(-)

diff --git a/raps/workloads/hpl.py b/raps/workloads/hpl.py
index 7378a44..e338061 100644
--- a/raps/workloads/hpl.py
+++ b/raps/workloads/hpl.py
@@ -1,64 +1,66 @@
 """
-Hao Lu's analytical HPL model. Ref:
-
-    Lu et al., "Insights from Optimizing HPL Performance on Exascale Systems:
-    A Comparative Analysis of Panel Factorization", in SC'25 Proceedings.
-
-Test using:
+Hao Lu’s analytical HPL model adapter for ExaDigiT.
 
+Usage:
     python main.py run -w hpl -d
-
 or:
     python raps/workloads/hpl.py
-
 """
+
 from raps.job import Job, job_dict
 import numpy as np
 import math
 
 
 class HPL:
-    """Analytical HPL workload generator for ExaDigiT"""
+    """Analytical HPL workload generator for ExaDigiT."""
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+    # -------------------------------------------------------------------------
+    # Public entry
+    # -------------------------------------------------------------------------
     def hpl(self, **kwargs):
         jobs = []
-        # Example: parameter sweep across node counts or block sizes
+
+        # You can add more scenarios; comment out big ones while testing.
         hpl_tests = [
-            {"M": 1482910, "b": 576, "P": 16, "Q": 32, "Rtype": "1-ring"},
-            {"M": 2965820, "b": 576, "P": 32, "Q": 32, "Rtype": "1-ring"},
-            {"M": 16777216, "b": 576, "P": 192, "Q": 192, "Rtype": "1-ring"},
+            # Smaller grid (quick sanity check)
+            {"M": 2_097_152, "b": 576, "P": 16, "Q": 32, "Rtype": "1-ring", "f": 0.6},
+            # Frontier-scale shape (comment in when ready)
+            {"M": 8_900_000, "b": 576, "P": 192, "Q": 384, "Rtype": "1-ring", "f": 0.6},
         ]
 
-        # GCDS_PER_GPU = 2
-
         for test in hpl_tests:
             for partition in self.partitions:
                 cfg = self.config_map[partition]
                 trace_quanta = cfg["TRACE_QUANTA"]
 
-                # --- Analytical model evaluation ---
-                results = self._run_hpl_model(**test)
+                # Per-iteration timings (already concurrency-aware)
+                iterations = self._run_hpl_model(**test)
 
-                total_time = results["T_total"]
-                gpu_util = self.config_map[self.args.system]['GPUS_PER_NODE'] * results["gpu_util"]
-                cpu_util = results["cpu_util"]
+                # Convert iteration timings to sampled traces on TRACE_QUANTA grid
+                gpu_trace, cpu_trace = self._emit_traces_from_iters(
+                    iterations, trace_quanta, cfg
+                )
+                total_time = len(gpu_trace) * trace_quanta
 
-                num_samples = math.ceil(total_time / trace_quanta) + 1
-                gpu_trace = np.full(num_samples, gpu_util)
-                cpu_trace = np.full(num_samples, cpu_util)
+                # Node count: ranks / (GPUs_per_node * GCDs_per_GPU)
+                gpus = cfg["GPUS_PER_NODE"]
+                gcds = cfg.get("GCDS_PER_GPU", 2)  # Frontier MI250X default: 2
+                ranks = test["P"] * test["Q"]
+                nodes_required = max(1, ranks // (gpus * gcds))
 
                 job_info = job_dict(
-                    # nodes_required=test["P"] * test["Q"] // (cfg["GPUS_PER_NODE"] * GCDS_PER_GPU),
-                    nodes_required=test["P"] * test["Q"] // cfg["GPUS_PER_NODE"],
+                    nodes_required=nodes_required,
                     scheduled_nodes=[],
-                    name=f"HPL_{test['M']}x{test['M']}",
+                    name=f"HPL_{test['M']}x{test['M']}_P{test['P']}Q{test['Q']}",
                     account="benchmark",
                     cpu_trace=cpu_trace,
                     gpu_trace=gpu_trace,
-                    ntx_trace=[], nrx_trace=[],
+                    ntx_trace=[],
+                    nrx_trace=[],
                     id=None,
                     end_state="COMPLETED",
                     priority=100,
@@ -73,74 +75,155 @@ class HPL:
                     trace_end_time=total_time,
                 )
                 jobs.append(Job(job_info))
+
         return jobs
 
+    # -------------------------------------------------------------------------
+    # Analytical per-iteration model (concurrency-aware)
+    # -------------------------------------------------------------------------
     def _run_hpl_model(self, M, b, P, Q, Rtype="1-ring", f=0.6):
-        # constants (Table II + Fig 2b)
-        CAllgather = 6.3e9
-        C1ring = 7e9
-        Creduce = 46e6
-        Fcpublas = 240e9
-        Fgemm = 24e12
+        """
+        Returns a list of dicts, one per iteration:
+        {
+            "T_iter": <iteration wall time (s)>,
+            "gpu_active": <seconds in iteration attributable to GPU UPDATE>,
+            "cpu_active": <seconds in iteration attributable to CPU PDFACT>,
+            "net_active": <seconds in iteration attributable to collectives>,
+        }
+
+        Concurrency-aware scaling:
+          - UPDATE (DGEMM) work is distributed over the full P*Q ranks  → divide by (P*Q)
+          - PDFACT/LBCAST/RS* progress along process columns (Q)         → divide by Q
+        This makes the per-iteration times reflect global wall-time.
+        """
+        # Effective per-rank throughputs/bandwidths (empirical constants)
+        CAllgather = 6.3e9     # bytes/s
+        C1ring     = 7.0e9     # bytes/s
+        Creduce    = 46e6      # bytes/s
+        Fcpublas   = 240e9     # FLOP/s
+        Fgemm      = 24e12     # FLOP/s
 
         Ml = M / P
         Nl = M / Q
         nb = int(M / b)
-        total_T = 0.0
+        iterations = []
 
-        print("*** nb:", nb)
         for i in range(nb):
             Ml_i = Ml - (i * b / P)
-            Nl1_i = max((1 - f) * Nl - i * b / Q, 0)
-            Nl2_i = f * Nl if i * b < f * Nl else Nl - i * b / Q
-
-            TPDFACT = b ** 2 / Creduce + (2 / 3) * b ** 2 * Ml_i / Fcpublas
-            TLBCAST = 16 * b * Ml_i / C1ring
-            TUPD1 = 2 * b * Ml_i * Nl1_i / Fgemm
-            TUPD2 = 2 * b * Ml_i * Nl2_i / Fgemm
-            TRS1 = 16 * b * Nl1_i / CAllgather
-            TRS2 = 16 * b * Nl2_i / CAllgather
-
-            total_T += max(TPDFACT + TLBCAST + TRS1, TUPD2) + max(TRS2, TUPD1)
-
-        # derive synthetic utilization
-        gpu_util = min(1.0, (Fgemm / 25e12))      # normalized ratio
-        cpu_util = min(1.0, (Fcpublas / 250e9))
-
-        return {"T_total": total_T, "gpu_util": gpu_util, "cpu_util": cpu_util}
-
-
+            if Ml_i <= 0:
+                break
+
+            # Local column partition sizes (A = [A1 | A2]), f is the split ratio
+            Nl1_i = max((1.0 - f) * Nl - (i * b / Q), 0.0)
+            Nl2_i = (f * Nl) if (i * b) < (f * Nl) else max(Nl - (i * b / Q), 0.0)
+
+            # Component times (per-rank formulations)
+            # NOTE: units already account for bytes vs. elements (coeffs 16, 2/3, etc.)
+            TPDFACT_rank = (b**2) / Creduce + (2.0 / 3.0) * (b**2) * Ml_i / Fcpublas
+            TLBCAST_rank = 16.0 * b * Ml_i / C1ring
+            TUPD1_rank   = 2.0 * b * Ml_i * Nl1_i / Fgemm
+            TUPD2_rank   = 2.0 * b * Ml_i * Nl2_i / Fgemm
+            TRS1_rank    = 16.0 * b * Nl1_i / CAllgather
+            TRS2_rank    = 16.0 * b * Nl2_i / CAllgather
+
+            # Concurrency: convert rank-local times to global wall-time contributions
+            # (coarse but effective partitioning of the communicators)
+            TPDFACT = TPDFACT_rank #/ Q
+            TLBCAST = TLBCAST_rank #/ Q
+            TRS1    = TRS1_rank #/ Q
+            TRS2    = TRS2_rank #/ Q
+            TUPD1   = TUPD1_rank #/ (P * Q)
+            TUPD2   = TUPD2_rank #/ (P * Q)
+
+            # Two pipeline stages per iteration (HPL)
+            stage1 = max(TPDFACT + TLBCAST + TRS1, TUPD2)
+            stage2 = max(TRS2, TUPD1)
+            T_iter = stage1 + stage2
+
+            # Attribute activity (for utilization duty fractions)
+            gpu_active = max(TUPD1, TUPD2)
+            cpu_active = TPDFACT
+            net_active = TLBCAST + TRS1 + TRS2
+
+            iterations.append(
+                dict(
+                    T_iter=T_iter,
+                    gpu_active=gpu_active,
+                    cpu_active=cpu_active,
+                    net_active=net_active,
+                )
+            )
+
+        return iterations
+
+    def _emit_traces_from_iters(self, iterations, trace_quanta, cfg):
+        gpn = cfg["GPUS_PER_NODE"]
+        gpu_trace, cpu_trace = [], []
+        acc_time = 0.0
+        acc_gpu = 0.0
+        acc_cpu = 0.0
+
+        for it in iterations:
+            T = it["T_iter"]
+            if T <= 0: 
+                continue
+
+            total_act = it["gpu_active"] + it["cpu_active"] + it["net_active"]
+            compute_ratio = it["gpu_active"] / total_act if total_act > 0 else 0.0
+            cpu_ratio = it["cpu_active"] / total_act if total_act > 0 else 0.0
+            fg = 0.8 + 0.2 * compute_ratio
+            fc = 0.6 + 0.3 * cpu_ratio
+
+            acc_time += T
+            acc_gpu += gpn * fg * T
+            acc_cpu += fc * T
+
+            # emit one sample each time we accumulate ≥ trace_quanta
+            while acc_time >= trace_quanta:
+                gpu_trace.append(acc_gpu / acc_time)
+                cpu_trace.append(acc_cpu / acc_time)
+                acc_time -= trace_quanta
+                acc_gpu = acc_cpu = 0.0
+
+        # flush remainder
+        if acc_time > 0:
+            gpu_trace.append(acc_gpu / acc_time)
+            cpu_trace.append(acc_cpu / acc_time)
+
+        return np.array(gpu_trace), np.array(cpu_trace)
+
+# -----------------------------------------------------------------------------
+# Stand-alone test
+# -----------------------------------------------------------------------------
 if __name__ == "__main__":
 
-    # Mock minimal configuration values to mimic ExaDigiT runtime
     class DummyHPL(HPL):
         def __init__(self):
-            # Provide fake partitions and system config
             self.partitions = ["gpu"]
             self.config_map = {
                 "gpu": {
-                    "TRACE_QUANTA": 15.0,      # seconds per trace tick
-                    "GPUS_PER_NODE": 4,
+                    "TRACE_QUANTA": 15.0,   # seconds/sample
+                    "GPUS_PER_NODE": 4,     # Frontier physical GPUs/node
+                    "GCDS_PER_GPU": 2,      # MI250X logical ranks/GPU
                     "CPUS_PER_NODE": 64,
                 }
             }
 
-    # Instantiate dummy workload
-    workload = DummyHPL()
-
-    # Run synthetic job generation
-    jobs = workload.hpl()
+    hpl = DummyHPL()
+    jobs = hpl.hpl()
 
-    print(f"Generated {len(jobs)} HPL jobs:\n")
+    print(f"Generated {len(jobs)} HPL job(s)\n")
     for i, job in enumerate(jobs):
-        print(i, job)
         print(f"--- Job {i} ---")
         print(f"Name: {job.name}")
         print(f"Nodes required: {job.nodes_required}")
-        print(f"Wall time: {job.trace_time:.2f} s")
-        print(f"CPU trace length: {len(job.cpu_trace)}")
-        print(f"GPU trace length: {len(job.gpu_trace)}")
-        print(f"Avg CPU util: {np.mean(job.cpu_trace):.3f}")
-        print(f"Avg GPU util: {np.mean(job.gpu_trace):.3f}")
-        print(f"Expected run time: {job.expected_run_time:.2f}")
+        print(f"Wall time: {job.trace_time:.1f}s")
+        print(f"Trace samples: {len(job.gpu_trace)}")
+        print(f"Avg GPU util: {np.mean(job.gpu_trace):.2f} (0..{hpl.config_map['gpu']['GPUS_PER_NODE']})")
+        print(f"Avg CPU util: {np.mean(job.cpu_trace):.2f} (0..1)")
+        # Peek at starts/ends
+        print("GPU head:", np.round(job.gpu_trace[:8], 3))
+        print("GPU tail:", np.round(job.gpu_trace[-8:], 3))
+        print("CPU head:", np.round(job.cpu_trace[:8], 3))
+        print("CPU tail:", np.round(job.cpu_trace[-8:], 3))
         print()
-- 
GitLab