Loading README.md +15 −5 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ Note: Requires python3.12 or greater. # Frontier DATEDIR="date=2024-01-18" DPATH=~/data/frontier-sample-2024-01-18 DPATH=/opt/data/frontier raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR ## Open Telemetry dataset Loading @@ -37,7 +37,7 @@ Note: Requires python3.12 or greater. For Marconi supercomputer, download `job_table.parquet` from https://zenodo.org/records/10127767 # Marconi100 raps run --system marconi100 -f ~/data/marconi100/job_table.parquet raps run --system marconi100 -f /opt/data/marconi100/job_table.parquet For Adastra MI250 supercomputer, download 'AdastaJobsMI250_15days.parquet' from https://zenodo.org/records/14007065 Loading @@ -46,10 +46,10 @@ For Adastra MI250 supercomputer, download 'AdastaJobsMI250_15days.parquet' from For Google cluster trace v2 raps run --system gcloudv2 -f ~/data/gcloud/v2/google_cluster_data_2011_sample --start '2011-05-02T00:10:00Z' raps run --system gcloudv2 -f /opt/data/gcloud/v2/google_cluster_data_2011_sample --start '2011-05-02T00:10:00Z' # analyze dataset raps telemetry --system gcloudv2 -f ~/data/gcloud/v2/google_cluster_data_2011_sample -v raps telemetry --system gcloudv2 -f /opt/data/gcloud/v2/google_cluster_data_2011_sample -v For MIT Supercloud Loading Loading @@ -95,7 +95,17 @@ For Lumi Lassen is one of the few datasets that has networking data. See `raps/dataloaders/lassen.py` for how to get the datasets. To run a network simulation, use the following command: raps run -f ~/data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit --start '2019-08-22T00:00:00+00:00' -t 12h --arrival poisson --net raps run -f /opt/data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit --start '2019-08-22T00:00:00+00:00' -t 12h --arrival poisson --net To simulate synthetic network tests: raps run --system lassen -w network_test --net -t 15m raps run --system lassen -w inter_job_congestion --net -t 15m Run network congestion tests outside of RAPS: python scripts/run_inter_job_congestion.py --config config/lassen.yaml -v ## Snapshot of extracted workload data Loading config/lassen.yaml +8 −2 Original line number Diff line number Diff line Loading @@ -120,9 +120,15 @@ cooling: w_cts_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" network: topology: fat-tree network_max_bw: 1000000000.0 fattree_k: 16 network_max_bw: 12.5e9 fattree_k: 32 dragonfly_d: 11 dragonfly_a: 9 dragonfly_p: 8 latency: 1 torus_x: 17 torus_y: 17 torus_z: 8 torus_wrap: true hosts_per_router: 2 torus_routing: DOR_XYZ raps/engine.py +16 −2 Original line number Diff line number Diff line Loading @@ -30,7 +30,8 @@ from raps.power import ( from raps.network import ( NetworkModel, apply_job_slowdown, compute_system_network_stats compute_system_network_stats, simulate_inter_job_congestion ) from raps.telemetry import Telemetry from raps.cooling import ThermoFluidsModel Loading Loading @@ -292,6 +293,7 @@ class Engine: self.avg_net_tx = [] self.avg_net_rx = [] self.net_util_history = [] self.net_congestion_history = [] self.avg_slowdown_history = [] self.max_slowdown_history = [] self.node_occupancy_history = [] Loading Loading @@ -328,7 +330,7 @@ class Engine: available_nodes = self.resource_manager.available_nodes self.network_model = NetworkModel( available_nodes=available_nodes, config=self.config, config=self.config ) else: self.network_model = None Loading Loading @@ -621,6 +623,18 @@ class Engine: system_util = self.num_active_nodes / self.config['AVAILABLE_NODES'] * 100 self.record_util_stats(system_util=system_util) # --- Inter-Job Network Congestion --- if self.simulate_network and self.network_model and self.running: congestion_stats = simulate_inter_job_congestion( self.network_model, self.running, self.config, self.debug ) if isinstance(congestion_stats, dict): total_congestion = congestion_stats['mean'] else: total_congestion = congestion_stats self.net_congestion_history.append((self.current_timestep, total_congestion)) # --- # System Power if self.power_manager: # Power is always simulated power_df, rack_power, total_power_kw, total_loss_kw, jobs_power = \ Loading raps/network/__init__.py +70 −15 Original line number Diff line number Diff line import os import warnings from .base import ( all_to_all_paths, apply_job_slowdown, Loading @@ -7,11 +10,16 @@ from .base import ( network_slowdown, network_utilization, worst_link_util, get_link_util_stats, simulate_inter_job_congestion, max_throughput_per_tick, ) from .fat_tree import build_fattree, node_id_to_host_name from .torus3d import build_torus3d, link_loads_for_job_torus from .dragonfly import build_dragonfly, dragonfly_node_id_to_host_name from .fat_tree import build_fattree, node_id_to_host_name, subsample_hosts from .torus3d import build_torus3d, link_loads_for_job_torus, torus_host_from_real_index from .dragonfly import build_dragonfly, dragonfly_node_id_to_host_name, build_dragonfly_idx_map from raps.plotting import plot_fattree_hierarchy, plot_dragonfly, plot_torus2d, plot_torus3d from raps.utils import get_current_utilization __all__ = [ Loading @@ -28,6 +36,9 @@ __all__ = [ "build_torus3d", "build_dragonfly", "dragonfly_node_id_to_host_name", "simulate_inter_job_congestion", "max_throughput_per_tick", "get_link_util_stats", ] Loading @@ -39,8 +50,11 @@ class NetworkModel: self.real_to_fat_idx = kwargs.get("real_to_fat_idx", {}) if self.topology == "fat-tree": total_nodes = config['TOTAL_NODES'] - len(config['DOWN_NODES']) self.fattree_k = config.get("FATTREE_K") self.net_graph = build_fattree(self.fattree_k) self.net_graph = build_fattree(self.fattree_k, total_nodes) # TODO: future testing of subsampling feature #self.net_graph = subsample_hosts(self.net_graph, num_hosts=4626) elif self.topology == "torus3d": dims = ( Loading @@ -67,11 +81,22 @@ class NetworkModel: nid += 1 elif self.topology == "dragonfly": self.net_graph = build_dragonfly( int(config["DRAGONFLY_D"]), int(config["DRAGONFLY_A"]), int(config.get("DRAGONFLY_P", 1)) ) D = self.config["DRAGONFLY_D"] A = self.config["DRAGONFLY_A"] P = self.config["DRAGONFLY_P"] self.net_graph = build_dragonfly(D, A, P) # total nodes seen by scheduler or job trace total_real_nodes = getattr(self, "available_nodes", None) if total_real_nodes is None: total_real_nodes = 4626 # fallback for Lassen # if available_nodes is a list, take its length if not isinstance(total_real_nodes, int): total_real_nodes = len(total_real_nodes) self.real_to_fat_idx = build_dragonfly_idx_map(D, A, P, total_real_nodes) print(f"[DEBUG] Dragonfly mapping: {len(self.real_to_fat_idx)} entries") elif self.topology == "capacity": # Capacity-only model: no explicit graph Loading Loading @@ -100,18 +125,28 @@ class NetworkModel: print(" fat-tree hosts:", host_list) elif self.topology == "dragonfly": D, A, P = self.config["DRAGONFLY_D"], self.config["DRAGONFLY_A"], self.config["DRAGONFLY_P"] host_list = [ dragonfly_node_id_to_host_name(self.real_to_fat_idx[real_n], D, A, P) for real_n in job.scheduled_nodes ] D = self.config["DRAGONFLY_D"] A = self.config["DRAGONFLY_A"] P = self.config["DRAGONFLY_P"] # Directly use mapped host names host_list = [self.real_to_fat_idx[real_n] for real_n in job.scheduled_nodes] if debug: print(" dragonfly hosts:", host_list) print("Example nodes in graph:", list(self.net_graph.nodes)[:10]) print("Contains h_0_9_0?", "h_0_9_0" in self.net_graph) loads = link_loads_for_job(self.net_graph, host_list, net_tx) net_cong = worst_link_util(loads, max_throughput) elif self.topology == "torus3d": host_list = [self.id_to_host[n] for n in job.scheduled_nodes] X = self.config["TORUS_X"] Y = self.config["TORUS_Y"] Z = self.config["TORUS_Z"] hosts_per_router = self.config["HOSTS_PER_ROUTER"] #host_list = [self.id_to_host[n] for n in job.scheduled_nodes] host_list = [ torus_host_from_real_index(n, X, Y, Z, hosts_per_router) for n in job.scheduled_nodes ] loads = link_loads_for_job_torus(self.net_graph, self.meta, host_list, net_tx) net_cong = worst_link_util(loads, max_throughput) if debug: Loading @@ -124,3 +159,23 @@ class NetworkModel: raise ValueError(f"Unsupported topology: {self.topology}") return net_util, net_cong, net_tx, net_rx, max_throughput def plot_topology(self, output_dir): """Plot network topology - save as png file in output_dir.""" if output_dir: if self.topology == "fat-tree": save_path = output_dir / "net-fat-tree.png" plot_fattree_hierarchy(self.net_graph, k=self.fattree_k, save_path=save_path) elif self.topology == "dragonfly": save_path = output_dir / "net-dragonfly.png" plot_dragonfly(self.net_graph, save_path=save_path) elif self.topology == "torus3d": save_path = output_dir / "net-torus2d.png" plot_torus2d(self.net_graph, save_path=save_path) save_path = output_dir / "net-torus3d.png" plot_torus3d(self.net_graph, save_path=save_path) else: warnings.warn( f"plotting not supported for {self.topology} topology", UserWarning ) raps/network/base.py +83 −1 Original line number Diff line number Diff line import networkx as nx import numpy as np from raps.utils import get_current_utilization from raps.network.fat_tree import node_id_to_host_name from raps.network.torus3d import link_loads_for_job_torus, torus_host_from_real_index def debug_print_trace(job, label: str = ""): """Print either the length (if iterable) or the value of job.gpu_trace.""" Loading Loading @@ -134,3 +137,82 @@ def worst_link_util(loads, throughput): if util > max_util: max_util = util return max_util def get_link_util_stats(loads, throughput, top_n=10): """ Calculates a distribution of link utilization stats. Returns a dictionary with min, mean, max, std_dev, and top N congested links. """ if not loads: return {'max': 0, 'mean': 0, 'min': 0, 'std_dev': 0, 'top_links': []} # Calculate utilization for every link utilizations = {(edge): (byte_load * 8) / throughput for edge, byte_load in loads.items()} util_values = list(utilizations.values()) stats = { 'max': np.max(util_values), 'mean': np.mean(util_values), 'min': np.min(util_values), 'std_dev': np.std(util_values) } # Get top N congested links sorted_links = sorted(utilizations.items(), key=lambda item: item[1], reverse=True) stats['top_links'] = sorted_links[:top_n] return stats def max_throughput_per_tick(legacy_cfg: dict, trace_quanta: int) -> float: """Return bytes-per-tick throughput of a single link.""" bw = legacy_cfg.get("NETWORK_MAX_BW") or 12.5e9 return float(bw) * trace_quanta def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False): """ Simulates network congestion from a list of concurrently running jobs. """ if not network_model.net_graph: print("[WARN] Network graph is not defined. Skipping congestion simulation.") return 0.0 total_loads = {tuple(sorted(edge)): 0.0 for edge in network_model.net_graph.edges()} trace_quanta = jobs[0].trace_quanta if jobs else 0 for job in jobs: # Assuming job.running_time is 0 for this static simulation job.running_time = 0 job.trace_start_time = 0 net_tx = get_current_utilization(job.ntx_trace, job) job_loads = {} if network_model.topology in ("fat-tree", "dragonfly"): if network_model.topology == "fat-tree": k = int(legacy_cfg.get("FATTREE_K", 32)) host_list = [node_id_to_host_name(n, k) for n in job.scheduled_nodes] else: # dragonfly host_list = [network_model.real_to_fat_idx[real_n] for real_n in job.scheduled_nodes] job_loads = link_loads_for_job(network_model.net_graph, host_list, net_tx) elif network_model.topology == "torus3d": X = int(legacy_cfg.get("TORUS_X", 12)) Y = int(legacy_cfg.get("TORUS_Y", 12)) Z = int(legacy_cfg.get("TORUS_Z", 12)) hosts_per_router = int(legacy_cfg.get("HOSTS_PER_ROUTER", 1)) host_list = [ torus_host_from_real_index(n, X, Y, Z, hosts_per_router) for n in job.scheduled_nodes ] job_loads = link_loads_for_job_torus(network_model.net_graph, network_model.meta, host_list, net_tx) for edge, load in job_loads.items(): edge_key = tuple(sorted(edge)) if edge_key in total_loads: total_loads[edge_key] += load max_throughput = max_throughput_per_tick(legacy_cfg, trace_quanta) net_stats = get_link_util_stats(total_loads, max_throughput) return net_stats Loading
README.md +15 −5 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ Note: Requires python3.12 or greater. # Frontier DATEDIR="date=2024-01-18" DPATH=~/data/frontier-sample-2024-01-18 DPATH=/opt/data/frontier raps run -f $DPATH/slurm/joblive/$DATEDIR,$DPATH/jobprofile/$DATEDIR ## Open Telemetry dataset Loading @@ -37,7 +37,7 @@ Note: Requires python3.12 or greater. For Marconi supercomputer, download `job_table.parquet` from https://zenodo.org/records/10127767 # Marconi100 raps run --system marconi100 -f ~/data/marconi100/job_table.parquet raps run --system marconi100 -f /opt/data/marconi100/job_table.parquet For Adastra MI250 supercomputer, download 'AdastaJobsMI250_15days.parquet' from https://zenodo.org/records/14007065 Loading @@ -46,10 +46,10 @@ For Adastra MI250 supercomputer, download 'AdastaJobsMI250_15days.parquet' from For Google cluster trace v2 raps run --system gcloudv2 -f ~/data/gcloud/v2/google_cluster_data_2011_sample --start '2011-05-02T00:10:00Z' raps run --system gcloudv2 -f /opt/data/gcloud/v2/google_cluster_data_2011_sample --start '2011-05-02T00:10:00Z' # analyze dataset raps telemetry --system gcloudv2 -f ~/data/gcloud/v2/google_cluster_data_2011_sample -v raps telemetry --system gcloudv2 -f /opt/data/gcloud/v2/google_cluster_data_2011_sample -v For MIT Supercloud Loading Loading @@ -95,7 +95,17 @@ For Lumi Lassen is one of the few datasets that has networking data. See `raps/dataloaders/lassen.py` for how to get the datasets. To run a network simulation, use the following command: raps run -f ~/data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit --start '2019-08-22T00:00:00+00:00' -t 12h --arrival poisson --net raps run -f /opt/data/lassen/Lassen-Supercomputer-Job-Dataset --system lassen --policy fcfs --backfill firstfit --start '2019-08-22T00:00:00+00:00' -t 12h --arrival poisson --net To simulate synthetic network tests: raps run --system lassen -w network_test --net -t 15m raps run --system lassen -w inter_job_congestion --net -t 15m Run network congestion tests outside of RAPS: python scripts/run_inter_job_congestion.py --config config/lassen.yaml -v ## Snapshot of extracted workload data Loading
config/lassen.yaml +8 −2 Original line number Diff line number Diff line Loading @@ -120,9 +120,15 @@ cooling: w_cts_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW" network: topology: fat-tree network_max_bw: 1000000000.0 fattree_k: 16 network_max_bw: 12.5e9 fattree_k: 32 dragonfly_d: 11 dragonfly_a: 9 dragonfly_p: 8 latency: 1 torus_x: 17 torus_y: 17 torus_z: 8 torus_wrap: true hosts_per_router: 2 torus_routing: DOR_XYZ
raps/engine.py +16 −2 Original line number Diff line number Diff line Loading @@ -30,7 +30,8 @@ from raps.power import ( from raps.network import ( NetworkModel, apply_job_slowdown, compute_system_network_stats compute_system_network_stats, simulate_inter_job_congestion ) from raps.telemetry import Telemetry from raps.cooling import ThermoFluidsModel Loading Loading @@ -292,6 +293,7 @@ class Engine: self.avg_net_tx = [] self.avg_net_rx = [] self.net_util_history = [] self.net_congestion_history = [] self.avg_slowdown_history = [] self.max_slowdown_history = [] self.node_occupancy_history = [] Loading Loading @@ -328,7 +330,7 @@ class Engine: available_nodes = self.resource_manager.available_nodes self.network_model = NetworkModel( available_nodes=available_nodes, config=self.config, config=self.config ) else: self.network_model = None Loading Loading @@ -621,6 +623,18 @@ class Engine: system_util = self.num_active_nodes / self.config['AVAILABLE_NODES'] * 100 self.record_util_stats(system_util=system_util) # --- Inter-Job Network Congestion --- if self.simulate_network and self.network_model and self.running: congestion_stats = simulate_inter_job_congestion( self.network_model, self.running, self.config, self.debug ) if isinstance(congestion_stats, dict): total_congestion = congestion_stats['mean'] else: total_congestion = congestion_stats self.net_congestion_history.append((self.current_timestep, total_congestion)) # --- # System Power if self.power_manager: # Power is always simulated power_df, rack_power, total_power_kw, total_loss_kw, jobs_power = \ Loading
raps/network/__init__.py +70 −15 Original line number Diff line number Diff line import os import warnings from .base import ( all_to_all_paths, apply_job_slowdown, Loading @@ -7,11 +10,16 @@ from .base import ( network_slowdown, network_utilization, worst_link_util, get_link_util_stats, simulate_inter_job_congestion, max_throughput_per_tick, ) from .fat_tree import build_fattree, node_id_to_host_name from .torus3d import build_torus3d, link_loads_for_job_torus from .dragonfly import build_dragonfly, dragonfly_node_id_to_host_name from .fat_tree import build_fattree, node_id_to_host_name, subsample_hosts from .torus3d import build_torus3d, link_loads_for_job_torus, torus_host_from_real_index from .dragonfly import build_dragonfly, dragonfly_node_id_to_host_name, build_dragonfly_idx_map from raps.plotting import plot_fattree_hierarchy, plot_dragonfly, plot_torus2d, plot_torus3d from raps.utils import get_current_utilization __all__ = [ Loading @@ -28,6 +36,9 @@ __all__ = [ "build_torus3d", "build_dragonfly", "dragonfly_node_id_to_host_name", "simulate_inter_job_congestion", "max_throughput_per_tick", "get_link_util_stats", ] Loading @@ -39,8 +50,11 @@ class NetworkModel: self.real_to_fat_idx = kwargs.get("real_to_fat_idx", {}) if self.topology == "fat-tree": total_nodes = config['TOTAL_NODES'] - len(config['DOWN_NODES']) self.fattree_k = config.get("FATTREE_K") self.net_graph = build_fattree(self.fattree_k) self.net_graph = build_fattree(self.fattree_k, total_nodes) # TODO: future testing of subsampling feature #self.net_graph = subsample_hosts(self.net_graph, num_hosts=4626) elif self.topology == "torus3d": dims = ( Loading @@ -67,11 +81,22 @@ class NetworkModel: nid += 1 elif self.topology == "dragonfly": self.net_graph = build_dragonfly( int(config["DRAGONFLY_D"]), int(config["DRAGONFLY_A"]), int(config.get("DRAGONFLY_P", 1)) ) D = self.config["DRAGONFLY_D"] A = self.config["DRAGONFLY_A"] P = self.config["DRAGONFLY_P"] self.net_graph = build_dragonfly(D, A, P) # total nodes seen by scheduler or job trace total_real_nodes = getattr(self, "available_nodes", None) if total_real_nodes is None: total_real_nodes = 4626 # fallback for Lassen # if available_nodes is a list, take its length if not isinstance(total_real_nodes, int): total_real_nodes = len(total_real_nodes) self.real_to_fat_idx = build_dragonfly_idx_map(D, A, P, total_real_nodes) print(f"[DEBUG] Dragonfly mapping: {len(self.real_to_fat_idx)} entries") elif self.topology == "capacity": # Capacity-only model: no explicit graph Loading Loading @@ -100,18 +125,28 @@ class NetworkModel: print(" fat-tree hosts:", host_list) elif self.topology == "dragonfly": D, A, P = self.config["DRAGONFLY_D"], self.config["DRAGONFLY_A"], self.config["DRAGONFLY_P"] host_list = [ dragonfly_node_id_to_host_name(self.real_to_fat_idx[real_n], D, A, P) for real_n in job.scheduled_nodes ] D = self.config["DRAGONFLY_D"] A = self.config["DRAGONFLY_A"] P = self.config["DRAGONFLY_P"] # Directly use mapped host names host_list = [self.real_to_fat_idx[real_n] for real_n in job.scheduled_nodes] if debug: print(" dragonfly hosts:", host_list) print("Example nodes in graph:", list(self.net_graph.nodes)[:10]) print("Contains h_0_9_0?", "h_0_9_0" in self.net_graph) loads = link_loads_for_job(self.net_graph, host_list, net_tx) net_cong = worst_link_util(loads, max_throughput) elif self.topology == "torus3d": host_list = [self.id_to_host[n] for n in job.scheduled_nodes] X = self.config["TORUS_X"] Y = self.config["TORUS_Y"] Z = self.config["TORUS_Z"] hosts_per_router = self.config["HOSTS_PER_ROUTER"] #host_list = [self.id_to_host[n] for n in job.scheduled_nodes] host_list = [ torus_host_from_real_index(n, X, Y, Z, hosts_per_router) for n in job.scheduled_nodes ] loads = link_loads_for_job_torus(self.net_graph, self.meta, host_list, net_tx) net_cong = worst_link_util(loads, max_throughput) if debug: Loading @@ -124,3 +159,23 @@ class NetworkModel: raise ValueError(f"Unsupported topology: {self.topology}") return net_util, net_cong, net_tx, net_rx, max_throughput def plot_topology(self, output_dir): """Plot network topology - save as png file in output_dir.""" if output_dir: if self.topology == "fat-tree": save_path = output_dir / "net-fat-tree.png" plot_fattree_hierarchy(self.net_graph, k=self.fattree_k, save_path=save_path) elif self.topology == "dragonfly": save_path = output_dir / "net-dragonfly.png" plot_dragonfly(self.net_graph, save_path=save_path) elif self.topology == "torus3d": save_path = output_dir / "net-torus2d.png" plot_torus2d(self.net_graph, save_path=save_path) save_path = output_dir / "net-torus3d.png" plot_torus3d(self.net_graph, save_path=save_path) else: warnings.warn( f"plotting not supported for {self.topology} topology", UserWarning )
raps/network/base.py +83 −1 Original line number Diff line number Diff line import networkx as nx import numpy as np from raps.utils import get_current_utilization from raps.network.fat_tree import node_id_to_host_name from raps.network.torus3d import link_loads_for_job_torus, torus_host_from_real_index def debug_print_trace(job, label: str = ""): """Print either the length (if iterable) or the value of job.gpu_trace.""" Loading Loading @@ -134,3 +137,82 @@ def worst_link_util(loads, throughput): if util > max_util: max_util = util return max_util def get_link_util_stats(loads, throughput, top_n=10): """ Calculates a distribution of link utilization stats. Returns a dictionary with min, mean, max, std_dev, and top N congested links. """ if not loads: return {'max': 0, 'mean': 0, 'min': 0, 'std_dev': 0, 'top_links': []} # Calculate utilization for every link utilizations = {(edge): (byte_load * 8) / throughput for edge, byte_load in loads.items()} util_values = list(utilizations.values()) stats = { 'max': np.max(util_values), 'mean': np.mean(util_values), 'min': np.min(util_values), 'std_dev': np.std(util_values) } # Get top N congested links sorted_links = sorted(utilizations.items(), key=lambda item: item[1], reverse=True) stats['top_links'] = sorted_links[:top_n] return stats def max_throughput_per_tick(legacy_cfg: dict, trace_quanta: int) -> float: """Return bytes-per-tick throughput of a single link.""" bw = legacy_cfg.get("NETWORK_MAX_BW") or 12.5e9 return float(bw) * trace_quanta def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False): """ Simulates network congestion from a list of concurrently running jobs. """ if not network_model.net_graph: print("[WARN] Network graph is not defined. Skipping congestion simulation.") return 0.0 total_loads = {tuple(sorted(edge)): 0.0 for edge in network_model.net_graph.edges()} trace_quanta = jobs[0].trace_quanta if jobs else 0 for job in jobs: # Assuming job.running_time is 0 for this static simulation job.running_time = 0 job.trace_start_time = 0 net_tx = get_current_utilization(job.ntx_trace, job) job_loads = {} if network_model.topology in ("fat-tree", "dragonfly"): if network_model.topology == "fat-tree": k = int(legacy_cfg.get("FATTREE_K", 32)) host_list = [node_id_to_host_name(n, k) for n in job.scheduled_nodes] else: # dragonfly host_list = [network_model.real_to_fat_idx[real_n] for real_n in job.scheduled_nodes] job_loads = link_loads_for_job(network_model.net_graph, host_list, net_tx) elif network_model.topology == "torus3d": X = int(legacy_cfg.get("TORUS_X", 12)) Y = int(legacy_cfg.get("TORUS_Y", 12)) Z = int(legacy_cfg.get("TORUS_Z", 12)) hosts_per_router = int(legacy_cfg.get("HOSTS_PER_ROUTER", 1)) host_list = [ torus_host_from_real_index(n, X, Y, Z, hosts_per_router) for n in job.scheduled_nodes ] job_loads = link_loads_for_job_torus(network_model.net_graph, network_model.meta, host_list, net_tx) for edge, load in job_loads.items(): edge_key = tuple(sorted(edge)) if edge_key in total_loads: total_loads[edge_key] += load max_throughput = max_throughput_per_tick(legacy_cfg, trace_quanta) net_stats = get_link_util_stats(total_loads, max_throughput) return net_stats