Add adaptive routing support for Fat-tree topology (a121e132) · Commits · ExaDigiT / sim-raps

config/lassen.yaml

+2 −10

Original line number	Diff line number	Diff line
		@@ -120,15 +120,7 @@ cooling:
		w_cts_key: "simulator[1].centralEnergyPlant[1].coolingTowerLoop[1].summary.W_flow_CT_kW"
		network:
		topology: fat-tree
		network_max_bw: 12.5e9
		network_max_bw: 12.5e9 # InfiniBand EDR 100 Gbps = 12.5 GB/s
		routing_algorithm: adaptive # InfiniBand Adaptive Routing
		fattree_k: 32
		dragonfly_d: 11
		dragonfly_a: 9
		dragonfly_p: 8
		latency: 1
		torus_x: 17
		torus_y: 17
		torus_z: 8
		torus_wrap: true
		hosts_per_router: 2
		torus_routing: DOR_XYZ

raps/network/init.py

+24 −2

Original line number	Diff line number	Diff line
		@@ -78,6 +78,12 @@ class NetworkModel:
		# TODO: future testing of subsampling feature
		#self.net_graph = subsample_hosts(self.net_graph, num_hosts=4626)

		# Initialize global link loads for adaptive routing
		self.global_link_loads = {tuple(sorted(edge)): 0.0 for edge in self.net_graph.edges()}

		routing_info = f"routing={self.routing_algorithm}"
		print(f"[DEBUG] Fat-tree k={self.fattree_k}: {total_nodes} nodes, {routing_info}")

		elif self.topology == "torus3d":
		dims = (
		int(config["TORUS_X"]),
		@@ -169,10 +175,26 @@ class NetworkModel:

		if self.topology == "fat-tree":
		host_list = [node_id_to_host_name(n, self.fattree_k) for n in job.scheduled_nodes]
		loads = link_loads_for_pattern(self.net_graph, host_list, effective_tx, comm_pattern)
		net_cong = worst_link_util(loads, max_throughput)
		if debug:
		print(" fat-tree hosts:", host_list)
		print(f" routing: {self.routing_algorithm}")

		loads = link_loads_for_pattern(
		self.net_graph,
		host_list,
		effective_tx,
		comm_pattern,
		routing_algorithm=self.routing_algorithm,
		link_loads=self.global_link_loads,
		)
		net_cong = worst_link_util(loads, max_throughput)

		# Update global link loads for adaptive routing decisions
		if self.routing_algorithm in ('ecmp', 'adaptive'):
		for edge, load in loads.items():
		edge_key = tuple(sorted(edge))
		if edge_key in self.global_link_loads:
		self.global_link_loads[edge_key] += load

		elif self.topology == "dragonfly":
		D = self.config["DRAGONFLY_D"]

raps/network/base.py

+17 −2

Original line number	Diff line number	Diff line
		@@ -354,6 +354,7 @@ def link_loads_for_pattern(
		*,
		routing_algorithm: str \| None = None,
		dragonfly_params: dict \| None = None,
		fattree_params: dict \| None = None,
		link_loads: dict \| None = None,
		):
		"""
		@@ -365,19 +366,23 @@ def link_loads_for_pattern(
		job_hosts: List of host names
		tx_volume_bytes: Total transmit volume per host
		comm_pattern: CommunicationPattern enum value
		routing_algorithm: Routing algorithm for Dragonfly ('minimal', 'ugal', 'valiant')
		routing_algorithm: Routing algorithm
		- Dragonfly: 'minimal', 'ugal', 'valiant'
		- Fat-tree: 'minimal', 'ecmp', 'adaptive'
		dragonfly_params: Dict with 'd', 'a', 'ugal_threshold', 'valiant_bias' for Dragonfly
		fattree_params: Dict with 'k' for fat-tree (optional, for future use)
		link_loads: Current global link loads (for adaptive routing decisions)

		Returns:
		dict {(u,v): bytes, ...} of link loads
		"""
		from raps.network.dragonfly import link_loads_for_job_dragonfly_adaptive
		from raps.network.fat_tree import link_loads_for_job_fattree_adaptive

		comm_pattern = normalize_comm_pattern(comm_pattern)

		# Handle adaptive routing for Dragonfly
		if routing_algorithm and dragonfly_params and routing_algorithm != 'minimal':
		if routing_algorithm and dragonfly_params and routing_algorithm in ('ugal', 'valiant'):
		return link_loads_for_job_dragonfly_adaptive(
		G,
		job_hosts,
		@@ -390,6 +395,16 @@ def link_loads_for_pattern(
		valiant_bias=dragonfly_params.get('valiant_bias', 0.0),
		)

		# Handle adaptive routing for Fat-tree
		if routing_algorithm and routing_algorithm in ('ecmp', 'adaptive'):
		return link_loads_for_job_fattree_adaptive(
		G,
		job_hosts,
		tx_volume_bytes,
		algorithm=routing_algorithm,
		link_loads=link_loads,
		)

		# Standard routing (shortest path)
		if comm_pattern == CommunicationPattern.STENCIL_3D:
		return link_loads_for_job_stencil_3d(G, job_hosts, tx_volume_bytes)

raps/network/fat_tree.py

+213 −0

Original line number	Diff line number	Diff line
		import random
		from typing import Tuple, List
		import networkx as nx


		@@ -80,3 +81,215 @@ def subsample_hosts(G, num_hosts):
		remove = [n for n in hosts if n not in keep]
		G.remove_nodes_from(remove)
		return G


		# =============================================================================
		# Adaptive Routing Functions for Fat-Tree
		# =============================================================================

		def parse_fattree_host(name: str) -> Tuple[int, int, int]:
		"""Parse a fat-tree host name into (pod, edge, host_idx).

		Args:
		name: Host name in format "h_{pod}_{edge}_{host}"

		Returns:
		Tuple of (pod, edge_switch_idx, host_idx)
		"""
		parts = name.split("_")
		return int(parts[1]), int(parts[2]), int(parts[3])


		def get_host_edge_switch(host: str) -> str:
		"""Get the edge switch connected to a host.

		Args:
		host: Host name in format "h_{pod}_{edge}_{host}"

		Returns:
		Edge switch name "e_{pod}_{edge}"
		"""
		pod, edge, _ = parse_fattree_host(host)
		return f"e_{pod}_{edge}"


		def fattree_all_shortest_paths(G: nx.Graph, src: str, dst: str) -> List[List[str]]:
		"""Find all shortest paths between source and destination in a fat-tree.

		In a fat-tree, there can be multiple equal-cost paths between hosts,
		especially when they are in different pods (going through different
		aggregation and core switches).

		Args:
		G: Fat-tree graph
		src: Source host name
		dst: Destination host name

		Returns:
		List of all shortest paths, each path is a list of node names
		"""
		if src == dst:
		return [[src]]

		try:
		return list(nx.all_shortest_paths(G, src, dst))
		except nx.NetworkXNoPath:
		return []


		def fattree_ecmp_select_path(G: nx.Graph, src: str, dst: str) -> List[str]:
		"""Select a path using ECMP (Equal-Cost Multi-Path) routing.

		Randomly selects one of the shortest paths between source and destination.

		Args:
		G: Fat-tree graph
		src: Source host name
		dst: Destination host name

		Returns:
		Selected path as a list of node names
		"""
		paths = fattree_all_shortest_paths(G, src, dst)
		if not paths:
		return []
		return random.choice(paths)


		def estimate_path_load(path: List[str], link_loads: dict) -> float:
		"""Estimate the total load on a path based on link loads.

		Args:
		path: List of node names forming the path
		link_loads: Dictionary mapping (node1, node2) tuples to load values

		Returns:
		Sum of loads on all links in the path
		"""
		total_load = 0.0
		for i in range(len(path) - 1):
		edge = tuple(sorted([path[i], path[i + 1]]))
		total_load += link_loads.get(edge, 0.0)
		return total_load


		def fattree_adaptive_select_path(
		G: nx.Graph,
		src: str,
		dst: str,
		link_loads: dict,
		) -> List[str]:
		"""Select the least congested path using Adaptive ECMP routing.

		This implements InfiniBand-style Adaptive Routing (AR), which selects
		the path with the lowest congestion among all equal-cost paths.

		Args:
		G: Fat-tree graph
		src: Source host name
		dst: Destination host name
		link_loads: Dictionary mapping edge tuples to current load values

		Returns:
		Selected path as a list of node names (least congested)
		"""
		paths = fattree_all_shortest_paths(G, src, dst)
		if not paths:
		return []
		if len(paths) == 1:
		return paths[0]

		# Find path with minimum total load
		best_path = paths[0]
		best_load = estimate_path_load(paths[0], link_loads)

		for path in paths[1:]:
		load = estimate_path_load(path, link_loads)
		if load < best_load:
		best_load = load
		best_path = path

		return best_path


		def fattree_route(
		G: nx.Graph,
		src: str,
		dst: str,
		algorithm: str = 'minimal',
		link_loads: dict = None,
		) -> List[str]:
		"""Main routing dispatcher for fat-tree topology.

		Args:
		G: Fat-tree graph
		src: Source host name
		dst: Destination host name
		algorithm: Routing algorithm ('minimal', 'ecmp', or 'adaptive')
		link_loads: Current link loads (required for 'adaptive')

		Returns:
		Selected path as a list of node names
		"""
		if src == dst:
		return [src]

		if algorithm == 'minimal':
		try:
		return nx.shortest_path(G, src, dst)
		except nx.NetworkXNoPath:
		return []

		elif algorithm == 'ecmp':
		return fattree_ecmp_select_path(G, src, dst)

		elif algorithm == 'adaptive':
		if link_loads is None:
		link_loads = {}
		return fattree_adaptive_select_path(G, src, dst, link_loads)

		else:
		raise ValueError(f"Unknown fat-tree routing algorithm: {algorithm}")


		def link_loads_for_job_fattree_adaptive(
		G: nx.Graph,
		job_hosts: List[str],
		tx_volume_bytes: float,
		algorithm: str = 'minimal',
		link_loads: dict = None,
		) -> dict:
		"""Compute link loads for a job using adaptive routing on fat-tree.

		Args:
		G: Fat-tree network graph
		job_hosts: List of host names assigned to the job
		tx_volume_bytes: Traffic volume per host pair
		algorithm: Routing algorithm ('minimal', 'ecmp', or 'adaptive')
		link_loads: Current global link loads for adaptive decisions

		Returns:
		Dictionary mapping edge tuples to accumulated load values
		"""
		if link_loads is None:
		link_loads = {}

		loads = {}
		n = len(job_hosts)
		if n <= 1:
		return loads

		# All-to-all traffic pattern
		for i, src in enumerate(job_hosts):
		for j, dst in enumerate(job_hosts):
		if i == j:
		continue

		path = fattree_route(G, src, dst, algorithm, link_loads)

		# Accumulate loads on path edges
		for k in range(len(path) - 1):
		edge = tuple(sorted([path[k], path[k + 1]]))
		loads[edge] = loads.get(edge, 0.0) + tx_volume_bytes

		return loads

raps/policy.py

+9 −0

Original line number	Diff line number	Diff line
		@@ -46,6 +46,7 @@ class RoutingAlgorithm(ValueComparableEnum):
		"Study of Workload Interference with Intelligent Routing on Dragonfly"
		(Kang et al., SC22)

		Dragonfly algorithms:
		MINIMAL: Always use shortest/minimal path routing.
		For Dragonfly: at most 3 hops (local-global-local).
		VALIANT: Valiant load balancing - route via random intermediate group.
		@@ -55,7 +56,15 @@ class RoutingAlgorithm(ValueComparableEnum):
		Dynamically chooses minimal or non-minimal based on congestion.
		Uses threshold comparison: if min_latency < threshold * nonmin_latency,
		use minimal path; otherwise use non-minimal.

		Fat-tree algorithms:
		ECMP: Equal-Cost Multi-Path routing. Randomly selects among all
		shortest paths between source and destination.
		ADAPTIVE: Adaptive ECMP routing (InfiniBand AR). Selects the least
		congested path among all equal-cost shortest paths.
		"""
		MINIMAL = 'minimal'
		VALIANT = 'valiant'
		UGAL = 'ugal'
		ECMP = 'ecmp'
		ADAPTIVE = 'adaptive'