Add support for dynamic NIC power (cbaf7db9) · Commits · ExaDigiT / sim-raps

config/lassen/power.json

+2 −1

Original line number	Diff line number	Diff line
		@@ -4,7 +4,8 @@
		"POWER_CPU_IDLE": 47.25,
		"POWER_CPU_MAX": 252,
		"POWER_MEM": 74.26,
		"POWER_NIC": 21,
		"POWER_NIC_IDLE": 10,
		"POWER_NIC_MAX": 50,
		"POWER_NVME": 45,
		"POWER_SWITCH": 250,
		"POWER_CDU": 0,

raps/dataloaders/lassen.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -49,7 +49,7 @@ def load_data(path, **kwargs):
		"""
		Loads data from the given file paths and returns job info.
		"""
		nrows = 1E5
		nrows = 1E4
		alloc_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_history_hashed.csv'), nrows=nrows)
		node_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_node_history.csv'), nrows=nrows)
		step_df = pd.read_csv(os.path.join(path[0], 'final_csm_step_history.csv'), nrows=nrows)

raps/network.py

0 → 100644

+8 −0

Original line number	Diff line number	Diff line
		TX_MAX = 10000
		RX_MAX = 20000

		def network_utilization(tx, rx):
		"""Compute average network utilization"""
		tx_util = min(tx / TX_MAX, 1.0) # Clamp to 1.0
		rx_util = min(rx / RX_MAX, 1.0) # Clamp to 1.0
		return (tx_util + rx_util) / 2.0

raps/power.py

+11 −9

Original line number	Diff line number	Diff line
		@@ -37,7 +37,8 @@ load_config_variables([
		'POWER_CPU_UNCERTAINTY',
		'POWER_MEM',
		'POWER_MEM_UNCERTAINTY',
		'POWER_NIC',
		'POWER_NIC_IDLE',
		'POWER_NIC_MAX',
		'POWER_NIC_UNCERTAINTY',
		'POWER_NVME',
		'POWER_NVME_UNCERTAINTY',
		@@ -90,7 +91,7 @@ def rectifier_loss(p_out):
		return p_in


		def compute_node_power(cpu_util, gpu_util, verbose=False):
		def compute_node_power(cpu_util, gpu_util, net_util, verbose=False):
		"""
		Calculate the total power consumption for given CPU and GPU utilization.

		@@ -101,8 +102,9 @@ def compute_node_power(cpu_util, gpu_util, verbose=False):
		"""
		power_cpu = cpu_util * POWER_CPU_MAX + (CPUS_PER_NODE - cpu_util) * POWER_CPU_IDLE
		power_gpu = gpu_util * POWER_GPU_MAX + (GPUS_PER_NODE - gpu_util) * POWER_GPU_IDLE
		power_nic = POWER_NIC_IDLE + (POWER_NIC_MAX - POWER_NIC_IDLE) * net_util

		power_total = power_cpu + power_gpu + POWER_MEM + NICS_PER_NODE * POWER_NIC + POWER_NVME
		power_total = power_cpu + power_gpu + POWER_MEM + NICS_PER_NODE * power_nic + POWER_NVME

		# Apply power loss due to Sivoc and Rectifier
		power_with_sivoc_loss = sivoc_loss(power_total)
		@@ -259,17 +261,17 @@ class PowerManager:

		def initialize_power_state(self):
		"""Initialize the power state array with idle power consumption values."""
		initial_power, _ = self.power_func(0, 0)
		initial_power, _ = self.power_func(0, 0, 0)
		return np.full(self.sc_shape, initial_power)

		def initialize_sivoc_loss(self):
		"""Initialize the Sivoc loss array with idle power consumption values."""
		_, initial_sivoc_loss = self.power_func(0, 0)
		_, initial_sivoc_loss = self.power_func(0, 0, 0)
		return np.full(self.sc_shape, initial_sivoc_loss)

		def initialize_rectifier_loss(self):
		""" Initialize the power state array """
		initial_power, _ = self.power_func(0, 0)
		initial_power, _ = self.power_func(0, 0, 0)
		# Rectifier loss curvefit is done at rectifier level, so we simply
		# approximate by scaling up to number of rectifiers, applying loss
		# and then dividing by number of rectifiers.
		@@ -295,9 +297,9 @@ class PowerManager:
		"""
		node_indices = linear_to_3d_index(node_indices, self.sc_shape)
		self.power_state[node_indices], self.sivoc_loss[node_indices] \
		= compute_node_power(0, 0)
		= compute_node_power(0, 0, 0)

		def update_power_state(self, scheduled_nodes, cpu_util, gpu_util):
		def update_power_state(self, scheduled_nodes, cpu_util, gpu_util, net_util):
		"""
		Update the power state of scheduled nodes based on CPU and GPU utilization.
		Note: this is only used to test smart load-sharing "what-if" scenario
		@@ -315,7 +317,7 @@ class PowerManager:
		Total power consumption of the scheduled nodes.
		"""
		node_indices = linear_to_3d_index(scheduled_nodes, self.sc_shape)
		power_value, sivoc_loss = self.power_func(cpu_util, gpu_util)
		power_value, sivoc_loss = self.power_func(cpu_util, gpu_util, net_util)
		self.power_state[node_indices] = power_value
		self.sivoc_loss[node_indices] = sivoc_loss
		return power_value * len(scheduled_nodes)

raps/scheduler.py

+9 −1

Original line number	Diff line number	Diff line
		@@ -48,6 +48,7 @@ import pandas as pd

		from .config import load_config_variables
		from .job import Job, JobState
		from .network import network_utilization
		from .policy import Policy, PolicyType
		from .utils import summarize_ranges, expand_ranges

		@@ -260,9 +261,16 @@ class Scheduler:
		cpu_util = get_utilization(job.cpu_trace, time_quanta_index)
		gpu_util = get_utilization(job.gpu_trace, time_quanta_index)

		if len(job.ntx_trace) and len(job.nrx_trace):
		net_tx = get_utilization(job.ntx_trace, time_quanta_index)
		net_rx = get_utilization(job.nrx_trace, time_quanta_index)
		net_util = network_utilization(net_tx, net_rx)
		else:
		net_util = 0

		self.flops_manager.update_flop_state(job.scheduled_nodes, cpu_util, gpu_util)
		job.power = self.power_manager.update_power_state(job.scheduled_nodes,
		cpu_util, gpu_util)
		cpu_util, gpu_util, net_util)

		if job.running_time % TRACE_QUANTA == 0:
		job.power_history.append(job.power)