Loading config/lassen/power.json +2 −1 Original line number Diff line number Diff line Loading @@ -4,7 +4,8 @@ "POWER_CPU_IDLE": 47.25, "POWER_CPU_MAX": 252, "POWER_MEM": 74.26, "POWER_NIC": 21, "POWER_NIC_IDLE": 10, "POWER_NIC_MAX": 50, "POWER_NVME": 45, "POWER_SWITCH": 250, "POWER_CDU": 0, Loading raps/dataloaders/lassen.py +1 −1 Original line number Diff line number Diff line Loading @@ -49,7 +49,7 @@ def load_data(path, **kwargs): """ Loads data from the given file paths and returns job info. """ nrows = 1E5 nrows = 1E4 alloc_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_history_hashed.csv'), nrows=nrows) node_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_node_history.csv'), nrows=nrows) step_df = pd.read_csv(os.path.join(path[0], 'final_csm_step_history.csv'), nrows=nrows) Loading raps/network.py 0 → 100644 +8 −0 Original line number Diff line number Diff line TX_MAX = 10000 RX_MAX = 20000 def network_utilization(tx, rx): """Compute average network utilization""" tx_util = min(tx / TX_MAX, 1.0) # Clamp to 1.0 rx_util = min(rx / RX_MAX, 1.0) # Clamp to 1.0 return (tx_util + rx_util) / 2.0 raps/power.py +11 −9 Original line number Diff line number Diff line Loading @@ -37,7 +37,8 @@ load_config_variables([ 'POWER_CPU_UNCERTAINTY', 'POWER_MEM', 'POWER_MEM_UNCERTAINTY', 'POWER_NIC', 'POWER_NIC_IDLE', 'POWER_NIC_MAX', 'POWER_NIC_UNCERTAINTY', 'POWER_NVME', 'POWER_NVME_UNCERTAINTY', Loading Loading @@ -90,7 +91,7 @@ def rectifier_loss(p_out): return p_in def compute_node_power(cpu_util, gpu_util, verbose=False): def compute_node_power(cpu_util, gpu_util, net_util, verbose=False): """ Calculate the total power consumption for given CPU and GPU utilization. Loading @@ -101,8 +102,9 @@ def compute_node_power(cpu_util, gpu_util, verbose=False): """ power_cpu = cpu_util * POWER_CPU_MAX + (CPUS_PER_NODE - cpu_util) * POWER_CPU_IDLE power_gpu = gpu_util * POWER_GPU_MAX + (GPUS_PER_NODE - gpu_util) * POWER_GPU_IDLE power_nic = POWER_NIC_IDLE + (POWER_NIC_MAX - POWER_NIC_IDLE) * net_util power_total = power_cpu + power_gpu + POWER_MEM + NICS_PER_NODE * POWER_NIC + POWER_NVME power_total = power_cpu + power_gpu + POWER_MEM + NICS_PER_NODE * power_nic + POWER_NVME # Apply power loss due to Sivoc and Rectifier power_with_sivoc_loss = sivoc_loss(power_total) Loading Loading @@ -259,17 +261,17 @@ class PowerManager: def initialize_power_state(self): """Initialize the power state array with idle power consumption values.""" initial_power, _ = self.power_func(0, 0) initial_power, _ = self.power_func(0, 0, 0) return np.full(self.sc_shape, initial_power) def initialize_sivoc_loss(self): """Initialize the Sivoc loss array with idle power consumption values.""" _, initial_sivoc_loss = self.power_func(0, 0) _, initial_sivoc_loss = self.power_func(0, 0, 0) return np.full(self.sc_shape, initial_sivoc_loss) def initialize_rectifier_loss(self): """ Initialize the power state array """ initial_power, _ = self.power_func(0, 0) initial_power, _ = self.power_func(0, 0, 0) # Rectifier loss curvefit is done at rectifier level, so we simply # approximate by scaling up to number of rectifiers, applying loss # and then dividing by number of rectifiers. Loading @@ -295,9 +297,9 @@ class PowerManager: """ node_indices = linear_to_3d_index(node_indices, self.sc_shape) self.power_state[node_indices], self.sivoc_loss[node_indices] \ = compute_node_power(0, 0) = compute_node_power(0, 0, 0) def update_power_state(self, scheduled_nodes, cpu_util, gpu_util): def update_power_state(self, scheduled_nodes, cpu_util, gpu_util, net_util): """ Update the power state of scheduled nodes based on CPU and GPU utilization. Note: this is only used to test smart load-sharing "what-if" scenario Loading @@ -315,7 +317,7 @@ class PowerManager: Total power consumption of the scheduled nodes. """ node_indices = linear_to_3d_index(scheduled_nodes, self.sc_shape) power_value, sivoc_loss = self.power_func(cpu_util, gpu_util) power_value, sivoc_loss = self.power_func(cpu_util, gpu_util, net_util) self.power_state[node_indices] = power_value self.sivoc_loss[node_indices] = sivoc_loss return power_value * len(scheduled_nodes) Loading raps/scheduler.py +9 −1 Original line number Diff line number Diff line Loading @@ -48,6 +48,7 @@ import pandas as pd from .config import load_config_variables from .job import Job, JobState from .network import network_utilization from .policy import Policy, PolicyType from .utils import summarize_ranges, expand_ranges Loading Loading @@ -260,9 +261,16 @@ class Scheduler: cpu_util = get_utilization(job.cpu_trace, time_quanta_index) gpu_util = get_utilization(job.gpu_trace, time_quanta_index) if len(job.ntx_trace) and len(job.nrx_trace): net_tx = get_utilization(job.ntx_trace, time_quanta_index) net_rx = get_utilization(job.nrx_trace, time_quanta_index) net_util = network_utilization(net_tx, net_rx) else: net_util = 0 self.flops_manager.update_flop_state(job.scheduled_nodes, cpu_util, gpu_util) job.power = self.power_manager.update_power_state(job.scheduled_nodes, cpu_util, gpu_util) cpu_util, gpu_util, net_util) if job.running_time % TRACE_QUANTA == 0: job.power_history.append(job.power) Loading Loading
config/lassen/power.json +2 −1 Original line number Diff line number Diff line Loading @@ -4,7 +4,8 @@ "POWER_CPU_IDLE": 47.25, "POWER_CPU_MAX": 252, "POWER_MEM": 74.26, "POWER_NIC": 21, "POWER_NIC_IDLE": 10, "POWER_NIC_MAX": 50, "POWER_NVME": 45, "POWER_SWITCH": 250, "POWER_CDU": 0, Loading
raps/dataloaders/lassen.py +1 −1 Original line number Diff line number Diff line Loading @@ -49,7 +49,7 @@ def load_data(path, **kwargs): """ Loads data from the given file paths and returns job info. """ nrows = 1E5 nrows = 1E4 alloc_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_history_hashed.csv'), nrows=nrows) node_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_node_history.csv'), nrows=nrows) step_df = pd.read_csv(os.path.join(path[0], 'final_csm_step_history.csv'), nrows=nrows) Loading
raps/network.py 0 → 100644 +8 −0 Original line number Diff line number Diff line TX_MAX = 10000 RX_MAX = 20000 def network_utilization(tx, rx): """Compute average network utilization""" tx_util = min(tx / TX_MAX, 1.0) # Clamp to 1.0 rx_util = min(rx / RX_MAX, 1.0) # Clamp to 1.0 return (tx_util + rx_util) / 2.0
raps/power.py +11 −9 Original line number Diff line number Diff line Loading @@ -37,7 +37,8 @@ load_config_variables([ 'POWER_CPU_UNCERTAINTY', 'POWER_MEM', 'POWER_MEM_UNCERTAINTY', 'POWER_NIC', 'POWER_NIC_IDLE', 'POWER_NIC_MAX', 'POWER_NIC_UNCERTAINTY', 'POWER_NVME', 'POWER_NVME_UNCERTAINTY', Loading Loading @@ -90,7 +91,7 @@ def rectifier_loss(p_out): return p_in def compute_node_power(cpu_util, gpu_util, verbose=False): def compute_node_power(cpu_util, gpu_util, net_util, verbose=False): """ Calculate the total power consumption for given CPU and GPU utilization. Loading @@ -101,8 +102,9 @@ def compute_node_power(cpu_util, gpu_util, verbose=False): """ power_cpu = cpu_util * POWER_CPU_MAX + (CPUS_PER_NODE - cpu_util) * POWER_CPU_IDLE power_gpu = gpu_util * POWER_GPU_MAX + (GPUS_PER_NODE - gpu_util) * POWER_GPU_IDLE power_nic = POWER_NIC_IDLE + (POWER_NIC_MAX - POWER_NIC_IDLE) * net_util power_total = power_cpu + power_gpu + POWER_MEM + NICS_PER_NODE * POWER_NIC + POWER_NVME power_total = power_cpu + power_gpu + POWER_MEM + NICS_PER_NODE * power_nic + POWER_NVME # Apply power loss due to Sivoc and Rectifier power_with_sivoc_loss = sivoc_loss(power_total) Loading Loading @@ -259,17 +261,17 @@ class PowerManager: def initialize_power_state(self): """Initialize the power state array with idle power consumption values.""" initial_power, _ = self.power_func(0, 0) initial_power, _ = self.power_func(0, 0, 0) return np.full(self.sc_shape, initial_power) def initialize_sivoc_loss(self): """Initialize the Sivoc loss array with idle power consumption values.""" _, initial_sivoc_loss = self.power_func(0, 0) _, initial_sivoc_loss = self.power_func(0, 0, 0) return np.full(self.sc_shape, initial_sivoc_loss) def initialize_rectifier_loss(self): """ Initialize the power state array """ initial_power, _ = self.power_func(0, 0) initial_power, _ = self.power_func(0, 0, 0) # Rectifier loss curvefit is done at rectifier level, so we simply # approximate by scaling up to number of rectifiers, applying loss # and then dividing by number of rectifiers. Loading @@ -295,9 +297,9 @@ class PowerManager: """ node_indices = linear_to_3d_index(node_indices, self.sc_shape) self.power_state[node_indices], self.sivoc_loss[node_indices] \ = compute_node_power(0, 0) = compute_node_power(0, 0, 0) def update_power_state(self, scheduled_nodes, cpu_util, gpu_util): def update_power_state(self, scheduled_nodes, cpu_util, gpu_util, net_util): """ Update the power state of scheduled nodes based on CPU and GPU utilization. Note: this is only used to test smart load-sharing "what-if" scenario Loading @@ -315,7 +317,7 @@ class PowerManager: Total power consumption of the scheduled nodes. """ node_indices = linear_to_3d_index(scheduled_nodes, self.sc_shape) power_value, sivoc_loss = self.power_func(cpu_util, gpu_util) power_value, sivoc_loss = self.power_func(cpu_util, gpu_util, net_util) self.power_state[node_indices] = power_value self.sivoc_loss[node_indices] = sivoc_loss return power_value * len(scheduled_nodes) Loading
raps/scheduler.py +9 −1 Original line number Diff line number Diff line Loading @@ -48,6 +48,7 @@ import pandas as pd from .config import load_config_variables from .job import Job, JobState from .network import network_utilization from .policy import Policy, PolicyType from .utils import summarize_ranges, expand_ranges Loading Loading @@ -260,9 +261,16 @@ class Scheduler: cpu_util = get_utilization(job.cpu_trace, time_quanta_index) gpu_util = get_utilization(job.gpu_trace, time_quanta_index) if len(job.ntx_trace) and len(job.nrx_trace): net_tx = get_utilization(job.ntx_trace, time_quanta_index) net_rx = get_utilization(job.nrx_trace, time_quanta_index) net_util = network_utilization(net_tx, net_rx) else: net_util = 0 self.flops_manager.update_flop_state(job.scheduled_nodes, cpu_util, gpu_util) job.power = self.power_manager.update_power_state(job.scheduled_nodes, cpu_util, gpu_util) cpu_util, gpu_util, net_util) if job.running_time % TRACE_QUANTA == 0: job.power_history.append(job.power) Loading