Commit cbaf7db9 authored by Brewer, Wes's avatar Brewer, Wes
Browse files

Add support for dynamic NIC power

parent 2d14d54a
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -4,7 +4,8 @@
    "POWER_CPU_IDLE": 47.25,
    "POWER_CPU_MAX": 252,
    "POWER_MEM": 74.26,
    "POWER_NIC": 21,
    "POWER_NIC_IDLE": 10,
    "POWER_NIC_MAX": 50,
    "POWER_NVME": 45,
    "POWER_SWITCH": 250,
    "POWER_CDU": 0,
+1 −1
Original line number Diff line number Diff line
@@ -49,7 +49,7 @@ def load_data(path, **kwargs):
    """
    Loads data from the given file paths and returns job info.
    """
    nrows = 1E5
    nrows = 1E4
    alloc_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_history_hashed.csv'), nrows=nrows)
    node_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_node_history.csv'), nrows=nrows)
    step_df = pd.read_csv(os.path.join(path[0], 'final_csm_step_history.csv'), nrows=nrows)

raps/network.py

0 → 100644
+8 −0
Original line number Diff line number Diff line
TX_MAX = 10000
RX_MAX = 20000

def network_utilization(tx, rx):
    """Compute average network utilization"""
    tx_util = min(tx / TX_MAX, 1.0)  # Clamp to 1.0
    rx_util = min(rx / RX_MAX, 1.0)  # Clamp to 1.0
    return (tx_util + rx_util) / 2.0
+11 −9
Original line number Diff line number Diff line
@@ -37,7 +37,8 @@ load_config_variables([
    'POWER_CPU_UNCERTAINTY',
    'POWER_MEM',
    'POWER_MEM_UNCERTAINTY',
    'POWER_NIC',
    'POWER_NIC_IDLE',
    'POWER_NIC_MAX',
    'POWER_NIC_UNCERTAINTY',
    'POWER_NVME',
    'POWER_NVME_UNCERTAINTY',
@@ -90,7 +91,7 @@ def rectifier_loss(p_out):
    return p_in


def compute_node_power(cpu_util, gpu_util, verbose=False):
def compute_node_power(cpu_util, gpu_util, net_util, verbose=False):
    """
    Calculate the total power consumption for given CPU and GPU utilization.

@@ -101,8 +102,9 @@ def compute_node_power(cpu_util, gpu_util, verbose=False):
    """
    power_cpu = cpu_util * POWER_CPU_MAX + (CPUS_PER_NODE - cpu_util) * POWER_CPU_IDLE
    power_gpu = gpu_util * POWER_GPU_MAX + (GPUS_PER_NODE - gpu_util) * POWER_GPU_IDLE
    power_nic = POWER_NIC_IDLE + (POWER_NIC_MAX - POWER_NIC_IDLE) * net_util

    power_total = power_cpu + power_gpu + POWER_MEM + NICS_PER_NODE * POWER_NIC + POWER_NVME
    power_total = power_cpu + power_gpu + POWER_MEM + NICS_PER_NODE * power_nic + POWER_NVME

    # Apply power loss due to Sivoc and Rectifier
    power_with_sivoc_loss = sivoc_loss(power_total)
@@ -259,17 +261,17 @@ class PowerManager:

    def initialize_power_state(self):
        """Initialize the power state array with idle power consumption values."""
        initial_power, _ = self.power_func(0, 0)
        initial_power, _ = self.power_func(0, 0, 0)
        return np.full(self.sc_shape, initial_power)

    def initialize_sivoc_loss(self):
        """Initialize the Sivoc loss array with idle power consumption values."""
        _, initial_sivoc_loss = self.power_func(0, 0)
        _, initial_sivoc_loss = self.power_func(0, 0, 0)
        return np.full(self.sc_shape, initial_sivoc_loss)

    def initialize_rectifier_loss(self):
        """ Initialize the power state array """
        initial_power, _ = self.power_func(0, 0)
        initial_power, _ = self.power_func(0, 0, 0)
        # Rectifier loss curvefit is done at rectifier level, so we simply
        # approximate by scaling up to number of rectifiers, applying loss
        # and then dividing by number of rectifiers.
@@ -295,9 +297,9 @@ class PowerManager:
        """
        node_indices = linear_to_3d_index(node_indices, self.sc_shape)
        self.power_state[node_indices], self.sivoc_loss[node_indices] \
            = compute_node_power(0, 0)
            = compute_node_power(0, 0, 0)

    def update_power_state(self, scheduled_nodes, cpu_util, gpu_util):
    def update_power_state(self, scheduled_nodes, cpu_util, gpu_util, net_util):
        """
        Update the power state of scheduled nodes based on CPU and GPU utilization.
        Note: this is only used to test smart load-sharing "what-if" scenario
@@ -315,7 +317,7 @@ class PowerManager:
            Total power consumption of the scheduled nodes.
        """
        node_indices = linear_to_3d_index(scheduled_nodes, self.sc_shape)
        power_value, sivoc_loss = self.power_func(cpu_util, gpu_util)
        power_value, sivoc_loss = self.power_func(cpu_util, gpu_util, net_util)
        self.power_state[node_indices] = power_value
        self.sivoc_loss[node_indices] = sivoc_loss
        return power_value * len(scheduled_nodes)
+9 −1
Original line number Diff line number Diff line
@@ -48,6 +48,7 @@ import pandas as pd

from .config import load_config_variables
from .job import Job, JobState
from .network import network_utilization
from .policy import Policy, PolicyType
from .utils import summarize_ranges, expand_ranges

@@ -260,9 +261,16 @@ class Scheduler:
                cpu_util = get_utilization(job.cpu_trace, time_quanta_index)
                gpu_util = get_utilization(job.gpu_trace, time_quanta_index)

                if len(job.ntx_trace) and len(job.nrx_trace):
                    net_tx = get_utilization(job.ntx_trace, time_quanta_index)
                    net_rx = get_utilization(job.nrx_trace, time_quanta_index)
                    net_util = network_utilization(net_tx, net_rx)
                else:
                    net_util = 0

                self.flops_manager.update_flop_state(job.scheduled_nodes, cpu_util, gpu_util)
                job.power = self.power_manager.update_power_state(job.scheduled_nodes,
                                                                  cpu_util, gpu_util)
                                                                  cpu_util, gpu_util, net_util)

                if job.running_time % TRACE_QUANTA == 0:
                    job.power_history.append(job.power)