Commit ebb5e8d5 authored by Brewer, Wes's avatar Brewer, Wes
Browse files

Remove rest of globals in raps/*.py (still some remain in raps/dataloaders)

parent ad387085
Loading
Loading
Loading
Loading
+9 −15
Original line number Diff line number Diff line
@@ -2,29 +2,23 @@ import numpy as np
from .utils import linear_to_3d_index


def compute_node_flops(cpu_util, gpu_util):
    return CPU_FP_RATIO * cpu_util * CPU_PEAK_FLOPS + GPU_FP_RATIO * gpu_util * GPU_PEAK_FLOPS


class FLOPSManager():

    def __init__(self, **config):
        globals().update(config)
        self.flop_state = np.zeros(SC_SHAPE)
        self.config = config
        self.flop_state = np.zeros(self.config['SC_SHAPE'])

    def update_flop_state(self, scheduled_nodes, cpu_util, gpu_util):
        node_indices = linear_to_3d_index(scheduled_nodes, SC_SHAPE)
        self.flop_state[node_indices] = compute_node_flops(cpu_util, gpu_util)
        node_indices = linear_to_3d_index(scheduled_nodes, self.config['SC_SHAPE'])
        self.flop_state[node_indices] = \
               self.config['CPU_FP_RATIO'] * cpu_util * self.config['CPU_PEAK_FLOPS'] + \
               self.config['GPU_FP_RATIO'] * gpu_util * self.config['GPU_PEAK_FLOPS']

    def get_rpeak(self):
        node_peak_flops = CPUS_PER_NODE*CPU_PEAK_FLOPS + GPUS_PER_NODE*GPU_PEAK_FLOPS
        system_peak_flops = AVAILABLE_NODES * node_peak_flops
        node_peak_flops = self.config['CPUS_PER_NODE'] * self.config['CPU_PEAK_FLOPS'] \
                        + self.config['GPUS_PER_NODE'] * self.config['GPU_PEAK_FLOPS']
        system_peak_flops = self.config['AVAILABLE_NODES'] * node_peak_flops
        return system_peak_flops

    def get_system_performance(self):
        return np.sum(self.flop_state)


if __name__ == "__main__":
    fm = FLOPManager(SC_SHAPE)
    print(fm.flop_state.shape)
+63 −79
Original line number Diff line number Diff line
@@ -6,8 +6,7 @@ Classes:
- PowerManager: Manages power consumption and loss calculations in the system.

Functions:
- sivoc_loss: Calculate the power input required considering Sivoc power loss.
- rectifier_loss: Calculate the power input required considering Rectifier power loss.
- compute_loss: Linear loss model
- compute_node_power: Calculate the total power consumption for given CPU and GPU utilization.
- compute_node_power_validate: Calculate the total power consumption for a given mean and standard deviation of node power.
"""
@@ -36,19 +35,11 @@ uf.Variable.__repr__ = custom_repr_uncertainties
uf.Variable.__format__ = custom_format_uncertainties


def sivoc_loss(p_out):
    """Calculate the power input required considering Sivoc power loss."""
    p_in = (p_out + SIVOC_LOSS_CONSTANT) / SIVOC_EFFICIENCY
    return p_in
def compute_loss(p_out, loss_constant, efficiency):
    return (p_out + loss_constant) / efficiency


def rectifier_loss(p_out):
    """Calculate the power input required considering Rectifier power loss."""
    p_in = (p_out + RECTIFIER_LOSS_CONSTANT) / RECTIFIER_EFFICIENCY
    return p_in


def compute_node_power(cpu_util, gpu_util, net_util, verbose=False):
def compute_node_power(cpu_util, gpu_util, net_util, config):
    """
    Calculate the total power consumption for given CPU and GPU utilization.

@@ -57,29 +48,30 @@ def compute_node_power(cpu_util, gpu_util, net_util, verbose=False):
    :param verbose: Flag for verbose output.
    :return: Total power consumption after accounting for power loss.
    """
    power_cpu = cpu_util * POWER_CPU_MAX + (CPUS_PER_NODE - cpu_util) * POWER_CPU_IDLE
    power_gpu = gpu_util * POWER_GPU_MAX + (GPUS_PER_NODE - gpu_util) * POWER_GPU_IDLE
    power_cpu = cpu_util * config['POWER_CPU_MAX'] + \
                (config['CPUS_PER_NODE'] - cpu_util) * config['POWER_CPU_IDLE']

    power_gpu = gpu_util * config['POWER_GPU_MAX'] + \
                (config['GPUS_PER_NODE'] - gpu_util) * config['POWER_GPU_IDLE']

    try: 
        power_nic = POWER_NIC_IDLE + (POWER_NIC_MAX - POWER_NIC_IDLE) * net_util
        power_nic = config['POWER_NIC_IDLE'] + \
                    (config['POWER_NIC_MAX'] - config['POWER_NIC_IDLE']) * net_util
    except:
        power_nic = POWER_NIC
        power_nic = config['POWER_NIC']

    power_total = power_cpu + power_gpu + POWER_MEM + NICS_PER_NODE * power_nic + POWER_NVME
    power_total = power_cpu + power_gpu + config['POWER_MEM'] + \
                  config['NICS_PER_NODE'] * power_nic + config['POWER_NVME']

    # Apply power loss due to Sivoc and Rectifier
    power_with_sivoc_loss = sivoc_loss(power_total)
    power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], \
                                                      config['SIVOC_EFFICIENCY'])
    power_sivoc_loss_only = power_with_sivoc_loss - power_total

    if verbose:
        print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}")
        print(f"*** SIVOC loss: {power_sivoc_loss_only}")
        print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}")

    return power_with_sivoc_loss, power_sivoc_loss_only


def compute_node_power_uncertainties(cpu_util, gpu_util, verbose=False):
def compute_node_power_uncertainties(cpu_util, gpu_util, net_util, config):
    """
    Calculate the total power consumption for given CPU and GPU utilization.

@@ -89,32 +81,27 @@ def compute_node_power_uncertainties(cpu_util, gpu_util, verbose=False):
    :return: Total power consumption after accounting for power loss.
    """
    power_cpu = cpu_util \
                * uf.ufloat(POWER_CPU_MAX, POWER_CPU_MAX * POWER_CPU_UNCERTAINTY) \
                + (CPUS_PER_NODE - cpu_util) \
                * uf.ufloat(POWER_CPU_IDLE, POWER_CPU_IDLE * POWER_CPU_UNCERTAINTY)
                * uf.ufloat(config['POWER_CPU_MAX'], config['POWER_CPU_MAX'] * config['POWER_CPU_UNCERTAINTY']) \
                + (config['CPUS_PER_NODE'] - cpu_util) \
                * uf.ufloat(config['POWER_CPU_IDLE'], config['POWER_CPU_IDLE'] * config['POWER_CPU_UNCERTAINTY'])
    power_gpu = gpu_util \
                * uf.ufloat(POWER_GPU_MAX, POWER_GPU_MAX * POWER_GPU_UNCERTAINTY) \
                + (GPUS_PER_NODE - gpu_util) \
                * uf.ufloat(POWER_GPU_IDLE, POWER_GPU_IDLE * POWER_GPU_UNCERTAINTY)
                * uf.ufloat(config['POWER_GPU_MAX'], config['POWER_GPU_MAX'] * config['POWER_GPU_UNCERTAINTY']) \
                + (config['GPUS_PER_NODE'] - gpu_util) \
                * uf.ufloat(config['POWER_GPU_IDLE'], config['POWER_GPU_IDLE'] * config['POWER_GPU_UNCERTAINTY'])

    power_total = power_cpu + power_gpu \
                  + uf.ufloat(POWER_MEM, POWER_MEM * POWER_MEM_UNCERTAINTY) \
                  + NICS_PER_NODE * uf.ufloat(POWER_NIC, POWER_NIC * POWER_NIC_UNCERTAINTY) \
                  + uf.ufloat(POWER_NVME, POWER_NVME * POWER_NVME_UNCERTAINTY)
                  + uf.ufloat(config['POWER_MEM'], config['POWER_MEM'] * config['POWER_MEM_UNCERTAINTY']) \
                  + config['NICS_PER_NODE'] * uf.ufloat(config['POWER_NIC'], config['POWER_NIC'] * config['POWER_NIC_UNCERTAINTY']) \
                  + uf.ufloat(config['POWER_NVME'], config['POWER_NVME'] * config['POWER_NVME_UNCERTAINTY'])

    # Apply power loss due to Sivoc and Rectifier
    power_with_sivoc_loss = sivoc_loss(power_total)
    power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], config['SIVOC_EFFICIENCY'])
    power_sivoc_loss_only = power_with_sivoc_loss - power_total

    if verbose:
        print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}")
        print(f"*** SIVOC loss: {power_sivoc_loss_only}")
        print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}")

    return power_with_sivoc_loss, power_sivoc_loss_only


def compute_node_power_validate(mean_node_power, stddev_node_power, verbose=False):
def compute_node_power_validate(mean_node_power, stddev_node_power, net_util, config):
    """
    Calculate the total power consumption for given mean and standard deviation of node power.

@@ -131,16 +118,12 @@ def compute_node_power_validate(mean_node_power, stddev_node_power, verbose=Fals
        Total power consumption after accounting for power loss and Sivoc loss.
    """
    power_total = mean_node_power
    power_with_sivoc_loss = sivoc_loss(power_total)
    power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], config['SIVOC_EFFICIENCY'])
    power_sivoc_loss_only = power_with_sivoc_loss - power_total
    if verbose:
        print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}")
        print(f"*** SIVOC loss: {power_sivoc_loss_only}")
        print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}")
    return power_with_sivoc_loss, power_sivoc_loss_only


def compute_node_power_validate_uncertainties(mean_node_power, stddev_node_power, verbose=False):
def compute_node_power_validate_uncertainties(mean_node_power, stddev_node_power, net_util, config):
    """
    Calculate the total power consumption for given mean and standard deviation of node power.

@@ -156,13 +139,9 @@ def compute_node_power_validate_uncertainties(mean_node_power, stddev_node_power
    tuple
        Total power consumption after accounting for power loss and Sivoc loss.
    """
    power_total = uf.ufloat(mean_node_power, mean_node_power * POWER_NODE_UNCERTAINTY)
    power_with_sivoc_loss = sivoc_loss(power_total)
    power_total = uf.ufloat(mean_node_power, mean_node_power * config['POWER_NODE_UNCERTAINTY'])
    power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], config['SIVOC_EFFICIENCY'])
    power_sivoc_loss_only = power_with_sivoc_loss - power_total
    if verbose:
        print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}")
        print(f"*** SIVOC loss: {power_sivoc_loss_only}")
        print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}")
    return power_with_sivoc_loss, power_sivoc_loss_only


@@ -196,7 +175,7 @@ class PowerManager:
        """
        self.sc_shape = config.get('SC_SHAPE')
        self.down_nodes = config.get('DOWN_NODES')
        globals().update(config)
        self.config = config
        self.power_func = power_func
        self.power_state = self.initialize_power_state()
        self.rectifier_loss = self.initialize_rectifier_loss()
@@ -211,35 +190,38 @@ class PowerManager:

    def get_peak_power(self):
        """Estimate peak power of system for setting max value of gauges in dashboard"""
        node_power = compute_node_power(CPUS_PER_NODE, GPUS_PER_NODE, net_util=0)[0]
        blades_per_rectifier = BLADES_PER_CHASSIS / RECTIFIERS_PER_CHASSIS
        rectifier_load = blades_per_rectifier * NODES_PER_BLADE * node_power
        rectifier_power = rectifier_loss(rectifier_load) # with AC-DC conversion losses
        chassis_power = BLADES_PER_CHASSIS * rectifier_power / blades_per_rectifier \
                      + SWITCHES_PER_CHASSIS * POWER_SWITCH
        rack_power = chassis_power * CHASSIS_PER_RACK 
        total_power = rack_power * NUM_RACKS + POWER_CDU * NUM_CDUS
        node_power = compute_node_power(self.config['CPUS_PER_NODE'], self.config['GPUS_PER_NODE'], net_util=0)[0]
        blades_per_rectifier = self.config['BLADES_PER_CHASSIS'] / self.config['RECTIFIERS_PER_CHASSIS']
        rectifier_load = blades_per_rectifier * self.config['NODES_PER_BLADE'] * node_power
        rectifier_power = compute_loss(rectifier_load, self.config['RECTIFIER_LOSS_CONSTANT'], \
                                       self.config['RECTIFIER_EFFICIENCY']) # with AC-DC conversion losses
        chassis_power = self.config['BLADES_PER_CHASSIS'] * rectifier_power / blades_per_rectifier \
                      + self.config['SWITCHES_PER_CHASSIS'] * self.config['POWER_SWITCH']
        rack_power = chassis_power * self.config['CHASSIS_PER_RACK']
        total_power = rack_power * self.config['NUM_RACKS'] + self.config['POWER_CDU'] * self.config['NUM_CDUS']
        return total_power

    def initialize_power_state(self):
        """Initialize the power state array with idle power consumption values."""
        initial_power, _ = self.power_func(0, 0, 0)
        initial_power, _ = self.power_func(0, 0, 0, self.config)
        return np.full(self.sc_shape, initial_power)

    def initialize_sivoc_loss(self):
        """Initialize the Sivoc loss array with idle power consumption values."""
        _, initial_sivoc_loss = self.power_func(0, 0, 0)
        _, initial_sivoc_loss = self.power_func(0, 0, 0, self.config)
        return np.full(self.sc_shape, initial_sivoc_loss)

    def initialize_rectifier_loss(self):
        """ Initialize the power state array """
        initial_power, _ = self.power_func(0, 0, 0)
        initial_power, _ = self.power_func(0, 0, 0, self.config)
        # Rectifier loss curvefit is done at rectifier level, so we simply
        # approximate by scaling up to number of rectifiers, applying loss
        # and then dividing by number of rectifiers.
        # For Frontier there are four nodes per rectifier.
        power_with_loss = rectifier_loss(initial_power * NODES_PER_RECTIFIER) \
                          / NODES_PER_RECTIFIER
        power_with_loss = compute_loss(initial_power * self.config['NODES_PER_RECTIFIER'], \
                                       self.config['RECTIFIER_LOSS_CONSTANT'], \
                                       self.config['RECTIFIER_EFFICIENCY']) \
                                     / self.config['NODES_PER_RECTIFIER']
        return np.full(self.sc_shape, power_with_loss)

    def apply_down_nodes(self):
@@ -259,7 +241,7 @@ class PowerManager:
        """
        node_indices = linear_to_3d_index(node_indices, self.sc_shape)
        self.power_state[node_indices], self.sivoc_loss[node_indices] \
            = compute_node_power(0, 0, 0)
            = compute_node_power(0, 0, 0, self.config)

    def update_power_state(self, scheduled_nodes, cpu_util, gpu_util, net_util):
        """
@@ -279,7 +261,7 @@ class PowerManager:
            Total power consumption of the scheduled nodes.
        """
        node_indices = linear_to_3d_index(scheduled_nodes, self.sc_shape)
        power_value, sivoc_loss = self.power_func(cpu_util, gpu_util, net_util)
        power_value, sivoc_loss = self.power_func(cpu_util, gpu_util, net_util, self.config)
        self.power_state[node_indices] = power_value
        self.sivoc_loss[node_indices] = sivoc_loss
        return power_value * len(scheduled_nodes)
@@ -296,8 +278,8 @@ class PowerManager:
        int
            Number of rectifiers needed.
        """
        value = int((power_state_summed - 1) // RECTIFIER_PEAK_THRESHOLD + 1)
        return min(value, RECTIFIERS_PER_CHASSIS)
        value = int((power_state_summed - 1) // self.config['RECTIFIER_PEAK_THRESHOLD'] + 1)
        return min(value, self.config['RECTIFIERS_PER_CHASSIS'])

    def compute_rack_power(self, smart_load_sharing=False):
        """
@@ -311,12 +293,12 @@ class PowerManager:
        tuple
            Tuple containing rack power (kW) and rectifier losses (kW).
        """
        shape = (self.sc_shape[0], self.sc_shape[1], CHASSIS_PER_RACK, -1)
        shape = (self.sc_shape[0], self.sc_shape[1], self.config['CHASSIS_PER_RACK'], -1)
        power_state_reshaped = np.reshape(self.power_state, shape)
        chassis_power = np.sum(power_state_reshaped, axis=-1)

        # Add in switch power
        chassis_power += SWITCHES_PER_CHASSIS * POWER_SWITCH
        chassis_power += self.config['SWITCHES_PER_CHASSIS'] * self.config['POWER_SWITCH']

        # Divide the power by the number of rectifiers and apply losses per rectifier
        # Smart load sharing dynamically stages rectifers as needed, e.g., when
@@ -327,7 +309,7 @@ class PowerManager:
            num_rectifiers_array = vectorized_function(chassis_power)

            # Initialize the array to hold the divided powers, using NaN for unused elements
            rectifier_power = np.full((*chassis_power.shape, RECTIFIERS_PER_CHASSIS), np.nan)
            rectifier_power = np.full((*chassis_power.shape, self.config['RECTIFIERS_PER_CHASSIS']), np.nan)
            power_with_losses = np.copy(rectifier_power)

            # Chassis_power.shape for Frontier is (25, 3, 8)
@@ -345,7 +327,9 @@ class PowerManager:
        else:
            divisor = np.array([4, 4, 4, 4]).reshape(1, 1, 1, 4)
            rectifier_power = chassis_power[:, :, :, np.newaxis] / divisor
            power_with_losses = rectifier_loss(rectifier_power)
            power_with_losses = compute_loss(rectifier_power, \
                                             self.config['RECTIFIER_LOSS_CONSTANT'], \
                                             self.config['RECTIFIER_EFFICIENCY'])

        # Compute just the losses
        rect_losses = power_with_losses - rectifier_power
@@ -353,9 +337,9 @@ class PowerManager:
        # Sum to 75 racks
        summed_power_with_losses = np.sum(power_with_losses/1000, axis=(2, 3))
        # Zero out power for missing racks
        for rack in MISSING_RACKS:
            cdu = rack // RACKS_PER_CDU
            rack2d = (cdu, rack % RACKS_PER_CDU)
        for rack in self.config['MISSING_RACKS']:
            cdu = rack // self.config['RACKS_PER_CDU']
            rack2d = (cdu, rack % self.config['RACKS_PER_CDU'])
            summed_power_with_losses[rack2d] = 0
        summed_rect_losses = np.sum(rect_losses/1000, axis=(2, 3))

@@ -400,7 +384,7 @@ class PowerManager:
    
    def get_power_df(self, rack_power, rack_loss):
        # Initialize the columns for power_df
        power_columns = POWER_DF_HEADER
        power_columns = self.config['POWER_DF_HEADER']
        power_data = []

        # Generate power_df
+22 −19
Original line number Diff line number Diff line
@@ -78,12 +78,11 @@ def get_utilization(trace, time_quanta_index):
class Scheduler:
    """Job scheduler and simulation manager."""
    def __init__(self, power_manager, flops_manager, layout_manager, cooling_model=None, **kwargs):
        config = kwargs.get('config')
        globals().update(config)
        self.down_nodes = summarize_ranges(DOWN_NODES)
        self.available_nodes = list(set(range(TOTAL_NODES)) - set(DOWN_NODES))
        self.config = kwargs.get('config')
        self.down_nodes = summarize_ranges(self.config['DOWN_NODES'])
        self.available_nodes = list(set(range(self.config['TOTAL_NODES'])) - set(self.config['DOWN_NODES']))
        self.num_free_nodes = len(self.available_nodes)
        self.num_active_nodes = TOTAL_NODES - self.num_free_nodes - len(DOWN_NODES)
        self.num_active_nodes = self.config['TOTAL_NODES'] - self.num_free_nodes - len(self.config['DOWN_NODES'])
        self.running = []
        self.queue = []
        self.jobs_completed = 0
@@ -177,11 +176,11 @@ class Scheduler:
                          is not None and job.end_time <= self.current_time]

        # Simulate node failure
        newly_downed_nodes = self.node_failure(MTBF)
        newly_downed_nodes = self.node_failure(self.config['MTBF'])

        # Update active/free nodes
        self.num_free_nodes = len(self.available_nodes)
        self.num_active_nodes = TOTAL_NODES - self.num_free_nodes \
        self.num_active_nodes = self.config['TOTAL_NODES'] - self.num_free_nodes \
                              - len(expand_ranges(self.down_nodes))

        # Update running time for all running jobs
@@ -209,7 +208,8 @@ class Scheduler:

                job.running_time = self.current_time - job.start_time

                time_quanta_index = (self.current_time - job.start_time) // TRACE_QUANTA
                time_quanta_index = (self.current_time - job.start_time) \
                                  // self.config['TRACE_QUANTA']

                cpu_util = get_utilization(job.cpu_trace, time_quanta_index)
                gpu_util = get_utilization(job.gpu_trace, time_quanta_index)
@@ -225,7 +225,7 @@ class Scheduler:
                job.power = self.power_manager.update_power_state(job.scheduled_nodes,
                                                                  cpu_util, gpu_util, net_util)

                if job.running_time % TRACE_QUANTA == 0:
                if job.running_time % self.config['TRACE_QUANTA'] == 0:
                    job.power_history.append(job.power)

        for job in completed_jobs:
@@ -269,7 +269,7 @@ class Scheduler:
        rack_loss = rect_losses + sivoc_losses

        # Update system utilization
        system_util = self.num_active_nodes / AVAILABLE_NODES * 100
        system_util = self.num_active_nodes / self.config['AVAILABLE_NODES'] * 100
        self.sys_util_history.append((self.current_time, system_util))

        # Render the updated layout
@@ -277,8 +277,8 @@ class Scheduler:
        cooling_inputs, cooling_outputs = None, None

        # Update power history every 15s
        if self.current_time % POWER_UPDATE_FREQ == 0:
            total_power_kw = sum(row[-1] for row in rack_power) + NUM_CDUS * POWER_CDU / 1000.0
        if self.current_time % self.config['POWER_UPDATE_FREQ'] == 0:
            total_power_kw = sum(row[-1] for row in rack_power) + self.config['NUM_CDUS'] * self.config['POWER_CDU'] / 1000.0
            total_loss_kw = sum(row[-1] for row in rack_loss)
            self.power_manager.history.append((self.current_time, total_power_kw))
            self.power_manager.loss_history.append((self.current_time, total_loss_kw))
@@ -290,7 +290,7 @@ class Scheduler:

        if self.cooling_model:

            if self.current_time % FMU_UPDATE_FREQ == 0:
            if self.current_time % self.config['FMU_UPDATE_FREQ'] == 0:
                # Power for NUM_CDUS (25 for Frontier)
                cdu_power = rack_power.T[-1] * 1000
                runtime_values = self.cooling_model.generate_runtime_values(cdu_power, self)
@@ -299,7 +299,7 @@ class Scheduler:
                fmu_inputs = self.cooling_model.generate_fmu_inputs(runtime_values, \
                             uncertainties=self.power_manager.uncertainties)
                cooling_inputs, cooling_outputs =\
                    self.cooling_model.step(self.current_time, fmu_inputs, FMU_UPDATE_FREQ)
                    self.cooling_model.step(self.current_time, fmu_inputs, self.config['FMU_UPDATE_FREQ'])
                
                # Get a dataframe of the power data
                power_df = self.power_manager.get_power_df(rack_power, rack_loss)
@@ -310,7 +310,7 @@ class Scheduler:
                               system_util, uncertainties=self.power_manager.uncertainties)
                    self.layout_manager.update_pressflow_array(cooling_outputs)

        if self.current_time % UI_UPDATE_FREQ == 0:
        if self.current_time % self.config['UI_UPDATE_FREQ'] == 0:
            # Get a dataframe of the power data
            power_df = self.power_manager.get_power_df(rack_power, rack_loss)

@@ -376,7 +376,7 @@ class Scheduler:
                print("stopping simulation at time", self.current_time)
                break
            if self.debug:
                if _ % UI_UPDATE_FREQ == 0:
                if _ % self.config['UI_UPDATE_FREQ'] == 0:
                    print(".", end="", flush=True)

    def run_simulation_blocking(self, jobs, timesteps):
@@ -408,7 +408,7 @@ class Scheduler:
        # From https://www.epa.gov/energy/greenhouse-gases-equivalencies-\
        #      calculator-calculations-and-references
        emissions = total_energy_consumed * 852.3 / 2204.6 / efficiency
        total_cost = total_energy_consumed * 1000 * POWER_COST # total cost in dollars
        total_cost = total_energy_consumed * 1000 * self.config['POWER_COST'] # total cost in dollars

        stats = {
            'num_samples': num_samples,
@@ -435,10 +435,13 @@ class Scheduler:

        # Create a NumPy array of node indices, excluding down nodes
        down_nodes = expand_ranges(self.down_nodes)
        all_nodes = np.setdiff1d(np.arange(TOTAL_NODES), np.array(down_nodes, dtype=int))
        all_nodes = np.setdiff1d(np.arange(self.config['TOTAL_NODES']), 
                                 np.array(down_nodes, dtype=int))

        # Sample the Weibull distribution for all nodes at once
        random_values = weibull_min.rvs(shape_parameter, scale=scale_parameter, size=all_nodes.size)
        random_values = weibull_min.rvs(shape_parameter, 
                                        scale=scale_parameter, 
                                        size=all_nodes.size)

        # Identify nodes that have failed
        failure_threshold = 0.1
+5 −6
Original line number Diff line number Diff line
@@ -21,15 +21,13 @@ if __name__ == "__main__":
    parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output')
    args = parser.parse_args()

    from .config import is_config_initialized, initialize_config
    if not is_config_initialized():
        initialize_config(args.system)

import importlib
import numpy as np
import re
from datetime import datetime
from tqdm import tqdm

from .config import ConfigManager
from .scheduler import Job
from .plotting import plot_submit_times, plot_nodes_histogram
from .utils import next_arrival
@@ -42,7 +40,6 @@ class Telemetry:
        self.kwargs = kwargs
        self.system = kwargs.get('system')
        config = kwargs.get('config')
        globals().update(config)


    def save_snapshot(self, jobs: list, filename: str):
@@ -71,7 +68,9 @@ class Telemetry:
if __name__ == "__main__":

    args_dict = vars(args)
    args_dict['config'] = ConfigManager(system_name=args.system).get_config()
    td = Telemetry(**args_dict)
    JOB_ARRIVAL_TIME = 900

    if args.replay[0].endswith(".npz"):
        print(f"Loading {args.replay[0]}...")
@@ -79,7 +78,7 @@ if __name__ == "__main__":
        if args.reschedule:
            for job in tqdm(jobs, desc="Updating requested_nodes"):
                job['requested_nodes'] = None
                job['submit_time'] = next_arrival(1/JOB_ARRIVAL_TIME)
                job['submit_time'] = next_arrival(1/config['JOB_ARRIVAL_TIME'])
    else:
        jobs = td.load_data(args.replay)

+9 −7

File changed.

Preview size limit exceeded, changes collapsed.

Loading