Commit 84f32bea authored by Brewer, Wes's avatar Brewer, Wes
Browse files

Move node_failure() to resmgr.py

parent c9b49c74
Loading
Loading
Loading
Loading
+1 −24
Original line number Diff line number Diff line
@@ -77,7 +77,7 @@ class Engine:
        completed_job_stats = []
        
        # Simulate node failure
        newly_downed_nodes = self.node_failure(self.config['MTBF'])
        newly_downed_nodes = self.resource_manager.node_failure(self.config['MTBF'])

        # Update active/free nodes
        self.num_free_nodes = len(self.resource_manager.available_nodes)
@@ -275,26 +275,3 @@ class Engine:
        }

        return stats


    def node_failure(self, mtbf):
        """Simulate node failure using Weibull distribution."""
        from scipy.stats import weibull_min
        shape_parameter = 1.5
        scale_parameter = mtbf * 3600  # Convert to seconds

        down_nodes = expand_ranges(self.down_nodes)
        all_nodes = np.setdiff1d(np.arange(self.config['TOTAL_NODES']), np.array(down_nodes, dtype=int))

        random_values = weibull_min.rvs(shape_parameter, scale=scale_parameter, size=all_nodes.size)
        failure_threshold = 0.1
        failed_nodes_mask = random_values < failure_threshold
        newly_downed_nodes = all_nodes[failed_nodes_mask]

        for node_index in newly_downed_nodes:
            if node_index in self.resource_manager.available_nodes:
                self.resource_manager.available_nodes.remove(node_index)
            self.down_nodes.append(str(node_index))
            self.power_manager.set_idle(node_index)

        return newly_downed_nodes.tolist()
+34 −0
Original line number Diff line number Diff line
import numpy as np
from .job import JobState
from .utils import expand_ranges
from scipy.stats import weibull_min


class ResourceManager:
    def __init__(self, total_nodes, down_nodes):
@@ -48,3 +52,33 @@ class ResourceManager:
        utilization = (num_active_nodes / total_operational) * 100 if total_operational else 0
        self.sys_util_history.append((current_time, utilization))
        return utilization

    def node_failure(self, mtbf):
        """Simulate node failure using Weibull distribution."""
        shape_parameter = 1.5
        scale_parameter = mtbf * 3600  # Convert to seconds

        # Create a NumPy array of node indices, excluding down nodes
        #print(self.down_nodes)
        #down_nodes = expand_ranges(self.down_nodes)
        #all_nodes = np.setdiff1d(np.arange(self.config['TOTAL_NODES']), np.array(self.down_nodes, dtype=int))
        all_nodes = np.array(sorted(set(range(self.total_nodes)) - set(self.down_nodes)))

        # Sample the Weibull distribution for all nodes at once
        random_values = weibull_min.rvs(shape_parameter, scale=scale_parameter, size=all_nodes.size)
        failure_threshold = 0.1
        failed_nodes = [node for node, r in zip(all_nodes, random_values) if r < failure_threshold]

        # Identify nodes that have failed
        failure_threshold = 0.1
        failed_nodes_mask = random_values < failure_threshold
        newly_downed_nodes = all_nodes[failed_nodes_mask]

        # Update available and down nodes
        for node_index in newly_downed_nodes:
            if node_index in self.available_nodes:
                self.available_nodes.remove(node_index)
            self.down_nodes.append(str(node_index))
            self.power_manager.set_idle(node_index)

        return newly_downed_nodes.tolist()
+0 −1
Original line number Diff line number Diff line
@@ -11,7 +11,6 @@ DEFAULT_TIME = "1h"
# Define systems and their corresponding filenames
SYSTEMS = {
    "frontier": "frontier/slurm/joblive/date=2024-01-18 frontier/jobprofile/date=2024-01-18",
    "fugaku": "fugaku/21_04.parquet",
    "marconi100": "marconi100/job_table.parquet",
    "lassen": "lassen/Lassen-Supercomputer-Job-Dataset",
    "adastraMI250": "adastra/AdastaJobsMI250_15days.parquet"