Commit 728fc33c authored by Maiterth, Matthias's avatar Maiterth, Matthias
Browse files

Move node_failure() to resmgr.py

parent 5f0d7a64
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -3,7 +3,6 @@ import dataclasses
import pandas as pd

from .job import Job, JobState
from .account import Accounts
from .network import network_utilization
from .utils import summarize_ranges, expand_ranges, get_utilization
from .resmgr import ResourceManager
+8 −1
Original line number Diff line number Diff line
import numpy as np
from .job import JobState
from .utils import expand_ranges
from scipy.stats import weibull_min


@@ -58,10 +59,15 @@ class ResourceManager:
        scale_parameter = mtbf * 3600  # Convert to seconds

        # Create a NumPy array of node indices, excluding down nodes
        #print(self.down_nodes)
        #down_nodes = expand_ranges(self.down_nodes)
        #all_nodes = np.setdiff1d(np.arange(self.config['TOTAL_NODES']), np.array(self.down_nodes, dtype=int))
        all_nodes = np.array(sorted(set(range(self.total_nodes)) - set(self.down_nodes)))

        # Sample the Weibull distribution for all nodes at once
        random_values = weibull_min.rvs(shape_parameter, scale=scale_parameter, size=all_nodes.size)
        failure_threshold = 0.1
        failed_nodes = [node for node, r in zip(all_nodes, random_values) if r < failure_threshold]

        # Identify nodes that have failed
        failure_threshold = 0.1
@@ -72,6 +78,7 @@ class ResourceManager:
        for node_index in newly_downed_nodes:
            if node_index in self.available_nodes:
                self.available_nodes.remove(node_index)
            self.down_nodes.add(str(node_index))
            self.down_nodes.append(str(node_index))
            self.power_manager.set_idle(node_index)

        return newly_downed_nodes.tolist()