Loading raps/flops.py +9 −15 Original line number Diff line number Diff line Loading @@ -2,29 +2,23 @@ import numpy as np from .utils import linear_to_3d_index def compute_node_flops(cpu_util, gpu_util): return CPU_FP_RATIO * cpu_util * CPU_PEAK_FLOPS + GPU_FP_RATIO * gpu_util * GPU_PEAK_FLOPS class FLOPSManager(): def __init__(self, **config): globals().update(config) self.flop_state = np.zeros(SC_SHAPE) self.config = config self.flop_state = np.zeros(self.config['SC_SHAPE']) def update_flop_state(self, scheduled_nodes, cpu_util, gpu_util): node_indices = linear_to_3d_index(scheduled_nodes, SC_SHAPE) self.flop_state[node_indices] = compute_node_flops(cpu_util, gpu_util) node_indices = linear_to_3d_index(scheduled_nodes, self.config['SC_SHAPE']) self.flop_state[node_indices] = \ self.config['CPU_FP_RATIO'] * cpu_util * self.config['CPU_PEAK_FLOPS'] + \ self.config['GPU_FP_RATIO'] * gpu_util * self.config['GPU_PEAK_FLOPS'] def get_rpeak(self): node_peak_flops = CPUS_PER_NODE*CPU_PEAK_FLOPS + GPUS_PER_NODE*GPU_PEAK_FLOPS system_peak_flops = AVAILABLE_NODES * node_peak_flops node_peak_flops = self.config['CPUS_PER_NODE'] * self.config['CPU_PEAK_FLOPS'] \ + self.config['GPUS_PER_NODE'] * self.config['GPU_PEAK_FLOPS'] system_peak_flops = self.config['AVAILABLE_NODES'] * node_peak_flops return system_peak_flops def get_system_performance(self): return np.sum(self.flop_state) if __name__ == "__main__": fm = FLOPManager(SC_SHAPE) print(fm.flop_state.shape) raps/power.py +63 −79 Original line number Diff line number Diff line Loading @@ -6,8 +6,7 @@ Classes: - PowerManager: Manages power consumption and loss calculations in the system. Functions: - sivoc_loss: Calculate the power input required considering Sivoc power loss. - rectifier_loss: Calculate the power input required considering Rectifier power loss. - compute_loss: Linear loss model - compute_node_power: Calculate the total power consumption for given CPU and GPU utilization. - compute_node_power_validate: Calculate the total power consumption for a given mean and standard deviation of node power. """ Loading Loading @@ -36,19 +35,11 @@ uf.Variable.__repr__ = custom_repr_uncertainties uf.Variable.__format__ = custom_format_uncertainties def sivoc_loss(p_out): """Calculate the power input required considering Sivoc power loss.""" p_in = (p_out + SIVOC_LOSS_CONSTANT) / SIVOC_EFFICIENCY return p_in def compute_loss(p_out, loss_constant, efficiency): return (p_out + loss_constant) / efficiency def rectifier_loss(p_out): """Calculate the power input required considering Rectifier power loss.""" p_in = (p_out + RECTIFIER_LOSS_CONSTANT) / RECTIFIER_EFFICIENCY return p_in def compute_node_power(cpu_util, gpu_util, net_util, verbose=False): def compute_node_power(cpu_util, gpu_util, net_util, config): """ Calculate the total power consumption for given CPU and GPU utilization. Loading @@ -57,29 +48,30 @@ def compute_node_power(cpu_util, gpu_util, net_util, verbose=False): :param verbose: Flag for verbose output. :return: Total power consumption after accounting for power loss. """ power_cpu = cpu_util * POWER_CPU_MAX + (CPUS_PER_NODE - cpu_util) * POWER_CPU_IDLE power_gpu = gpu_util * POWER_GPU_MAX + (GPUS_PER_NODE - gpu_util) * POWER_GPU_IDLE power_cpu = cpu_util * config['POWER_CPU_MAX'] + \ (config['CPUS_PER_NODE'] - cpu_util) * config['POWER_CPU_IDLE'] power_gpu = gpu_util * config['POWER_GPU_MAX'] + \ (config['GPUS_PER_NODE'] - gpu_util) * config['POWER_GPU_IDLE'] try: power_nic = POWER_NIC_IDLE + (POWER_NIC_MAX - POWER_NIC_IDLE) * net_util power_nic = config['POWER_NIC_IDLE'] + \ (config['POWER_NIC_MAX'] - config['POWER_NIC_IDLE']) * net_util except: power_nic = POWER_NIC power_nic = config['POWER_NIC'] power_total = power_cpu + power_gpu + POWER_MEM + NICS_PER_NODE * power_nic + POWER_NVME power_total = power_cpu + power_gpu + config['POWER_MEM'] + \ config['NICS_PER_NODE'] * power_nic + config['POWER_NVME'] # Apply power loss due to Sivoc and Rectifier power_with_sivoc_loss = sivoc_loss(power_total) power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], \ config['SIVOC_EFFICIENCY']) power_sivoc_loss_only = power_with_sivoc_loss - power_total if verbose: print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}") print(f"*** SIVOC loss: {power_sivoc_loss_only}") print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}") return power_with_sivoc_loss, power_sivoc_loss_only def compute_node_power_uncertainties(cpu_util, gpu_util, verbose=False): def compute_node_power_uncertainties(cpu_util, gpu_util, net_util, config): """ Calculate the total power consumption for given CPU and GPU utilization. Loading @@ -89,32 +81,27 @@ def compute_node_power_uncertainties(cpu_util, gpu_util, verbose=False): :return: Total power consumption after accounting for power loss. """ power_cpu = cpu_util \ * uf.ufloat(POWER_CPU_MAX, POWER_CPU_MAX * POWER_CPU_UNCERTAINTY) \ + (CPUS_PER_NODE - cpu_util) \ * uf.ufloat(POWER_CPU_IDLE, POWER_CPU_IDLE * POWER_CPU_UNCERTAINTY) * uf.ufloat(config['POWER_CPU_MAX'], config['POWER_CPU_MAX'] * config['POWER_CPU_UNCERTAINTY']) \ + (config['CPUS_PER_NODE'] - cpu_util) \ * uf.ufloat(config['POWER_CPU_IDLE'], config['POWER_CPU_IDLE'] * config['POWER_CPU_UNCERTAINTY']) power_gpu = gpu_util \ * uf.ufloat(POWER_GPU_MAX, POWER_GPU_MAX * POWER_GPU_UNCERTAINTY) \ + (GPUS_PER_NODE - gpu_util) \ * uf.ufloat(POWER_GPU_IDLE, POWER_GPU_IDLE * POWER_GPU_UNCERTAINTY) * uf.ufloat(config['POWER_GPU_MAX'], config['POWER_GPU_MAX'] * config['POWER_GPU_UNCERTAINTY']) \ + (config['GPUS_PER_NODE'] - gpu_util) \ * uf.ufloat(config['POWER_GPU_IDLE'], config['POWER_GPU_IDLE'] * config['POWER_GPU_UNCERTAINTY']) power_total = power_cpu + power_gpu \ + uf.ufloat(POWER_MEM, POWER_MEM * POWER_MEM_UNCERTAINTY) \ + NICS_PER_NODE * uf.ufloat(POWER_NIC, POWER_NIC * POWER_NIC_UNCERTAINTY) \ + uf.ufloat(POWER_NVME, POWER_NVME * POWER_NVME_UNCERTAINTY) + uf.ufloat(config['POWER_MEM'], config['POWER_MEM'] * config['POWER_MEM_UNCERTAINTY']) \ + config['NICS_PER_NODE'] * uf.ufloat(config['POWER_NIC'], config['POWER_NIC'] * config['POWER_NIC_UNCERTAINTY']) \ + uf.ufloat(config['POWER_NVME'], config['POWER_NVME'] * config['POWER_NVME_UNCERTAINTY']) # Apply power loss due to Sivoc and Rectifier power_with_sivoc_loss = sivoc_loss(power_total) power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], config['SIVOC_EFFICIENCY']) power_sivoc_loss_only = power_with_sivoc_loss - power_total if verbose: print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}") print(f"*** SIVOC loss: {power_sivoc_loss_only}") print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}") return power_with_sivoc_loss, power_sivoc_loss_only def compute_node_power_validate(mean_node_power, stddev_node_power, verbose=False): def compute_node_power_validate(mean_node_power, stddev_node_power, net_util, config): """ Calculate the total power consumption for given mean and standard deviation of node power. Loading @@ -131,16 +118,12 @@ def compute_node_power_validate(mean_node_power, stddev_node_power, verbose=Fals Total power consumption after accounting for power loss and Sivoc loss. """ power_total = mean_node_power power_with_sivoc_loss = sivoc_loss(power_total) power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], config['SIVOC_EFFICIENCY']) power_sivoc_loss_only = power_with_sivoc_loss - power_total if verbose: print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}") print(f"*** SIVOC loss: {power_sivoc_loss_only}") print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}") return power_with_sivoc_loss, power_sivoc_loss_only def compute_node_power_validate_uncertainties(mean_node_power, stddev_node_power, verbose=False): def compute_node_power_validate_uncertainties(mean_node_power, stddev_node_power, net_util, config): """ Calculate the total power consumption for given mean and standard deviation of node power. Loading @@ -156,13 +139,9 @@ def compute_node_power_validate_uncertainties(mean_node_power, stddev_node_power tuple Total power consumption after accounting for power loss and Sivoc loss. """ power_total = uf.ufloat(mean_node_power, mean_node_power * POWER_NODE_UNCERTAINTY) power_with_sivoc_loss = sivoc_loss(power_total) power_total = uf.ufloat(mean_node_power, mean_node_power * config['POWER_NODE_UNCERTAINTY']) power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], config['SIVOC_EFFICIENCY']) power_sivoc_loss_only = power_with_sivoc_loss - power_total if verbose: print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}") print(f"*** SIVOC loss: {power_sivoc_loss_only}") print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}") return power_with_sivoc_loss, power_sivoc_loss_only Loading Loading @@ -196,7 +175,7 @@ class PowerManager: """ self.sc_shape = config.get('SC_SHAPE') self.down_nodes = config.get('DOWN_NODES') globals().update(config) self.config = config self.power_func = power_func self.power_state = self.initialize_power_state() self.rectifier_loss = self.initialize_rectifier_loss() Loading @@ -211,35 +190,38 @@ class PowerManager: def get_peak_power(self): """Estimate peak power of system for setting max value of gauges in dashboard""" node_power = compute_node_power(CPUS_PER_NODE, GPUS_PER_NODE, net_util=0)[0] blades_per_rectifier = BLADES_PER_CHASSIS / RECTIFIERS_PER_CHASSIS rectifier_load = blades_per_rectifier * NODES_PER_BLADE * node_power rectifier_power = rectifier_loss(rectifier_load) # with AC-DC conversion losses chassis_power = BLADES_PER_CHASSIS * rectifier_power / blades_per_rectifier \ + SWITCHES_PER_CHASSIS * POWER_SWITCH rack_power = chassis_power * CHASSIS_PER_RACK total_power = rack_power * NUM_RACKS + POWER_CDU * NUM_CDUS node_power = compute_node_power(self.config['CPUS_PER_NODE'], self.config['GPUS_PER_NODE'], net_util=0)[0] blades_per_rectifier = self.config['BLADES_PER_CHASSIS'] / self.config['RECTIFIERS_PER_CHASSIS'] rectifier_load = blades_per_rectifier * self.config['NODES_PER_BLADE'] * node_power rectifier_power = compute_loss(rectifier_load, self.config['RECTIFIER_LOSS_CONSTANT'], \ self.config['RECTIFIER_EFFICIENCY']) # with AC-DC conversion losses chassis_power = self.config['BLADES_PER_CHASSIS'] * rectifier_power / blades_per_rectifier \ + self.config['SWITCHES_PER_CHASSIS'] * self.config['POWER_SWITCH'] rack_power = chassis_power * self.config['CHASSIS_PER_RACK'] total_power = rack_power * self.config['NUM_RACKS'] + self.config['POWER_CDU'] * self.config['NUM_CDUS'] return total_power def initialize_power_state(self): """Initialize the power state array with idle power consumption values.""" initial_power, _ = self.power_func(0, 0, 0) initial_power, _ = self.power_func(0, 0, 0, self.config) return np.full(self.sc_shape, initial_power) def initialize_sivoc_loss(self): """Initialize the Sivoc loss array with idle power consumption values.""" _, initial_sivoc_loss = self.power_func(0, 0, 0) _, initial_sivoc_loss = self.power_func(0, 0, 0, self.config) return np.full(self.sc_shape, initial_sivoc_loss) def initialize_rectifier_loss(self): """ Initialize the power state array """ initial_power, _ = self.power_func(0, 0, 0) initial_power, _ = self.power_func(0, 0, 0, self.config) # Rectifier loss curvefit is done at rectifier level, so we simply # approximate by scaling up to number of rectifiers, applying loss # and then dividing by number of rectifiers. # For Frontier there are four nodes per rectifier. power_with_loss = rectifier_loss(initial_power * NODES_PER_RECTIFIER) \ / NODES_PER_RECTIFIER power_with_loss = compute_loss(initial_power * self.config['NODES_PER_RECTIFIER'], \ self.config['RECTIFIER_LOSS_CONSTANT'], \ self.config['RECTIFIER_EFFICIENCY']) \ / self.config['NODES_PER_RECTIFIER'] return np.full(self.sc_shape, power_with_loss) def apply_down_nodes(self): Loading @@ -259,7 +241,7 @@ class PowerManager: """ node_indices = linear_to_3d_index(node_indices, self.sc_shape) self.power_state[node_indices], self.sivoc_loss[node_indices] \ = compute_node_power(0, 0, 0) = compute_node_power(0, 0, 0, self.config) def update_power_state(self, scheduled_nodes, cpu_util, gpu_util, net_util): """ Loading @@ -279,7 +261,7 @@ class PowerManager: Total power consumption of the scheduled nodes. """ node_indices = linear_to_3d_index(scheduled_nodes, self.sc_shape) power_value, sivoc_loss = self.power_func(cpu_util, gpu_util, net_util) power_value, sivoc_loss = self.power_func(cpu_util, gpu_util, net_util, self.config) self.power_state[node_indices] = power_value self.sivoc_loss[node_indices] = sivoc_loss return power_value * len(scheduled_nodes) Loading @@ -296,8 +278,8 @@ class PowerManager: int Number of rectifiers needed. """ value = int((power_state_summed - 1) // RECTIFIER_PEAK_THRESHOLD + 1) return min(value, RECTIFIERS_PER_CHASSIS) value = int((power_state_summed - 1) // self.config['RECTIFIER_PEAK_THRESHOLD'] + 1) return min(value, self.config['RECTIFIERS_PER_CHASSIS']) def compute_rack_power(self, smart_load_sharing=False): """ Loading @@ -311,12 +293,12 @@ class PowerManager: tuple Tuple containing rack power (kW) and rectifier losses (kW). """ shape = (self.sc_shape[0], self.sc_shape[1], CHASSIS_PER_RACK, -1) shape = (self.sc_shape[0], self.sc_shape[1], self.config['CHASSIS_PER_RACK'], -1) power_state_reshaped = np.reshape(self.power_state, shape) chassis_power = np.sum(power_state_reshaped, axis=-1) # Add in switch power chassis_power += SWITCHES_PER_CHASSIS * POWER_SWITCH chassis_power += self.config['SWITCHES_PER_CHASSIS'] * self.config['POWER_SWITCH'] # Divide the power by the number of rectifiers and apply losses per rectifier # Smart load sharing dynamically stages rectifers as needed, e.g., when Loading @@ -327,7 +309,7 @@ class PowerManager: num_rectifiers_array = vectorized_function(chassis_power) # Initialize the array to hold the divided powers, using NaN for unused elements rectifier_power = np.full((*chassis_power.shape, RECTIFIERS_PER_CHASSIS), np.nan) rectifier_power = np.full((*chassis_power.shape, self.config['RECTIFIERS_PER_CHASSIS']), np.nan) power_with_losses = np.copy(rectifier_power) # Chassis_power.shape for Frontier is (25, 3, 8) Loading @@ -345,7 +327,9 @@ class PowerManager: else: divisor = np.array([4, 4, 4, 4]).reshape(1, 1, 1, 4) rectifier_power = chassis_power[:, :, :, np.newaxis] / divisor power_with_losses = rectifier_loss(rectifier_power) power_with_losses = compute_loss(rectifier_power, \ self.config['RECTIFIER_LOSS_CONSTANT'], \ self.config['RECTIFIER_EFFICIENCY']) # Compute just the losses rect_losses = power_with_losses - rectifier_power Loading @@ -353,9 +337,9 @@ class PowerManager: # Sum to 75 racks summed_power_with_losses = np.sum(power_with_losses/1000, axis=(2, 3)) # Zero out power for missing racks for rack in MISSING_RACKS: cdu = rack // RACKS_PER_CDU rack2d = (cdu, rack % RACKS_PER_CDU) for rack in self.config['MISSING_RACKS']: cdu = rack // self.config['RACKS_PER_CDU'] rack2d = (cdu, rack % self.config['RACKS_PER_CDU']) summed_power_with_losses[rack2d] = 0 summed_rect_losses = np.sum(rect_losses/1000, axis=(2, 3)) Loading Loading @@ -400,7 +384,7 @@ class PowerManager: def get_power_df(self, rack_power, rack_loss): # Initialize the columns for power_df power_columns = POWER_DF_HEADER power_columns = self.config['POWER_DF_HEADER'] power_data = [] # Generate power_df Loading raps/scheduler.py +22 −19 Original line number Diff line number Diff line Loading @@ -78,12 +78,11 @@ def get_utilization(trace, time_quanta_index): class Scheduler: """Job scheduler and simulation manager.""" def __init__(self, power_manager, flops_manager, layout_manager, cooling_model=None, **kwargs): config = kwargs.get('config') globals().update(config) self.down_nodes = summarize_ranges(DOWN_NODES) self.available_nodes = list(set(range(TOTAL_NODES)) - set(DOWN_NODES)) self.config = kwargs.get('config') self.down_nodes = summarize_ranges(self.config['DOWN_NODES']) self.available_nodes = list(set(range(self.config['TOTAL_NODES'])) - set(self.config['DOWN_NODES'])) self.num_free_nodes = len(self.available_nodes) self.num_active_nodes = TOTAL_NODES - self.num_free_nodes - len(DOWN_NODES) self.num_active_nodes = self.config['TOTAL_NODES'] - self.num_free_nodes - len(self.config['DOWN_NODES']) self.running = [] self.queue = [] self.jobs_completed = 0 Loading Loading @@ -177,11 +176,11 @@ class Scheduler: is not None and job.end_time <= self.current_time] # Simulate node failure newly_downed_nodes = self.node_failure(MTBF) newly_downed_nodes = self.node_failure(self.config['MTBF']) # Update active/free nodes self.num_free_nodes = len(self.available_nodes) self.num_active_nodes = TOTAL_NODES - self.num_free_nodes \ self.num_active_nodes = self.config['TOTAL_NODES'] - self.num_free_nodes \ - len(expand_ranges(self.down_nodes)) # Update running time for all running jobs Loading Loading @@ -209,7 +208,8 @@ class Scheduler: job.running_time = self.current_time - job.start_time time_quanta_index = (self.current_time - job.start_time) // TRACE_QUANTA time_quanta_index = (self.current_time - job.start_time) \ // self.config['TRACE_QUANTA'] cpu_util = get_utilization(job.cpu_trace, time_quanta_index) gpu_util = get_utilization(job.gpu_trace, time_quanta_index) Loading @@ -225,7 +225,7 @@ class Scheduler: job.power = self.power_manager.update_power_state(job.scheduled_nodes, cpu_util, gpu_util, net_util) if job.running_time % TRACE_QUANTA == 0: if job.running_time % self.config['TRACE_QUANTA'] == 0: job.power_history.append(job.power) for job in completed_jobs: Loading Loading @@ -269,7 +269,7 @@ class Scheduler: rack_loss = rect_losses + sivoc_losses # Update system utilization system_util = self.num_active_nodes / AVAILABLE_NODES * 100 system_util = self.num_active_nodes / self.config['AVAILABLE_NODES'] * 100 self.sys_util_history.append((self.current_time, system_util)) # Render the updated layout Loading @@ -277,8 +277,8 @@ class Scheduler: cooling_inputs, cooling_outputs = None, None # Update power history every 15s if self.current_time % POWER_UPDATE_FREQ == 0: total_power_kw = sum(row[-1] for row in rack_power) + NUM_CDUS * POWER_CDU / 1000.0 if self.current_time % self.config['POWER_UPDATE_FREQ'] == 0: total_power_kw = sum(row[-1] for row in rack_power) + self.config['NUM_CDUS'] * self.config['POWER_CDU'] / 1000.0 total_loss_kw = sum(row[-1] for row in rack_loss) self.power_manager.history.append((self.current_time, total_power_kw)) self.power_manager.loss_history.append((self.current_time, total_loss_kw)) Loading @@ -290,7 +290,7 @@ class Scheduler: if self.cooling_model: if self.current_time % FMU_UPDATE_FREQ == 0: if self.current_time % self.config['FMU_UPDATE_FREQ'] == 0: # Power for NUM_CDUS (25 for Frontier) cdu_power = rack_power.T[-1] * 1000 runtime_values = self.cooling_model.generate_runtime_values(cdu_power, self) Loading @@ -299,7 +299,7 @@ class Scheduler: fmu_inputs = self.cooling_model.generate_fmu_inputs(runtime_values, \ uncertainties=self.power_manager.uncertainties) cooling_inputs, cooling_outputs =\ self.cooling_model.step(self.current_time, fmu_inputs, FMU_UPDATE_FREQ) self.cooling_model.step(self.current_time, fmu_inputs, self.config['FMU_UPDATE_FREQ']) # Get a dataframe of the power data power_df = self.power_manager.get_power_df(rack_power, rack_loss) Loading @@ -310,7 +310,7 @@ class Scheduler: system_util, uncertainties=self.power_manager.uncertainties) self.layout_manager.update_pressflow_array(cooling_outputs) if self.current_time % UI_UPDATE_FREQ == 0: if self.current_time % self.config['UI_UPDATE_FREQ'] == 0: # Get a dataframe of the power data power_df = self.power_manager.get_power_df(rack_power, rack_loss) Loading Loading @@ -376,7 +376,7 @@ class Scheduler: print("stopping simulation at time", self.current_time) break if self.debug: if _ % UI_UPDATE_FREQ == 0: if _ % self.config['UI_UPDATE_FREQ'] == 0: print(".", end="", flush=True) def run_simulation_blocking(self, jobs, timesteps): Loading Loading @@ -408,7 +408,7 @@ class Scheduler: # From https://www.epa.gov/energy/greenhouse-gases-equivalencies-\ # calculator-calculations-and-references emissions = total_energy_consumed * 852.3 / 2204.6 / efficiency total_cost = total_energy_consumed * 1000 * POWER_COST # total cost in dollars total_cost = total_energy_consumed * 1000 * self.config['POWER_COST'] # total cost in dollars stats = { 'num_samples': num_samples, Loading @@ -435,10 +435,13 @@ class Scheduler: # Create a NumPy array of node indices, excluding down nodes down_nodes = expand_ranges(self.down_nodes) all_nodes = np.setdiff1d(np.arange(TOTAL_NODES), np.array(down_nodes, dtype=int)) all_nodes = np.setdiff1d(np.arange(self.config['TOTAL_NODES']), np.array(down_nodes, dtype=int)) # Sample the Weibull distribution for all nodes at once random_values = weibull_min.rvs(shape_parameter, scale=scale_parameter, size=all_nodes.size) random_values = weibull_min.rvs(shape_parameter, scale=scale_parameter, size=all_nodes.size) # Identify nodes that have failed failure_threshold = 0.1 Loading raps/telemetry.py +5 −6 Original line number Diff line number Diff line Loading @@ -21,15 +21,13 @@ if __name__ == "__main__": parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output') args = parser.parse_args() from .config import is_config_initialized, initialize_config if not is_config_initialized(): initialize_config(args.system) import importlib import numpy as np import re from datetime import datetime from tqdm import tqdm from .config import ConfigManager from .scheduler import Job from .plotting import plot_submit_times, plot_nodes_histogram from .utils import next_arrival Loading @@ -42,7 +40,6 @@ class Telemetry: self.kwargs = kwargs self.system = kwargs.get('system') config = kwargs.get('config') globals().update(config) def save_snapshot(self, jobs: list, filename: str): Loading Loading @@ -71,7 +68,9 @@ class Telemetry: if __name__ == "__main__": args_dict = vars(args) args_dict['config'] = ConfigManager(system_name=args.system).get_config() td = Telemetry(**args_dict) JOB_ARRIVAL_TIME = 900 if args.replay[0].endswith(".npz"): print(f"Loading {args.replay[0]}...") Loading @@ -79,7 +78,7 @@ if __name__ == "__main__": if args.reschedule: for job in tqdm(jobs, desc="Updating requested_nodes"): job['requested_nodes'] = None job['submit_time'] = next_arrival(1/JOB_ARRIVAL_TIME) job['submit_time'] = next_arrival(1/config['JOB_ARRIVAL_TIME']) else: jobs = td.load_data(args.replay) Loading raps/ui.py +9 −7 File changed.Preview size limit exceeded, changes collapsed. Show changes Loading
raps/flops.py +9 −15 Original line number Diff line number Diff line Loading @@ -2,29 +2,23 @@ import numpy as np from .utils import linear_to_3d_index def compute_node_flops(cpu_util, gpu_util): return CPU_FP_RATIO * cpu_util * CPU_PEAK_FLOPS + GPU_FP_RATIO * gpu_util * GPU_PEAK_FLOPS class FLOPSManager(): def __init__(self, **config): globals().update(config) self.flop_state = np.zeros(SC_SHAPE) self.config = config self.flop_state = np.zeros(self.config['SC_SHAPE']) def update_flop_state(self, scheduled_nodes, cpu_util, gpu_util): node_indices = linear_to_3d_index(scheduled_nodes, SC_SHAPE) self.flop_state[node_indices] = compute_node_flops(cpu_util, gpu_util) node_indices = linear_to_3d_index(scheduled_nodes, self.config['SC_SHAPE']) self.flop_state[node_indices] = \ self.config['CPU_FP_RATIO'] * cpu_util * self.config['CPU_PEAK_FLOPS'] + \ self.config['GPU_FP_RATIO'] * gpu_util * self.config['GPU_PEAK_FLOPS'] def get_rpeak(self): node_peak_flops = CPUS_PER_NODE*CPU_PEAK_FLOPS + GPUS_PER_NODE*GPU_PEAK_FLOPS system_peak_flops = AVAILABLE_NODES * node_peak_flops node_peak_flops = self.config['CPUS_PER_NODE'] * self.config['CPU_PEAK_FLOPS'] \ + self.config['GPUS_PER_NODE'] * self.config['GPU_PEAK_FLOPS'] system_peak_flops = self.config['AVAILABLE_NODES'] * node_peak_flops return system_peak_flops def get_system_performance(self): return np.sum(self.flop_state) if __name__ == "__main__": fm = FLOPManager(SC_SHAPE) print(fm.flop_state.shape)
raps/power.py +63 −79 Original line number Diff line number Diff line Loading @@ -6,8 +6,7 @@ Classes: - PowerManager: Manages power consumption and loss calculations in the system. Functions: - sivoc_loss: Calculate the power input required considering Sivoc power loss. - rectifier_loss: Calculate the power input required considering Rectifier power loss. - compute_loss: Linear loss model - compute_node_power: Calculate the total power consumption for given CPU and GPU utilization. - compute_node_power_validate: Calculate the total power consumption for a given mean and standard deviation of node power. """ Loading Loading @@ -36,19 +35,11 @@ uf.Variable.__repr__ = custom_repr_uncertainties uf.Variable.__format__ = custom_format_uncertainties def sivoc_loss(p_out): """Calculate the power input required considering Sivoc power loss.""" p_in = (p_out + SIVOC_LOSS_CONSTANT) / SIVOC_EFFICIENCY return p_in def compute_loss(p_out, loss_constant, efficiency): return (p_out + loss_constant) / efficiency def rectifier_loss(p_out): """Calculate the power input required considering Rectifier power loss.""" p_in = (p_out + RECTIFIER_LOSS_CONSTANT) / RECTIFIER_EFFICIENCY return p_in def compute_node_power(cpu_util, gpu_util, net_util, verbose=False): def compute_node_power(cpu_util, gpu_util, net_util, config): """ Calculate the total power consumption for given CPU and GPU utilization. Loading @@ -57,29 +48,30 @@ def compute_node_power(cpu_util, gpu_util, net_util, verbose=False): :param verbose: Flag for verbose output. :return: Total power consumption after accounting for power loss. """ power_cpu = cpu_util * POWER_CPU_MAX + (CPUS_PER_NODE - cpu_util) * POWER_CPU_IDLE power_gpu = gpu_util * POWER_GPU_MAX + (GPUS_PER_NODE - gpu_util) * POWER_GPU_IDLE power_cpu = cpu_util * config['POWER_CPU_MAX'] + \ (config['CPUS_PER_NODE'] - cpu_util) * config['POWER_CPU_IDLE'] power_gpu = gpu_util * config['POWER_GPU_MAX'] + \ (config['GPUS_PER_NODE'] - gpu_util) * config['POWER_GPU_IDLE'] try: power_nic = POWER_NIC_IDLE + (POWER_NIC_MAX - POWER_NIC_IDLE) * net_util power_nic = config['POWER_NIC_IDLE'] + \ (config['POWER_NIC_MAX'] - config['POWER_NIC_IDLE']) * net_util except: power_nic = POWER_NIC power_nic = config['POWER_NIC'] power_total = power_cpu + power_gpu + POWER_MEM + NICS_PER_NODE * power_nic + POWER_NVME power_total = power_cpu + power_gpu + config['POWER_MEM'] + \ config['NICS_PER_NODE'] * power_nic + config['POWER_NVME'] # Apply power loss due to Sivoc and Rectifier power_with_sivoc_loss = sivoc_loss(power_total) power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], \ config['SIVOC_EFFICIENCY']) power_sivoc_loss_only = power_with_sivoc_loss - power_total if verbose: print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}") print(f"*** SIVOC loss: {power_sivoc_loss_only}") print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}") return power_with_sivoc_loss, power_sivoc_loss_only def compute_node_power_uncertainties(cpu_util, gpu_util, verbose=False): def compute_node_power_uncertainties(cpu_util, gpu_util, net_util, config): """ Calculate the total power consumption for given CPU and GPU utilization. Loading @@ -89,32 +81,27 @@ def compute_node_power_uncertainties(cpu_util, gpu_util, verbose=False): :return: Total power consumption after accounting for power loss. """ power_cpu = cpu_util \ * uf.ufloat(POWER_CPU_MAX, POWER_CPU_MAX * POWER_CPU_UNCERTAINTY) \ + (CPUS_PER_NODE - cpu_util) \ * uf.ufloat(POWER_CPU_IDLE, POWER_CPU_IDLE * POWER_CPU_UNCERTAINTY) * uf.ufloat(config['POWER_CPU_MAX'], config['POWER_CPU_MAX'] * config['POWER_CPU_UNCERTAINTY']) \ + (config['CPUS_PER_NODE'] - cpu_util) \ * uf.ufloat(config['POWER_CPU_IDLE'], config['POWER_CPU_IDLE'] * config['POWER_CPU_UNCERTAINTY']) power_gpu = gpu_util \ * uf.ufloat(POWER_GPU_MAX, POWER_GPU_MAX * POWER_GPU_UNCERTAINTY) \ + (GPUS_PER_NODE - gpu_util) \ * uf.ufloat(POWER_GPU_IDLE, POWER_GPU_IDLE * POWER_GPU_UNCERTAINTY) * uf.ufloat(config['POWER_GPU_MAX'], config['POWER_GPU_MAX'] * config['POWER_GPU_UNCERTAINTY']) \ + (config['GPUS_PER_NODE'] - gpu_util) \ * uf.ufloat(config['POWER_GPU_IDLE'], config['POWER_GPU_IDLE'] * config['POWER_GPU_UNCERTAINTY']) power_total = power_cpu + power_gpu \ + uf.ufloat(POWER_MEM, POWER_MEM * POWER_MEM_UNCERTAINTY) \ + NICS_PER_NODE * uf.ufloat(POWER_NIC, POWER_NIC * POWER_NIC_UNCERTAINTY) \ + uf.ufloat(POWER_NVME, POWER_NVME * POWER_NVME_UNCERTAINTY) + uf.ufloat(config['POWER_MEM'], config['POWER_MEM'] * config['POWER_MEM_UNCERTAINTY']) \ + config['NICS_PER_NODE'] * uf.ufloat(config['POWER_NIC'], config['POWER_NIC'] * config['POWER_NIC_UNCERTAINTY']) \ + uf.ufloat(config['POWER_NVME'], config['POWER_NVME'] * config['POWER_NVME_UNCERTAINTY']) # Apply power loss due to Sivoc and Rectifier power_with_sivoc_loss = sivoc_loss(power_total) power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], config['SIVOC_EFFICIENCY']) power_sivoc_loss_only = power_with_sivoc_loss - power_total if verbose: print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}") print(f"*** SIVOC loss: {power_sivoc_loss_only}") print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}") return power_with_sivoc_loss, power_sivoc_loss_only def compute_node_power_validate(mean_node_power, stddev_node_power, verbose=False): def compute_node_power_validate(mean_node_power, stddev_node_power, net_util, config): """ Calculate the total power consumption for given mean and standard deviation of node power. Loading @@ -131,16 +118,12 @@ def compute_node_power_validate(mean_node_power, stddev_node_power, verbose=Fals Total power consumption after accounting for power loss and Sivoc loss. """ power_total = mean_node_power power_with_sivoc_loss = sivoc_loss(power_total) power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], config['SIVOC_EFFICIENCY']) power_sivoc_loss_only = power_with_sivoc_loss - power_total if verbose: print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}") print(f"*** SIVOC loss: {power_sivoc_loss_only}") print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}") return power_with_sivoc_loss, power_sivoc_loss_only def compute_node_power_validate_uncertainties(mean_node_power, stddev_node_power, verbose=False): def compute_node_power_validate_uncertainties(mean_node_power, stddev_node_power, net_util, config): """ Calculate the total power consumption for given mean and standard deviation of node power. Loading @@ -156,13 +139,9 @@ def compute_node_power_validate_uncertainties(mean_node_power, stddev_node_power tuple Total power consumption after accounting for power loss and Sivoc loss. """ power_total = uf.ufloat(mean_node_power, mean_node_power * POWER_NODE_UNCERTAINTY) power_with_sivoc_loss = sivoc_loss(power_total) power_total = uf.ufloat(mean_node_power, mean_node_power * config['POWER_NODE_UNCERTAINTY']) power_with_sivoc_loss = compute_loss(power_total, config['SIVOC_LOSS_CONSTANT'], config['SIVOC_EFFICIENCY']) power_sivoc_loss_only = power_with_sivoc_loss - power_total if verbose: print(f"*** Power (CPU + GPU + MEM + NICS): {power_total}") print(f"*** SIVOC loss: {power_sivoc_loss_only}") print(f"*** Power before SIVOC loss: {power_with_sivoc_loss}") return power_with_sivoc_loss, power_sivoc_loss_only Loading Loading @@ -196,7 +175,7 @@ class PowerManager: """ self.sc_shape = config.get('SC_SHAPE') self.down_nodes = config.get('DOWN_NODES') globals().update(config) self.config = config self.power_func = power_func self.power_state = self.initialize_power_state() self.rectifier_loss = self.initialize_rectifier_loss() Loading @@ -211,35 +190,38 @@ class PowerManager: def get_peak_power(self): """Estimate peak power of system for setting max value of gauges in dashboard""" node_power = compute_node_power(CPUS_PER_NODE, GPUS_PER_NODE, net_util=0)[0] blades_per_rectifier = BLADES_PER_CHASSIS / RECTIFIERS_PER_CHASSIS rectifier_load = blades_per_rectifier * NODES_PER_BLADE * node_power rectifier_power = rectifier_loss(rectifier_load) # with AC-DC conversion losses chassis_power = BLADES_PER_CHASSIS * rectifier_power / blades_per_rectifier \ + SWITCHES_PER_CHASSIS * POWER_SWITCH rack_power = chassis_power * CHASSIS_PER_RACK total_power = rack_power * NUM_RACKS + POWER_CDU * NUM_CDUS node_power = compute_node_power(self.config['CPUS_PER_NODE'], self.config['GPUS_PER_NODE'], net_util=0)[0] blades_per_rectifier = self.config['BLADES_PER_CHASSIS'] / self.config['RECTIFIERS_PER_CHASSIS'] rectifier_load = blades_per_rectifier * self.config['NODES_PER_BLADE'] * node_power rectifier_power = compute_loss(rectifier_load, self.config['RECTIFIER_LOSS_CONSTANT'], \ self.config['RECTIFIER_EFFICIENCY']) # with AC-DC conversion losses chassis_power = self.config['BLADES_PER_CHASSIS'] * rectifier_power / blades_per_rectifier \ + self.config['SWITCHES_PER_CHASSIS'] * self.config['POWER_SWITCH'] rack_power = chassis_power * self.config['CHASSIS_PER_RACK'] total_power = rack_power * self.config['NUM_RACKS'] + self.config['POWER_CDU'] * self.config['NUM_CDUS'] return total_power def initialize_power_state(self): """Initialize the power state array with idle power consumption values.""" initial_power, _ = self.power_func(0, 0, 0) initial_power, _ = self.power_func(0, 0, 0, self.config) return np.full(self.sc_shape, initial_power) def initialize_sivoc_loss(self): """Initialize the Sivoc loss array with idle power consumption values.""" _, initial_sivoc_loss = self.power_func(0, 0, 0) _, initial_sivoc_loss = self.power_func(0, 0, 0, self.config) return np.full(self.sc_shape, initial_sivoc_loss) def initialize_rectifier_loss(self): """ Initialize the power state array """ initial_power, _ = self.power_func(0, 0, 0) initial_power, _ = self.power_func(0, 0, 0, self.config) # Rectifier loss curvefit is done at rectifier level, so we simply # approximate by scaling up to number of rectifiers, applying loss # and then dividing by number of rectifiers. # For Frontier there are four nodes per rectifier. power_with_loss = rectifier_loss(initial_power * NODES_PER_RECTIFIER) \ / NODES_PER_RECTIFIER power_with_loss = compute_loss(initial_power * self.config['NODES_PER_RECTIFIER'], \ self.config['RECTIFIER_LOSS_CONSTANT'], \ self.config['RECTIFIER_EFFICIENCY']) \ / self.config['NODES_PER_RECTIFIER'] return np.full(self.sc_shape, power_with_loss) def apply_down_nodes(self): Loading @@ -259,7 +241,7 @@ class PowerManager: """ node_indices = linear_to_3d_index(node_indices, self.sc_shape) self.power_state[node_indices], self.sivoc_loss[node_indices] \ = compute_node_power(0, 0, 0) = compute_node_power(0, 0, 0, self.config) def update_power_state(self, scheduled_nodes, cpu_util, gpu_util, net_util): """ Loading @@ -279,7 +261,7 @@ class PowerManager: Total power consumption of the scheduled nodes. """ node_indices = linear_to_3d_index(scheduled_nodes, self.sc_shape) power_value, sivoc_loss = self.power_func(cpu_util, gpu_util, net_util) power_value, sivoc_loss = self.power_func(cpu_util, gpu_util, net_util, self.config) self.power_state[node_indices] = power_value self.sivoc_loss[node_indices] = sivoc_loss return power_value * len(scheduled_nodes) Loading @@ -296,8 +278,8 @@ class PowerManager: int Number of rectifiers needed. """ value = int((power_state_summed - 1) // RECTIFIER_PEAK_THRESHOLD + 1) return min(value, RECTIFIERS_PER_CHASSIS) value = int((power_state_summed - 1) // self.config['RECTIFIER_PEAK_THRESHOLD'] + 1) return min(value, self.config['RECTIFIERS_PER_CHASSIS']) def compute_rack_power(self, smart_load_sharing=False): """ Loading @@ -311,12 +293,12 @@ class PowerManager: tuple Tuple containing rack power (kW) and rectifier losses (kW). """ shape = (self.sc_shape[0], self.sc_shape[1], CHASSIS_PER_RACK, -1) shape = (self.sc_shape[0], self.sc_shape[1], self.config['CHASSIS_PER_RACK'], -1) power_state_reshaped = np.reshape(self.power_state, shape) chassis_power = np.sum(power_state_reshaped, axis=-1) # Add in switch power chassis_power += SWITCHES_PER_CHASSIS * POWER_SWITCH chassis_power += self.config['SWITCHES_PER_CHASSIS'] * self.config['POWER_SWITCH'] # Divide the power by the number of rectifiers and apply losses per rectifier # Smart load sharing dynamically stages rectifers as needed, e.g., when Loading @@ -327,7 +309,7 @@ class PowerManager: num_rectifiers_array = vectorized_function(chassis_power) # Initialize the array to hold the divided powers, using NaN for unused elements rectifier_power = np.full((*chassis_power.shape, RECTIFIERS_PER_CHASSIS), np.nan) rectifier_power = np.full((*chassis_power.shape, self.config['RECTIFIERS_PER_CHASSIS']), np.nan) power_with_losses = np.copy(rectifier_power) # Chassis_power.shape for Frontier is (25, 3, 8) Loading @@ -345,7 +327,9 @@ class PowerManager: else: divisor = np.array([4, 4, 4, 4]).reshape(1, 1, 1, 4) rectifier_power = chassis_power[:, :, :, np.newaxis] / divisor power_with_losses = rectifier_loss(rectifier_power) power_with_losses = compute_loss(rectifier_power, \ self.config['RECTIFIER_LOSS_CONSTANT'], \ self.config['RECTIFIER_EFFICIENCY']) # Compute just the losses rect_losses = power_with_losses - rectifier_power Loading @@ -353,9 +337,9 @@ class PowerManager: # Sum to 75 racks summed_power_with_losses = np.sum(power_with_losses/1000, axis=(2, 3)) # Zero out power for missing racks for rack in MISSING_RACKS: cdu = rack // RACKS_PER_CDU rack2d = (cdu, rack % RACKS_PER_CDU) for rack in self.config['MISSING_RACKS']: cdu = rack // self.config['RACKS_PER_CDU'] rack2d = (cdu, rack % self.config['RACKS_PER_CDU']) summed_power_with_losses[rack2d] = 0 summed_rect_losses = np.sum(rect_losses/1000, axis=(2, 3)) Loading Loading @@ -400,7 +384,7 @@ class PowerManager: def get_power_df(self, rack_power, rack_loss): # Initialize the columns for power_df power_columns = POWER_DF_HEADER power_columns = self.config['POWER_DF_HEADER'] power_data = [] # Generate power_df Loading
raps/scheduler.py +22 −19 Original line number Diff line number Diff line Loading @@ -78,12 +78,11 @@ def get_utilization(trace, time_quanta_index): class Scheduler: """Job scheduler and simulation manager.""" def __init__(self, power_manager, flops_manager, layout_manager, cooling_model=None, **kwargs): config = kwargs.get('config') globals().update(config) self.down_nodes = summarize_ranges(DOWN_NODES) self.available_nodes = list(set(range(TOTAL_NODES)) - set(DOWN_NODES)) self.config = kwargs.get('config') self.down_nodes = summarize_ranges(self.config['DOWN_NODES']) self.available_nodes = list(set(range(self.config['TOTAL_NODES'])) - set(self.config['DOWN_NODES'])) self.num_free_nodes = len(self.available_nodes) self.num_active_nodes = TOTAL_NODES - self.num_free_nodes - len(DOWN_NODES) self.num_active_nodes = self.config['TOTAL_NODES'] - self.num_free_nodes - len(self.config['DOWN_NODES']) self.running = [] self.queue = [] self.jobs_completed = 0 Loading Loading @@ -177,11 +176,11 @@ class Scheduler: is not None and job.end_time <= self.current_time] # Simulate node failure newly_downed_nodes = self.node_failure(MTBF) newly_downed_nodes = self.node_failure(self.config['MTBF']) # Update active/free nodes self.num_free_nodes = len(self.available_nodes) self.num_active_nodes = TOTAL_NODES - self.num_free_nodes \ self.num_active_nodes = self.config['TOTAL_NODES'] - self.num_free_nodes \ - len(expand_ranges(self.down_nodes)) # Update running time for all running jobs Loading Loading @@ -209,7 +208,8 @@ class Scheduler: job.running_time = self.current_time - job.start_time time_quanta_index = (self.current_time - job.start_time) // TRACE_QUANTA time_quanta_index = (self.current_time - job.start_time) \ // self.config['TRACE_QUANTA'] cpu_util = get_utilization(job.cpu_trace, time_quanta_index) gpu_util = get_utilization(job.gpu_trace, time_quanta_index) Loading @@ -225,7 +225,7 @@ class Scheduler: job.power = self.power_manager.update_power_state(job.scheduled_nodes, cpu_util, gpu_util, net_util) if job.running_time % TRACE_QUANTA == 0: if job.running_time % self.config['TRACE_QUANTA'] == 0: job.power_history.append(job.power) for job in completed_jobs: Loading Loading @@ -269,7 +269,7 @@ class Scheduler: rack_loss = rect_losses + sivoc_losses # Update system utilization system_util = self.num_active_nodes / AVAILABLE_NODES * 100 system_util = self.num_active_nodes / self.config['AVAILABLE_NODES'] * 100 self.sys_util_history.append((self.current_time, system_util)) # Render the updated layout Loading @@ -277,8 +277,8 @@ class Scheduler: cooling_inputs, cooling_outputs = None, None # Update power history every 15s if self.current_time % POWER_UPDATE_FREQ == 0: total_power_kw = sum(row[-1] for row in rack_power) + NUM_CDUS * POWER_CDU / 1000.0 if self.current_time % self.config['POWER_UPDATE_FREQ'] == 0: total_power_kw = sum(row[-1] for row in rack_power) + self.config['NUM_CDUS'] * self.config['POWER_CDU'] / 1000.0 total_loss_kw = sum(row[-1] for row in rack_loss) self.power_manager.history.append((self.current_time, total_power_kw)) self.power_manager.loss_history.append((self.current_time, total_loss_kw)) Loading @@ -290,7 +290,7 @@ class Scheduler: if self.cooling_model: if self.current_time % FMU_UPDATE_FREQ == 0: if self.current_time % self.config['FMU_UPDATE_FREQ'] == 0: # Power for NUM_CDUS (25 for Frontier) cdu_power = rack_power.T[-1] * 1000 runtime_values = self.cooling_model.generate_runtime_values(cdu_power, self) Loading @@ -299,7 +299,7 @@ class Scheduler: fmu_inputs = self.cooling_model.generate_fmu_inputs(runtime_values, \ uncertainties=self.power_manager.uncertainties) cooling_inputs, cooling_outputs =\ self.cooling_model.step(self.current_time, fmu_inputs, FMU_UPDATE_FREQ) self.cooling_model.step(self.current_time, fmu_inputs, self.config['FMU_UPDATE_FREQ']) # Get a dataframe of the power data power_df = self.power_manager.get_power_df(rack_power, rack_loss) Loading @@ -310,7 +310,7 @@ class Scheduler: system_util, uncertainties=self.power_manager.uncertainties) self.layout_manager.update_pressflow_array(cooling_outputs) if self.current_time % UI_UPDATE_FREQ == 0: if self.current_time % self.config['UI_UPDATE_FREQ'] == 0: # Get a dataframe of the power data power_df = self.power_manager.get_power_df(rack_power, rack_loss) Loading Loading @@ -376,7 +376,7 @@ class Scheduler: print("stopping simulation at time", self.current_time) break if self.debug: if _ % UI_UPDATE_FREQ == 0: if _ % self.config['UI_UPDATE_FREQ'] == 0: print(".", end="", flush=True) def run_simulation_blocking(self, jobs, timesteps): Loading Loading @@ -408,7 +408,7 @@ class Scheduler: # From https://www.epa.gov/energy/greenhouse-gases-equivalencies-\ # calculator-calculations-and-references emissions = total_energy_consumed * 852.3 / 2204.6 / efficiency total_cost = total_energy_consumed * 1000 * POWER_COST # total cost in dollars total_cost = total_energy_consumed * 1000 * self.config['POWER_COST'] # total cost in dollars stats = { 'num_samples': num_samples, Loading @@ -435,10 +435,13 @@ class Scheduler: # Create a NumPy array of node indices, excluding down nodes down_nodes = expand_ranges(self.down_nodes) all_nodes = np.setdiff1d(np.arange(TOTAL_NODES), np.array(down_nodes, dtype=int)) all_nodes = np.setdiff1d(np.arange(self.config['TOTAL_NODES']), np.array(down_nodes, dtype=int)) # Sample the Weibull distribution for all nodes at once random_values = weibull_min.rvs(shape_parameter, scale=scale_parameter, size=all_nodes.size) random_values = weibull_min.rvs(shape_parameter, scale=scale_parameter, size=all_nodes.size) # Identify nodes that have failed failure_threshold = 0.1 Loading
raps/telemetry.py +5 −6 Original line number Diff line number Diff line Loading @@ -21,15 +21,13 @@ if __name__ == "__main__": parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output') args = parser.parse_args() from .config import is_config_initialized, initialize_config if not is_config_initialized(): initialize_config(args.system) import importlib import numpy as np import re from datetime import datetime from tqdm import tqdm from .config import ConfigManager from .scheduler import Job from .plotting import plot_submit_times, plot_nodes_histogram from .utils import next_arrival Loading @@ -42,7 +40,6 @@ class Telemetry: self.kwargs = kwargs self.system = kwargs.get('system') config = kwargs.get('config') globals().update(config) def save_snapshot(self, jobs: list, filename: str): Loading Loading @@ -71,7 +68,9 @@ class Telemetry: if __name__ == "__main__": args_dict = vars(args) args_dict['config'] = ConfigManager(system_name=args.system).get_config() td = Telemetry(**args_dict) JOB_ARRIVAL_TIME = 900 if args.replay[0].endswith(".npz"): print(f"Loading {args.replay[0]}...") Loading @@ -79,7 +78,7 @@ if __name__ == "__main__": if args.reschedule: for job in tqdm(jobs, desc="Updating requested_nodes"): job['requested_nodes'] = None job['submit_time'] = next_arrival(1/JOB_ARRIVAL_TIME) job['submit_time'] = next_arrival(1/config['JOB_ARRIVAL_TIME']) else: jobs = td.load_data(args.replay) Loading