Commit ac22834f authored by Brewer, Wes's avatar Brewer, Wes
Browse files

Merge branch 'restructure-engine-scheduler' into 'main'

Restructure engine scheduler

See merge request !72
parents d2219d1d 086ce17b
Loading
Loading
Loading
Loading
+4 −2
Original line number Diff line number Diff line
import argparse
from raps.policy import PolicyType
from raps.schedulers.default import PolicyType

parser = argparse.ArgumentParser(description='Resource Allocator & Power Simulator (RAPS)')
parser.add_argument('-c', '--cooling', action='store_true', help='Include FMU cooling model')
@@ -28,8 +28,10 @@ choices = ['png', 'svg', 'jpg', 'pdf', 'eps']
parser.add_argument('--imtype', type=str, choices=choices, default=choices[0], help='Plot image type')
parser.add_argument('--scale', type=int, default=0, help='Scale telemetry to max nodes specified in order to run telemetry on a smaller smaller target system/partition, e.g., --scale 192')
parser.add_argument('--system', type=str, default='frontier', help='System config to use')
choices = ['default', 'nrel', 'anl', 'flux']
parser.add_argument('--scheduler', type=str, choices=choices, default=choices[0], help='Name of scheduler')
choices = [policy.value for policy in PolicyType]
parser.add_argument('-s', '--schedule', type=str, choices=choices, default=choices[0], help='Schedule policy to use')
parser.add_argument('--policy', type=str, choices=choices, default=choices[0], help='Schedule policy to use')
choices = ['random', 'benchmark', 'peak', 'idle']
parser.add_argument('-w', '--workload', type=str, choices=choices, default=choices[0], help='Type of synthetic workload')
choices = ['layout1', 'layout2']
+4 −3
Original line number Diff line number Diff line
@@ -25,7 +25,8 @@ from raps.flops import FLOPSManager
from raps.plotting import Plotter
from raps.power import PowerManager, compute_node_power, compute_node_power_validate
from raps.power import compute_node_power_uncertainties, compute_node_power_validate_uncertainties
from raps.scheduler import Scheduler, Job
from raps.engine import Engine
from raps.job import Job
from raps.telemetry import Telemetry
from raps.workload import Workload
from raps.weather import Weather
@@ -61,13 +62,13 @@ else:
args_dict['config'] = config
flops_manager = FLOPSManager(**args_dict)

sc = Scheduler(
sc = Engine(
    power_manager=power_manager,
    flops_manager=flops_manager,
    cooling_model=cooling_model,
    **args_dict,
)
layout_manager = LayoutManager(args.layout, scheduler=sc, debug=args.debug, **config)
layout_manager = LayoutManager(args.layout, engine=sc, debug=args.debug, **config)

if args.replay:

+7 −7
Original line number Diff line number Diff line
@@ -8,9 +8,9 @@ import sys

from args import args
from raps.config import ConfigManager, CONFIG_PATH
from raps.policy import PolicyType
from raps.schedulers.default import PolicyType
from raps.ui import LayoutManager
from raps.scheduler import Scheduler
from raps.engine import Engine
from raps.flops import FLOPSManager
from raps.power import PowerManager, compute_node_power
from raps.telemetry import Telemetry
@@ -74,8 +74,8 @@ layout_managers = {}
for i, config in enumerate(configs):
    pm = PowerManager(compute_node_power, **configs[i])
    fm = FLOPSManager(**args_dicts[i])
    sc = Scheduler(power_manager=pm, flops_manager=fm, cooling_model=None, **args_dicts[i])
    layout_managers[config['system_name']] = LayoutManager(args.layout, scheduler=sc, debug=args.debug, **config)
    sc = Engine(power_manager=pm, flops_manager=fm, cooling_model=None, **args_dicts[i])
    layout_managers[config['system_name']] = LayoutManager(args.layout, engine=sc, debug=args.debug, **config)

# Set simulation timesteps
if args.time:
@@ -96,9 +96,9 @@ for timestep in range(timesteps):
    if timestep % configs[0]['UI_UPDATE_FREQ'] == 0:  # Assuming same frequency for all partitions
        sys_power = 0
        for name, lm in layout_managers.items():
            sys_util = lm.scheduler.sys_util_history[-1] if lm.scheduler.sys_util_history else 0.0
            print(f"[DEBUG] {name} - Timestep {timestep} - Jobs running: {len(lm.scheduler.running)} - Utilization: {sys_util[1]:.2f}% - Power: {lm.scheduler.sys_power:.1f}kW")
            sys_power += lm.scheduler.sys_power
            sys_util = lm.engine.sys_util_history[-1] if lm.engine.sys_util_history else 0.0
            print(f"[DEBUG] {name} - Timestep {timestep} - Jobs running: {len(lm.engine.running)} - Utilization: {sys_util[1]:.2f}% - Power: {lm.engine.sys_power:.1f}kW")
            sys_power += lm.engine.sys_power
        print(f"system power: {sys_power:.1f}kW")

print("Simulation complete.")
+266 −0

File changed and moved.

Preview size limit exceeded, changes collapsed.

+26 −8
Original line number Diff line number Diff line
@@ -9,17 +9,35 @@ class FLOPSManager():
        self.flop_state = np.zeros(self.config['SC_SHAPE'])

    def update_flop_state(self, scheduled_nodes, cpu_util, gpu_util):
        node_indices = linear_to_3d_index(scheduled_nodes, self.config['SC_SHAPE'])
        if self.validate:   # cpu_util is in fact node_Watts in this case
            self.flop_state[node_indices] = \
                (self.config['CPU_FP_RATIO']*self.config['CPU_PEAK_FLOPS'] + self.config['GPU_FP_RATIO'] * self.config['GPU_PEAK_FLOPS']) * (cpu_util / (self.config['POWER_CPU_MAX']*self.config['CPUS_PER_NODE'] + self.config['POWER_GPU_MAX']*self.config['GPUS_PER_NODE']+ self.config['POWER_NIC']*self.config['NICS_PER_NODE']+self.config['POWER_NVME']))
        else:   
            self.flop_state[node_indices] = \
                self.config['CPU_FP_RATIO'] * cpu_util * self.config['CPU_PEAK_FLOPS'] + \
                self.config['GPU_FP_RATIO'] * gpu_util * self.config['GPU_PEAK_FLOPS']
        cpu_util = np.asarray(cpu_util)
        gpu_util = np.asarray(gpu_util)
        job_lengths = np.array([len(job) for job in scheduled_nodes])
        flattened_nodes = np.concatenate(scheduled_nodes, axis=0)

        cpu_util_flat = np.repeat(cpu_util, job_lengths)
        gpu_util_flat = np.repeat(gpu_util, job_lengths)

        node_indices = linear_to_3d_index(flattened_nodes, self.config['SC_SHAPE'])


        if self.validate:   # cpu_util is in fact node_Watts in this case
            total_peak = (
                self.config['CPU_FP_RATIO'] * self.config['CPU_PEAK_FLOPS'] + 
                self.config['GPU_FP_RATIO'] * self.config['GPU_PEAK_FLOPS']
                )
            denominator = (
                self.config['POWER_CPU_MAX'] * self.config['CPUS_PER_NODE'] + 
                self.config['POWER_GPU_MAX'] * self.config['GPUS_PER_NODE'] + 
                self.config['POWER_NIC'] * self.config['NICS_PER_NODE'] +
                self.config['POWER_NVME']
                )
            self.flop_state[node_indices] = total_peak * (cpu_util_flat / denominator)
        else:   
            self.flop_state[node_indices] = (
                self.config['CPU_FP_RATIO'] * cpu_util_flat * self.config['CPU_PEAK_FLOPS'] +
                self.config['GPU_FP_RATIO'] * gpu_util_flat * self.config['GPU_PEAK_FLOPS']
            )

    def get_rpeak(self):
        node_peak_flops = self.config['CPUS_PER_NODE'] * self.config['CPU_PEAK_FLOPS'] \
                        + self.config['GPUS_PER_NODE'] * self.config['GPU_PEAK_FLOPS']
Loading