Commit 19f0189c authored by Brewer, Wes's avatar Brewer, Wes
Browse files

Initial attempts to support pluggable schedulers specified by --scheduler

parent d2219d1d
Loading
Loading
Loading
Loading
+4 −2
Original line number Diff line number Diff line
import argparse
from raps.policy import PolicyType
from raps.schedulers.default import PolicyType

parser = argparse.ArgumentParser(description='Resource Allocator & Power Simulator (RAPS)')
parser.add_argument('-c', '--cooling', action='store_true', help='Include FMU cooling model')
@@ -28,8 +28,10 @@ choices = ['png', 'svg', 'jpg', 'pdf', 'eps']
parser.add_argument('--imtype', type=str, choices=choices, default=choices[0], help='Plot image type')
parser.add_argument('--scale', type=int, default=0, help='Scale telemetry to max nodes specified in order to run telemetry on a smaller smaller target system/partition, e.g., --scale 192')
parser.add_argument('--system', type=str, default='frontier', help='System config to use')
choices = ['default', 'nrel', 'anl', 'flux']
parser.add_argument('--scheduler', type=str, choices=choices, default=choices[0], help='Name of scheduler')
choices = [policy.value for policy in PolicyType]
parser.add_argument('-s', '--schedule', type=str, choices=choices, default=choices[0], help='Schedule policy to use')
parser.add_argument('--policy', type=str, choices=choices, default=choices[0], help='Schedule policy to use')
choices = ['random', 'benchmark', 'peak', 'idle']
parser.add_argument('-w', '--workload', type=str, choices=choices, default=choices[0], help='Type of synthetic workload')
choices = ['layout1', 'layout2']
+4 −3
Original line number Diff line number Diff line
@@ -25,7 +25,8 @@ from raps.flops import FLOPSManager
from raps.plotting import Plotter
from raps.power import PowerManager, compute_node_power, compute_node_power_validate
from raps.power import compute_node_power_uncertainties, compute_node_power_validate_uncertainties
from raps.scheduler import Scheduler, Job
from raps.engine import Engine
from raps.job import Job
from raps.telemetry import Telemetry
from raps.workload import Workload
from raps.weather import Weather
@@ -61,13 +62,13 @@ else:
args_dict['config'] = config
flops_manager = FLOPSManager(**args_dict)

sc = Scheduler(
sc = Engine(
    power_manager=power_manager,
    flops_manager=flops_manager,
    cooling_model=cooling_model,
    **args_dict,
)
layout_manager = LayoutManager(args.layout, scheduler=sc, debug=args.debug, **config)
layout_manager = LayoutManager(args.layout, engine=sc, debug=args.debug, **config)

if args.replay:

+7 −7
Original line number Diff line number Diff line
@@ -8,9 +8,9 @@ import sys

from args import args
from raps.config import ConfigManager, CONFIG_PATH
from raps.policy import PolicyType
from raps.schedulers.default import PolicyType
from raps.ui import LayoutManager
from raps.scheduler import Scheduler
from raps.engine import Engine
from raps.flops import FLOPSManager
from raps.power import PowerManager, compute_node_power
from raps.telemetry import Telemetry
@@ -74,8 +74,8 @@ layout_managers = {}
for i, config in enumerate(configs):
    pm = PowerManager(compute_node_power, **configs[i])
    fm = FLOPSManager(**args_dicts[i])
    sc = Scheduler(power_manager=pm, flops_manager=fm, cooling_model=None, **args_dicts[i])
    layout_managers[config['system_name']] = LayoutManager(args.layout, scheduler=sc, debug=args.debug, **config)
    sc = Engine(power_manager=pm, flops_manager=fm, cooling_model=None, **args_dicts[i])
    layout_managers[config['system_name']] = LayoutManager(args.layout, engine=sc, debug=args.debug, **config)

# Set simulation timesteps
if args.time:
@@ -96,9 +96,9 @@ for timestep in range(timesteps):
    if timestep % configs[0]['UI_UPDATE_FREQ'] == 0:  # Assuming same frequency for all partitions
        sys_power = 0
        for name, lm in layout_managers.items():
            sys_util = lm.scheduler.sys_util_history[-1] if lm.scheduler.sys_util_history else 0.0
            print(f"[DEBUG] {name} - Timestep {timestep} - Jobs running: {len(lm.scheduler.running)} - Utilization: {sys_util[1]:.2f}% - Power: {lm.scheduler.sys_power:.1f}kW")
            sys_power += lm.scheduler.sys_power
            sys_util = lm.engine.sys_util_history[-1] if lm.engine.sys_util_history else 0.0
            print(f"[DEBUG] {name} - Timestep {timestep} - Jobs running: {len(lm.engine.running)} - Utilization: {sys_util[1]:.2f}% - Power: {lm.engine.sys_power:.1f}kW")
            sys_power += lm.engine.sys_power
        print(f"system power: {sys_power:.1f}kW")

print("Simulation complete.")
+272 −0

File changed and moved.

Preview size limit exceeded, changes collapsed.

raps/policy.py

deleted100644 → 0
+0 −62
Original line number Diff line number Diff line
from enum import Enum

class PolicyType(Enum):
    FCFS = 'fcfs'
    BACKFILL = 'backfill'
    PRIORITY = 'priority'
    SJF = 'sjf'
    #DEADLINE = 'deadline' # not yet supported
    

class Policy:

    def __init__(self, strategy):
        self.strategy = PolicyType(strategy)

    def sort_jobs(self, jobs):
        if self.strategy == PolicyType.FCFS or self.strategy == PolicyType.BACKFILL:
            return sorted(jobs, key=lambda job: job.submit_time)
        elif self.strategy == PolicyType.SJF:
            return sorted(jobs, key=lambda job: job.wall_time)
        elif self.strategy == PolicyType.PRIORITY:
            return sorted(jobs, key=lambda job: job.priority, reverse=True)
        else:
            raise ValueError(f"Unknown policy type: {self.policy_type}")

    def find_backfill_job(self, queue, num_free_nodes, current_time):
        """ This implementation is based on pseudocode from Leonenkov and Zhumatiy.
            "Introducing new backfill-based scheduler for slurm resource manager."
            Procedia computer science 66 (2015): 661-669. """

        first_job = queue[0]

        for job in queue: job.end_time = current_time + job.wall_time

        # Sort jobs according to their termination time (end_time)
        sorted_queue = sorted(queue, key=lambda job: job.end_time)

        # Compute shadow time - loop over the list and collect nodes until the 
        # number of available nodes is sufficient for the first job in the queue
        sum_nodes = 0
        shadow_time = None
        for job in sorted_queue:
            sum_nodes += job.nodes_required
            if sum_nodes >= first_job.nodes_required:
                shadow_time = current_time + job.wall_time
                num_extra_nodes = sum_nodes - job.nodes_required
                break

        # Find backfill job
        backfill_job = None
        for job in queue:
            # condition1 checks that the job ends before first_job starts
            condition1 = job.nodes_required <= num_free_nodes \
                         and current_time + job.wall_time < shadow_time
            # condition2 checks that the job does not interfere with first_job
            condition2 = job.nodes_required <= min(num_free_nodes, num_extra_nodes)

            if condition1 or condition2:
                backfill_job = job
                break

        return backfill_job
Loading