Commit de5c1c00 authored by Brewer, Wes's avatar Brewer, Wes
Browse files

Merge branch 'reschedule-on-submit-time' into 'main'

Reschedule on submit time

See merge request !70
parents 4ff726a2 37c4d5b9
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -70,7 +70,7 @@ This will simulate synthetic workloads on two partitions as defined in `config/s

This will dump a .npz file with a randomized name, e.g. ac23db.npz. Let's rename this file to pm100.npz for clarity. Note: can control-C when the simulation starts. Now, this pm100.npz file can be used with `multi-part-sim.py` as follows:

    python multi-part-sim.py -x setonix/* -f pm100.npz --reschedule --scale 192
    python multi-part-sim.py -x setonix/* -f pm100.npz --reschedule poisson --scale 192

The `--reschedule` flag will use the internal scheduler to determine what nodes to schedule for each job, and the `--scale` flag will specify the maximum number of nodes for each job (generally set this to the max number of nodes of the smallest partition). 

+2 −1
Original line number Diff line number Diff line
@@ -14,7 +14,8 @@ parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose
parser.add_argument('--seed', action='store_true', help='Set random number seed for deterministic simulation')
parser.add_argument('-f', '--replay', nargs='+', type=str, help='Either: path/to/joblive path/to/jobprofile' + \
                                                                ' -or- filename.npz (overrides --workload option)')
parser.add_argument('--reschedule', action='store_true', help='Reschedule the telemetry workload')
choices = ['false','poisson', 'submit-time']
parser.add_argument('--reschedule', type=str, choices=choices, default=choices[0], help='Reschedule the telemetry workload')
parser.add_argument('-u', '--uncertainties', action='store_true',
                    help='Change from floating point units to floating point units with uncertainties.' + \
                                                                ' Very expensive w.r.t simulation time!')
+5 −2
Original line number Diff line number Diff line
@@ -95,11 +95,14 @@ if args.replay:
                job['nodes_required'] = random.randint(1, args.scale)
                job['requested_nodes'] = None # Setting to None triggers scheduler to assign nodes

        if args.reschedule:
        if args.reschedule == 'poisson':
            print("available nodes:", config['AVAILABLE_NODES'])
            for job in tqdm(jobs, desc="Rescheduling jobs"):
                job['requested_nodes'] = None
                job['submit_time'] = next_arrival(1 / config['JOB_ARRIVAL_TIME'])
        elif args.reschedule == 'submit-time':
            raise NotImplementedError


    else:  # custom data loader
        print(*args.replay)
+6 −4
Original line number Diff line number Diff line
@@ -49,12 +49,14 @@ if args.replay:
            job['nodes_required'] = random.randint(1, args.scale)
            job['requested_nodes'] = None # Setting to None triggers scheduler to assign nodes

    if args.reschedule:
    if args.reschedule == 'poisson':
        for job in tqdm(jobs, desc="Rescheduling jobs"):
            partition = job['partition']
            partition_config = configs[partition_names.index(partition)]
            job['requested_nodes'] = None
            job['submit_time'] = next_arrival(1 / partition_config['JOB_ARRIVAL_TIME'])
    elif args.reschedule == 'submit-time':
        raise NotImplementedError

else:  # Synthetic workload
    wl = Workload(*configs)
+19 −16
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@
    python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastra

    # to reschedule
    python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastra --reschedule
    python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastra --reschedule poisson

    # to fast-forward 60 days and replay for 1 day
    python main.py -f /path/to/AdastaJobsMI250_15days.parquet --system adastra -ff 60d -t 1d
@@ -154,11 +154,14 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs):
            # When extracting out a single job, run one iteration past the end of the job
            time_offset = config['UI_UPDATE_FREQ']

        if fastforward: time_offset -= fastforward
        if fastforward:
            time_offset -= fastforward

        if reschedule: # Let the scheduler reschedule the jobs
        if reschedule == 'poisson':  # Let the scheduler reschedule the jobs
            scheduled_nodes = None
            time_offset = next_arrival(1/config['JOB_ARRIVAL_TIME'])
        elif reschedule == 'submit-time':
            raise NotImplementedError
        else:  # Prescribed replay
            scheduled_nodes = (jobs_df.loc[jidx, 'nodes']).tolist()

Loading