Commit f0bf0600 authored by Maiterth, Matthias's avatar Maiterth, Matthias
Browse files

added accounts to schedule, such that sort can sort according to the fugaku...

added accounts to schedule, such that sort can sort according to the fugaku points associated with an account.

Also added additional stats for cpu/gpu utilization.
parent 24f2507c
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -301,7 +301,7 @@ class Engine:
        self.add_running_jobs_to_queue(all_jobs)
        # Now process job queue one by one (needed to get the start_time right!)
        for job in self.queue[:]:  # operate over a slice copy to be able to remove jobs from queue if placed.
            self.scheduler.schedule([job], self.running, job.start_time, sorted=True)
            self.scheduler.schedule([job], self.running, job.start_time, accounts=self.accounts, sorted=True)
            self.queue.remove(job)
        if replay and len(self.queue) != 0:
            raise ValueError(f"Something went wrong! Not all jobs could be placed!\nPotential confligt in queue:\n{self.queue}")
@@ -338,7 +338,7 @@ class Engine:
            # 2. Identify eligible jobs and add them to the queue.
            has_new_additions = self.add_eligible_jobs_to_queue(jobs)
            # 3. Schedule jobs that are now in the queue.
            self.scheduler.schedule(self.queue, self.running, self.current_time, sorted=(not has_new_additions))
            self.scheduler.schedule(self.queue, self.running, self.current_time,accounts=self.accounts, sorted=(not has_new_additions))

            # Stop the simulation if no more jobs are running or in the queue or in the job list.
            if autoshutdown and not self.queue and not self.running and not self.replay and not all_jobs and not jobs:
+16 −0
Original line number Diff line number Diff line
@@ -159,6 +159,22 @@ class JobStatistics:
        self.start_time = job.start_time
        self.end_time = job.end_time
        self.state = job._state
        if len(job.cpu_trace) == 0:
            self.avg_cpu_usage = 0
        else:
            self.avg_cpu_usage = sum(job.cpu_trace) / len(job.cpu_trace)
        if len(job.gpu_trace) == 0:
            self.avg_gpu_usage = 0
        else:
            self.avg_gpu_usage = sum(job.gpu_trace) / len(job.gpu_trace)
        if len(job.ntx_trace) == 0:
            self.avg_ntx_usage = 0
        else:
            self.avg_ntx_usage = sum(job.ntx_trace) / len(job.ntx_trace)
        if len(job.nrx_trace) == 0:
            self.avg_nrx_usage = 0
        else:
            self.avg_nrx_usage = sum(job.nrx_trace) / len(job.nrx_trace)
        if len(job.power_history) == 0:
            self.avg_node_power = 0
            self.max_node_power = 0
+1 −0
Original line number Diff line number Diff line
@@ -63,6 +63,7 @@ class Scheduler:

                # After backfill dedice continue processing the queue or wait, continuing may result in fairness issues.
                if self.policy in [PolicyType.REPLAY]:
                    # print(f"Nodes available {nodes_available} - Req:{len(job.requested_nodes)} N-avail:{len(self.resource_manager.available_nodes)}")
                    continue  # Regardless if the job at the front of the queue doenst fit, try placing all of them.
                elif self.policy in [PolicyType.FCFS, PolicyType.PRIORITY,
                                     PolicyType.FUGAKU_PTS, PolicyType.LJF]:
+31 −3
Original line number Diff line number Diff line
@@ -60,6 +60,7 @@ def get_scheduler_stats(engine: Engine):
    }
    return stats


def get_job_stats(engine: Engine):
    """ Return job statistics processed over the engine execution"""
    # Information on Job-Mix
@@ -80,9 +81,15 @@ def get_job_stats(engine: Engine):
    min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den = sys.maxsize, -sys.maxsize - 1, 0
    min_awrt, max_awrt, sum_awrt = sys.maxsize, -sys.maxsize - 1, 0

    min_cpu_u, max_cpu_u, sum_cpu_u = sys.maxsize, -sys.maxsize - 1, 0
    min_gpu_u, max_gpu_u, sum_gpu_u = sys.maxsize, -sys.maxsize - 1, 0
    min_ntx_u, max_ntx_u, sum_ntx_u = sys.maxsize, -sys.maxsize - 1, 0
    min_nrx_u, max_nrx_u, sum_nrx_u = sys.maxsize, -sys.maxsize - 1, 0

    jobsSmall = 0
    jobsMedium = 0
    jobsLarge = 0
    jobsVLarge = 0
    jobsHuge = 0

    # Information on Job-Mix
@@ -132,12 +139,19 @@ def get_job_stats(engine: Engine):
        min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den = \
            min_max_sum(psf_partial_den, min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den)

        min_cpu_u, max_cpu_u, sum_cpu_u = min_max_sum(min_cpu_u, max_cpu_u, sum_cpu_u)
        min_gpu_u, max_gpu_u, sum_gpu_u = min_max_sum(min_gpu_u, max_gpu_u, sum_gpu_u)
        min_ntx_u, max_ntx_u, sum_ntx_u = min_max_sum(min_ntx_u, max_ntx_u, sum_ntx_u)
        min_nrx_u, max_nrx_u, sum_nrx_u = min_max_sum(min_nrx_u, max_nrx_u, sum_nrx_u)

        if job['num_nodes'] <= 5:
            jobsSmall += 1
        elif job['num_nodes'] <= 50:
            jobsMedium += 1
        elif job['num_nodes'] <= 250:
            jobsLarge += 1
        elif job['num_nodes'] <= 4500:
            jobsVLarge += 1
        else:  # job['nodes_required'] > 250:
            jobsHuge += 1

@@ -173,7 +187,8 @@ def get_job_stats(engine: Engine):
        'Jobs <= 5 nodes': jobsSmall,
        'Jobs <= 50 nodes': jobsMedium,
        'Jobs <= 250 nodes': jobsLarge,
        'Jobs > 250 nodes': jobsHuge,
        'Jobs <= 4500 nodes': jobsVLarge,
        'Jobs > 4500 nodes': jobsHuge,
        # Information on job-mix executed
        'min job size': min_job_size,
        'max job size': max_job_size,
@@ -193,6 +208,19 @@ def get_job_stats(engine: Engine):
        'min_aggregate_node_hours': min_agg_node_hours,
        'max_aggregate_node_hours': max_agg_node_hours,
        'avg_aggregate_node_hours': avg_agg_node_hours,
        # Utilization:
        'min_cpu_util': min_cpu_u,
        'max_cpu_util': max_cpu_u,
        'sum_cpu_util': sum_cpu_u,
        'min_gpu_util': min_gpu_u,
        'max_gpu_util': max_gpu_u,
        'sum_gpu_util': sum_gpu_u,
        'min_ntx_util': min_ntx_u,
        'max_ntx_util': max_ntx_u,
        'sum_ntx_util': sum_ntx_u,
        'min_nrx_util': min_nrx_u,
        'max_nrx_util': max_nrx_u,
        'sum_nrx_util': sum_nrx_u,
        # Completion statistics
        'min_wait_time': min_wait_time,
        'max_wait_time': max_wait_time,