Loading raps/engine.py +2 −2 Original line number Diff line number Diff line Loading @@ -301,7 +301,7 @@ class Engine: self.add_running_jobs_to_queue(all_jobs) # Now process job queue one by one (needed to get the start_time right!) for job in self.queue[:]: # operate over a slice copy to be able to remove jobs from queue if placed. self.scheduler.schedule([job], self.running, job.start_time, sorted=True) self.scheduler.schedule([job], self.running, job.start_time, accounts=self.accounts, sorted=True) self.queue.remove(job) if replay and len(self.queue) != 0: raise ValueError(f"Something went wrong! Not all jobs could be placed!\nPotential confligt in queue:\n{self.queue}") Loading Loading @@ -338,7 +338,7 @@ class Engine: # 2. Identify eligible jobs and add them to the queue. has_new_additions = self.add_eligible_jobs_to_queue(jobs) # 3. Schedule jobs that are now in the queue. self.scheduler.schedule(self.queue, self.running, self.current_time, sorted=(not has_new_additions)) self.scheduler.schedule(self.queue, self.running, self.current_time,accounts=self.accounts, sorted=(not has_new_additions)) # Stop the simulation if no more jobs are running or in the queue or in the job list. if autoshutdown and not self.queue and not self.running and not self.replay and not all_jobs and not jobs: Loading raps/job.py +16 −0 Original line number Diff line number Diff line Loading @@ -159,6 +159,22 @@ class JobStatistics: self.start_time = job.start_time self.end_time = job.end_time self.state = job._state if len(job.cpu_trace) == 0: self.avg_cpu_usage = 0 else: self.avg_cpu_usage = sum(job.cpu_trace) / len(job.cpu_trace) if len(job.gpu_trace) == 0: self.avg_gpu_usage = 0 else: self.avg_gpu_usage = sum(job.gpu_trace) / len(job.gpu_trace) if len(job.ntx_trace) == 0: self.avg_ntx_usage = 0 else: self.avg_ntx_usage = sum(job.ntx_trace) / len(job.ntx_trace) if len(job.nrx_trace) == 0: self.avg_nrx_usage = 0 else: self.avg_nrx_usage = sum(job.nrx_trace) / len(job.nrx_trace) if len(job.power_history) == 0: self.avg_node_power = 0 self.max_node_power = 0 Loading raps/schedulers/default.py +1 −0 Original line number Diff line number Diff line Loading @@ -63,6 +63,7 @@ class Scheduler: # After backfill dedice continue processing the queue or wait, continuing may result in fairness issues. if self.policy in [PolicyType.REPLAY]: # print(f"Nodes available {nodes_available} - Req:{len(job.requested_nodes)} N-avail:{len(self.resource_manager.available_nodes)}") continue # Regardless if the job at the front of the queue doenst fit, try placing all of them. elif self.policy in [PolicyType.FCFS, PolicyType.PRIORITY, PolicyType.FUGAKU_PTS, PolicyType.LJF]: Loading raps/stats.py +31 −3 Original line number Diff line number Diff line Loading @@ -60,6 +60,7 @@ def get_scheduler_stats(engine: Engine): } return stats def get_job_stats(engine: Engine): """ Return job statistics processed over the engine execution""" # Information on Job-Mix Loading @@ -80,9 +81,15 @@ def get_job_stats(engine: Engine): min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den = sys.maxsize, -sys.maxsize - 1, 0 min_awrt, max_awrt, sum_awrt = sys.maxsize, -sys.maxsize - 1, 0 min_cpu_u, max_cpu_u, sum_cpu_u = sys.maxsize, -sys.maxsize - 1, 0 min_gpu_u, max_gpu_u, sum_gpu_u = sys.maxsize, -sys.maxsize - 1, 0 min_ntx_u, max_ntx_u, sum_ntx_u = sys.maxsize, -sys.maxsize - 1, 0 min_nrx_u, max_nrx_u, sum_nrx_u = sys.maxsize, -sys.maxsize - 1, 0 jobsSmall = 0 jobsMedium = 0 jobsLarge = 0 jobsVLarge = 0 jobsHuge = 0 # Information on Job-Mix Loading Loading @@ -132,12 +139,19 @@ def get_job_stats(engine: Engine): min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den = \ min_max_sum(psf_partial_den, min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den) min_cpu_u, max_cpu_u, sum_cpu_u = min_max_sum(min_cpu_u, max_cpu_u, sum_cpu_u) min_gpu_u, max_gpu_u, sum_gpu_u = min_max_sum(min_gpu_u, max_gpu_u, sum_gpu_u) min_ntx_u, max_ntx_u, sum_ntx_u = min_max_sum(min_ntx_u, max_ntx_u, sum_ntx_u) min_nrx_u, max_nrx_u, sum_nrx_u = min_max_sum(min_nrx_u, max_nrx_u, sum_nrx_u) if job['num_nodes'] <= 5: jobsSmall += 1 elif job['num_nodes'] <= 50: jobsMedium += 1 elif job['num_nodes'] <= 250: jobsLarge += 1 elif job['num_nodes'] <= 4500: jobsVLarge += 1 else: # job['nodes_required'] > 250: jobsHuge += 1 Loading Loading @@ -173,7 +187,8 @@ def get_job_stats(engine: Engine): 'Jobs <= 5 nodes': jobsSmall, 'Jobs <= 50 nodes': jobsMedium, 'Jobs <= 250 nodes': jobsLarge, 'Jobs > 250 nodes': jobsHuge, 'Jobs <= 4500 nodes': jobsVLarge, 'Jobs > 4500 nodes': jobsHuge, # Information on job-mix executed 'min job size': min_job_size, 'max job size': max_job_size, Loading @@ -193,6 +208,19 @@ def get_job_stats(engine: Engine): 'min_aggregate_node_hours': min_agg_node_hours, 'max_aggregate_node_hours': max_agg_node_hours, 'avg_aggregate_node_hours': avg_agg_node_hours, # Utilization: 'min_cpu_util': min_cpu_u, 'max_cpu_util': max_cpu_u, 'sum_cpu_util': sum_cpu_u, 'min_gpu_util': min_gpu_u, 'max_gpu_util': max_gpu_u, 'sum_gpu_util': sum_gpu_u, 'min_ntx_util': min_ntx_u, 'max_ntx_util': max_ntx_u, 'sum_ntx_util': sum_ntx_u, 'min_nrx_util': min_nrx_u, 'max_nrx_util': max_nrx_u, 'sum_nrx_util': sum_nrx_u, # Completion statistics 'min_wait_time': min_wait_time, 'max_wait_time': max_wait_time, Loading Loading
raps/engine.py +2 −2 Original line number Diff line number Diff line Loading @@ -301,7 +301,7 @@ class Engine: self.add_running_jobs_to_queue(all_jobs) # Now process job queue one by one (needed to get the start_time right!) for job in self.queue[:]: # operate over a slice copy to be able to remove jobs from queue if placed. self.scheduler.schedule([job], self.running, job.start_time, sorted=True) self.scheduler.schedule([job], self.running, job.start_time, accounts=self.accounts, sorted=True) self.queue.remove(job) if replay and len(self.queue) != 0: raise ValueError(f"Something went wrong! Not all jobs could be placed!\nPotential confligt in queue:\n{self.queue}") Loading Loading @@ -338,7 +338,7 @@ class Engine: # 2. Identify eligible jobs and add them to the queue. has_new_additions = self.add_eligible_jobs_to_queue(jobs) # 3. Schedule jobs that are now in the queue. self.scheduler.schedule(self.queue, self.running, self.current_time, sorted=(not has_new_additions)) self.scheduler.schedule(self.queue, self.running, self.current_time,accounts=self.accounts, sorted=(not has_new_additions)) # Stop the simulation if no more jobs are running or in the queue or in the job list. if autoshutdown and not self.queue and not self.running and not self.replay and not all_jobs and not jobs: Loading
raps/job.py +16 −0 Original line number Diff line number Diff line Loading @@ -159,6 +159,22 @@ class JobStatistics: self.start_time = job.start_time self.end_time = job.end_time self.state = job._state if len(job.cpu_trace) == 0: self.avg_cpu_usage = 0 else: self.avg_cpu_usage = sum(job.cpu_trace) / len(job.cpu_trace) if len(job.gpu_trace) == 0: self.avg_gpu_usage = 0 else: self.avg_gpu_usage = sum(job.gpu_trace) / len(job.gpu_trace) if len(job.ntx_trace) == 0: self.avg_ntx_usage = 0 else: self.avg_ntx_usage = sum(job.ntx_trace) / len(job.ntx_trace) if len(job.nrx_trace) == 0: self.avg_nrx_usage = 0 else: self.avg_nrx_usage = sum(job.nrx_trace) / len(job.nrx_trace) if len(job.power_history) == 0: self.avg_node_power = 0 self.max_node_power = 0 Loading
raps/schedulers/default.py +1 −0 Original line number Diff line number Diff line Loading @@ -63,6 +63,7 @@ class Scheduler: # After backfill dedice continue processing the queue or wait, continuing may result in fairness issues. if self.policy in [PolicyType.REPLAY]: # print(f"Nodes available {nodes_available} - Req:{len(job.requested_nodes)} N-avail:{len(self.resource_manager.available_nodes)}") continue # Regardless if the job at the front of the queue doenst fit, try placing all of them. elif self.policy in [PolicyType.FCFS, PolicyType.PRIORITY, PolicyType.FUGAKU_PTS, PolicyType.LJF]: Loading
raps/stats.py +31 −3 Original line number Diff line number Diff line Loading @@ -60,6 +60,7 @@ def get_scheduler_stats(engine: Engine): } return stats def get_job_stats(engine: Engine): """ Return job statistics processed over the engine execution""" # Information on Job-Mix Loading @@ -80,9 +81,15 @@ def get_job_stats(engine: Engine): min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den = sys.maxsize, -sys.maxsize - 1, 0 min_awrt, max_awrt, sum_awrt = sys.maxsize, -sys.maxsize - 1, 0 min_cpu_u, max_cpu_u, sum_cpu_u = sys.maxsize, -sys.maxsize - 1, 0 min_gpu_u, max_gpu_u, sum_gpu_u = sys.maxsize, -sys.maxsize - 1, 0 min_ntx_u, max_ntx_u, sum_ntx_u = sys.maxsize, -sys.maxsize - 1, 0 min_nrx_u, max_nrx_u, sum_nrx_u = sys.maxsize, -sys.maxsize - 1, 0 jobsSmall = 0 jobsMedium = 0 jobsLarge = 0 jobsVLarge = 0 jobsHuge = 0 # Information on Job-Mix Loading Loading @@ -132,12 +139,19 @@ def get_job_stats(engine: Engine): min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den = \ min_max_sum(psf_partial_den, min_psf_partial_den, max_psf_partial_den, sum_psf_partial_den) min_cpu_u, max_cpu_u, sum_cpu_u = min_max_sum(min_cpu_u, max_cpu_u, sum_cpu_u) min_gpu_u, max_gpu_u, sum_gpu_u = min_max_sum(min_gpu_u, max_gpu_u, sum_gpu_u) min_ntx_u, max_ntx_u, sum_ntx_u = min_max_sum(min_ntx_u, max_ntx_u, sum_ntx_u) min_nrx_u, max_nrx_u, sum_nrx_u = min_max_sum(min_nrx_u, max_nrx_u, sum_nrx_u) if job['num_nodes'] <= 5: jobsSmall += 1 elif job['num_nodes'] <= 50: jobsMedium += 1 elif job['num_nodes'] <= 250: jobsLarge += 1 elif job['num_nodes'] <= 4500: jobsVLarge += 1 else: # job['nodes_required'] > 250: jobsHuge += 1 Loading Loading @@ -173,7 +187,8 @@ def get_job_stats(engine: Engine): 'Jobs <= 5 nodes': jobsSmall, 'Jobs <= 50 nodes': jobsMedium, 'Jobs <= 250 nodes': jobsLarge, 'Jobs > 250 nodes': jobsHuge, 'Jobs <= 4500 nodes': jobsVLarge, 'Jobs > 4500 nodes': jobsHuge, # Information on job-mix executed 'min job size': min_job_size, 'max job size': max_job_size, Loading @@ -193,6 +208,19 @@ def get_job_stats(engine: Engine): 'min_aggregate_node_hours': min_agg_node_hours, 'max_aggregate_node_hours': max_agg_node_hours, 'avg_aggregate_node_hours': avg_agg_node_hours, # Utilization: 'min_cpu_util': min_cpu_u, 'max_cpu_util': max_cpu_u, 'sum_cpu_util': sum_cpu_u, 'min_gpu_util': min_gpu_u, 'max_gpu_util': max_gpu_u, 'sum_gpu_util': sum_gpu_u, 'min_ntx_util': min_ntx_u, 'max_ntx_util': max_ntx_u, 'sum_ntx_util': sum_ntx_u, 'min_nrx_util': min_nrx_u, 'max_nrx_util': max_nrx_u, 'sum_nrx_util': sum_nrx_u, # Completion statistics 'min_wait_time': min_wait_time, 'max_wait_time': max_wait_time, Loading