Loading main.py +8 −5 Original line number Diff line number Diff line Loading @@ -71,12 +71,14 @@ sc = Engine( ) layout_manager = LayoutManager(args.layout, engine=sc, debug=args.debug, **config) if args.replay: timestep_start = 0 if args.fastforward: args.fastforward = convert_to_seconds(args.fastforward) timestep_start = args.fastforward if args.replay: td = Telemetry(**args_dict) # Try to extract date from given name to use as case directory Loading Loading @@ -106,8 +108,9 @@ if args.replay: else: # custom data loader print(*args.replay) jobs, timestep_start, timestep_end = td.load_data(args.replay) jobs, timestep_start_from_data, timestep_end = td.load_data(args.replay) td.save_snapshot(jobs, filename=DIR_NAME) timestep_start += timestep_start_from_data # + timestep_start_from_data # Set number of timesteps based on the last job running which we assume # is the maximum value of submit_time + wall_time of all the jobs Loading raps/dataloaders/frontier.py +17 −17 Original line number Diff line number Diff line Loading @@ -100,7 +100,8 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar This means that the time between first_start_timestamp and telemetry_start has no associated values in the traces! The missing values after simulation_end can be ignored, as the simulatuion will have stoped before. The missing values after simulation_end can be ignored, as the simulatuion will have stoped before. However, the times before telemetry_start have to be padded to generate correct offsets within their data! Loading @@ -114,8 +115,13 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar - end_time - wall_time (end_time-start_time, actual runtime in seconds) - trace_time (lenght of each trace in seconds) - trace_start_time (time offset in seconds after which the trace starts) - trace_end_time (time offset in seconds after which the trace ends) has to be set for use within the simulation The values trace_start_time are similar to the telemetry_start and telemetry_stop but job specific. The returned values are these three: - The list of parsed jobs. (as a job_dict) - telemetry_start: int (in seconds) Loading Loading @@ -157,7 +163,6 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar diff = first_start_timestamp - telemetry_start_timestamp first_start = int(diff.total_seconds()) # negative seconds or 0 num_jobs = len(jobs_df) if debug: print("num_jobs:", num_jobs) Loading Loading @@ -191,7 +196,7 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar cpu_power_array = cpu_power.values cpu_min_power = nodes_required * config['POWER_CPU_IDLE'] * config['CPUS_PER_NODE'] cpu_max_power = nodes_required * config['POWER_CPU_MAX'] * config['CPUS_PER_NODE'] cpu_util = power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power) cpu_util = power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power) # Will be negative! as cpu_power_array[i] can be smaller than cpu_min_power cpu_trace = cpu_util * config['CPUS_PER_NODE'] gpu_power = jobprofile_df[jobprofile_df['allocation_id'] \ Loading Loading @@ -222,28 +227,24 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar diff = end_time_timestamp - telemetry_start_timestamp end_time = diff.total_seconds() wall_time = end_time - start_time if np.isnan(wall_time): wall_time = 0 trace_time = gpu_trace.size * config['TRACE_QUANTA'] # seconds trace_start_time = 0 trace_end_time = trace_time if wall_time > trace_time: missing_steps = int(wall_time - trace_time) missing_trace_time = wall_time - trace_time if start_time < 0: cpu_trace = np.concatenate((np.array([0] * missing_steps),cpu_trace)) gpu_trace = np.concatenate((np.array([0] * missing_steps),gpu_trace)) print(f"Job: {job_id} prepended {missing_steps} Values with idle power!") print(f"{start_time} - {end_time}") trace_start_time = missing_trace_time trace_end_time = wall_time elif end_time > telemetry_end: cpu_trace = np.concatenate((cpu_trace,np.array([0] * missing_steps))) gpu_trace = np.concatenate((gpu_trace,np.array([0] * missing_steps))) print(f"Job: {job_id} appended {missing_steps} Values with idle power!") print(f"{start_time} - {end_time}") trace_start_time = 0 trace_end_time = trace_time else: print(f"Job: {job_id} {start_time} - {end_time}!") raise ValueError("Missing values not at start nor end.") trace_time = wall_time # Pretending to have a full trace, This may not be needed! xnames = jobs_df.loc[jidx, 'xnames'] # Don't replay any job with an empty set of xnames Loading Loading @@ -277,15 +278,14 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar print("Job starts after last recorded telemetry entry:",job_id, "start:", start_time,"end:",end_time, " Telemetry: ", len(gpu_trace), "entries.") continue # SKIP! if gpu_trace.size > 0 and (jid == job_id or jid == '*'): # and time_submit >= 0: job_info = job_dict(nodes_required, name, account, cpu_trace, gpu_trace, [], [], end_state, scheduled_nodes, job_id, priority, # partition missing submit_time=submit_time, time_limit=time_limit, start_time=start_time, end_time=end_time, wall_time=wall_time, trace_time=trace_time) wall_time=wall_time, trace_time=trace_time, trace_start_time=trace_start_time, trace_end_time=trace_end_time) jobs.append(job_info) return jobs, telemetry_start, telemetry_end Loading raps/engine.py +42 −19 Original line number Diff line number Diff line Loading @@ -148,16 +148,42 @@ class Engine: cpu_utils = [] gpu_utils = [] net_utils = [] if self.debug: print(f"Current Time: {self.current_time}") for job in self.running: if self.debug: print(f"JobID: {job.id}") if job.state == JobState.RUNNING: job.running_time = self.current_time - job.start_time if job.running_time > job.trace_time: raise ValueError(f"Trace Ended before job ended!\n\ {job.running_time} > {job.trace_time}\n\ {len(job.cpu_trace)} vs. {self.running_time // self.config['TRACE_QUANTA']}\ if job.running_time > job.wall_time: raise Exception(f"Job should have ended already!\n\ {job.running_time} > {job.wall_time}\n\ {len(job.cpu_trace)} vs. {job.running_time // self.config['TRACE_QUANTA']}\ ") time_quanta_index = int(job.running_time // self.config['TRACE_QUANTA']) if job.running_time < job.trace_start_time or job.running_time > job.trace_end_time: cpu_util = 0 # No values available therefore we assume IDLE == 0 gpu_util = 0 net_util = 0 if self.debug: print("No Values in trace, using IDLE.") if self.scheduler.policy == PolicyType.REPLAY: print(f"{job.running_time} < {job.trace_start_time} or {job.running_time} > {job.trace_end_time}") raise Exception("Replay is using IDLE values! Something is wrong!") else: time_quanta_index = int((job.running_time - job.trace_start_time) // self.config['TRACE_QUANTA']) if time_quanta_index == len(job.cpu_trace): # If the running time is past the last time step in the # trace, use the last value in the trace. This can # happen if the last valid timesteps is e.g. 17%15, # the last trace value is 15%15 and the next possible # trace value 30%15 but was not recorded because the # job ended before. # For every other error condition trace_start_ and # _end_time are used! time_quanta_index -= 1 cpu_util = get_utilization(job.cpu_trace, time_quanta_index) gpu_util = get_utilization(job.gpu_trace, time_quanta_index) net_util = 0 Loading Loading @@ -188,7 +214,6 @@ class Engine: job.power_history.append(jobs_power[i] * len(job.scheduled_nodes)) del _running_jobs # Update the power array UI component rack_power, rect_losses = self.power_manager.compute_rack_power() sivoc_losses = self.power_manager.compute_sivoc_losses() Loading Loading @@ -252,41 +277,39 @@ class Engine: self.current_time += 1 return tick_data def prepare_system_state(self, all_jobs:List, timestep_start): def prepare_system_state(self, all_jobs:List, timestep_start, replay:bool): # Modifies Jobs object self.current_time = timestep_start #keep only jobs that have not yet ended # Keep only jobs that have not yet ended all_jobs[:] = [job for job in all_jobs if job['end_time'] >= timestep_start] all_jobs.sort(key=lambda j: j['submit_time']) self.add_running_jobs_to_queue(all_jobs) # Now process job queue one by one (needed to get the start_time right!) for job in self.queue: for job in self.queue[:]: # operate over a slice copy to be able to remove jobs from queue if placed. self.scheduler.schedule([job], self.running, job.start_time, sorted=True) if len(self.queue) != len(self.running): self.queue.remove(job) if replay and len(self.queue) != 0: raise ValueError(f"Something went wrong! Not all jobs could be placed!\nPotential confligt in queue:\n{self.queue}") self.queue = [] # Empty queue needed as addition one by one does not empty the queue! def run_simulation(self, jobs, timestep_start, timestep_end, autoshutdown=False): """Generator that yields after each simulation tick.""" self.timesteps = timestep_end - timestep_start # Where is this used? # Place jobs that are currently running, onto the system. self.prepare_system_state(jobs, timestep_start) if self.scheduler.policy == PolicyType.REPLAY: replay = True else: replay = False # Place jobs that are currently running, onto the system. self.prepare_system_state(jobs, timestep_start, replay) for timestep in range(timestep_start,timestep_end): completed_jobs, newly_downed_nodes = self.prepare_timestep(replay) # Identify eligible jobs and add them to the queue. #self.queue += self.eligible_jobs(jobs, self.current_time) #jobs = self.add_eligible_jobs_to_queue(jobs) self.add_eligible_jobs_to_queue(jobs) # Schedule jobs that are now in the queue. self.scheduler.schedule(self.queue, self.running, self.current_time, sorted=False) Loading raps/job.py +8 −2 Original line number Diff line number Diff line Loading @@ -4,7 +4,7 @@ def job_dict(nodes_required, name, account, \ cpu_trace, gpu_trace, ntx_trace, nrx_trace, \ end_state, scheduled_nodes, job_id, priority=0, partition=0, submit_time=0, time_limit=0, start_time=0, end_time=0, wall_time=0, trace_time=0): wall_time=0, trace_time=0, trace_start_time=0,trace_end_time=0): """ Return job info dictionary """ return { 'nodes_required': nodes_required, Loading @@ -25,7 +25,9 @@ def job_dict(nodes_required, name, account, \ 'start_time': start_time, 'end_time': end_time, 'wall_time': wall_time, 'trace_time': trace_time 'trace_time': trace_time, 'trace_start_time': trace_start_time, 'trace_end_time': trace_end_time } Loading Loading @@ -65,6 +67,8 @@ class Job: self.end_time = None # Actual end time when executing or from telemetry self.wall_time = None # end_time - start_time self.trace_time = None # Time period for which traces are available self.trace_start_time = None # Relative start time of the trace (to running time) self.trace_end_time = None # Relative end time of the trace self.running_time = 0 # Current running time updated when simulating # If a job dict was given, override the values from the job_dict: Loading @@ -87,6 +91,8 @@ class Job: f"start_time={self.start_time}, end_time={self.end_time}, " f"wall_time={self.wall_time}, " f"trace_time={self.trace_time}, " f"trace_start_time={self.trace_start_time}, " f"trace_end_time={self.trace_end_time}, " f"running_time={self.running_time}, state={self._state}, " f"scheduled_nodes={self.scheduled_nodes}, power={self.power}, " f"power_history={self.power_history})") Loading raps/workload.py +16 −7 Original line number Diff line number Diff line Loading @@ -85,7 +85,8 @@ class Workload: jobs.append(job_dict(nodes_required, name, account, cpu_trace, gpu_trace, net_tx, net_rx, \ end_state, None, job_index, priority, partition, time_to_next_job, time_limit, time_to_next_job, time_to_next_job + wall_time, wall_time, wall_time)) time_to_next_job, time_limit, time_to_next_job, time_to_next_job + wall_time, wall_time, wall_time, 0, wall_time)) return jobs Loading Loading @@ -128,7 +129,9 @@ class Workload: 0, # Start time / or None len(gpu_trace) * config['TRACE_QUANTA'], # End time / or None len(gpu_trace) * config['TRACE_QUANTA'], # Wall time len(gpu_trace) * config['TRACE_QUANTA'] # Trace time len(gpu_trace) * config['TRACE_QUANTA'], # Trace time 0, # Trace start time len(gpu_trace) * config['TRACE_QUANTA'] # Trace end time ) print(job_info) jobs.append(job_info) # Add job to the list Loading Loading @@ -170,7 +173,9 @@ class Workload: 0, # Start time / or None len(gpu_trace) * config['TRACE_QUANTA'], # End time / or None len(gpu_trace) * config['TRACE_QUANTA'], # Wall time len(gpu_trace) * config['TRACE_QUANTA'] # Trace time len(gpu_trace) * config['TRACE_QUANTA'], # Trace time 0, # Trace start time len(gpu_trace) * config['TRACE_QUANTA'] # Trace end time ) jobs.append(job_info) # Add job to the list Loading Loading @@ -199,7 +204,8 @@ class Workload: f"Max Test {partition}", account, cpu_trace, gpu_trace, net_tx, net_rx, 'COMPLETED', None, None, 100, partition, 0, len(gpu_trace) * config['TRACE_QUANTA'] + 1, 0, 10800, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'] 0, 10800, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'], 0, len(gpu_trace) * config['TRACE_QUANTA'] ) jobs.append(job_info) Loading @@ -211,7 +217,8 @@ class Workload: f"OpenMxP {partition}", account, cpu_trace, gpu_trace, net_tx, net_rx, 'COMPLETED', None, None, 100, partition, 0, len(gpu_trace) * config['TRACE_QUANTA'] + 1, 10800, 14200, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'] 10800, 14200, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'], 0, len(gpu_trace) * config['TRACE_QUANTA'] ) jobs.append(job_info) Loading @@ -223,7 +230,8 @@ class Workload: f"HPL {partition}", account, cpu_trace, gpu_trace, net_tx, net_rx, 'COMPLETED', None, None, 100, partition, 0, len(gpu_trace) * config['TRACE_QUANTA'] + 1, 14200, 17800, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'] 14200, 17800, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'], 0, len(gpu_trace) * config['TRACE_QUANTA'] ) jobs.append(job_info) Loading @@ -235,7 +243,8 @@ class Workload: f"Idle Test {partition}", account, cpu_trace, gpu_trace, net_tx, net_rx, 'COMPLETED', None, None, 100, partition, 0, len(gpu_trace) * config['TRACE_QUANTA'] + 1, 17800, 21400, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'] 17800, 21400, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'], 0, len(gpu_trace) * config['TRACE_QUANTA'] ) jobs.append(job_info) Loading Loading
main.py +8 −5 Original line number Diff line number Diff line Loading @@ -71,12 +71,14 @@ sc = Engine( ) layout_manager = LayoutManager(args.layout, engine=sc, debug=args.debug, **config) if args.replay: timestep_start = 0 if args.fastforward: args.fastforward = convert_to_seconds(args.fastforward) timestep_start = args.fastforward if args.replay: td = Telemetry(**args_dict) # Try to extract date from given name to use as case directory Loading Loading @@ -106,8 +108,9 @@ if args.replay: else: # custom data loader print(*args.replay) jobs, timestep_start, timestep_end = td.load_data(args.replay) jobs, timestep_start_from_data, timestep_end = td.load_data(args.replay) td.save_snapshot(jobs, filename=DIR_NAME) timestep_start += timestep_start_from_data # + timestep_start_from_data # Set number of timesteps based on the last job running which we assume # is the maximum value of submit_time + wall_time of all the jobs Loading
raps/dataloaders/frontier.py +17 −17 Original line number Diff line number Diff line Loading @@ -100,7 +100,8 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar This means that the time between first_start_timestamp and telemetry_start has no associated values in the traces! The missing values after simulation_end can be ignored, as the simulatuion will have stoped before. The missing values after simulation_end can be ignored, as the simulatuion will have stoped before. However, the times before telemetry_start have to be padded to generate correct offsets within their data! Loading @@ -114,8 +115,13 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar - end_time - wall_time (end_time-start_time, actual runtime in seconds) - trace_time (lenght of each trace in seconds) - trace_start_time (time offset in seconds after which the trace starts) - trace_end_time (time offset in seconds after which the trace ends) has to be set for use within the simulation The values trace_start_time are similar to the telemetry_start and telemetry_stop but job specific. The returned values are these three: - The list of parsed jobs. (as a job_dict) - telemetry_start: int (in seconds) Loading Loading @@ -157,7 +163,6 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar diff = first_start_timestamp - telemetry_start_timestamp first_start = int(diff.total_seconds()) # negative seconds or 0 num_jobs = len(jobs_df) if debug: print("num_jobs:", num_jobs) Loading Loading @@ -191,7 +196,7 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar cpu_power_array = cpu_power.values cpu_min_power = nodes_required * config['POWER_CPU_IDLE'] * config['CPUS_PER_NODE'] cpu_max_power = nodes_required * config['POWER_CPU_MAX'] * config['CPUS_PER_NODE'] cpu_util = power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power) cpu_util = power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power) # Will be negative! as cpu_power_array[i] can be smaller than cpu_min_power cpu_trace = cpu_util * config['CPUS_PER_NODE'] gpu_power = jobprofile_df[jobprofile_df['allocation_id'] \ Loading Loading @@ -222,28 +227,24 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar diff = end_time_timestamp - telemetry_start_timestamp end_time = diff.total_seconds() wall_time = end_time - start_time if np.isnan(wall_time): wall_time = 0 trace_time = gpu_trace.size * config['TRACE_QUANTA'] # seconds trace_start_time = 0 trace_end_time = trace_time if wall_time > trace_time: missing_steps = int(wall_time - trace_time) missing_trace_time = wall_time - trace_time if start_time < 0: cpu_trace = np.concatenate((np.array([0] * missing_steps),cpu_trace)) gpu_trace = np.concatenate((np.array([0] * missing_steps),gpu_trace)) print(f"Job: {job_id} prepended {missing_steps} Values with idle power!") print(f"{start_time} - {end_time}") trace_start_time = missing_trace_time trace_end_time = wall_time elif end_time > telemetry_end: cpu_trace = np.concatenate((cpu_trace,np.array([0] * missing_steps))) gpu_trace = np.concatenate((gpu_trace,np.array([0] * missing_steps))) print(f"Job: {job_id} appended {missing_steps} Values with idle power!") print(f"{start_time} - {end_time}") trace_start_time = 0 trace_end_time = trace_time else: print(f"Job: {job_id} {start_time} - {end_time}!") raise ValueError("Missing values not at start nor end.") trace_time = wall_time # Pretending to have a full trace, This may not be needed! xnames = jobs_df.loc[jidx, 'xnames'] # Don't replay any job with an empty set of xnames Loading Loading @@ -277,15 +278,14 @@ def load_data_from_df(jobs_df: pd.DataFrame, jobprofile_df: pd.DataFrame, **kwar print("Job starts after last recorded telemetry entry:",job_id, "start:", start_time,"end:",end_time, " Telemetry: ", len(gpu_trace), "entries.") continue # SKIP! if gpu_trace.size > 0 and (jid == job_id or jid == '*'): # and time_submit >= 0: job_info = job_dict(nodes_required, name, account, cpu_trace, gpu_trace, [], [], end_state, scheduled_nodes, job_id, priority, # partition missing submit_time=submit_time, time_limit=time_limit, start_time=start_time, end_time=end_time, wall_time=wall_time, trace_time=trace_time) wall_time=wall_time, trace_time=trace_time, trace_start_time=trace_start_time, trace_end_time=trace_end_time) jobs.append(job_info) return jobs, telemetry_start, telemetry_end Loading
raps/engine.py +42 −19 Original line number Diff line number Diff line Loading @@ -148,16 +148,42 @@ class Engine: cpu_utils = [] gpu_utils = [] net_utils = [] if self.debug: print(f"Current Time: {self.current_time}") for job in self.running: if self.debug: print(f"JobID: {job.id}") if job.state == JobState.RUNNING: job.running_time = self.current_time - job.start_time if job.running_time > job.trace_time: raise ValueError(f"Trace Ended before job ended!\n\ {job.running_time} > {job.trace_time}\n\ {len(job.cpu_trace)} vs. {self.running_time // self.config['TRACE_QUANTA']}\ if job.running_time > job.wall_time: raise Exception(f"Job should have ended already!\n\ {job.running_time} > {job.wall_time}\n\ {len(job.cpu_trace)} vs. {job.running_time // self.config['TRACE_QUANTA']}\ ") time_quanta_index = int(job.running_time // self.config['TRACE_QUANTA']) if job.running_time < job.trace_start_time or job.running_time > job.trace_end_time: cpu_util = 0 # No values available therefore we assume IDLE == 0 gpu_util = 0 net_util = 0 if self.debug: print("No Values in trace, using IDLE.") if self.scheduler.policy == PolicyType.REPLAY: print(f"{job.running_time} < {job.trace_start_time} or {job.running_time} > {job.trace_end_time}") raise Exception("Replay is using IDLE values! Something is wrong!") else: time_quanta_index = int((job.running_time - job.trace_start_time) // self.config['TRACE_QUANTA']) if time_quanta_index == len(job.cpu_trace): # If the running time is past the last time step in the # trace, use the last value in the trace. This can # happen if the last valid timesteps is e.g. 17%15, # the last trace value is 15%15 and the next possible # trace value 30%15 but was not recorded because the # job ended before. # For every other error condition trace_start_ and # _end_time are used! time_quanta_index -= 1 cpu_util = get_utilization(job.cpu_trace, time_quanta_index) gpu_util = get_utilization(job.gpu_trace, time_quanta_index) net_util = 0 Loading Loading @@ -188,7 +214,6 @@ class Engine: job.power_history.append(jobs_power[i] * len(job.scheduled_nodes)) del _running_jobs # Update the power array UI component rack_power, rect_losses = self.power_manager.compute_rack_power() sivoc_losses = self.power_manager.compute_sivoc_losses() Loading Loading @@ -252,41 +277,39 @@ class Engine: self.current_time += 1 return tick_data def prepare_system_state(self, all_jobs:List, timestep_start): def prepare_system_state(self, all_jobs:List, timestep_start, replay:bool): # Modifies Jobs object self.current_time = timestep_start #keep only jobs that have not yet ended # Keep only jobs that have not yet ended all_jobs[:] = [job for job in all_jobs if job['end_time'] >= timestep_start] all_jobs.sort(key=lambda j: j['submit_time']) self.add_running_jobs_to_queue(all_jobs) # Now process job queue one by one (needed to get the start_time right!) for job in self.queue: for job in self.queue[:]: # operate over a slice copy to be able to remove jobs from queue if placed. self.scheduler.schedule([job], self.running, job.start_time, sorted=True) if len(self.queue) != len(self.running): self.queue.remove(job) if replay and len(self.queue) != 0: raise ValueError(f"Something went wrong! Not all jobs could be placed!\nPotential confligt in queue:\n{self.queue}") self.queue = [] # Empty queue needed as addition one by one does not empty the queue! def run_simulation(self, jobs, timestep_start, timestep_end, autoshutdown=False): """Generator that yields after each simulation tick.""" self.timesteps = timestep_end - timestep_start # Where is this used? # Place jobs that are currently running, onto the system. self.prepare_system_state(jobs, timestep_start) if self.scheduler.policy == PolicyType.REPLAY: replay = True else: replay = False # Place jobs that are currently running, onto the system. self.prepare_system_state(jobs, timestep_start, replay) for timestep in range(timestep_start,timestep_end): completed_jobs, newly_downed_nodes = self.prepare_timestep(replay) # Identify eligible jobs and add them to the queue. #self.queue += self.eligible_jobs(jobs, self.current_time) #jobs = self.add_eligible_jobs_to_queue(jobs) self.add_eligible_jobs_to_queue(jobs) # Schedule jobs that are now in the queue. self.scheduler.schedule(self.queue, self.running, self.current_time, sorted=False) Loading
raps/job.py +8 −2 Original line number Diff line number Diff line Loading @@ -4,7 +4,7 @@ def job_dict(nodes_required, name, account, \ cpu_trace, gpu_trace, ntx_trace, nrx_trace, \ end_state, scheduled_nodes, job_id, priority=0, partition=0, submit_time=0, time_limit=0, start_time=0, end_time=0, wall_time=0, trace_time=0): wall_time=0, trace_time=0, trace_start_time=0,trace_end_time=0): """ Return job info dictionary """ return { 'nodes_required': nodes_required, Loading @@ -25,7 +25,9 @@ def job_dict(nodes_required, name, account, \ 'start_time': start_time, 'end_time': end_time, 'wall_time': wall_time, 'trace_time': trace_time 'trace_time': trace_time, 'trace_start_time': trace_start_time, 'trace_end_time': trace_end_time } Loading Loading @@ -65,6 +67,8 @@ class Job: self.end_time = None # Actual end time when executing or from telemetry self.wall_time = None # end_time - start_time self.trace_time = None # Time period for which traces are available self.trace_start_time = None # Relative start time of the trace (to running time) self.trace_end_time = None # Relative end time of the trace self.running_time = 0 # Current running time updated when simulating # If a job dict was given, override the values from the job_dict: Loading @@ -87,6 +91,8 @@ class Job: f"start_time={self.start_time}, end_time={self.end_time}, " f"wall_time={self.wall_time}, " f"trace_time={self.trace_time}, " f"trace_start_time={self.trace_start_time}, " f"trace_end_time={self.trace_end_time}, " f"running_time={self.running_time}, state={self._state}, " f"scheduled_nodes={self.scheduled_nodes}, power={self.power}, " f"power_history={self.power_history})") Loading
raps/workload.py +16 −7 Original line number Diff line number Diff line Loading @@ -85,7 +85,8 @@ class Workload: jobs.append(job_dict(nodes_required, name, account, cpu_trace, gpu_trace, net_tx, net_rx, \ end_state, None, job_index, priority, partition, time_to_next_job, time_limit, time_to_next_job, time_to_next_job + wall_time, wall_time, wall_time)) time_to_next_job, time_limit, time_to_next_job, time_to_next_job + wall_time, wall_time, wall_time, 0, wall_time)) return jobs Loading Loading @@ -128,7 +129,9 @@ class Workload: 0, # Start time / or None len(gpu_trace) * config['TRACE_QUANTA'], # End time / or None len(gpu_trace) * config['TRACE_QUANTA'], # Wall time len(gpu_trace) * config['TRACE_QUANTA'] # Trace time len(gpu_trace) * config['TRACE_QUANTA'], # Trace time 0, # Trace start time len(gpu_trace) * config['TRACE_QUANTA'] # Trace end time ) print(job_info) jobs.append(job_info) # Add job to the list Loading Loading @@ -170,7 +173,9 @@ class Workload: 0, # Start time / or None len(gpu_trace) * config['TRACE_QUANTA'], # End time / or None len(gpu_trace) * config['TRACE_QUANTA'], # Wall time len(gpu_trace) * config['TRACE_QUANTA'] # Trace time len(gpu_trace) * config['TRACE_QUANTA'], # Trace time 0, # Trace start time len(gpu_trace) * config['TRACE_QUANTA'] # Trace end time ) jobs.append(job_info) # Add job to the list Loading Loading @@ -199,7 +204,8 @@ class Workload: f"Max Test {partition}", account, cpu_trace, gpu_trace, net_tx, net_rx, 'COMPLETED', None, None, 100, partition, 0, len(gpu_trace) * config['TRACE_QUANTA'] + 1, 0, 10800, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'] 0, 10800, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'], 0, len(gpu_trace) * config['TRACE_QUANTA'] ) jobs.append(job_info) Loading @@ -211,7 +217,8 @@ class Workload: f"OpenMxP {partition}", account, cpu_trace, gpu_trace, net_tx, net_rx, 'COMPLETED', None, None, 100, partition, 0, len(gpu_trace) * config['TRACE_QUANTA'] + 1, 10800, 14200, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'] 10800, 14200, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'], 0, len(gpu_trace) * config['TRACE_QUANTA'] ) jobs.append(job_info) Loading @@ -223,7 +230,8 @@ class Workload: f"HPL {partition}", account, cpu_trace, gpu_trace, net_tx, net_rx, 'COMPLETED', None, None, 100, partition, 0, len(gpu_trace) * config['TRACE_QUANTA'] + 1, 14200, 17800, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'] 14200, 17800, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'], 0, len(gpu_trace) * config['TRACE_QUANTA'] ) jobs.append(job_info) Loading @@ -235,7 +243,8 @@ class Workload: f"Idle Test {partition}", account, cpu_trace, gpu_trace, net_tx, net_rx, 'COMPLETED', None, None, 100, partition, 0, len(gpu_trace) * config['TRACE_QUANTA'] + 1, 17800, 21400, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'] 17800, 21400, len(gpu_trace) * config['TRACE_QUANTA'], len(gpu_trace) * config['TRACE_QUANTA'], 0, len(gpu_trace) * config['TRACE_QUANTA'] ) jobs.append(job_info) Loading