Loading raps/engine.py +6 −6 Original line number Diff line number Diff line Loading @@ -499,7 +499,7 @@ class Engine: # update Running time for job in self.running: if job.current_state == JobState.RUNNING: job.running_time = self.current_timestep - job.start_time job.current_run_time = self.current_timestep - job.start_time # Stop the simulation if no more jobs are running or in the queue or in the job list. if autoshutdown and \ Loading Loading @@ -552,7 +552,7 @@ class Engine: for job in self.running: job.running_time = self.current_timestep - job.start_time job.current_run_time = self.current_timestep - job.start_time if job.current_state != JobState.RUNNING: raise ValueError( Loading @@ -561,15 +561,15 @@ class Engine: ) else: # if job.state == JobState.RUNNING: # Error checks if not replay and job.running_time > job.time_limit and job.end_time is not None: if not replay and job.current_run_time > job.time_limit and job.end_time is not None: raise Exception(f"Job exceded time limit! " f"{job.running_time} > {job.time_limit}" f"{job.current_run_time} > {job.time_limit}" f"\n{job}" f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)" ) if replay and job.running_time > job.expected_run_time: if replay and job.current_run_time > job.expected_run_time: raise Exception(f"Job should have ended in replay! " f" {job.running_time} > {job.expected_run_time}" f" {job.current_run_time} > {job.expected_run_time}" f"\n{job}" f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)" ) Loading raps/job.py +3 −3 Original line number Diff line number Diff line Loading @@ -180,7 +180,7 @@ class Job: self.trace_start_time = None # Relative start time of the trace (to running time) self.trace_end_time = None # Relative end time of the trace self.trace_quanta = None # Trace quanta associated with the job # None means single value! self.running_time = 0 # Current running time updated when simulating self.current_run_time = 0 # Current running time updated when simulating # If a job dict was given, override the values from the job_dict: for key, value in job_dict.items(): Loading Loading @@ -232,7 +232,7 @@ class Job: f"trace_start_time={self.trace_start_time}, " f"trace_end_time={self.trace_end_time}, " f"trace_quanta={self.trace_quanta}, " f"running_time={self.running_time}, " f"current_run_time={self.current_run_time}, " f"power={self.power}, " f"power_history={self.power_history})") Loading Loading @@ -296,7 +296,7 @@ class JobStatistics: self.account = job.account self.num_nodes = len(job.scheduled_nodes) self.scheduled_nodes = job.scheduled_nodes self.run_time = job.running_time self.run_time = job.current_run_time self.submit_time = job.submit_time self.start_time = job.start_time self.end_time = job.end_time Loading raps/network/base.py +11 −7 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from raps.utils import get_current_utilization from raps.network.fat_tree import node_id_to_host_name from raps.network.torus3d import link_loads_for_job_torus, torus_host_from_real_index def debug_print_trace(job, label: str = ""): """Print either the length (if iterable) or the value of job.gpu_trace.""" if hasattr(job.gpu_trace, "__len__"): Loading Loading @@ -138,6 +139,7 @@ def worst_link_util(loads, throughput): max_util = util return max_util def get_link_util_stats(loads, throughput, top_n=10): """ Calculates a distribution of link utilization stats. Loading @@ -164,11 +166,13 @@ def get_link_util_stats(loads, throughput, top_n=10): return stats def max_throughput_per_tick(legacy_cfg: dict, trace_quanta: int) -> float: """Return bytes-per-tick throughput of a single link.""" bw = legacy_cfg.get("NETWORK_MAX_BW") or 12.5e9 return float(bw) * trace_quanta def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False): """ Simulates network congestion from a list of concurrently running jobs. Loading @@ -181,8 +185,8 @@ def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False): trace_quanta = jobs[0].trace_quanta if jobs else 0 for job in jobs: # Assuming job.running_time is 0 for this static simulation job.running_time = 0 # Assuming job.current_run_time is 0 for this static simulation job.current_run_time = 0 job.trace_start_time = 0 net_tx = get_current_utilization(job.ntx_trace, job) Loading raps/power.py +1 −2 Original line number Diff line number Diff line Loading @@ -55,7 +55,7 @@ def compute_node_power(cpu_util, gpu_util, net_util, config): power_gpu = gpu_util * config['POWER_GPU_MAX'] + \ (config['GPUS_PER_NODE'] - gpu_util) * config['POWER_GPU_IDLE'] if config.get("POWER_NIC_IDLE") != None and config.get("POWER_NIC_MAX") != None: if config.get("POWER_NIC_IDLE") is not None and config.get("POWER_NIC_MAX") is not None: power_nic = config['POWER_NIC_IDLE'] + \ (config['POWER_NIC_MAX'] - config['POWER_NIC_IDLE']) * net_util else: Loading Loading @@ -432,7 +432,6 @@ class PowerManager: jobs_power = self.update_power_state(scheduled_nodes, cpu_utils, gpu_utils, net_utils) for i, job in enumerate(running_jobs): # if job.running_time % self.config['TRACE_QUANTA'] == 0: job.power_history.append(jobs_power[i] * len(job.scheduled_nodes)) # Update the power array UI component Loading raps/ui.py +5 −5 Original line number Diff line number Diff line Loading @@ -191,10 +191,10 @@ class LayoutManager: nodes_display = col_nodelist if self.engine.downscale != 1: running_time_str = convert_seconds_to_hhmmss(job.running_time // self.engine.downscale) + \ f" +{job.running_time % self.engine.downscale}/{self.engine.downscale}s" running_time_str = convert_seconds_to_hhmmss(job.current_run_time // self.engine.downscale) + \ f" +{job.current_run_time % self.engine.downscale}/{self.engine.downscale}s" else: running_time_str = convert_seconds_to_hhmm(job.running_time) running_time_str = convert_seconds_to_hhmm(job.current_run_time) row = [ str(job.id).zfill(5), Loading Loading @@ -269,13 +269,13 @@ class LayoutManager: # Add data row with white values time_in_s = time // self.engine.downscale if (time_in_s < 946684800): # Introducing Y2K into our codebase! Kek time_str = convert_seconds_to_hhmm(time_in_s) time_str = convert_seconds_to_hhmmss(time_in_s) else: # For the curious: If the simulation time in seconds is large than # unix timestamp for Jan 2000 this is a unix timestamp, time_str = f"{datetime.fromtimestamp(time_in_s).strftime('%Y-%m-%d %H:%M')}" if timestep_start != 0: # append time simulated time_str += f"\nSim: {convert_seconds_to_hhmm(time_in_s - timestep_start)}" time_str += f"\nSim: {convert_seconds_to_hhmmss(time_in_s - timestep_start)}" row.append(time_str) row.append(str(nrun)) Loading Loading
raps/engine.py +6 −6 Original line number Diff line number Diff line Loading @@ -499,7 +499,7 @@ class Engine: # update Running time for job in self.running: if job.current_state == JobState.RUNNING: job.running_time = self.current_timestep - job.start_time job.current_run_time = self.current_timestep - job.start_time # Stop the simulation if no more jobs are running or in the queue or in the job list. if autoshutdown and \ Loading Loading @@ -552,7 +552,7 @@ class Engine: for job in self.running: job.running_time = self.current_timestep - job.start_time job.current_run_time = self.current_timestep - job.start_time if job.current_state != JobState.RUNNING: raise ValueError( Loading @@ -561,15 +561,15 @@ class Engine: ) else: # if job.state == JobState.RUNNING: # Error checks if not replay and job.running_time > job.time_limit and job.end_time is not None: if not replay and job.current_run_time > job.time_limit and job.end_time is not None: raise Exception(f"Job exceded time limit! " f"{job.running_time} > {job.time_limit}" f"{job.current_run_time} > {job.time_limit}" f"\n{job}" f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)" ) if replay and job.running_time > job.expected_run_time: if replay and job.current_run_time > job.expected_run_time: raise Exception(f"Job should have ended in replay! " f" {job.running_time} > {job.expected_run_time}" f" {job.current_run_time} > {job.expected_run_time}" f"\n{job}" f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)" ) Loading
raps/job.py +3 −3 Original line number Diff line number Diff line Loading @@ -180,7 +180,7 @@ class Job: self.trace_start_time = None # Relative start time of the trace (to running time) self.trace_end_time = None # Relative end time of the trace self.trace_quanta = None # Trace quanta associated with the job # None means single value! self.running_time = 0 # Current running time updated when simulating self.current_run_time = 0 # Current running time updated when simulating # If a job dict was given, override the values from the job_dict: for key, value in job_dict.items(): Loading Loading @@ -232,7 +232,7 @@ class Job: f"trace_start_time={self.trace_start_time}, " f"trace_end_time={self.trace_end_time}, " f"trace_quanta={self.trace_quanta}, " f"running_time={self.running_time}, " f"current_run_time={self.current_run_time}, " f"power={self.power}, " f"power_history={self.power_history})") Loading Loading @@ -296,7 +296,7 @@ class JobStatistics: self.account = job.account self.num_nodes = len(job.scheduled_nodes) self.scheduled_nodes = job.scheduled_nodes self.run_time = job.running_time self.run_time = job.current_run_time self.submit_time = job.submit_time self.start_time = job.start_time self.end_time = job.end_time Loading
raps/network/base.py +11 −7 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from raps.utils import get_current_utilization from raps.network.fat_tree import node_id_to_host_name from raps.network.torus3d import link_loads_for_job_torus, torus_host_from_real_index def debug_print_trace(job, label: str = ""): """Print either the length (if iterable) or the value of job.gpu_trace.""" if hasattr(job.gpu_trace, "__len__"): Loading Loading @@ -138,6 +139,7 @@ def worst_link_util(loads, throughput): max_util = util return max_util def get_link_util_stats(loads, throughput, top_n=10): """ Calculates a distribution of link utilization stats. Loading @@ -164,11 +166,13 @@ def get_link_util_stats(loads, throughput, top_n=10): return stats def max_throughput_per_tick(legacy_cfg: dict, trace_quanta: int) -> float: """Return bytes-per-tick throughput of a single link.""" bw = legacy_cfg.get("NETWORK_MAX_BW") or 12.5e9 return float(bw) * trace_quanta def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False): """ Simulates network congestion from a list of concurrently running jobs. Loading @@ -181,8 +185,8 @@ def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False): trace_quanta = jobs[0].trace_quanta if jobs else 0 for job in jobs: # Assuming job.running_time is 0 for this static simulation job.running_time = 0 # Assuming job.current_run_time is 0 for this static simulation job.current_run_time = 0 job.trace_start_time = 0 net_tx = get_current_utilization(job.ntx_trace, job) Loading
raps/power.py +1 −2 Original line number Diff line number Diff line Loading @@ -55,7 +55,7 @@ def compute_node_power(cpu_util, gpu_util, net_util, config): power_gpu = gpu_util * config['POWER_GPU_MAX'] + \ (config['GPUS_PER_NODE'] - gpu_util) * config['POWER_GPU_IDLE'] if config.get("POWER_NIC_IDLE") != None and config.get("POWER_NIC_MAX") != None: if config.get("POWER_NIC_IDLE") is not None and config.get("POWER_NIC_MAX") is not None: power_nic = config['POWER_NIC_IDLE'] + \ (config['POWER_NIC_MAX'] - config['POWER_NIC_IDLE']) * net_util else: Loading Loading @@ -432,7 +432,6 @@ class PowerManager: jobs_power = self.update_power_state(scheduled_nodes, cpu_utils, gpu_utils, net_utils) for i, job in enumerate(running_jobs): # if job.running_time % self.config['TRACE_QUANTA'] == 0: job.power_history.append(jobs_power[i] * len(job.scheduled_nodes)) # Update the power array UI component Loading
raps/ui.py +5 −5 Original line number Diff line number Diff line Loading @@ -191,10 +191,10 @@ class LayoutManager: nodes_display = col_nodelist if self.engine.downscale != 1: running_time_str = convert_seconds_to_hhmmss(job.running_time // self.engine.downscale) + \ f" +{job.running_time % self.engine.downscale}/{self.engine.downscale}s" running_time_str = convert_seconds_to_hhmmss(job.current_run_time // self.engine.downscale) + \ f" +{job.current_run_time % self.engine.downscale}/{self.engine.downscale}s" else: running_time_str = convert_seconds_to_hhmm(job.running_time) running_time_str = convert_seconds_to_hhmm(job.current_run_time) row = [ str(job.id).zfill(5), Loading Loading @@ -269,13 +269,13 @@ class LayoutManager: # Add data row with white values time_in_s = time // self.engine.downscale if (time_in_s < 946684800): # Introducing Y2K into our codebase! Kek time_str = convert_seconds_to_hhmm(time_in_s) time_str = convert_seconds_to_hhmmss(time_in_s) else: # For the curious: If the simulation time in seconds is large than # unix timestamp for Jan 2000 this is a unix timestamp, time_str = f"{datetime.fromtimestamp(time_in_s).strftime('%Y-%m-%d %H:%M')}" if timestep_start != 0: # append time simulated time_str += f"\nSim: {convert_seconds_to_hhmm(time_in_s - timestep_start)}" time_str += f"\nSim: {convert_seconds_to_hhmmss(time_in_s - timestep_start)}" row.append(time_str) row.append(str(nrun)) Loading