Merge branch 'fix-running-time' into 'hpl' (c6d23273) · Commits · ExaDigiT / sim-raps

raps/engine.py

+6 −6

Original line number	Diff line number	Diff line
		@@ -499,7 +499,7 @@ class Engine:
		# update Running time
		for job in self.running:
		if job.current_state == JobState.RUNNING:
		job.running_time = self.current_timestep - job.start_time
		job.current_run_time = self.current_timestep - job.start_time

		# Stop the simulation if no more jobs are running or in the queue or in the job list.
		if autoshutdown and \
		@@ -552,7 +552,7 @@ class Engine:

		for job in self.running:

		job.running_time = self.current_timestep - job.start_time
		job.current_run_time = self.current_timestep - job.start_time

		if job.current_state != JobState.RUNNING:
		raise ValueError(
		@@ -561,15 +561,15 @@ class Engine:
		)
		else: # if job.state == JobState.RUNNING:
		# Error checks
		if not replay and job.running_time > job.time_limit and job.end_time is not None:
		if not replay and job.current_run_time > job.time_limit and job.end_time is not None:
		raise Exception(f"Job exceded time limit! "
		f"{job.running_time} > {job.time_limit}"
		f"{job.current_run_time} > {job.time_limit}"
		f"\n{job}"
		f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)"
		)
		if replay and job.running_time > job.expected_run_time:
		if replay and job.current_run_time > job.expected_run_time:
		raise Exception(f"Job should have ended in replay! "
		f" {job.running_time} > {job.expected_run_time}"
		f" {job.current_run_time} > {job.expected_run_time}"
		f"\n{job}"
		f"\nCurrent timestep:{self.current_timestep - self.timestep_start} (rel)"
		)

+3 −3

Original line number	Diff line number	Diff line
		@@ -180,7 +180,7 @@ class Job:
		self.trace_start_time = None # Relative start time of the trace (to running time)
		self.trace_end_time = None # Relative end time of the trace
		self.trace_quanta = None # Trace quanta associated with the job # None means single value!
		self.running_time = 0 # Current running time updated when simulating
		self.current_run_time = 0 # Current running time updated when simulating

		# If a job dict was given, override the values from the job_dict:
		for key, value in job_dict.items():
		@@ -232,7 +232,7 @@ class Job:
		f"trace_start_time={self.trace_start_time}, "
		f"trace_end_time={self.trace_end_time}, "
		f"trace_quanta={self.trace_quanta}, "
		f"running_time={self.running_time}, "
		f"current_run_time={self.current_run_time}, "
		f"power={self.power}, "
		f"power_history={self.power_history})")

		@@ -296,7 +296,7 @@ class JobStatistics:
		self.account = job.account
		self.num_nodes = len(job.scheduled_nodes)
		self.scheduled_nodes = job.scheduled_nodes
		self.run_time = job.running_time
		self.run_time = job.current_run_time
		self.submit_time = job.submit_time
		self.start_time = job.start_time
		self.end_time = job.end_time

+11 −7

Original line number	Diff line number	Diff line
		@@ -4,6 +4,7 @@ from raps.utils import get_current_utilization
		from raps.network.fat_tree import node_id_to_host_name
		from raps.network.torus3d import link_loads_for_job_torus, torus_host_from_real_index


		def debug_print_trace(job, label: str = ""):
		"""Print either the length (if iterable) or the value of job.gpu_trace."""
		if hasattr(job.gpu_trace, "__len__"):
		@@ -138,6 +139,7 @@ def worst_link_util(loads, throughput):
		max_util = util
		return max_util


		def get_link_util_stats(loads, throughput, top_n=10):
		"""
		Calculates a distribution of link utilization stats.
		@@ -164,11 +166,13 @@ def get_link_util_stats(loads, throughput, top_n=10):

		return stats


		def max_throughput_per_tick(legacy_cfg: dict, trace_quanta: int) -> float:
		"""Return bytes-per-tick throughput of a single link."""
		bw = legacy_cfg.get("NETWORK_MAX_BW") or 12.5e9
		return float(bw) * trace_quanta


		def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False):
		"""
		Simulates network congestion from a list of concurrently running jobs.
		@@ -181,8 +185,8 @@ def simulate_inter_job_congestion(network_model, jobs, legacy_cfg, debug=False):
		trace_quanta = jobs[0].trace_quanta if jobs else 0

		for job in jobs:
		# Assuming job.running_time is 0 for this static simulation
		job.running_time = 0
		# Assuming job.current_run_time is 0 for this static simulation
		job.current_run_time = 0
		job.trace_start_time = 0
		net_tx = get_current_utilization(job.ntx_trace, job)

+1 −2

Original line number	Diff line number	Diff line
		@@ -55,7 +55,7 @@ def compute_node_power(cpu_util, gpu_util, net_util, config):
		power_gpu = gpu_util * config['POWER_GPU_MAX'] + \
		(config['GPUS_PER_NODE'] - gpu_util) * config['POWER_GPU_IDLE']

		if config.get("POWER_NIC_IDLE") != None and config.get("POWER_NIC_MAX") != None:
		if config.get("POWER_NIC_IDLE") is not None and config.get("POWER_NIC_MAX") is not None:
		power_nic = config['POWER_NIC_IDLE'] + \
		(config['POWER_NIC_MAX'] - config['POWER_NIC_IDLE']) * net_util
		else:
		@@ -432,7 +432,6 @@ class PowerManager:
		jobs_power = self.update_power_state(scheduled_nodes, cpu_utils, gpu_utils, net_utils)

		for i, job in enumerate(running_jobs):
		# if job.running_time % self.config['TRACE_QUANTA'] == 0:
		job.power_history.append(jobs_power[i] * len(job.scheduled_nodes))

		# Update the power array UI component

+5 −5

Original line number	Diff line number	Diff line
		@@ -191,10 +191,10 @@ class LayoutManager:
		nodes_display = col_nodelist

		if self.engine.downscale != 1:
		running_time_str = convert_seconds_to_hhmmss(job.running_time // self.engine.downscale) + \
		f" +{job.running_time % self.engine.downscale}/{self.engine.downscale}s"
		running_time_str = convert_seconds_to_hhmmss(job.current_run_time // self.engine.downscale) + \
		f" +{job.current_run_time % self.engine.downscale}/{self.engine.downscale}s"
		else:
		running_time_str = convert_seconds_to_hhmm(job.running_time)
		running_time_str = convert_seconds_to_hhmm(job.current_run_time)

		row = [
		str(job.id).zfill(5),
		@@ -269,13 +269,13 @@ class LayoutManager:
		# Add data row with white values
		time_in_s = time // self.engine.downscale
		if (time_in_s < 946684800): # Introducing Y2K into our codebase! Kek
		time_str = convert_seconds_to_hhmm(time_in_s)
		time_str = convert_seconds_to_hhmmss(time_in_s)
		else:
		# For the curious: If the simulation time in seconds is large than
		# unix timestamp for Jan 2000 this is a unix timestamp,
		time_str = f"{datetime.fromtimestamp(time_in_s).strftime('%Y-%m-%d %H:%M')}"
		if timestep_start != 0: # append time simulated
		time_str += f"\nSim: {convert_seconds_to_hhmm(time_in_s - timestep_start)}"
		time_str += f"\nSim: {convert_seconds_to_hhmmss(time_in_s - timestep_start)}"

		row.append(time_str)
		row.append(str(nrun))