update lassen data loder, deriving trace from utilization. (1d22fe20) · Commits · ExaDigiT / sim-raps

config/lassen/system.json

+1 −0

Original line number	Diff line number	Diff line
		@@ -12,6 +12,7 @@
		"MISSING_RACKS": [44],
		"DOWN_NODES": [],
		"CPUS_PER_NODE": 2,
		"CORES_PER_CPU": 22,
		"GPUS_PER_NODE": 4,
		"CPU_PEAK_FLOPS": 396.8E9,
		"GPU_PEAK_FLOPS": 7.8E12,

raps/dataloaders/lassen.py

+20 −20

Original line number	Diff line number	Diff line
		@@ -36,14 +36,15 @@ from tqdm import tqdm

		load_config_variables(['TRACE_QUANTA', 'CPUS_PER_NODE', 'GPUS_PER_NODE',
		'POWER_GPU_IDLE', 'POWER_GPU_MAX', 'POWER_CPU_IDLE',
		'POWER_CPU_MAX', 'POWER_MEM', 'POWER_NIC', 'POWER_NVME',
		'POWER_CPU_MAX', 'POWER_MEM', 'POWER_NIC',
		'POWER_NVME', 'POWER_CDU', 'POWER_SWITCH', 'CORES_PER_CPU',
		'NICS_PER_NODE'], globals())

		def load_data(path, **kwargs):
		"""
		Loads data from the given file paths and returns job info.
		"""
		nrows = 1E4
		nrows = None # 1E4
		alloc_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_history_hashed.csv'), nrows=nrows)
		node_df = pd.read_csv(os.path.join(path[0], 'final_csm_allocation_node_history.csv'), nrows=nrows)
		step_df = pd.read_csv(os.path.join(path[0], 'final_csm_step_history.csv'), nrows=nrows)
		@@ -90,26 +91,25 @@ def load_data_from_df(allocation_df, node_df, step_df, **kwargs):
		#gpu_power = gpu_energy / wall_time
		gpu_power_array = np.array([gpu_power] * samples)

		#GPU power can be 0:
		gpu_min_power = nodes_required * POWER_GPU_IDLE
		gpu_max_power = nodes_required * POWER_GPU_MAX
		gpu_util = power_to_utilization(gpu_power_array, gpu_min_power, gpu_max_power)
		gpu_trace = gpu_util * GPUS_PER_NODE

		# Compute CPU power (assuming total energy minus gpu_energy is cpu_energy)
		total_energy = node_data['energy'].sum() # Joules
		cpu_energy = total_energy - gpu_energy

		cpu_usage = node_data['cpu_usage'].sum() / 1E9 / nodes_required # seconds
		cpu_power = cpu_energy / cpu_usage if cpu_usage > 0 else 0
		#cpu_power = cpu_energy / wall_time
		cpu_power -= nodes_required * (POWER_MEM + NICS_PER_NODE * POWER_NIC + POWER_NVME)
		cpu_power_array = np.array([cpu_power] * samples)

		cpu_min_power = nodes_required * POWER_CPU_IDLE
		cpu_max_power = nodes_required * POWER_CPU_MAX
		cpu_util = power_to_utilization(cpu_power_array, cpu_min_power, cpu_max_power)
		# Utilization is defined in the range of [0 to 1].
		# gpu_util will be negative if power reports 0, which is smaller than POWER_GPU_IDLE
		# Therefore: gpu_util should be set to zero if it is smaller than 0.
		gpu_trace = max(0,gpu_util) * GPUS_PER_NODE

		# Compute CPU power from GPU usage time
		# Only Node Power and GPU power is reported!
		cpu_usage = node_data['cpu_usage'].sum() / 1E9 / nodes_required / CORES_PER_CPU # seconds
		cpu_usage_array = np.array([cpu_usage] * samples)
		cpu_util = cpu_usage_array / wall_time
		cpu_trace = cpu_util * CPUS_PER_NODE

		# TODO use total energy for validation
		# total_energy = node_data['energy'].sum() # Joules

		if reschedule: # Let the scheduler reschedule the jobs
		scheduled_nodes = None
		time_offset = next_arrival()

raps/utils.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -216,7 +216,7 @@ def power_to_utilization(power, pmin, pmax):
		float
		Utilization value.
		"""
		return (np.maximum(pmin,power) - pmin) / (np.minimum(power,pmax) - pmin)
		return (power - pmin) / (pmax - pmin)


		def create_binary_array_numpy(max_time, trace_quanta, util):