OCIZettascale10 Rough sketch with what ExaDigit can model. (82f348a2) · Commits · ExaDigiT / sim-raps

config/OCIZettascale10.yaml

0 → 100644

+59 −0

Original line number	Diff line number	Diff line
		system:
		num_cdus: 2778 # 800,000 Vera Rubin total
		racks_per_cdu: 3
		nodes_per_rack: 72 # 600kW # like NV72
		chassis_per_rack: 1
		nodes_per_blade: 1
		switches_per_chassis: 72 # Chassis concept is Cray => NV72?
		nics_per_node: 1 # Most likely 4
		rectifiers_per_chassis: 1 # power / losses will be set to zero as this is unknown
		nodes_per_rectifier: 1 # power / losses will be set to zero as this is unknown
		#missing_racks:
		down_nodes: []
		cpus_per_node: 1
		gpus_per_node: 4 # Chiplets 4?
		cpu_peak_flops: 2048000000000.0 # Insignificant
		gpu_peak_flops: 15000000000000000000.0 # 15EFlops/s FP4
		cpu_fp_ratio: 0.667
		gpu_fp_ratio: 0.667
		power:
		power_gpu_idle: 200 # 200 == 4* 50
		power_gpu_max: 2200 # 2kW per node = 4*525
		power_cpu_idle: 90
		power_cpu_max: 280
		power_mem: 74.26
		power_nic: 20
		power_nvme: 30
		power_switch: 250
		power_cdu: 8473.47
		power_update_freq: 15
		rectifier_peak_threshold: 13670
		sivoc_loss_constant: 0
		sivoc_efficiency: 1.00
		rectifier_loss_constant: 0
		rectifier_efficiency: 1.00
		power_cost: 0.094
		scheduler:
		job_arrival_time: 1
		mtbf: 11
		trace_quanta: 15
		min_wall_time: 3600
		max_wall_time: 43200
		ui_update_freq: 900
		max_nodes_per_job: 9000
		job_end_probs:
		COMPLETED: 0.63
		FAILED: 0.13
		CANCELLED: 0.12
		TIMEOUT: 0.11
		NODE_FAIL: 0.01
		uq:
		power_gpu_uncertainty: 0.05
		power_cpu_uncertainty: 0.05
		power_mem_uncertainty: 0.05
		power_nic_uncertainty: 0.05
		power_nvme_uncertainty: 0.05
		power_cdus_uncertainty: 0.05
		power_node_uncertainty: 0.002
		power_switch_uncertainty: 0.05
		rectifier_power_uncertainty: 0.05

+2 −1

Original line number	Diff line number	Diff line
		@@ -134,7 +134,8 @@ class SimConfig(RAPSBaseModel, abc.ABC):
		""" Grab data from live system. """

		# Workload arguments (TODO split into separate model)
		workload: Literal['random', 'benchmark', 'peak', 'idle', 'synthetic', 'multitenant', 'replay'] = "random"
		workload: Literal['random', 'benchmark', 'peak', 'idle',
		'synthetic', 'multitenant', 'replay', 'randomAI'] = "random"

		""" Type of synthetic workload """
		multimodal: list[float] = [1.0]

+10 −7

Original line number	Diff line number	Diff line
		@@ -11,6 +11,7 @@ from raps.utils import (

		from .constants import JOB_NAMES, ACCT_NAMES, MAX_PRIORITY


		class BasicWorkload:

		# Test for random 'reasonable' AI jobs
		@@ -19,17 +20,17 @@ class BasicWorkload:
		jobs = []
		for i in range(args.numjobs):
		draw = random.randint(0, 10)
		if draw == 0:
		if draw != 0:
		et = random.randint(7200, 28800)
		nr = random.choice([128, 256, 512, 1024, 1280, 1792, 2048])
		new_job = Job(job_dict(nodes_required=nr,
		name="LLM",
		name="LLM Production",
		account="llmUser",
		end_state="Success",
		id=random.randint(1, 99999),
		cpu_trace=0.1,
		gpu_trace=(random.uniform(0.55, 0.8) *
		self.config_map[self.args.system]['GPUS_PER_NODE']),
		gpu_trace=(random.uniform(0.55, 0.8)
		* self.config_map[self.args.system]['GPUS_PER_NODE']),
		ntx_trace=None,
		nrx_trace=None,
		submit_time=0,
		@@ -38,8 +39,10 @@ class BasicWorkload:
		end_time=et,
		expected_run_time=et))
		else:
		new_job = Job(job_dict(nodes_required=1,
		name="LLM",
		et = random.randint(300, 7200)
		nr = random.choice([1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 128])
		new_job = Job(job_dict(nodes_required=nr,
		name="User-Test LLM",
		account="llmUser",
		end_state="Success",
		id=random.randint(1, 99999),
		@@ -50,7 +53,7 @@ class BasicWorkload:
		submit_time=0,
		time_limit=43200,
		start_time=0,
		end_time=7200,
		end_time=et,
		expected_run_time=random.randint(60, 7200)))
		jobs.append(new_job)
		return jobs

+1 −1

Original line number	Diff line number	Diff line
		def continuous_job_generation(self, *, engine, timestep, jobs):
		def continuous_job_generation(*, engine, timestep, jobs):
		# print("if len(engine.queue) <= engine.continuous_workload.args.maxqueue:")
		# print(f"if {len(engine.queue)} <= {engine.continuous_workload.args.maxqueue}:")
		if len(engine.queue) <= engine.continuous_workload.args.maxqueue: