Move args.scale implementation from within dataloader to main.py (38895605) · Commits · ExaDigiT / sim-raps

args.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -25,7 +25,7 @@ parser.add_argument('-p', '--plot', nargs='+', choices=['power', 'loss', 'pue',
		help='Specify one or more types of plots to generate: power, loss, pue, util, temp')
		choices = ['png', 'svg', 'jpg', 'pdf', 'eps']
		parser.add_argument('--imtype', type=str, choices=choices, default=choices[0], help='Plot image type')
		parser.add_argument('--scale', type=int, default=0, help='Scale telemetry to fit on target system/partition (currently only suupported for marconi100 data)')
		parser.add_argument('--scale', type=int, default=0, help='Scale telemetry to max nodes specified in order to run telemetry on a smaller smaller target system/partition, e.g., --scale 192')
		parser.add_argument('--system', type=str, default='frontier', help='System config to use')
		choices = [policy.value for policy in PolicyType]
		parser.add_argument('-s', '--schedule', type=str, choices=choices, default=choices[0], help='Schedule policy to use')

+8 −1

Original line number	Diff line number	Diff line
		@@ -86,11 +86,18 @@ if args.replay:
		if args.replay[0].endswith(".npz"):
		print(f"Loading {args.replay[0]}...")
		jobs = td.load_snapshot(args.replay[0])

		if args.scale:
		for job in tqdm(jobs, desc=f"Scaling jobs to {args.scale} nodes"):
		job['nodes_required'] = random.randint(1, args.scale)
		args.reschedule = True

		if args.reschedule:
		print("available nodes:", config['AVAILABLE_NODES'])
		for job in tqdm(jobs, desc="Updating requested_nodes"):
		for job in tqdm(jobs, desc="Rescheduling jobs"):
		job['requested_nodes'] = None
		job['submit_time'] = next_arrival(1 / config['JOB_ARRIVAL_TIME'])

		else:
		print(*args.replay)
		jobs = td.load_data(args.replay)

+6 −1

Original line number	Diff line number	Diff line
		@@ -20,6 +20,11 @@ configs = [ConfigManager(system_name=partition).get_config() for partition in pa
		args_dicts = [{**vars(args), 'config': config} for config in configs]

		# Initialize Workload with all configurations
		if args.replay:

		td = Telemetry(**args_dict)

		else:
		wl = Workload(*configs)

		# Generate jobs based on workload type

+0 −3

Original line number	Diff line number	Diff line
		@@ -63,7 +63,6 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs):
		fastforward = kwargs.get('fastforward')
		validate = kwargs.get('validate')
		jid = kwargs.get('jid', '*')
		scale = kwargs.get('scale')

		if fastforward: print(f"fast-forwarding {fastforward} seconds")

		@@ -150,8 +149,6 @@ def load_data_from_df(jobs_df: pd.DataFrame, **kwargs):
		else: # Prescribed replay
		scheduled_nodes = (jobs_df.loc[jidx, 'nodes']).tolist()

		if scale > 0: nodes_required = random.randint(1, scale)

		if gpu_trace.size > 0 and time_offset >= 0:
		job_info = job_dict(nodes_required, name, cpu_trace, gpu_trace, [], [], wall_time,
		end_state, scheduled_nodes, time_offset, job_id, priority)