Debut some issues in aurora dataloader (ece07730) · Commits · ExaDigiT / sim-raps

raps/dataloaders/aurora.py

+31 −34

Original line number	Diff line number	Diff line
		@@ -29,12 +29,11 @@ def load_data(local_dataset_path, **kwargs):
		"WALLTIME_SECONDS", "RUNTIME_SECONDS", "USERNAME_GENID", "LOCATION"
		]

		for chunk in pd.read_csv(filepath, chunksize=chunksize, on_bad_lines='warn'):
		for chunk in pd.read_csv(filepath, chunksize=chunksize, on_bad_lines='warn', nrows=100):
		# Drop rows where essential timestamp data is missing
		chunk.dropna(subset=['QUEUED_TIMESTAMP', 'START_TIMESTAMP', 'END_TIMESTAMP'], inplace=True)

		for _, row in chunk.iterrows():
		try:
		submit_time = int(pd.to_datetime(row["QUEUED_TIMESTAMP"]).timestamp())
		start_time = int(pd.to_datetime(row["START_TIMESTAMP"]).timestamp())
		end_time = int(pd.to_datetime(row["END_TIMESTAMP"]).timestamp())
		@@ -50,7 +49,8 @@ def load_data(local_dataset_path, **kwargs):
		nodes_required=int(row.get("NODES_REQUESTED", 0)),
		cpu_cores_required=int(row.get("CORES_REQUESTED", 0)),
		account=str(row.get("USERNAME_GENID", "N/A")),
		scheduled_nodes=str(row.get("LOCATION", "")).split(','),
		#scheduled_nodes=str(row.get("LOCATION", "")).split(','),
		scheduled_nodes=[], #str(row.get("LOCATION", "")),
		# The following are placeholders as they are not in the CSV
		gpu_trace=0,
		cpu_trace=0,
		@@ -65,9 +65,6 @@ def load_data(local_dataset_path, **kwargs):
		trace_quanta=1,
		)
		jobs.append(Job(job))
		except (ValueError, TypeError) as e:
		print(f"Skipping row due to parsing error: {e}. Row: {row}")
		continue

		if not jobs:
		return WorkloadData(jobs=[], telemetry_start=0, telemetry_end=0, start_date=datetime.now(timezone.utc))