Loading raps/dataloaders/aurora.py +31 −34 Original line number Diff line number Diff line Loading @@ -29,12 +29,11 @@ def load_data(local_dataset_path, **kwargs): "WALLTIME_SECONDS", "RUNTIME_SECONDS", "USERNAME_GENID", "LOCATION" ] for chunk in pd.read_csv(filepath, chunksize=chunksize, on_bad_lines='warn'): for chunk in pd.read_csv(filepath, chunksize=chunksize, on_bad_lines='warn', nrows=100): # Drop rows where essential timestamp data is missing chunk.dropna(subset=['QUEUED_TIMESTAMP', 'START_TIMESTAMP', 'END_TIMESTAMP'], inplace=True) for _, row in chunk.iterrows(): try: submit_time = int(pd.to_datetime(row["QUEUED_TIMESTAMP"]).timestamp()) start_time = int(pd.to_datetime(row["START_TIMESTAMP"]).timestamp()) end_time = int(pd.to_datetime(row["END_TIMESTAMP"]).timestamp()) Loading @@ -50,7 +49,8 @@ def load_data(local_dataset_path, **kwargs): nodes_required=int(row.get("NODES_REQUESTED", 0)), cpu_cores_required=int(row.get("CORES_REQUESTED", 0)), account=str(row.get("USERNAME_GENID", "N/A")), scheduled_nodes=str(row.get("LOCATION", "")).split(','), #scheduled_nodes=str(row.get("LOCATION", "")).split(','), scheduled_nodes=[], #str(row.get("LOCATION", "")), # The following are placeholders as they are not in the CSV gpu_trace=0, cpu_trace=0, Loading @@ -65,9 +65,6 @@ def load_data(local_dataset_path, **kwargs): trace_quanta=1, ) jobs.append(Job(job)) except (ValueError, TypeError) as e: print(f"Skipping row due to parsing error: {e}. Row: {row}") continue if not jobs: return WorkloadData(jobs=[], telemetry_start=0, telemetry_end=0, start_date=datetime.now(timezone.utc)) Loading Loading
raps/dataloaders/aurora.py +31 −34 Original line number Diff line number Diff line Loading @@ -29,12 +29,11 @@ def load_data(local_dataset_path, **kwargs): "WALLTIME_SECONDS", "RUNTIME_SECONDS", "USERNAME_GENID", "LOCATION" ] for chunk in pd.read_csv(filepath, chunksize=chunksize, on_bad_lines='warn'): for chunk in pd.read_csv(filepath, chunksize=chunksize, on_bad_lines='warn', nrows=100): # Drop rows where essential timestamp data is missing chunk.dropna(subset=['QUEUED_TIMESTAMP', 'START_TIMESTAMP', 'END_TIMESTAMP'], inplace=True) for _, row in chunk.iterrows(): try: submit_time = int(pd.to_datetime(row["QUEUED_TIMESTAMP"]).timestamp()) start_time = int(pd.to_datetime(row["START_TIMESTAMP"]).timestamp()) end_time = int(pd.to_datetime(row["END_TIMESTAMP"]).timestamp()) Loading @@ -50,7 +49,8 @@ def load_data(local_dataset_path, **kwargs): nodes_required=int(row.get("NODES_REQUESTED", 0)), cpu_cores_required=int(row.get("CORES_REQUESTED", 0)), account=str(row.get("USERNAME_GENID", "N/A")), scheduled_nodes=str(row.get("LOCATION", "")).split(','), #scheduled_nodes=str(row.get("LOCATION", "")).split(','), scheduled_nodes=[], #str(row.get("LOCATION", "")), # The following are placeholders as they are not in the CSV gpu_trace=0, cpu_trace=0, Loading @@ -65,9 +65,6 @@ def load_data(local_dataset_path, **kwargs): trace_quanta=1, ) jobs.append(Job(job)) except (ValueError, TypeError) as e: print(f"Skipping row due to parsing error: {e}. Row: {row}") continue if not jobs: return WorkloadData(jobs=[], telemetry_start=0, telemetry_end=0, start_date=datetime.now(timezone.utc)) Loading