Commit ece07730 authored by Brewer, Wes's avatar Brewer, Wes
Browse files

Debut some issues in aurora dataloader

parent 2c8cdc9a
Loading
Loading
Loading
Loading
+31 −34
Original line number Diff line number Diff line
@@ -29,12 +29,11 @@ def load_data(local_dataset_path, **kwargs):
        "WALLTIME_SECONDS", "RUNTIME_SECONDS", "USERNAME_GENID", "LOCATION"
    ]

    for chunk in pd.read_csv(filepath, chunksize=chunksize, on_bad_lines='warn'):
    for chunk in pd.read_csv(filepath, chunksize=chunksize, on_bad_lines='warn', nrows=100):
        # Drop rows where essential timestamp data is missing
        chunk.dropna(subset=['QUEUED_TIMESTAMP', 'START_TIMESTAMP', 'END_TIMESTAMP'], inplace=True)

        for _, row in chunk.iterrows():
            try:
            submit_time = int(pd.to_datetime(row["QUEUED_TIMESTAMP"]).timestamp())
            start_time = int(pd.to_datetime(row["START_TIMESTAMP"]).timestamp())
            end_time = int(pd.to_datetime(row["END_TIMESTAMP"]).timestamp())
@@ -50,7 +49,8 @@ def load_data(local_dataset_path, **kwargs):
                nodes_required=int(row.get("NODES_REQUESTED", 0)),
                cpu_cores_required=int(row.get("CORES_REQUESTED", 0)),
                account=str(row.get("USERNAME_GENID", "N/A")),
                    scheduled_nodes=str(row.get("LOCATION", "")).split(','),
                #scheduled_nodes=str(row.get("LOCATION", "")).split(','),
                scheduled_nodes=[], #str(row.get("LOCATION", "")),
                # The following are placeholders as they are not in the CSV
                gpu_trace=0,
                cpu_trace=0,
@@ -65,9 +65,6 @@ def load_data(local_dataset_path, **kwargs):
                trace_quanta=1,
            )
            jobs.append(Job(job))
            except (ValueError, TypeError) as e:
                print(f"Skipping row due to parsing error: {e}. Row: {row}")
                continue

    if not jobs:
        return WorkloadData(jobs=[], telemetry_start=0, telemetry_end=0, start_date=datetime.now(timezone.utc))