Commit 90fe8628 authored by Wes Brewer's avatar Wes Brewer Committed by Brewer, Wes
Browse files

Work towards support for reading philly traces that are already parsed into daily files

parent de2aba18
Loading
Loading
Loading
Loading
+120 −48
Original line number Diff line number Diff line
import os
import glob
import json
import csv
import pandas as pd
import warnings

from datetime import datetime, timezone
from datetime import datetime, timezone, timedelta
from tqdm import tqdm
from raps.job import job_dict, Job
from raps.utils import WorkloadData

DATE_FORMAT_STR = "%Y-%m-%d %H:%M:%S"
DEFAULT_START = "2017-10-03T00:00"
DEFAULT_END = "2017-10-04T00:00"

def to_epoch(ts_str):
    if ts_str is None:
        return None
    if isinstance(ts_str, (int, float)):
        return int(ts_str)
    if "T" in ts_str:
        dt = datetime.fromisoformat(ts_str)
    else:
        dt = datetime.strptime(ts_str, DATE_FORMAT_STR)
    return int(dt.timestamp())

def parse_timestamp(val):
    """
    Convert Philly job log timestamps to datetime.
    Handles integers (epoch) and strings with PST/PDT.
    Returns datetime or None.
    """
    if val is None or val == "None":
        return None
    if isinstance(val, (int, float)):
        return datetime.fromtimestamp(int(val), tz=timezone.utc).replace(tzinfo=None)
    if isinstance(val, str):
        val = val.replace(" PST", "").replace(" PDT", "")
        try:
            return datetime.strptime(val, DATE_FORMAT_STR).replace(tzinfo=None)
        except ValueError:
            return None
    return None

def load_gpu_traces_by_dayXX(gpu_trace_dir, machine_ids, job_start_dt, job_end_dt):
    """
    Load GPU utilization for specific machines and time range,
    using preprocessed per-day CSVs (gpu_by_day/).
    """
    dfs = []
    current = job_start_dt.date()
    while current <= job_end_dt.date():
        day_file = os.path.join(gpu_trace_dir, f"{current}.csv")
        if os.path.exists(day_file):
            df = pd.read_csv(
                day_file,
                names=["time", "machineId", "gpu_util"],
                parse_dates=["time"],
                on_bad_lines="skip"
            )
            df = df[df["machineId"].isin(machine_ids)]
            df = df[(df["time"] >= job_start_dt) & (df["time"] <= job_end_dt)]
            if not df.empty:
                dfs.append(df)
        current += timedelta(days=1)

    if dfs:
        return pd.concat(dfs, ignore_index=True)
    return pd.DataFrame(columns=["time", "machineId", "gpu_util"])


def load_gpu_traces_by_day(trace_dir, start_dt, end_dt):
    """Load GPU traces only for the days between start_dt and end_dt."""
    gpu_dir = os.path.join(trace_dir, "dist/gpu_by_day")
    frames = []

    current = start_dt.date()
    while current <= end_dt.date():
        daily_file = os.path.join(gpu_dir, f"{current}.csv")
        if os.path.exists(daily_file):
            df = pd.read_csv(
                daily_file,
                names=["time", "machineId", "gpu_util"],  # no header in daily CSVs
                parse_dates=["time"]
            )
            frames.append(df)
        else:
            print(f"⚠ No trace file for {current}")
        current += timedelta(days=1)

    if not frames:
        return pd.DataFrame(columns=["time", "machineId", "gpu_util"])

    return pd.concat(frames, ignore_index=True)

def parse_date(s):
    if not s or s == "None":
@@ -28,8 +111,12 @@ def load_data(files, **kwargs):
    Returns:
        list[Job]
    """
    # extract --start from kwargs
    start_ts = to_epoch(kwargs.get("start", DEFAULT_START))
    end_ts = to_epoch(kwargs.get("end", DEFAULT_END))
    assert len(files) == 1, "Expecting a single directory path"
    trace_dir = files[0]
    gpu_trace_dir = os.path.join(files[0], "dist", "gpu_by_day")
    config = kwargs.get('config')
    gpus_per_node = config.get("GPUS_PER_NODE")
    if gpus_per_node is None:
@@ -62,46 +149,36 @@ def load_data(files, **kwargs):
    # --- 2. CPU util ---
    cpu_file = os.path.join(trace_dir, "cluster_cpu_util")
    cpu_util = pd.read_csv(cpu_file)
    # cpu_util has columns: time, machine_id, cpu_util
    cpu_util["time"] = cpu_util["time"].str.replace(" PST","").str.replace(" PDT","")
    #cpu_util["time"] = pd.to_datetime(cpu_util["time"], format="%Y-%m-%d %H:%M:%S")
    # now cpu_util has: time (datetime), machine_id, cpu_util
    cpu_util["time"] = cpu_util["time"].apply(parse_date)

    # --- 3. GPU util ---
    gpu_file = os.path.join(trace_dir, "cluster_gpu_util")

    with warnings.catch_warnings(record=True) as wlist:
        gpu_util = pd.read_csv(
            gpu_file,
            engine="python",
            on_bad_lines="skip"
        )
        if wlist:
            warnings.warn(
                f"cluster_gpu_util: skipped {len(wlist)} malformed lines while reading {gpu_file}",
                UserWarning
            )

    # Convert time to datetime
    gpu_util["time"] = pd.to_datetime(gpu_util["time"], errors="coerce").dt.tz_localize(None)

    # Identify GPU columns
    gpu_cols = [c for c in gpu_util.columns if c.startswith("gpu")]
    start_dt = datetime.fromtimestamp(start_ts, tz=timezone.utc)
    end_dt   = datetime.fromtimestamp(end_ts, tz=timezone.utc)

    # Collapse per row: sum all GPU utilizations and divide by 100
    gpu_util["gpu_util"] = gpu_util[gpu_cols].sum(axis=1) / 100.0
    gpu_trace_dir = os.path.join(trace_dir, "dist", "gpu_by_day")

    # Keep only collapsed util plus metadata
    gpu_util = gpu_util[["time", "machineId", "gpu_util"]]

    #print("Sample GPU util after preprocess:", gpu_util.head())
    # For each job:
    gpu_trace = load_gpu_traces_by_day(gpu_trace_dir, start_dt, end_dt)
    job_gpu = load_gpu_traces_by_day(gpu_trace_dir, start_dt, end_dt)

    # --- 4. Job log ---
    job_file = os.path.join(trace_dir, "cluster_job_log")
    with open(job_file) as f:
        job_log = json.load(f)

    # Filter job_log to only jobs matching the partition's gpus_per_node
    if gpus_per_node is not None:
        filtered_log = []
        for raw in job_log:
            attempts = raw.get("attempts", [])
            if attempts and "detail" in attempts[0]:
                # Count GPUs from the first detail
                gpus = sum(len(detail.get("gpus", [])) for detail in attempts[0]["detail"])
                if gpus > 0 and (gpus % gpus_per_node == 0):
                    filtered_log.append(raw)
        job_log = filtered_log

    # --- First pass: find earliest submit time ---
    start_ts = None
    for raw in job_log:
@@ -176,25 +253,19 @@ def load_data(files, **kwargs):
            if len(machine_ids) > 1:
                job_cpu = job_cpu.groupby("time")["cpu_util"].mean().reset_index()

        #print("Job", jobid)
        #print("machine_ids from job:", machine_ids[:5])
        #print("gpu_util machineId sample:", gpu_util["machineId"].unique()[:5])
        #print("start, end:", start, end)
        #print("gpu_util time range:", gpu_util["time"].min(), gpu_util["time"].max())
        # --- absolute datetimes (used for filtering traces) ---
        submitted_dt = parse_timestamp(raw.get("submitted_time"))

        # GPU utilization traces
        job_gpu = None
        if machine_ids and start and end:
        mask = (
                gpu_util["machineId"].isin(machine_ids) &
                (gpu_util["time"] >= start) &
                (gpu_util["time"] <= end)
            (gpu_trace["machineId"].isin(machine_ids)) &
            (gpu_trace["time"] >= start_dt) &
            (gpu_trace["time"] <= end_dt)
        )
            job_gpu = gpu_util.loc[mask].copy()
        job_gpu = gpu_trace.loc[mask].copy()

            # Aggregate across machines if >1 machine
            if len(machine_ids) > 1:
                job_gpu = job_gpu.groupby("time")["gpu_util"].sum().reset_index()
        print(f"  job_gpu shape after filtering: {job_gpu.shape}")
        if job_gpu.empty:
            print("  ⚠ No GPU rows matched this job")

        if machine_ids:
            # Shift times relative to start_ts
@@ -229,7 +300,7 @@ def load_data(files, **kwargs):
                    end_state=status,
                    scheduled_nodes=scheduled_nodes,

                    cpu_trace=job_cpu,
                    cpu_trace=0,
                    gpu_trace=job_gpu,
                    ntx_trace=None,
                    nrx_trace=None,
@@ -252,7 +323,8 @@ def load_data(files, **kwargs):
            print(job)

    # Find max end timestamp across jobs
    end_ts = max(j.end_time for j in jobs_list if j.end_time is not None)
    #end_ts = max(j.end_time for j in jobs_list if j.end_time is not None)
    end_ts = 3600

    return WorkloadData(
        jobs=jobs_list,