Commit 6cc8eb7c authored by Brewer, Wes's avatar Brewer, Wes
Browse files

feat(scripts): add Lassen inter-job network congestion analysis and plot



analyze_lassen_congestion.py: offline replay of measured IB TX/RX counters
from the Lassen CSM dataset through the RAPS fat-tree simulation to detect
inter-job network interference. Sweeps hourly snapshots over a configurable
time window, computes max/mean link utilisation per snapshot, and outputs
per-snapshot and per-job CSVs flagging victim/bully jobs.

plot_congestion_timeline.py: 3-panel timeline figure comparing all-to-all
vs stencil-3d communication pattern assumptions, with a daily heatmap inset.

Co-Authored-By: default avatarClaude Sonnet 4.6 <noreply@anthropic.com>
parent 270368d2
Loading
Loading
Loading
Loading
+888 −0

File added.

Preview size limit exceeded, changes collapsed.

+178 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
"""Plot inter-job congestion timeline comparing all-to-all vs stencil-3d."""

import argparse
from pathlib import Path

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd


def load(prefix):
    snaps = pd.read_csv(f"{prefix}_snapshots.csv", parse_dates=["timestamp"])
    jobs  = pd.read_csv(f"{prefix}_jobs.csv")
    return snaps, jobs


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--a2a",     default="results/lassen_a2a",     help="All-to-all output prefix")
    p.add_argument("--stencil", default="results/lassen_stencil", help="Stencil-3d output prefix")
    p.add_argument("--output",  default="results/congestion_timeline.png")
    args = p.parse_args()

    a2a_s, a2a_j     = load(args.a2a)
    stencil_s, stencil_j = load(args.stencil)

    t = a2a_s["timestamp"]

    # ── colour palette ──────────────────────────────────────────────────
    C_A2A     = "#E05C4B"   # warm red   – all-to-all
    C_STENCIL = "#4B87D0"   # steel blue – stencil-3d
    C_JOBS    = "#6BAF6B"   # muted green
    C_THRESH  = "#999999"

    fig = plt.figure(figsize=(14, 9))
    fig.patch.set_facecolor("#F7F7F7")

    gs = fig.add_gridspec(
        3, 1,
        height_ratios=[3, 1.4, 1],
        hspace=0.08,
        left=0.07, right=0.97,
        top=0.91, bottom=0.09,
    )
    ax_max  = fig.add_subplot(gs[0])
    ax_mean = fig.add_subplot(gs[1], sharex=ax_max)
    ax_jobs = fig.add_subplot(gs[2], sharex=ax_max)

    for ax in (ax_max, ax_mean, ax_jobs):
        ax.set_facecolor("#F0F0F0")
        ax.grid(axis="y", color="white", linewidth=0.8, zorder=0)
        ax.grid(axis="x", color="white", linewidth=0.4, zorder=0)
        ax.spines[["top", "right"]].set_visible(False)

    # ── panel 1: max_link_util ──────────────────────────────────────────
    ax_max.fill_between(t, a2a_s["max_link_util"],
                         alpha=0.18, color=C_A2A, zorder=1)
    ax_max.fill_between(t, stencil_s["max_link_util"],
                         alpha=0.18, color=C_STENCIL, zorder=1)
    ax_max.plot(t, a2a_s["max_link_util"],
                color=C_A2A,     lw=1.5, label="All-to-all",  zorder=3)
    ax_max.plot(t, stencil_s["max_link_util"],
                color=C_STENCIL, lw=1.5, label="Stencil-3D",  zorder=3)

    # annotate overall peaks
    for s, c, yoff in [(a2a_s, C_A2A, -18), (stencil_s, C_STENCIL, 8)]:
        idx = s["max_link_util"].idxmax()
        ax_max.annotate(
            f"{s['max_link_util'][idx]:.1f}×",
            xy=(t[idx], s["max_link_util"][idx]),
            xytext=(0, yoff), textcoords="offset points",
            ha="center", fontsize=8.5, color=c, fontweight="bold",
            arrowprops=dict(arrowstyle="-", color=c, lw=0.8),
        )

    ax_max.set_ylabel("Max link utilisation\n(× link capacity)", fontsize=10)
    ax_max.legend(loc="upper left", fontsize=9, framealpha=0.85)
    ax_max.set_ylim(bottom=0)
    ax_max.yaxis.set_major_formatter(ticker.FormatStrFormatter("%.0f×"))

    # capacity reference lines
    for y, label in [(1, "1× (full)"), (10, "10×"), (50, "50×")]:
        ax_max.axhline(y, color=C_THRESH, lw=0.7, ls="--", zorder=2)
        ax_max.text(t.iloc[-1], y, f" {label}", va="center",
                    fontsize=7.5, color=C_THRESH)

    # ── panel 2: mean_link_util ─────────────────────────────────────────
    ax_mean.fill_between(t, a2a_s["mean_link_util"],
                          alpha=0.25, color=C_A2A, zorder=1)
    ax_mean.fill_between(t, stencil_s["mean_link_util"],
                          alpha=0.25, color=C_STENCIL, zorder=1)
    ax_mean.plot(t, a2a_s["mean_link_util"],
                 color=C_A2A,     lw=1.4, zorder=3)
    ax_mean.plot(t, stencil_s["mean_link_util"],
                 color=C_STENCIL, lw=1.4, zorder=3)
    ax_mean.set_ylabel("Mean link util\n(× capacity)", fontsize=10)
    ax_mean.yaxis.set_major_formatter(ticker.FormatStrFormatter("%.2f×"))
    ax_mean.set_ylim(bottom=0)

    # ── panel 3: concurrent jobs & node utilisation ─────────────────────
    ax_jobs.fill_between(t, a2a_s["n_sim_jobs"],
                          alpha=0.35, color=C_JOBS, step="pre", zorder=1)
    ax_jobs.step(t, a2a_s["n_sim_jobs"],
                 color=C_JOBS, lw=1.3, where="pre",
                 label="Active IB jobs", zorder=3)

    ax_jobs_r = ax_jobs.twinx()
    ax_jobs_r.spines[["top"]].set_visible(False)
    ax_jobs_r.plot(t, a2a_s["n_nodes_busy"] / 4608 * 100,
                   color="#888888", lw=1.0, ls=":", zorder=4,
                   label="Node utilisation %")
    ax_jobs_r.set_ylabel("Node util (%)", fontsize=9, color="#888888")
    ax_jobs_r.tick_params(axis="y", colors="#888888", labelsize=8)
    ax_jobs_r.set_ylim(0, 30)

    ax_jobs.set_ylabel("Concurrent jobs", fontsize=10)
    ax_jobs.set_ylim(bottom=0)
    lines_j, labels_j = ax_jobs.get_legend_handles_labels()
    lines_r, labels_r = ax_jobs_r.get_legend_handles_labels()
    ax_jobs.legend(lines_j + lines_r, labels_j + labels_r,
                   loc="upper left", fontsize=8.5, framealpha=0.85)

    # ── shared x-axis formatting ────────────────────────────────────────
    ax_jobs.xaxis.set_major_locator(mdates.DayLocator())
    ax_jobs.xaxis.set_major_formatter(mdates.DateFormatter("%b %d"))
    ax_jobs.xaxis.set_minor_locator(mdates.HourLocator(byhour=[6, 12, 18]))
    plt.setp(ax_max.get_xticklabels(),  visible=False)
    plt.setp(ax_mean.get_xticklabels(), visible=False)
    ax_jobs.tick_params(axis="x", labelsize=9)

    # ── title + summary stats box ───────────────────────────────────────
    fig.suptitle(
        "Lassen – Simulated Inter-Job Network Congestion  (Aug 22–28, 2019)",
        fontsize=13, fontweight="bold", y=0.97,
    )

    stats_text = (
        f"168 hourly snapshots  |  5 181 jobs ≥ 2 nodes  |  Fat-tree k=32, IB EDR 100 Gbps\n"
        f"All-to-all:   peak {a2a_s['max_link_util'].max():.1f}×,  mean {a2a_s['max_link_util'].mean():.1f}×    "
        f"Stencil-3D:  peak {stencil_s['max_link_util'].max():.1f}×,  mean {stencil_s['max_link_util'].mean():.1f}×    "
        f"Ratio: {(stencil_s['max_link_util']/a2a_s['max_link_util']).mean():.2f}× avg"
    )
    fig.text(0.5, 0.935, stats_text, ha="center", va="top",
             fontsize=8.2, color="#444444",
             bbox=dict(boxstyle="round,pad=0.4", fc="white", ec="#cccccc", alpha=0.9))

    # ── inset: daily heatmap ────────────────────────────────────────────
    ax_inset = ax_max.inset_axes([0.72, 0.08, 0.27, 0.50])
    ax_inset.set_facecolor("#E8E8E8")

    a2a_s["day"]  = a2a_s["timestamp"].dt.day_of_week
    a2a_s["hour"] = a2a_s["timestamp"].dt.hour
    pivot = a2a_s.pivot_table(index="hour", columns="day",
                               values="max_link_util", aggfunc="mean")
    pivot.columns = ["Thu","Fri","Sat","Sun","Mon","Tue","Wed"][:len(pivot.columns)]

    im = ax_inset.imshow(pivot.values, aspect="auto", cmap="YlOrRd",
                          origin="lower", vmin=0)
    ax_inset.set_xticks(range(len(pivot.columns)))
    ax_inset.set_xticklabels(pivot.columns, fontsize=6.5)
    ax_inset.set_yticks([0, 6, 12, 18, 23])
    ax_inset.set_yticklabels(["00:00", "06:00", "12:00", "18:00", "23:00"], fontsize=6)
    ax_inset.set_title("A2A max util\nhour × day", fontsize=6.5, pad=3)
    cb = plt.colorbar(im, ax=ax_inset, fraction=0.046, pad=0.04)
    cb.ax.tick_params(labelsize=5.5)

    out = Path(args.output)
    out.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(out, dpi=160, bbox_inches="tight")
    print(f"Saved → {out}")
    plt.close(fig)


if __name__ == "__main__":
    main()