Loading ci/eval/compare/cmp-stats.pydeleted 100644 → 0 +0 −141 Original line number Diff line number Diff line import json import os from scipy.stats import ttest_rel import pandas as pd import numpy as np from pathlib import Path # Define metrics of interest (can be expanded as needed) METRIC_PREFIXES = ("nr", "gc") def flatten_data(json_data: dict) -> dict: """ Extracts and flattens metrics from JSON data. This is needed because the JSON data can be nested. For example, the JSON data entry might look like this: "gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464} Flattened: "gc.cycles": 13 "gc.heapSize": 5404549120 ... Args: json_data (dict): JSON data containing metrics. Returns: dict: Flattened metrics with keys as metric names. """ flat_metrics = {} for k, v in json_data.items(): if isinstance(v, (int, float)): flat_metrics[k] = v elif isinstance(v, dict): for sub_k, sub_v in v.items(): flat_metrics[f"{k}.{sub_k}"] = sub_v return flat_metrics def load_all_metrics(directory: Path) -> dict: """ Loads all stats JSON files in the specified directory and extracts metrics. Args: directory (Path): Directory containing JSON files. Returns: dict: Dictionary with filenames as keys and extracted metrics as values. """ metrics = {} for system_dir in directory.iterdir(): assert system_dir.is_dir() for chunk_output in system_dir.iterdir(): with chunk_output.open() as f: data = json.load(f) metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data) return metrics def dataframe_to_markdown(df: pd.DataFrame) -> str: markdown_lines = [] # Header (get column names and format them) header = '\n| ' + ' | '.join(df.columns) + ' |' markdown_lines.append(header) markdown_lines.append("| - " * (len(df.columns)) + "|") # Separator line # Iterate over rows to build Markdown rows for _, row in df.iterrows(): # TODO: define threshold for highlighting highlight = False fmt = lambda x: f"**{x}**" if highlight else f"{x}" # Check for no change and NaN in p_value/t_stat row_values = [] for val in row: if isinstance(val, float) and np.isnan(val): # For NaN values in p-value or t-stat row_values.append("-") # Custom symbol for NaN elif isinstance(val, float) and val == 0: # For no change (mean_diff == 0) row_values.append("-") # Custom symbol for no change else: row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val))) markdown_lines.append('| ' + ' | '.join(row_values) + ' |') return '\n'.join(markdown_lines) def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame: common_files = sorted(set(before_metrics) & set(after_metrics)) all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() }) results = [] for key in all_keys: before_vals, after_vals = [], [] for fname in common_files: if key in before_metrics[fname] and key in after_metrics[fname]: before_vals.append(before_metrics[fname][key]) after_vals.append(after_metrics[fname][key]) if len(before_vals) >= 2: before_arr = np.array(before_vals) after_arr = np.array(after_vals) diff = after_arr - before_arr pct_change = 100 * diff / before_arr t_stat, p_val = ttest_rel(after_arr, before_arr) results.append({ "metric": key, "mean_before": np.mean(before_arr), "mean_after": np.mean(after_arr), "mean_diff": np.mean(diff), "mean_%_change": np.mean(pct_change), "p_value": p_val, "t_stat": t_stat }) df = pd.DataFrame(results).sort_values("p_value") return df if __name__ == "__main__": before_dir = os.environ.get("BEFORE_DIR") after_dir = os.environ.get("AFTER_DIR") if not before_dir or not after_dir: print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.") exit(1) before_metrics = load_all_metrics(Path(before_dir) / "stats") after_metrics = load_all_metrics(Path(after_dir) / "stats") df1 = perform_pairwise_tests(before_metrics, after_metrics) markdown_table = dataframe_to_markdown(df1) print(markdown_table) ci/eval/compare/default.nix +4 −46 Original line number Diff line number Diff line Loading @@ -3,7 +3,6 @@ jq, runCommand, writeText, python3, ... }: { Loading Loading @@ -126,59 +125,18 @@ let in runCommand "compare" { nativeBuildInputs = [ jq (python3.withPackages ( ps: with ps; [ numpy pandas scipy ] )) ]; nativeBuildInputs = [ jq ]; maintainers = builtins.toJSON maintainers; passAsFile = [ "maintainers" ]; env = { BEFORE_DIR = "${beforeResultDir}"; AFTER_DIR = "${afterResultDir}"; }; } '' mkdir $out cp ${changed-paths} $out/changed-paths.json if jq -e '(.attrdiff.added | length == 0) and (.attrdiff.removed | length == 0)' "${changed-paths}" > /dev/null; then # Chunks have changed between revisions # We cannot generate a performance comparison { echo echo "# Performance comparison" echo echo "This compares the performance of this branch against its pull request base branch (e.g., 'master')" echo echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)" echo } >> $out/step-summary.md python3 ${./cmp-stats.py} >> $out/step-summary.md else # Package chunks are the same in both revisions # We can use the to generate a performance comparison { echo echo "# Performance Comparison" echo echo "Performance stats were skipped because the package sets differ between the two revisions." echo echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)" } >> $out/step-summary.md fi jq -r -f ${./generate-step-summary.jq} < ${changed-paths} >> $out/step-summary.md jq -r -f ${./generate-step-summary.jq} < ${changed-paths} > $out/step-summary.md cp "$maintainersPath" "$out/maintainers.json" # TODO: Compare eval stats '' ci/eval/default.nix +0 −2 Original line number Diff line number Diff line Loading @@ -9,7 +9,6 @@ nixVersions, jq, sta, python3, }: let Loading Loading @@ -271,7 +270,6 @@ let runCommand writeText supportedSystems python3 ; }; Loading Loading
ci/eval/compare/cmp-stats.pydeleted 100644 → 0 +0 −141 Original line number Diff line number Diff line import json import os from scipy.stats import ttest_rel import pandas as pd import numpy as np from pathlib import Path # Define metrics of interest (can be expanded as needed) METRIC_PREFIXES = ("nr", "gc") def flatten_data(json_data: dict) -> dict: """ Extracts and flattens metrics from JSON data. This is needed because the JSON data can be nested. For example, the JSON data entry might look like this: "gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464} Flattened: "gc.cycles": 13 "gc.heapSize": 5404549120 ... Args: json_data (dict): JSON data containing metrics. Returns: dict: Flattened metrics with keys as metric names. """ flat_metrics = {} for k, v in json_data.items(): if isinstance(v, (int, float)): flat_metrics[k] = v elif isinstance(v, dict): for sub_k, sub_v in v.items(): flat_metrics[f"{k}.{sub_k}"] = sub_v return flat_metrics def load_all_metrics(directory: Path) -> dict: """ Loads all stats JSON files in the specified directory and extracts metrics. Args: directory (Path): Directory containing JSON files. Returns: dict: Dictionary with filenames as keys and extracted metrics as values. """ metrics = {} for system_dir in directory.iterdir(): assert system_dir.is_dir() for chunk_output in system_dir.iterdir(): with chunk_output.open() as f: data = json.load(f) metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data) return metrics def dataframe_to_markdown(df: pd.DataFrame) -> str: markdown_lines = [] # Header (get column names and format them) header = '\n| ' + ' | '.join(df.columns) + ' |' markdown_lines.append(header) markdown_lines.append("| - " * (len(df.columns)) + "|") # Separator line # Iterate over rows to build Markdown rows for _, row in df.iterrows(): # TODO: define threshold for highlighting highlight = False fmt = lambda x: f"**{x}**" if highlight else f"{x}" # Check for no change and NaN in p_value/t_stat row_values = [] for val in row: if isinstance(val, float) and np.isnan(val): # For NaN values in p-value or t-stat row_values.append("-") # Custom symbol for NaN elif isinstance(val, float) and val == 0: # For no change (mean_diff == 0) row_values.append("-") # Custom symbol for no change else: row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val))) markdown_lines.append('| ' + ' | '.join(row_values) + ' |') return '\n'.join(markdown_lines) def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame: common_files = sorted(set(before_metrics) & set(after_metrics)) all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() }) results = [] for key in all_keys: before_vals, after_vals = [], [] for fname in common_files: if key in before_metrics[fname] and key in after_metrics[fname]: before_vals.append(before_metrics[fname][key]) after_vals.append(after_metrics[fname][key]) if len(before_vals) >= 2: before_arr = np.array(before_vals) after_arr = np.array(after_vals) diff = after_arr - before_arr pct_change = 100 * diff / before_arr t_stat, p_val = ttest_rel(after_arr, before_arr) results.append({ "metric": key, "mean_before": np.mean(before_arr), "mean_after": np.mean(after_arr), "mean_diff": np.mean(diff), "mean_%_change": np.mean(pct_change), "p_value": p_val, "t_stat": t_stat }) df = pd.DataFrame(results).sort_values("p_value") return df if __name__ == "__main__": before_dir = os.environ.get("BEFORE_DIR") after_dir = os.environ.get("AFTER_DIR") if not before_dir or not after_dir: print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.") exit(1) before_metrics = load_all_metrics(Path(before_dir) / "stats") after_metrics = load_all_metrics(Path(after_dir) / "stats") df1 = perform_pairwise_tests(before_metrics, after_metrics) markdown_table = dataframe_to_markdown(df1) print(markdown_table)
ci/eval/compare/default.nix +4 −46 Original line number Diff line number Diff line Loading @@ -3,7 +3,6 @@ jq, runCommand, writeText, python3, ... }: { Loading Loading @@ -126,59 +125,18 @@ let in runCommand "compare" { nativeBuildInputs = [ jq (python3.withPackages ( ps: with ps; [ numpy pandas scipy ] )) ]; nativeBuildInputs = [ jq ]; maintainers = builtins.toJSON maintainers; passAsFile = [ "maintainers" ]; env = { BEFORE_DIR = "${beforeResultDir}"; AFTER_DIR = "${afterResultDir}"; }; } '' mkdir $out cp ${changed-paths} $out/changed-paths.json if jq -e '(.attrdiff.added | length == 0) and (.attrdiff.removed | length == 0)' "${changed-paths}" > /dev/null; then # Chunks have changed between revisions # We cannot generate a performance comparison { echo echo "# Performance comparison" echo echo "This compares the performance of this branch against its pull request base branch (e.g., 'master')" echo echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)" echo } >> $out/step-summary.md python3 ${./cmp-stats.py} >> $out/step-summary.md else # Package chunks are the same in both revisions # We can use the to generate a performance comparison { echo echo "# Performance Comparison" echo echo "Performance stats were skipped because the package sets differ between the two revisions." echo echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)" } >> $out/step-summary.md fi jq -r -f ${./generate-step-summary.jq} < ${changed-paths} >> $out/step-summary.md jq -r -f ${./generate-step-summary.jq} < ${changed-paths} > $out/step-summary.md cp "$maintainersPath" "$out/maintainers.json" # TODO: Compare eval stats ''
ci/eval/default.nix +0 −2 Original line number Diff line number Diff line Loading @@ -9,7 +9,6 @@ nixVersions, jq, sta, python3, }: let Loading Loading @@ -271,7 +270,6 @@ let runCommand writeText supportedSystems python3 ; }; Loading