Revert "ci/compare: Bring back nix stats comparison" (a3520e95) · Commits · nix / nixpkgs

ci/eval/compare/cmp-stats.py

deleted100644 → 0

+0 −141

Original line number	Diff line number	Diff line
		import json
		import os
		from scipy.stats import ttest_rel
		import pandas as pd
		import numpy as np
		from pathlib import Path

		# Define metrics of interest (can be expanded as needed)
		METRIC_PREFIXES = ("nr", "gc")

		def flatten_data(json_data: dict) -> dict:
		"""
		Extracts and flattens metrics from JSON data.
		This is needed because the JSON data can be nested.
		For example, the JSON data entry might look like this:

		"gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}

		Flattened:

		"gc.cycles": 13
		"gc.heapSize": 5404549120
		...

		Args:
		json_data (dict): JSON data containing metrics.
		Returns:
		dict: Flattened metrics with keys as metric names.
		"""
		flat_metrics = {}
		for k, v in json_data.items():
		if isinstance(v, (int, float)):
		flat_metrics[k] = v
		elif isinstance(v, dict):
		for sub_k, sub_v in v.items():
		flat_metrics[f"{k}.{sub_k}"] = sub_v
		return flat_metrics




		def load_all_metrics(directory: Path) -> dict:
		"""
		Loads all stats JSON files in the specified directory and extracts metrics.

		Args:
		directory (Path): Directory containing JSON files.
		Returns:
		dict: Dictionary with filenames as keys and extracted metrics as values.
		"""
		metrics = {}
		for system_dir in directory.iterdir():
		assert system_dir.is_dir()

		for chunk_output in system_dir.iterdir():
		with chunk_output.open() as f:
		data = json.load(f)
		metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)

		return metrics

		def dataframe_to_markdown(df: pd.DataFrame) -> str:
		markdown_lines = []

		# Header (get column names and format them)
		header = '\n\| ' + ' \| '.join(df.columns) + ' \|'
		markdown_lines.append(header)
		markdown_lines.append("\| - " * (len(df.columns)) + "\|") # Separator line

		# Iterate over rows to build Markdown rows
		for _, row in df.iterrows():
		# TODO: define threshold for highlighting
		highlight = False

		fmt = lambda x: f"{x}" if highlight else f"{x}"

		# Check for no change and NaN in p_value/t_stat
		row_values = []
		for val in row:
		if isinstance(val, float) and np.isnan(val): # For NaN values in p-value or t-stat
		row_values.append("-") # Custom symbol for NaN
		elif isinstance(val, float) and val == 0: # For no change (mean_diff == 0)
		row_values.append("-") # Custom symbol for no change
		else:
		row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))

		markdown_lines.append('\| ' + ' \| '.join(row_values) + ' \|')

		return '\n'.join(markdown_lines)


		def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:
		common_files = sorted(set(before_metrics) & set(after_metrics))
		all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })

		results = []

		for key in all_keys:
		before_vals, after_vals = [], []

		for fname in common_files:
		if key in before_metrics[fname] and key in after_metrics[fname]:
		before_vals.append(before_metrics[fname][key])
		after_vals.append(after_metrics[fname][key])

		if len(before_vals) >= 2:
		before_arr = np.array(before_vals)
		after_arr = np.array(after_vals)

		diff = after_arr - before_arr
		pct_change = 100 * diff / before_arr
		t_stat, p_val = ttest_rel(after_arr, before_arr)

		results.append({
		"metric": key,
		"mean_before": np.mean(before_arr),
		"mean_after": np.mean(after_arr),
		"mean_diff": np.mean(diff),
		"mean_%_change": np.mean(pct_change),
		"p_value": p_val,
		"t_stat": t_stat
		})

		df = pd.DataFrame(results).sort_values("p_value")
		return df


		if __name__ == "__main__":
		before_dir = os.environ.get("BEFORE_DIR")
		after_dir = os.environ.get("AFTER_DIR")

		if not before_dir or not after_dir:
		print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")
		exit(1)

		before_metrics = load_all_metrics(Path(before_dir) / "stats")
		after_metrics = load_all_metrics(Path(after_dir) / "stats")

		df1 = perform_pairwise_tests(before_metrics, after_metrics)
		markdown_table = dataframe_to_markdown(df1)
		print(markdown_table)

ci/eval/compare/default.nix

+4 −46

Original line number	Diff line number	Diff line
		@@ -3,7 +3,6 @@
		jq,
		runCommand,
		writeText,
		python3,
		...
		}:
		{
		@@ -126,59 +125,18 @@ let
		in
		runCommand "compare"
		{
		nativeBuildInputs = [
		jq
		(python3.withPackages (
		ps: with ps; [
		numpy
		pandas
		scipy
		]
		))

		];
		nativeBuildInputs = [ jq ];
		maintainers = builtins.toJSON maintainers;
		passAsFile = [ "maintainers" ];
		env = {
		BEFORE_DIR = "${beforeResultDir}";
		AFTER_DIR = "${afterResultDir}";
		};
		}
		''
		mkdir $out

		cp ${changed-paths} $out/changed-paths.json


		if jq -e '(.attrdiff.added \| length == 0) and (.attrdiff.removed \| length == 0)' "${changed-paths}" > /dev/null; then
		# Chunks have changed between revisions
		# We cannot generate a performance comparison
		{
		echo
		echo "# Performance comparison"
		echo
		echo "This compares the performance of this branch against its pull request base branch (e.g., 'master')"
		echo
		echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)"
		echo
		} >> $out/step-summary.md

		python3 ${./cmp-stats.py} >> $out/step-summary.md

		else
		# Package chunks are the same in both revisions
		# We can use the to generate a performance comparison
		{
		echo
		echo "# Performance Comparison"
		echo
		echo "Performance stats were skipped because the package sets differ between the two revisions."
		echo
		echo "For further help please refer to: [ci/README.md](https://github.com/NixOS/nixpkgs/blob/master/ci/README.md)"
		} >> $out/step-summary.md
		fi

		jq -r -f ${./generate-step-summary.jq} < ${changed-paths} >> $out/step-summary.md
		jq -r -f ${./generate-step-summary.jq} < ${changed-paths} > $out/step-summary.md

		cp "$maintainersPath" "$out/maintainers.json"

		# TODO: Compare eval stats
		''

ci/eval/default.nix

+0 −2

Original line number	Diff line number	Diff line
		@@ -9,7 +9,6 @@
		nixVersions,
		jq,
		sta,
		python3,
		}:

		let
		@@ -271,7 +270,6 @@ let
		runCommand
		writeText
		supportedSystems
		python3
		;
		};