ci.eval.compare: extend the performance comparison script (#443620) (d1376cc1) · Commits · nix / nixpkgs

ci/eval/compare/cmp-stats.py

+259 −92

Original line number	Diff line number	Diff line
		import argparse
		import json
		import numpy as np
		import os
		from scipy.stats import ttest_rel
		import pandas as pd
		import numpy as np
		import warnings

		from dataclasses import asdict, dataclass
		from pathlib import Path
		from scipy.stats import ttest_rel
		from tabulate import tabulate
		from typing import Final

		# Define metrics of interest (can be expanded as needed)
		METRIC_PREFIXES = ("nr", "gc")

		def flatten_data(json_data: dict) -> dict:
		"""
		@@ -22,133 +26,296 @@ def flatten_data(json_data: dict) -> dict:
		"gc.heapSize": 5404549120
		...

		See https://github.com/NixOS/nix/blob/187520ce88c47e2859064704f9320a2d6c97e56e/src/libexpr/eval.cc#L2846
		for the ultimate source of this data.

		Args:
		json_data (dict): JSON data containing metrics.
		Returns:
		dict: Flattened metrics with keys as metric names.
		"""
		flat_metrics = {}
		for k, v in json_data.items():
		if isinstance(v, (int, float)):
		flat_metrics[k] = v
		elif isinstance(v, dict):
		for sub_k, sub_v in v.items():
		flat_metrics[f"{k}.{sub_k}"] = sub_v
		for key, value in json_data.items():
		# This key is duplicated as `time.cpu`; we keep that copy.
		if key == "cpuTime":
		continue

		if isinstance(value, (int, float)):
		flat_metrics[key] = value
		elif isinstance(value, dict):
		for subkey, subvalue in value.items():
		assert isinstance(subvalue, (int, float)), subvalue
		flat_metrics[f"{key}.{subkey}"] = subvalue
		else:
		assert isinstance(value, (float, int, dict)), (
		f"Value `{value}` has unexpected type"
		)

		return flat_metrics


		def load_all_metrics(path: Path) -> dict:
		"""
		Loads all stats JSON files in the specified file or directory and extracts metrics.
		These stats JSON files are created by Nix when the `NIX_SHOW_STATS` environment variable is set.

		If the provided path is a directory, it must have the structure $path/$system/$stats,
		where $path is the provided path, $system is some system from `lib.systems.doubles.*`,
		and $stats is a stats JSON file.

		def load_all_metrics(directory: Path) -> dict:
		"""
		Loads all stats JSON files in the specified directory and extracts metrics.
		If the provided path is a file, it is a stats JSON file.

		Args:
		directory (Path): Directory containing JSON files.
		path (Path): Directory containing JSON files or a stats JSON file.

		Returns:
		dict: Dictionary with filenames as keys and extracted metrics as values.
		"""
		metrics = {}
		for system_dir in directory.iterdir():
		if path.is_dir():
		for system_dir in path.iterdir():
		assert system_dir.is_dir()

		for chunk_output in system_dir.iterdir():
		with chunk_output.open() as f:
		data = json.load(f)

		metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
		else:
		with path.open() as f:
		metrics[path.name] = flatten_data(json.load(f))

		return metrics

		def dataframe_to_markdown(df: pd.DataFrame) -> str:
		df = df.sort_values(by=df.columns[0], ascending=True)
		markdown_lines = []

		# Header (get column names and format them)
		header = '\n\| ' + ' \| '.join(df.columns) + ' \|'
		markdown_lines.append(header)
		markdown_lines.append("\| - " * (len(df.columns)) + "\|") # Separator line
		def metric_table_name(name: str, explain: bool) -> str:
		"""
		Returns the name of the metric, plus a footnote to explain it if needed.
		"""
		return f"{name}[^{name}]" if explain else name


		METRIC_EXPLANATION_FOOTNOTE: Final[str] = """

		[^time.cpu]: Number of seconds of CPU time accounted by the OS to the Nix evaluator process. On UNIX systems, this comes from [`getrusage(RUSAGE_SELF)`](https://man7.org/linux/man-pages/man2/getrusage.2.html).
		[^time.gc]: Number of seconds of CPU time accounted by the Boehm garbage collector to performing GC.
		[^time.gcFraction]: What fraction of the total CPU time is accounted towards performing GC.
		[^gc.cycles]: Number of times garbage collection has been performed.
		[^gc.heapSize]: Size in bytes of the garbage collector heap.
		[^gc.totalBytes]: Size in bytes of all allocations in the garbage collector.
		[^envs.bytes]: Size in bytes of all `Env` objects allocated by the Nix evaluator. These are almost exclusively created by [`nix-env`](https://nix.dev/manual/nix/stable/command-ref/nix-env.html).
		[^list.bytes]: Size in bytes of all [lists](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
		[^sets.bytes]: Size in bytes of all [attrsets](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
		[^symbols.bytes]: Size in bytes of all items in the Nix evaluator symbol table.
		[^values.bytes]: Size in bytes of all values allocated by the Nix evaluator.
		[^envs.number]: The count of all `Env` objects allocated.
		[^nrAvoided]: The number of thunks avoided being created.
		[^nrExprs]: The number of expression objects ever created.
		[^nrFunctionCalls]: The number of function calls ever made.
		[^nrLookups]: The number of lookups into an attrset ever made.
		[^nrOpUpdateValuesCopied]: The number of attrset values copied in the process of merging attrsets.
		[^nrOpUpdates]: The number of attrsets merge operations (`//`) performed.
		[^nrPrimOpCalls]: The number of function calls to primops (Nix builtins) ever made.
		[^nrThunks]: The number of [thunks](https://nix.dev/manual/nix/latest/language/evaluation.html#laziness) ever made. A thunk is a delayed computation, represented by an expression reference and a closure.
		[^sets.number]: The number of attrsets ever made.
		[^symbols.number]: The number of symbols ever added to the symbol table.
		[^values.number]: The number of values ever made.
		[^envs.elements]: The number of values contained within an `Env` object.
		[^list.concats]: The number of list concatenation operations (`++`) performed.
		[^list.elements]: The number of values contained within a list.
		[^sets.elements]: The number of values contained within an attrset.
		[^sizes.Attr]: Size in bytes of the `Attr` type.
		[^sizes.Bindings]: Size in bytes of the `Bindings` type.
		[^sizes.Env]: Size in bytes of the `Env` type.
		[^sizes.Value]: Size in bytes of the `Value` type.
		"""


		# Iterate over rows to build Markdown rows
		for _, row in df.iterrows():
		# TODO: define threshold for highlighting
		highlight = False
		@dataclass(frozen=True)
		class PairwiseTestResults:
		updated: pd.DataFrame
		equivalent: pd.DataFrame

		fmt = lambda x: f"{x}" if highlight else f"{x}"
		@staticmethod
		def tabulate(table, headers) -> str:
		return tabulate(
		table, headers, tablefmt="github", floatfmt=".4f", missingval="-"
		)

		def updated_to_markdown(self, explain: bool) -> str:
		assert not self.updated.empty
		# Header (get column names and format them)
		return self.tabulate(
		headers=[str(column) for column in self.updated.columns],
		table=[
		[
		# The metric acts as its own footnote name
		metric_table_name(row["metric"], explain),
		# Check for no change and NaN in p_value/t_stat
		row_values = []
		for val in row:
		if isinstance(val, float) and np.isnan(val): # For NaN values in p-value or t-stat
		row_values.append("-") # Custom symbol for NaN
		elif isinstance(val, float) and val == 0: # For no change (mean_diff == 0)
		row_values.append("-") # Custom symbol for no change
		*[
		None if np.isnan(val) or np.allclose(val, 0) else val
		for val in row[1:]
		],
		]
		for _, row in self.updated.iterrows()
		],
		)

		def equivalent_to_markdown(self, explain: bool) -> str:
		assert not self.equivalent.empty
		return self.tabulate(
		headers=[str(column) for column in self.equivalent.columns],
		table=[
		[
		# The metric acts as its own footnote name
		metric_table_name(row["metric"], explain),
		row["value"],
		]
		for _, row in self.equivalent.iterrows()
		],
		)

		def to_markdown(self, explain: bool) -> str:
		result = ""

		if not self.equivalent.empty:
		result += "## Unchanged values\n\n"
		result += self.equivalent_to_markdown(explain)

		if not self.updated.empty:
		result += ("\n\n" if result else "") + "## Updated values\n\n"
		result += self.updated_to_markdown(explain)

		if explain:
		result += METRIC_EXPLANATION_FOOTNOTE

		return result


		@dataclass(frozen=True)
		class Equivalent:
		metric: str
		value: float


		@dataclass(frozen=True)
		class Comparison:
		metric: str
		mean_before: float
		mean_after: float
		mean_diff: float
		mean_pct_change: float


		@dataclass(frozen=True)
		class ComparisonWithPValue(Comparison):
		p_value: float
		t_stat: float


		def metric_sort_key(name: str) -> str:
		if name in ("time.cpu", "time.gc", "time.gcFraction"):
		return (1, name)
		elif name.startswith("gc"):
		return (2, name)
		elif name.endswith(("bytes", "Bytes")):
		return (3, name)
		elif name.startswith("nr") or name.endswith("number"):
		return (4, name)
		else:
		row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))

		markdown_lines.append('\| ' + ' \| '.join(row_values) + ' \|')
		return (5, name)

		return '\n'.join(markdown_lines)


		def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:
		def perform_pairwise_tests(
		before_metrics: dict, after_metrics: dict
		) -> PairwiseTestResults:
		common_files = sorted(set(before_metrics) & set(after_metrics))
		all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })

		results = []
		all_keys = sorted(
		{
		metric_keys
		for file_metrics in before_metrics.values()
		for metric_keys in file_metrics.keys()
		},
		key=metric_sort_key,
		)

		updated = []
		equivalent = []

		for key in all_keys:
		before_vals, after_vals = [], []
		before_vals = []
		after_vals = []

		for fname in common_files:
		if key in before_metrics[fname] and key in after_metrics[fname]:
		before_vals.append(before_metrics[fname][key])
		after_vals.append(after_metrics[fname][key])

		if len(before_vals) >= 2:
		if len(before_vals) == 0:
		continue

		before_arr = np.array(before_vals)
		after_arr = np.array(after_vals)

		diff = after_arr - before_arr

		# If there's no difference, add it all to the equivalent output.
		if np.allclose(diff, 0):
		equivalent.append(Equivalent(metric=key, value=before_vals[0]))
		else:
		pct_change = 100 * diff / before_arr
		t_stat, p_val = ttest_rel(after_arr, before_arr)

		results.append({
		"metric": key,
		"mean_before": np.mean(before_arr),
		"mean_after": np.mean(after_arr),
		"mean_diff": np.mean(diff),
		"mean_%_change": np.mean(pct_change),
		"p_value": p_val,
		"t_stat": t_stat
		})
		result = Comparison(
		metric=key,
		mean_before=np.mean(before_arr),
		mean_after=np.mean(after_arr),
		mean_diff=np.mean(diff),
		mean_pct_change=np.mean(pct_change),
		)

		df = pd.DataFrame(results).sort_values("p_value")
		return df
		# If there are enough values to perform a t-test, do so.
		if len(before_vals) > 1:
		t_stat, p_val = ttest_rel(after_arr, before_arr)
		result = ComparisonWithPValue(
		**asdict(result), p_value=p_val, t_stat=t_stat
		)

		updated.append(result)

		if __name__ == "__main__":
		before_dir = os.environ.get("BEFORE_DIR")
		after_dir = os.environ.get("AFTER_DIR")
		return PairwiseTestResults(
		updated=pd.DataFrame(map(asdict, updated)),
		equivalent=pd.DataFrame(map(asdict, equivalent)),
		)

		if not before_dir or not after_dir:
		print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")
		exit(1)

		before_stats = Path(before_dir) / "stats"
		after_stats = Path(after_dir) / "stats"
		def main():
		parser = argparse.ArgumentParser(
		description="Performance comparison of Nix evaluation statistics"
		)
		parser.add_argument(
		"--explain", action="store_true", help="Explain the evaluation statistics"
		)
		parser.add_argument(
		"before", help="File or directory containing baseline (data before)"
		)
		parser.add_argument(
		"after", help="File or directory containing comparison (data after)"
		)

		# This may happen if the pull request target does not include PR#399720 yet.
		if not before_stats.exists():
		print("⚠️ Skipping comparison: stats directory is missing in the target commit.")
		exit(0)
		options = parser.parse_args()

		# This should never happen, but we're exiting gracefully anyways
		if not after_stats.exists():
		print("⚠️ Skipping comparison: stats directory missing in current PR evaluation.")
		exit(0)
		# Turn warnings into errors
		warnings.simplefilter("error")

		before_stats = Path(options.before)
		after_stats = Path(options.after)

		before_metrics = load_all_metrics(before_stats)
		after_metrics = load_all_metrics(after_stats)
		df1 = perform_pairwise_tests(before_metrics, after_metrics)
		markdown_table = dataframe_to_markdown(df1)
		pairwise_test_results = perform_pairwise_tests(before_metrics, after_metrics)
		markdown_table = pairwise_test_results.to_markdown(explain=options.explain)
		print(markdown_table)


		if __name__ == "__main__":
		main()

ci/eval/compare/default.nix

+41 −13

Original line number	Diff line number	Diff line
		@@ -5,7 +5,46 @@
		runCommand,
		writeText,
		python3,
		stdenvNoCC,
		makeWrapper,
		}:
		let
		python = python3.withPackages (ps: [
		ps.numpy
		ps.pandas
		ps.scipy
		ps.tabulate
		]);

		cmp-stats = stdenvNoCC.mkDerivation {
		pname = "cmp-stats";
		version = lib.trivial.release;

		dontUnpack = true;

		nativeBuildInputs = [ makeWrapper ];

		installPhase = ''
		runHook preInstall

		mkdir -p $out/share/cmp-stats

		cp ${./cmp-stats.py} "$out/share/cmp-stats/cmp-stats.py"

		makeWrapper ${python.interpreter} "$out/bin/cmp-stats" \
		--add-flags "$out/share/cmp-stats/cmp-stats.py"

		runHook postInstall
		'';

		meta = {
		description = "Performance comparison of Nix evaluation statistics";
		license = lib.licenses.mit;
		mainProgram = "cmp-stats";
		maintainers = with lib.maintainers; [ philiptaron ];
		};
		};
		in
		{
		combinedDir,
		touchedFilesJson,
		@@ -140,21 +179,10 @@ runCommand "compare"
		# Don't depend on -dev outputs to reduce closure size for CI.
		nativeBuildInputs = map lib.getBin [
		jq
		(python3.withPackages (
		ps: with ps; [
		numpy
		pandas
		scipy
		]
		))

		cmp-stats
		];
		maintainers = builtins.toJSON maintainers;
		passAsFile = [ "maintainers" ];
		env = {
		BEFORE_DIR = "${combined}/before";
		AFTER_DIR = "${combined}/after";
		};
		}
		''
		mkdir $out
		@@ -181,7 +209,7 @@ runCommand "compare"
		echo
		} >> $out/step-summary.md

		python3 ${./cmp-stats.py} >> $out/step-summary.md
		cmp-stats --explain ${combined}/before/stats ${combined}/after/stats >> $out/step-summary.md

		else
		# Package chunks are the same in both revisions