Merge pull request #289593 (confinement + DynamicUser) (2bb1556b) · Commits · nix / nixpkgs

nixos/doc/manual/release-notes/rl-2405.section.md

+2 −0

Original line number	Diff line number	Diff line
		@@ -713,6 +713,8 @@ The pre-existing [services.ankisyncd](#opt-services.ankisyncd.enable) has been m
		- `documentation.man.mandoc` now by default uses `MANPATH` to set the directories where mandoc will search for manual pages.
		This enables mandoc to find manual pages in Nix profiles. To set the manual search paths via the `mandoc.conf` configuration file like before, use `documentation.man.mandoc.settings.manpath` instead.

		- The `systemd-confinement` module extension is now compatible with `DynamicUser=true` and thus `ProtectSystem=strict` too.

		- `grafana-loki` package was updated to 3.0.0 which includes [breaking changes](https://github.com/grafana/loki/releases/tag/v3.0.0).

		- `programs.fish.package` now allows you to override the package used in the `fish` module.

nixos/modules/security/systemd-confinement.nix

+22 −14

Original line number	Diff line number	Diff line
		@@ -79,13 +79,20 @@ in {
		description = ''
		The value `full-apivfs` (the default) sets up
		private {file}`/dev`, {file}`/proc`,
		{file}`/sys` and {file}`/tmp` file systems in a separate user
		name space.
		{file}`/sys`, {file}`/tmp` and {file}`/var/tmp` file systems
		in a separate user name space.

		If this is set to `chroot-only`, only the file
		system name space is set up along with the call to
		{manpage}`chroot(2)`.

		In all cases, unless `serviceConfig.PrivateTmp=true` is set,
		both {file}`/tmp` and {file}`/var/tmp` paths are added to `InaccessiblePaths=`.
		This is to overcome options like `DynamicUser=true`
		implying `PrivateTmp=true` without letting it being turned off.
		Beware however that giving processes the `CAP_SYS_ADMIN` and `@mount` privileges
		can let them undo the effects of `InaccessiblePaths=`.

		::: {.note}
		This doesn't cover network namespaces and is solely for
		file system level isolation.
		@@ -98,8 +105,12 @@ in {
		wantsAPIVFS = lib.mkDefault (config.confinement.mode == "full-apivfs");
		in lib.mkIf config.confinement.enable {
		serviceConfig = {
		RootDirectory = "/var/empty";
		TemporaryFileSystem = "/";
		ReadOnlyPaths = [ "+/" ];
		RuntimeDirectory = [ "confinement/${mkPathSafeName name}" ];
		RootDirectory = "/run/confinement/${mkPathSafeName name}";
		InaccessiblePaths = [
		"-+/run/confinement/${mkPathSafeName name}"
		];
		PrivateMounts = lib.mkDefault true;

		# https://github.com/NixOS/nixpkgs/issues/14645 is a future attempt
		@@ -148,16 +159,6 @@ in {
		+ " Please either define a separate service or find a way to run"
		+ " commands other than ExecStart within the chroot.";
		}
		{ assertion = !cfg.serviceConfig.DynamicUser or false;
		message = "${whatOpt "DynamicUser"}. Please create a dedicated user via"
		+ " the 'users.users' option instead as this combination is"
		+ " currently not supported.";
		}
		{ assertion = cfg.serviceConfig ? ProtectSystem -> cfg.serviceConfig.ProtectSystem == false;
		message = "${whatOpt "ProtectSystem"}. ProtectSystem is not compatible"
		+ " with service confinement as it fails to remount /usr within"
		+ " our chroot. Please disable the option.";
		}
		]) config.systemd.services);

		config.systemd.packages = lib.concatLists (lib.mapAttrsToList (name: cfg: let
		@@ -183,6 +184,13 @@ in {
		echo "BindReadOnlyPaths=$realprog:/bin/sh" >> "$serviceFile"
		''}

		# If DynamicUser= is enabled, PrivateTmp=true is implied (and cannot be turned off).
		# so disable them unless PrivateTmp=true is explicitely set.
		${lib.optionalString (!cfg.serviceConfig.PrivateTmp) ''
		echo "InaccessiblePaths=-+/tmp" >> "$serviceFile"
		echo "InaccessiblePaths=-+/var/tmp" >> "$serviceFile"
		''}

		while read storePath; do
		if [ -L "$storePath" ]; then
		# Currently, systemd can't cope with symlinks in Bind(ReadOnly)Paths,

nixos/tests/all-tests.nix

+1 −1

Original line number	Diff line number	Diff line
		@@ -885,7 +885,7 @@ in {
		systemd-binfmt = handleTestOn ["x86_64-linux"] ./systemd-binfmt.nix {};
		systemd-boot = handleTest ./systemd-boot.nix {};
		systemd-bpf = handleTest ./systemd-bpf.nix {};
		systemd-confinement = handleTest ./systemd-confinement.nix {};
		systemd-confinement = handleTest ./systemd-confinement {};
		systemd-coredump = handleTest ./systemd-coredump.nix {};
		systemd-cryptenroll = handleTest ./systemd-cryptenroll.nix {};
		systemd-credentials-tpm2 = handleTest ./systemd-credentials-tpm2.nix {};

nixos/tests/systemd-confinement.nix

deleted100644 → 0

+0 −184

Original line number	Diff line number	Diff line
		import ./make-test-python.nix {
		name = "systemd-confinement";

		nodes.machine = { pkgs, lib, ... }: let
		testServer = pkgs.writeScript "testserver.sh" ''
		#!${pkgs.runtimeShell}
		export PATH=${lib.escapeShellArg "${pkgs.coreutils}/bin"}
		${lib.escapeShellArg pkgs.runtimeShell} 2>&1
		echo "exit-status:$?"
		'';

		testClient = pkgs.writeScriptBin "chroot-exec" ''
		#!${pkgs.runtimeShell} -e
		output="$(echo "$@" \| nc -NU "/run/test$(< /teststep).sock")"
		ret="$(echo "$output" \| sed -nre '$s/^exit-status:([0-9]+)$/\1/p')"
		echo "$output" \| head -n -1
		exit "''${ret:-1}"
		'';

		mkTestStep = num: {
		testScript,
		config ? {},
		serviceName ? "test${toString num}",
		}: {
		systemd.sockets.${serviceName} = {
		description = "Socket for Test Service ${toString num}";
		wantedBy = [ "sockets.target" ];
		socketConfig.ListenStream = "/run/test${toString num}.sock";
		socketConfig.Accept = true;
		};

		systemd.services."${serviceName}@" = {
		description = "Confined Test Service ${toString num}";
		confinement = (config.confinement or {}) // { enable = true; };
		serviceConfig = (config.serviceConfig or {}) // {
		ExecStart = testServer;
		StandardInput = "socket";
		};
		} // removeAttrs config [ "confinement" "serviceConfig" ];

		__testSteps = lib.mkOrder num (''
		machine.succeed("echo ${toString num} > /teststep")
		'' + testScript);
		};

		in {
		imports = lib.imap1 mkTestStep [
		{ config.confinement.mode = "chroot-only";
		testScript = ''
		with subtest("chroot-only confinement"):
		paths = machine.succeed('chroot-exec ls -1 / \| paste -sd,').strip()
		assert_eq(paths, "bin,nix,run")
		uid = machine.succeed('chroot-exec id -u').strip()
		assert_eq(uid, "0")
		machine.succeed("chroot-exec chown 65534 /bin")
		'';
		}
		{ testScript = ''
		with subtest("full confinement with APIVFS"):
		machine.fail("chroot-exec ls -l /etc")
		machine.fail("chroot-exec chown 65534 /bin")
		assert_eq(machine.succeed('chroot-exec id -u').strip(), "0")
		machine.succeed("chroot-exec chown 0 /bin")
		'';
		}
		{ config.serviceConfig.BindReadOnlyPaths = [ "/etc" ];
		testScript = ''
		with subtest("check existence of bind-mounted /etc"):
		passwd = machine.succeed('chroot-exec cat /etc/passwd').strip()
		assert len(passwd) > 0, "/etc/passwd must not be empty"
		'';
		}
		{ config.serviceConfig.User = "chroot-testuser";
		config.serviceConfig.Group = "chroot-testgroup";
		testScript = ''
		with subtest("check if User/Group really runs as non-root"):
		machine.succeed("chroot-exec ls -l /dev")
		uid = machine.succeed('chroot-exec id -u').strip()
		assert uid != "0", "UID of chroot-testuser shouldn't be 0"
		machine.fail("chroot-exec touch /bin/test")
		'';
		}
		(let
		symlink = pkgs.runCommand "symlink" {
		target = pkgs.writeText "symlink-target" "got me\n";
		} "ln -s \"$target\" \"$out\"";
		in {
		config.confinement.packages = lib.singleton symlink;
		testScript = ''
		with subtest("check if symlinks are properly bind-mounted"):
		machine.fail("chroot-exec test -e /etc")
		text = machine.succeed('chroot-exec cat ${symlink}').strip()
		assert_eq(text, "got me")
		'';
		})
		{ config.serviceConfig.User = "chroot-testuser";
		config.serviceConfig.Group = "chroot-testgroup";
		config.serviceConfig.StateDirectory = "testme";
		testScript = ''
		with subtest("check if StateDirectory works"):
		machine.succeed("chroot-exec touch /tmp/canary")
		machine.succeed('chroot-exec "echo works > /var/lib/testme/foo"')
		machine.succeed('test "$(< /var/lib/testme/foo)" = works')
		machine.succeed("test ! -e /tmp/canary")
		'';
		}
		{ testScript = ''
		with subtest("check if /bin/sh works"):
		machine.succeed(
		"chroot-exec test -e /bin/sh",
		'test "$(chroot-exec \'/bin/sh -c "echo bar"\')" = bar',
		)
		'';
		}
		{ config.confinement.binSh = null;
		testScript = ''
		with subtest("check if suppressing /bin/sh works"):
		machine.succeed("chroot-exec test ! -e /bin/sh")
		machine.succeed('test "$(chroot-exec \'/bin/sh -c "echo foo"\')" != foo')
		'';
		}
		{ config.confinement.binSh = "${pkgs.hello}/bin/hello";
		testScript = ''
		with subtest("check if we can set /bin/sh to something different"):
		machine.succeed("chroot-exec test -e /bin/sh")
		machine.succeed('test "$(chroot-exec /bin/sh -g foo)" = foo')
		'';
		}
		{ config.environment.FOOBAR = pkgs.writeText "foobar" "eek\n";
		testScript = ''
		with subtest("check if only Exec* dependencies are included"):
		machine.succeed('test "$(chroot-exec \'cat "$FOOBAR"\')" != eek')
		'';
		}
		{ config.environment.FOOBAR = pkgs.writeText "foobar" "eek\n";
		config.confinement.fullUnit = true;
		testScript = ''
		with subtest("check if all unit dependencies are included"):
		machine.succeed('test "$(chroot-exec \'cat "$FOOBAR"\')" = eek')
		'';
		}
		{ serviceName = "shipped-unitfile";
		config.confinement.mode = "chroot-only";
		testScript = ''
		with subtest("check if shipped unit file still works"):
		machine.succeed(
		'chroot-exec \'kill -9 $$ 2>&1 \|\| :\' \| '
		'grep -q "Too many levels of symbolic links"'
		)
		'';
		}
		];

		options.__testSteps = lib.mkOption {
		type = lib.types.lines;
		description = "All of the test steps combined as a single script.";
		};

		config.environment.systemPackages = lib.singleton testClient;
		config.systemd.packages = lib.singleton (pkgs.writeTextFile {
		name = "shipped-unitfile";
		destination = "/etc/systemd/system/shipped-unitfile@.service";
		text = ''
		[Service]
		SystemCallFilter=~kill
		SystemCallErrorNumber=ELOOP
		'';
		});

		config.users.groups.chroot-testgroup = {};
		config.users.users.chroot-testuser = {
		isSystemUser = true;
		description = "Chroot Test User";
		group = "chroot-testgroup";
		};
		};

		testScript = { nodes, ... }: ''
		def assert_eq(a, b):
		assert a == b, f"{a} != {b}"

		machine.wait_for_unit("multi-user.target")
		'' + nodes.machine.config.__testSteps;
		}

nixos/tests/systemd-confinement/checkperms.py

0 → 100644

+187 −0

Original line number	Diff line number	Diff line
		import errno
		import os

		from enum import IntEnum
		from pathlib import Path


		class Accessibility(IntEnum):
		"""
		The level of accessibility we have on a file or directory.

		This is needed to assess the attack surface on the file system namespace we
		have within a confined service. Higher levels mean more permissions for the
		user and thus a bigger attack surface.
		"""
		NONE = 0

		# Directories can be listed or files can be read.
		READABLE = 1

		# This is for special file systems such as procfs and for stuff such as
		# FIFOs or character special files. The reason why this has a lower value
		# than WRITABLE is because those files are more restricted on what and how
		# they can be written to.
		SPECIAL = 2

		# Another special case are sticky directories, which do allow write access
		# but restrict deletion. This does not apply to sticky directories that
		# are read-only.
		STICKY = 3

		# Essentially full permissions, the kind of accessibility we want to avoid
		# in most cases.
		WRITABLE = 4

		def assert_on(self, path: Path) -> None:
		"""
		Raise an AssertionError if the given 'path' allows for more
		accessibility than 'self'.
		"""
		actual = self.NONE

		if path.is_symlink():
		actual = self.READABLE
		elif path.is_dir():
		writable = True

		dummy_file = path / 'can_i_write'
		try:
		dummy_file.touch()
		except OSError as e:
		if e.errno in [errno.EROFS, errno.EACCES]:
		writable = False
		else:
		raise
		else:
		dummy_file.unlink()

		if writable:
		# The reason why we test this after we made sure it's
		# writable is because we could have a sticky directory where
		# the current user doesn't have write access.
		if path.stat().st_mode & 0o1000 == 0o1000:
		actual = self.STICKY
		else:
		actual = self.WRITABLE
		else:
		actual = self.READABLE
		elif path.is_file():
		try:
		with path.open('rb') as fp:
		fp.read(1)
		actual = self.READABLE
		except PermissionError:
		pass

		writable = True
		try:
		with path.open('ab') as fp:
		fp.write('x')
		size = fp.tell()
		fp.truncate(size)
		except PermissionError:
		writable = False
		except OSError as e:
		if e.errno == errno.ETXTBSY:
		writable = os.access(path, os.W_OK)
		elif e.errno == errno.EROFS:
		writable = False
		else:
		raise

		# Let's always try to fail towards being writable, so if either
		# access(2) or a real write is successful it's writable. This is to
		# make sure we don't accidentally introduce no-ops if we have bugs
		# in the more complicated real write code above.
		if writable or os.access(path, os.W_OK):
		actual = self.WRITABLE
		else:
		# We need to be very careful when writing to or reading from
		# special files (eg. FIFOs), since they can possibly block. So if
		# it's not a file, just trust that access(2) won't lie.
		if os.access(path, os.R_OK):
		actual = self.READABLE

		if os.access(path, os.W_OK):
		actual = self.SPECIAL

		if actual > self:
		stat = path.stat()
		details = ', '.join([
		f'permissions: {stat.st_mode & 0o7777:o}',
		f'uid: {stat.st_uid}',
		f'group: {stat.st_gid}',
		])

		raise AssertionError(
		f'Expected at most {self!r} but got {actual!r} for path'
		f' {path} ({details}).'
		)


		def is_special_fs(path: Path) -> bool:
		"""
		Check whether the given path truly is a special file system such as procfs
		or sysfs.
		"""
		try:
		if path == Path('/proc'):
		return (path / 'version').read_text().startswith('Linux')
		elif path == Path('/sys'):
		return b'Linux' in (path / 'kernel' / 'notes').read_bytes()
		except FileNotFoundError:
		pass
		return False


		def is_empty_dir(path: Path) -> bool:
		try:
		next(path.iterdir())
		return False
		except (StopIteration, PermissionError):
		return True


		def _assert_permissions_in_directory(
		directory: Path,
		accessibility: Accessibility,
		subdirs: dict[Path, Accessibility],
		) -> None:
		accessibility.assert_on(directory)

		for file in directory.iterdir():
		if is_special_fs(file):
		msg = f'Got unexpected special filesystem at {file}.'
		assert subdirs.pop(file) == Accessibility.SPECIAL, msg
		elif not file.is_symlink() and file.is_dir():
		subdir_access = subdirs.pop(file, accessibility)
		if is_empty_dir(file):
		# Whenever we got an empty directory, we check the permission
		# constraints on the current directory (except if specified
		# explicitly in subdirs) because for example if we're non-root
		# (the constraints of the current directory are thus
		# Accessibility.READABLE), we really have to make sure that
		# empty directories are never writable.
		subdir_access.assert_on(file)
		else:
		_assert_permissions_in_directory(file, subdir_access, subdirs)
		else:
		subdirs.pop(file, accessibility).assert_on(file)


		def assert_permissions(subdirs: dict[str, Accessibility]) -> None:
		"""
		Recursively check whether the file system conforms to the accessibility
		specification we specified via 'subdirs'.
		"""
		root = Path('/')
		absolute_subdirs = {root / p: a for p, a in subdirs.items()}
		_assert_permissions_in_directory(
		root,
		Accessibility.WRITABLE if os.getuid() == 0 else Accessibility.READABLE,
		absolute_subdirs,
		)
		for file in absolute_subdirs.keys():
		msg = f'Expected {file} to exist, but it was nowwhere to be found.'
		raise AssertionError(msg)