Unverified Commit 2bb1556b authored by aszlig's avatar aszlig
Browse files

Merge pull request #289593 (confinement + DynamicUser)

This adds support for the systemd ProtectSystem and DynamicUser options
in conjunction with the systemd-confinement module, which has been a
limitation in the initial implementation and so far has thrown assertion
errors whenever those options were enabled.

Thanks to @ju1m, we now no longer need to resort to static users.

Review for this work took a little bit longer since I wanted to be
absolutely sure that we don't introduce any new regressions, which would
involve increasing the attack surface.

In the end however, we even managed to even lower the attack surface
even more since now the confined filesystem root is now read-only even
for the root user.
parents 0d793f31 e4bd1e8f
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -713,6 +713,8 @@ The pre-existing [services.ankisyncd](#opt-services.ankisyncd.enable) has been m
- `documentation.man.mandoc` now by default uses `MANPATH` to set the directories where mandoc will search for manual pages.
  This enables mandoc to find manual pages in Nix profiles. To set the manual search paths via the `mandoc.conf` configuration file like before, use `documentation.man.mandoc.settings.manpath` instead.

- The `systemd-confinement` module extension is now compatible with `DynamicUser=true` and thus `ProtectSystem=strict` too.

- `grafana-loki` package was updated to 3.0.0 which includes [breaking changes](https://github.com/grafana/loki/releases/tag/v3.0.0).

- `programs.fish.package` now allows you to override the package used in the `fish` module.
+22 −14
Original line number Diff line number Diff line
@@ -79,13 +79,20 @@ in {
        description = ''
          The value `full-apivfs` (the default) sets up
          private {file}`/dev`, {file}`/proc`,
          {file}`/sys` and {file}`/tmp` file systems in a separate user
          name space.
          {file}`/sys`, {file}`/tmp` and {file}`/var/tmp` file systems
          in a separate user name space.

          If this is set to `chroot-only`, only the file
          system name space is set up along with the call to
          {manpage}`chroot(2)`.

          In all cases, unless `serviceConfig.PrivateTmp=true` is set,
          both {file}`/tmp` and {file}`/var/tmp` paths are added to `InaccessiblePaths=`.
          This is to overcome options like `DynamicUser=true`
          implying `PrivateTmp=true` without letting it being turned off.
          Beware however that giving processes the `CAP_SYS_ADMIN` and `@mount` privileges
          can let them undo the effects of `InaccessiblePaths=`.

          ::: {.note}
          This doesn't cover network namespaces and is solely for
          file system level isolation.
@@ -98,8 +105,12 @@ in {
        wantsAPIVFS = lib.mkDefault (config.confinement.mode == "full-apivfs");
      in lib.mkIf config.confinement.enable {
        serviceConfig = {
          RootDirectory = "/var/empty";
          TemporaryFileSystem = "/";
          ReadOnlyPaths = [ "+/" ];
          RuntimeDirectory = [ "confinement/${mkPathSafeName name}" ];
          RootDirectory = "/run/confinement/${mkPathSafeName name}";
          InaccessiblePaths = [
            "-+/run/confinement/${mkPathSafeName name}"
          ];
          PrivateMounts = lib.mkDefault true;

          # https://github.com/NixOS/nixpkgs/issues/14645 is a future attempt
@@ -148,16 +159,6 @@ in {
              + " Please either define a separate service or find a way to run"
              + " commands other than ExecStart within the chroot.";
    }
    { assertion = !cfg.serviceConfig.DynamicUser or false;
      message = "${whatOpt "DynamicUser"}. Please create a dedicated user via"
              + " the 'users.users' option instead as this combination is"
              + " currently not supported.";
    }
    { assertion = cfg.serviceConfig ? ProtectSystem -> cfg.serviceConfig.ProtectSystem == false;
      message = "${whatOpt "ProtectSystem"}. ProtectSystem is not compatible"
              + " with service confinement as it fails to remount /usr within"
              + " our chroot. Please disable the option.";
    }
  ]) config.systemd.services);

  config.systemd.packages = lib.concatLists (lib.mapAttrsToList (name: cfg: let
@@ -183,6 +184,13 @@ in {
        echo "BindReadOnlyPaths=$realprog:/bin/sh" >> "$serviceFile"
      ''}

      # If DynamicUser= is enabled, PrivateTmp=true is implied (and cannot be turned off).
      # so disable them unless PrivateTmp=true is explicitely set.
      ${lib.optionalString (!cfg.serviceConfig.PrivateTmp) ''
        echo "InaccessiblePaths=-+/tmp" >> "$serviceFile"
        echo "InaccessiblePaths=-+/var/tmp" >> "$serviceFile"
      ''}

      while read storePath; do
        if [ -L "$storePath" ]; then
          # Currently, systemd can't cope with symlinks in Bind(ReadOnly)Paths,
+1 −1
Original line number Diff line number Diff line
@@ -885,7 +885,7 @@ in {
  systemd-binfmt = handleTestOn ["x86_64-linux"] ./systemd-binfmt.nix {};
  systemd-boot = handleTest ./systemd-boot.nix {};
  systemd-bpf = handleTest ./systemd-bpf.nix {};
  systemd-confinement = handleTest ./systemd-confinement.nix {};
  systemd-confinement = handleTest ./systemd-confinement {};
  systemd-coredump = handleTest ./systemd-coredump.nix {};
  systemd-cryptenroll = handleTest ./systemd-cryptenroll.nix {};
  systemd-credentials-tpm2 = handleTest ./systemd-credentials-tpm2.nix {};
+0 −184
Original line number Diff line number Diff line
import ./make-test-python.nix {
  name = "systemd-confinement";

  nodes.machine = { pkgs, lib, ... }: let
    testServer = pkgs.writeScript "testserver.sh" ''
      #!${pkgs.runtimeShell}
      export PATH=${lib.escapeShellArg "${pkgs.coreutils}/bin"}
      ${lib.escapeShellArg pkgs.runtimeShell} 2>&1
      echo "exit-status:$?"
    '';

    testClient = pkgs.writeScriptBin "chroot-exec" ''
      #!${pkgs.runtimeShell} -e
      output="$(echo "$@" | nc -NU "/run/test$(< /teststep).sock")"
      ret="$(echo "$output" | sed -nre '$s/^exit-status:([0-9]+)$/\1/p')"
      echo "$output" | head -n -1
      exit "''${ret:-1}"
    '';

    mkTestStep = num: {
      testScript,
      config ? {},
      serviceName ? "test${toString num}",
    }: {
      systemd.sockets.${serviceName} = {
        description = "Socket for Test Service ${toString num}";
        wantedBy = [ "sockets.target" ];
        socketConfig.ListenStream = "/run/test${toString num}.sock";
        socketConfig.Accept = true;
      };

      systemd.services."${serviceName}@" = {
        description = "Confined Test Service ${toString num}";
        confinement = (config.confinement or {}) // { enable = true; };
        serviceConfig = (config.serviceConfig or {}) // {
          ExecStart = testServer;
          StandardInput = "socket";
        };
      } // removeAttrs config [ "confinement" "serviceConfig" ];

      __testSteps = lib.mkOrder num (''
        machine.succeed("echo ${toString num} > /teststep")
      '' + testScript);
    };

  in {
    imports = lib.imap1 mkTestStep [
      { config.confinement.mode = "chroot-only";
        testScript = ''
          with subtest("chroot-only confinement"):
              paths = machine.succeed('chroot-exec ls -1 / | paste -sd,').strip()
              assert_eq(paths, "bin,nix,run")
              uid = machine.succeed('chroot-exec id -u').strip()
              assert_eq(uid, "0")
              machine.succeed("chroot-exec chown 65534 /bin")
        '';
      }
      { testScript = ''
          with subtest("full confinement with APIVFS"):
              machine.fail("chroot-exec ls -l /etc")
              machine.fail("chroot-exec chown 65534 /bin")
              assert_eq(machine.succeed('chroot-exec id -u').strip(), "0")
              machine.succeed("chroot-exec chown 0 /bin")
        '';
      }
      { config.serviceConfig.BindReadOnlyPaths = [ "/etc" ];
        testScript = ''
          with subtest("check existence of bind-mounted /etc"):
              passwd = machine.succeed('chroot-exec cat /etc/passwd').strip()
              assert len(passwd) > 0, "/etc/passwd must not be empty"
        '';
      }
      { config.serviceConfig.User = "chroot-testuser";
        config.serviceConfig.Group = "chroot-testgroup";
        testScript = ''
          with subtest("check if User/Group really runs as non-root"):
              machine.succeed("chroot-exec ls -l /dev")
              uid = machine.succeed('chroot-exec id -u').strip()
              assert uid != "0", "UID of chroot-testuser shouldn't be 0"
              machine.fail("chroot-exec touch /bin/test")
        '';
      }
      (let
        symlink = pkgs.runCommand "symlink" {
          target = pkgs.writeText "symlink-target" "got me\n";
        } "ln -s \"$target\" \"$out\"";
      in {
        config.confinement.packages = lib.singleton symlink;
        testScript = ''
          with subtest("check if symlinks are properly bind-mounted"):
              machine.fail("chroot-exec test -e /etc")
              text = machine.succeed('chroot-exec cat ${symlink}').strip()
              assert_eq(text, "got me")
        '';
      })
      { config.serviceConfig.User = "chroot-testuser";
        config.serviceConfig.Group = "chroot-testgroup";
        config.serviceConfig.StateDirectory = "testme";
        testScript = ''
          with subtest("check if StateDirectory works"):
              machine.succeed("chroot-exec touch /tmp/canary")
              machine.succeed('chroot-exec "echo works > /var/lib/testme/foo"')
              machine.succeed('test "$(< /var/lib/testme/foo)" = works')
              machine.succeed("test ! -e /tmp/canary")
        '';
      }
      { testScript = ''
          with subtest("check if /bin/sh works"):
              machine.succeed(
                  "chroot-exec test -e /bin/sh",
                  'test "$(chroot-exec \'/bin/sh -c "echo bar"\')" = bar',
              )
        '';
      }
      { config.confinement.binSh = null;
        testScript = ''
          with subtest("check if suppressing /bin/sh works"):
              machine.succeed("chroot-exec test ! -e /bin/sh")
              machine.succeed('test "$(chroot-exec \'/bin/sh -c "echo foo"\')" != foo')
        '';
      }
      { config.confinement.binSh = "${pkgs.hello}/bin/hello";
        testScript = ''
          with subtest("check if we can set /bin/sh to something different"):
              machine.succeed("chroot-exec test -e /bin/sh")
              machine.succeed('test "$(chroot-exec /bin/sh -g foo)" = foo')
        '';
      }
      { config.environment.FOOBAR = pkgs.writeText "foobar" "eek\n";
        testScript = ''
          with subtest("check if only Exec* dependencies are included"):
              machine.succeed('test "$(chroot-exec \'cat "$FOOBAR"\')" != eek')
        '';
      }
      { config.environment.FOOBAR = pkgs.writeText "foobar" "eek\n";
        config.confinement.fullUnit = true;
        testScript = ''
          with subtest("check if all unit dependencies are included"):
              machine.succeed('test "$(chroot-exec \'cat "$FOOBAR"\')" = eek')
        '';
      }
      { serviceName = "shipped-unitfile";
        config.confinement.mode = "chroot-only";
        testScript = ''
          with subtest("check if shipped unit file still works"):
              machine.succeed(
                  'chroot-exec \'kill -9 $$ 2>&1 || :\' | '
                  'grep -q "Too many levels of symbolic links"'
              )
        '';
      }
    ];

    options.__testSteps = lib.mkOption {
      type = lib.types.lines;
      description = "All of the test steps combined as a single script.";
    };

    config.environment.systemPackages = lib.singleton testClient;
    config.systemd.packages = lib.singleton (pkgs.writeTextFile {
      name = "shipped-unitfile";
      destination = "/etc/systemd/system/shipped-unitfile@.service";
      text = ''
        [Service]
        SystemCallFilter=~kill
        SystemCallErrorNumber=ELOOP
      '';
    });

    config.users.groups.chroot-testgroup = {};
    config.users.users.chroot-testuser = {
      isSystemUser = true;
      description = "Chroot Test User";
      group = "chroot-testgroup";
    };
  };

  testScript = { nodes, ... }: ''
    def assert_eq(a, b):
        assert a == b, f"{a} != {b}"

    machine.wait_for_unit("multi-user.target")
  '' + nodes.machine.config.__testSteps;
}
+187 −0
Original line number Diff line number Diff line
import errno
import os

from enum import IntEnum
from pathlib import Path


class Accessibility(IntEnum):
    """
    The level of accessibility we have on a file or directory.

    This is needed to assess the attack surface on the file system namespace we
    have within a confined service. Higher levels mean more permissions for the
    user and thus a bigger attack surface.
    """
    NONE = 0

    # Directories can be listed or files can be read.
    READABLE = 1

    # This is for special file systems such as procfs and for stuff such as
    # FIFOs or character special files. The reason why this has a lower value
    # than WRITABLE is because those files are more restricted on what and how
    # they can be written to.
    SPECIAL = 2

    # Another special case are sticky directories, which do allow write access
    # but restrict deletion. This does *not* apply to sticky directories that
    # are read-only.
    STICKY = 3

    # Essentially full permissions, the kind of accessibility we want to avoid
    # in most cases.
    WRITABLE = 4

    def assert_on(self, path: Path) -> None:
        """
        Raise an AssertionError if the given 'path' allows for more
        accessibility than 'self'.
        """
        actual = self.NONE

        if path.is_symlink():
            actual = self.READABLE
        elif path.is_dir():
            writable = True

            dummy_file = path / 'can_i_write'
            try:
                dummy_file.touch()
            except OSError as e:
                if e.errno in [errno.EROFS, errno.EACCES]:
                    writable = False
                else:
                    raise
            else:
                dummy_file.unlink()

            if writable:
                # The reason why we test this *after* we made sure it's
                # writable is because we could have a sticky directory where
                # the current user doesn't have write access.
                if path.stat().st_mode & 0o1000 == 0o1000:
                    actual = self.STICKY
                else:
                    actual = self.WRITABLE
            else:
                actual = self.READABLE
        elif path.is_file():
            try:
                with path.open('rb') as fp:
                    fp.read(1)
                actual = self.READABLE
            except PermissionError:
                pass

            writable = True
            try:
                with path.open('ab') as fp:
                    fp.write('x')
                    size = fp.tell()
                    fp.truncate(size)
            except PermissionError:
                writable = False
            except OSError as e:
                if e.errno == errno.ETXTBSY:
                    writable = os.access(path, os.W_OK)
                elif e.errno == errno.EROFS:
                    writable = False
                else:
                    raise

            # Let's always try to fail towards being writable, so if *either*
            # access(2) or a real write is successful it's writable. This is to
            # make sure we don't accidentally introduce no-ops if we have bugs
            # in the more complicated real write code above.
            if writable or os.access(path, os.W_OK):
                actual = self.WRITABLE
        else:
            # We need to be very careful when writing to or reading from
            # special files (eg.  FIFOs), since they can possibly block. So if
            # it's not a file, just trust that access(2) won't lie.
            if os.access(path, os.R_OK):
                actual = self.READABLE

            if os.access(path, os.W_OK):
                actual = self.SPECIAL

        if actual > self:
            stat = path.stat()
            details = ', '.join([
                f'permissions: {stat.st_mode & 0o7777:o}',
                f'uid: {stat.st_uid}',
                f'group: {stat.st_gid}',
            ])

            raise AssertionError(
                f'Expected at most {self!r} but got {actual!r} for path'
                f' {path} ({details}).'
            )


def is_special_fs(path: Path) -> bool:
    """
    Check whether the given path truly is a special file system such as procfs
    or sysfs.
    """
    try:
        if path == Path('/proc'):
            return (path / 'version').read_text().startswith('Linux')
        elif path == Path('/sys'):
            return b'Linux' in (path / 'kernel' / 'notes').read_bytes()
    except FileNotFoundError:
        pass
    return False


def is_empty_dir(path: Path) -> bool:
    try:
        next(path.iterdir())
        return False
    except (StopIteration, PermissionError):
        return True


def _assert_permissions_in_directory(
    directory: Path,
    accessibility: Accessibility,
    subdirs: dict[Path, Accessibility],
) -> None:
    accessibility.assert_on(directory)

    for file in directory.iterdir():
        if is_special_fs(file):
            msg = f'Got unexpected special filesystem at {file}.'
            assert subdirs.pop(file) == Accessibility.SPECIAL, msg
        elif not file.is_symlink() and file.is_dir():
            subdir_access = subdirs.pop(file, accessibility)
            if is_empty_dir(file):
                # Whenever we got an empty directory, we check the permission
                # constraints on the current directory (except if specified
                # explicitly in subdirs) because for example if we're non-root
                # (the constraints of the current directory are thus
                # Accessibility.READABLE), we really have to make sure that
                # empty directories are *never* writable.
                subdir_access.assert_on(file)
            else:
                _assert_permissions_in_directory(file, subdir_access, subdirs)
        else:
            subdirs.pop(file, accessibility).assert_on(file)


def assert_permissions(subdirs: dict[str, Accessibility]) -> None:
    """
    Recursively check whether the file system conforms to the accessibility
    specification we specified via 'subdirs'.
    """
    root = Path('/')
    absolute_subdirs = {root / p: a for p, a in subdirs.items()}
    _assert_permissions_in_directory(
        root,
        Accessibility.WRITABLE if os.getuid() == 0 else Accessibility.READABLE,
        absolute_subdirs,
    )
    for file in absolute_subdirs.keys():
        msg = f'Expected {file} to exist, but it was nowwhere to be found.'
        raise AssertionError(msg)
Loading