Unverified Commit 9b8ea589 authored by Leona Maroni's avatar Leona Maroni Committed by GitHub
Browse files

nixos/acme: improve scalability - reduce superfluous unit activations (#422076)

parents ecce792f 2c111ce3
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -1468,6 +1468,9 @@
  "module-security-acme-fix-jws": [
    "index.html#module-security-acme-fix-jws"
  ],
  "module-security-acme-reload-dependencies": [
    "index.html#module-security-acme-reload-dependencies"
  ],
  "module-programs-zsh-ohmyzsh": [
    "index.html#module-programs-zsh-ohmyzsh"
  ],
+15 −0
Original line number Diff line number Diff line
@@ -171,6 +171,21 @@

- `services.gitea` supports sending notifications with sendmail again. To do this, activate the parameter `services.gitea.mailerUseSendmail` and configure SMTP server.

- Revamp of the ACME certificate acquisication and renewal process to help scale systems with lots (100+) of certificates.

  Units and targets have been reshaped to better support more specific dependency propagation and avoid
  superfluously triggering unchanged units:

  If a service requires a syntactically valid certificate to start it should now depend on the `acme-{certname}.service` unit.

  We now always generate initial self-signed certificates as this drastically simplifies the dependency structure. As a result, the option `security.acme.preliminarySelfsigned` has been removed.

  Instead of the previous `acme-finished-{certname}.target`s there are now `acme-order-renew-{certname}.service`s that will be activated
  in a delayed fashion to ensure that bootstrapping with servers like nginx that take part in the acquisition/renewal process works
  smoothly. Dependencies on `acme-finished` units should move to `acme-order-renew`.

  Note that system activation will complete before all certificates may have been renewed or acquired.

- `libvirt` now supports using `nftables` backend.
  - The `virtualisation.libvirtd.firewallBackend` option can be used to configure the firewall backend used by libvirtd.

+9 −1
Original line number Diff line number Diff line
@@ -318,7 +318,7 @@ can be applied to any service.

  # Now you must augment OpenSMTPD's systemd service to load
  # the certificate files.
  systemd.services.opensmtpd.requires = [ "acme-finished-mail.example.com.target" ];
  systemd.services.opensmtpd.requires = [ "acme-mail.example.com.service" ];
  systemd.services.opensmtpd.serviceConfig.LoadCredential =
    let
      certDir = config.security.acme.certs."mail.example.com".directory;
@@ -376,3 +376,11 @@ systemd-tmpfiles --create
# Note: Do this for all certs that share the same account email address
systemctl start acme-example.com.service
```

## Ensuring dependencies for services that need to be reloaded when a certificate challenges {#module-security-acme-reload-dependencies}

Services that depend on ACME certificates and need to be reloaded can use one of two approaches to reload upon successfull certificate acquisition or renewal:

1. **Using the `security.acme.certs.<name>.reloadServices` option**: This will cause `systemctl try-reload-or-restart` to be run for the listed services.

2. **Using a separate reload unit**: if you need perform more complex actions you can implement a separate reload unit but need to ensure that it lists the `acme-renew-<name>.service` unit both as `wantedBy` AND `after`. See the nginx module implementation with its `nginx-config-reload` service.
+169 −194
Original line number Diff line number Diff line
@@ -24,56 +24,32 @@ let
  # Since that service is a oneshot with RemainAfterExit,
  # the folder will exist during all renewal services.
  lockdir = "/run/acme/";
  concurrencyLockfiles = map (n: "${toString n}.lock") (lib.range 1 cfg.maxConcurrentRenewals);
  # Assign elements of `baseList` to each element of `needAssignmentList`, until the latter is exhausted.
  # returns: [{fst = "element of baseList"; snd = "element of needAssignmentList"}]
  roundRobinAssign =
    baseList: needAssignmentList:
    if baseList == [ ] then [ ] else _rrCycler baseList baseList needAssignmentList;
  _rrCycler =
    with builtins;
    origBaseList: workingBaseList: needAssignmentList:
    if (workingBaseList == [ ] || needAssignmentList == [ ]) then
      [ ]
    else
      [
        {
          fst = head workingBaseList;
          snd = head needAssignmentList;
        }
      ]
      ++ _rrCycler origBaseList (
        if (tail workingBaseList == [ ]) then origBaseList else tail workingBaseList
      ) (tail needAssignmentList);
  attrsToList = lib.mapAttrsToList (
    attrname: attrval: {
      name = attrname;
      value = attrval;
    }
  );
  # for an AttrSet `funcsAttrs` having functions as values, apply single arguments from
  # `argsList` to them in a round-robin manner.
  # Returns an attribute set with the applied functions as values.
  roundRobinApplyAttrs =
    funcsAttrs: argsList:
    lib.listToAttrs (
      map (x: {
        inherit (x.snd) name;
        value = x.snd.value x.fst;
      }) (roundRobinAssign argsList (attrsToList funcsAttrs))
    );

  wrapInFlock =
    lockfilePath: script:
    script:
    # explainer: https://stackoverflow.com/a/60896531
    ''
      exec {LOCKFD}> ${lockfilePath}
      echo "Waiting to acquire lock ${lockfilePath}"
      ${pkgs.flock}/bin/flock ''${LOCKFD} || exit 1
      echo "Acquired lock ${lockfilePath}"
      maxConcurrentRenewals=${toString cfg.maxConcurrentRenewals}

      acquireLock() {
        echo "Waiting to acquire lock in ${lockdir}"
        while true; do
          for i in $(seq 1 $maxConcurrentRenewals); do
            exec {LOCKFD}> "${lockdir}/$i.lock"
            if ${pkgs.flock}/bin/flock -n ''${LOCKFD}; then
              return 0
            fi
            exec {LOCKFD}>&-
          done
          sleep 1;
        done
      }

      if [ "$maxConcurrentRenewals" -gt "0" ]; then
        acquireLock
      fi
    ''
    + script
    + "\n"
    + ''echo "Releasing lock ${lockfilePath}"  # only released after process exit'';
    + script;

  # There are many services required to make cert renewals work.
  # They all follow a common structure:
@@ -160,13 +136,19 @@ let
  );

  # This is defined with lib.mkMerge so that we can separate the config per function.
  setupService = lib.mkMerge [
    {
  setupService = {
    description = "Set up the ACME certificate renewal infrastructure";
    path = [ pkgs.minica ];

    script = lib.mkBefore ''
      ${lib.optionalString cfg.defaults.enableDebugLogs "set -x"}
      set -euo pipefail
      test -e ca/key.pem || minica \
        --ca-key ca/key.pem \
        --ca-cert ca/cert.pem \
        --domains selfsigned.local
    '';

    serviceConfig = commonServiceConfig // {
      # This script runs with elevated privileges, denoted by the +
      # ExecStartPre is used instead of ExecStart so that the `script` continues to work.
@@ -181,7 +163,9 @@ let
        "acme"
        "acme/.lego"
        "acme/.lego/accounts"
        "acme/.minica"
      ];
      BindPaths = "/var/lib/acme/.minica:/tmp/ca";
      StateDirectoryMode = "0755";

      # Creates ${lockdir}. Earlier RemainAfterExit=true means
@@ -194,24 +178,7 @@ let
      # under the acme user.
      UMask = "0077";
    };
    }

    # Avoid race conditions creating the CA for selfsigned certs
    (lib.mkIf cfg.preliminarySelfsigned {
      path = [ pkgs.minica ];
      # Working directory will be /tmp
      script = ''
        test -e ca/key.pem || minica \
          --ca-key ca/key.pem \
          --ca-cert ca/cert.pem \
          --domains selfsigned.local
      '';
      serviceConfig = {
        StateDirectory = [ "acme/.minica" ];
        BindPaths = "/var/lib/acme/.minica:/tmp/ca";
  };
    })
  ];

  certToConfig =
    cert: data:
@@ -219,7 +186,6 @@ let
      acmeServer = data.server;
      useDns = data.dnsProvider != null;
      destPath = "/var/lib/acme/${cert}";
      selfsignedDeps = lib.optionals (cfg.preliminarySelfsigned) [ "acme-selfsigned-${cert}.service" ];

      # Minica and lego have a "feature" which replaces * with _. We need
      # to make this substitution to reference the output files from both programs.
@@ -339,16 +305,18 @@ let
      certificateKey = if data.csrKey != null then "${data.csrKey}" else "certificates/${keyName}.key";
    in
    {
      inherit accountHash cert selfsignedDeps;
      inherit accountHash cert;

      group = data.group;

      renewTimer = {
        description = "Renew ACME Certificate for ${cert}";
        wantedBy = [ "timers.target" ];
        # Avoid triggering certificate renewals accidentally when running s-t-c.
        unitConfig."X-OnlyManualStart" = true;
        timerConfig = {
          OnCalendar = data.renewInterval;
          Unit = "acme-${cert}.service";
          Unit = "acme-order-renew-${cert}.service";
          Persistent = "yes";

          # Allow systemd to pick a convenient time within the day
@@ -364,15 +332,29 @@ let
        };
      };

      selfsignService = lockfileName: {
        description = "Generate self-signed certificate for ${cert}";
      baseService = {
        description = "Ensure certificate for ${cert}";

        wantedBy = [ "multi-user.target" ];

        after = [ "acme-setup.service" ];
        requires = [ "acme-setup.service" ];

        # Whenever this service starts (on boot, through dependencies, through
        # changes) we trigger the acme-order-renew service to give it a chance
        # to catch up with the potentially changed config.
        wants = [
          "acme-setup.service"
          "acme-order-renew-${cert}.service"
        ];
        before = [ "acme-order-renew-${cert}.service" ];

        restartTriggers = [
          config.systemd.services."acme-order-renew-${cert}".script
        ];

        path = [ pkgs.minica ];

        unitConfig = {
          ConditionPathExists = "!/var/lib/acme/${cert}/key.pem";
          StartLimitIntervalSec = 0;
        };

@@ -380,52 +362,83 @@ let
          Group = data.group;
          UMask = "0027";

          RemainAfterExit = true;

          StateDirectory = "acme/${cert}";

          BindPaths = [
            "/var/lib/acme/.minica:/tmp/ca"
            "/var/lib/acme/${cert}:/tmp/${keyName}"
            "/var/lib/acme/${cert}:/tmp/out"
          ];
        };

        # Working directory will be /tmp
        # minica will output to a folder sharing the name of the first domain
        # in the list, which will be ${data.domain}
        script = (if (lockfileName == null) then lib.id else wrapInFlock "${lockdir}${lockfileName}") ''
        script = wrapInFlock ''
          set -ex

          # Regenerate self-signed certificates (in case the SANs change) until we
          # have seen a succesfull ACME certificate at least once.
          if [ -e out/acme-success ]; then
            exit 0
          fi

          minica \
            --ca-key ca/key.pem \
            --ca-cert ca/cert.pem \
            --domains ${lib.escapeShellArg (builtins.concatStringsSep "," ([ data.domain ] ++ extraDomains))}

          # Create files to match directory layout for real certificates
          (
            cd '${keyName}'
          cp ../ca/cert.pem chain.pem
          cat cert.pem chain.pem > fullchain.pem
          cat key.pem fullchain.pem > full.pem
            cp -vp cert.pem ../out/cert.pem
            cp -vp key.pem ../out/key.pem
          )
          cat out/cert.pem ca/cert.pem > out/fullchain.pem
          cp ca/cert.pem out/chain.pem
          cat out/key.pem out/fullchain.pem > out/full.pem

          # Group might change between runs, re-apply it
          chown '${user}:${data.group}' -- *
          # Fix up the output files to adhere to the group and
          # have consistent permissions. This needs to be kept
          # consistent with the acme-setup script above.
          for fixpath in out certificates; do
            if [ -d "$fixpath" ]; then
              chmod -R u=rwX,g=rX,o= "$fixpath"
              chown -R ${user}:${data.group} "$fixpath"
            fi
          done

          # Default permissions make the files unreadable by group + anon
          # Need to be readable by group
          chmod 640 -- *
          ${lib.optionalString (data.webroot != null) ''
            # Ensure the webroot exists. Fixing group is required in case configuration was changed between runs.
            # Lego will fail if the webroot does not exist at all.
            (
              mkdir -p '${data.webroot}/.well-known/acme-challenge' \
              && chgrp '${data.group}' ${data.webroot}/.well-known/acme-challenge
            ) || (
              echo 'Please ensure ${data.webroot}/.well-known/acme-challenge exists and is writable by acme:${data.group}' \
              && exit 1
            )
          ''}
        '';
      };

      renewService = lockfileName: {
        description = "Renew ACME certificate for ${cert}";
      orderRenewService = {
        description = "Order (and renew) ACME certificate for ${cert}";
        after = [
          "network.target"
          "network-online.target"
          "acme-setup.service"
          "nss-lookup.target"
        ]
        ++ selfsignedDeps;
        wants = [ "network-online.target" ] ++ selfsignedDeps;
        requires = [ "acme-setup.service" ];

        # https://github.com/NixOS/nixpkgs/pull/81371#issuecomment-605526099
        wantedBy = lib.optionals (!config.boot.isContainer) [ "multi-user.target" ];
          "acme-${cert}.service"
        ];
        wants = [
          "network-online.target"
          "acme-setup.service"
          "acme-${cert}.service"
        ];
        # Ensure that certificates are generated if people use `security.acme.certs`
        # without having/declaring other systemd units that depend on the cert.

        path = with pkgs; [
          lego
@@ -491,7 +504,7 @@ let
              };

        # Working directory will be /tmp
        script = (if (lockfileName == null) then lib.id else wrapInFlock "${lockdir}${lockfileName}") ''
        script = wrapInFlock ''
          ${lib.optionalString data.enableDebugLogs "set -x"}
          set -euo pipefail

@@ -523,25 +536,12 @@ let
            [[ $expiration_days -gt ${toString data.validMinDays} ]]
          }

          ${lib.optionalString (data.webroot != null) ''
            # Ensure the webroot exists. Fixing group is required in case configuration was changed between runs.
            # Lego will fail if the webroot does not exist at all.
            (
              mkdir -p '${data.webroot}/.well-known/acme-challenge' \
              && chgrp '${data.group}' ${data.webroot}/.well-known/acme-challenge
            ) || (
              echo 'Please ensure ${data.webroot}/.well-known/acme-challenge exists and is writable by acme:${data.group}' \
              && exit 1
            )
          ''}

          echo '${domainHash}' > domainhash.txt

          # Check if we can renew.
          # Check if a new order is needed
          # We can only renew if the list of domains has not changed.
          # We also need an account key. Avoids #190493
          if cmp -s domainhash.txt certificates/domainhash.txt && [ -e '${certificateKey}' ] && [ -e 'certificates/${keyName}.crt' ] && [ -n "$(find accounts -name '${data.email}.key')" ]; then

            # Even if a cert is not expired, it may be revoked by the CA.
            # Try to renew, and silently fail if the cert is not expired.
            # Avoids #85794 and resolves #129838
@@ -553,13 +553,12 @@ let
                exit 11
              fi
            fi

          # Otherwise do a full run
          # Do a full run
          elif ! lego ${runOpts}; then
            # Produce a nice error for those doing their first nixos-rebuild with these certs
            echo Failed to fetch certificates. \
              This may mean your DNS records are set up incorrectly. \
              ${lib.optionalString (cfg.preliminarySelfsigned) "Selfsigned certs are in place and dependant services will still start."}
              Self-signed certs are in place and dependant services will still start.
            # Exit 10 so that users can potentially amend SuccessExitStatus to ignore this error.
            # High number to avoid Systemd reserved codes.
            exit 10
@@ -567,10 +566,12 @@ let

          mv domainhash.txt certificates/

          # Group might change between runs, re-apply it
          chown '${user}:${data.group}' certificates/*
          touch out/acme-success

          # Copy all certs to the "real" certs directory
          # lego has only an interesting subset of files available,
          # construct reasonably compatible files that clients can consume
          # as expected.
          if ! cmp -s 'certificates/${keyName}.crt' out/fullchain.pem; then
            touch out/renewed
            echo Installing new certificate
@@ -581,10 +582,13 @@ let
            cat out/key.pem out/fullchain.pem > out/full.pem
          fi

          # By default group will have no access to the cert files.
          # This chmod will fix that.
          chmod 640 out/*

          # Keep permissions consistent. Needs to be in sync with the other scripts.
          for fixpath in out certificates; do
            if [ -d "$fixpath" ]; then
              chmod -R u=rwX,g=rX,o= "$fixpath"
              chown -R ${user}:${data.group} "$fixpath"
            fi
          done
          # Also ensure safer permissions on the account directory.
          chmod -R u=rwX,g=,o= accounts/.
        '';
@@ -905,19 +909,6 @@ in

  options = {
    security.acme = {
      preliminarySelfsigned = lib.mkOption {
        type = lib.types.bool;
        default = true;
        description = ''
          Whether a preliminary self-signed certificate should be generated before
          doing ACME requests. This can be useful when certificates are required in
          a webserver, but ACME needs the webserver to make its requests.

          With preliminary self-signed certificate the webserver can be started and
          can later reload the correct ACME certificates.
        '';
      };

      acceptTerms = lib.mkOption {
        type = lib.types.bool;
        default = false;
@@ -1003,10 +994,13 @@ in
      "ACME Directory is now hardcoded to /var/lib/acme and its permissions are managed by systemd. See https://github.com/NixOS/nixpkgs/issues/53852 for more info."
    )
    (lib.mkRemovedOptionModule [ "security" "acme" "preDelay" ]
      "This option has been removed. If you want to make sure that something executes before certificates are provisioned, add a RequiredBy=acme-\${cert}.service to the service you want to execute before the cert renewal"
      "This option has been removed. If you want to make sure that something executes before certificates are provisioned, add a RequiredBy=acme-\${cert}.service and Before=acme-\${cert}.service to the service you want to execute before the cert renewal"
    )
    (lib.mkRemovedOptionModule [ "security" "acme" "activationDelay" ]
      "This option has been removed. If you want to make sure that something executes before certificates are provisioned, add a RequiredBy=acme-\${cert}.service to the service you want to execute before the cert renewal"
      "This option has been removed. If you want to make sure that something executes before certificates are provisioned, add a RequiredBy=acme-\${cert}.service and Before=acme-\${cert}.service to the service you want to execute before the cert renewal"
    )
    (lib.mkRemovedOptionModule [ "security" "acme" "preliminarySelfsigned" ]
      "This option has been removed. Preliminary self-signed certificates are now always generated to simplify the dependency structure."
    )
    (lib.mkChangedOptionModule
      [ "security" "acme" "validMin" ]
@@ -1161,45 +1155,25 @@ in

      systemd.services =
        let
          renewServiceFunctions = lib.mapAttrs' (
            cert: conf: lib.nameValuePair "acme-${cert}" conf.renewService
          orderRenewServices = lib.mapAttrs' (
            cert: conf: lib.nameValuePair "acme-order-renew-${cert}" conf.orderRenewService
          ) certConfigs;
          renewServices =
            if cfg.maxConcurrentRenewals > 0 then
              roundRobinApplyAttrs renewServiceFunctions concurrencyLockfiles
            else
              lib.mapAttrs (_: f: f null) renewServiceFunctions;
          selfsignServiceFunctions = lib.mapAttrs' (
            cert: conf: lib.nameValuePair "acme-selfsigned-${cert}" conf.selfsignService
          baseServices = lib.mapAttrs' (
            cert: conf: lib.nameValuePair "acme-${cert}" conf.baseService
          ) certConfigs;
          selfsignServices =
            if cfg.maxConcurrentRenewals > 0 then
              roundRobinApplyAttrs selfsignServiceFunctions concurrencyLockfiles
            else
              lib.mapAttrs (_: f: f null) selfsignServiceFunctions;
        in
        {
          acme-setup = setupService;
        }
        // renewServices
        // lib.optionalAttrs cfg.preliminarySelfsigned selfsignServices;
        // baseServices
        // orderRenewServices;

      systemd.timers = lib.mapAttrs' (
        cert: conf: lib.nameValuePair "acme-${cert}" conf.renewTimer
        cert: conf: lib.nameValuePair "acme-renew-${cert}" conf.renewTimer
      ) certConfigs;

      systemd.targets =
        let
          # Create some targets which can be depended on to be "active" after cert renewals
          finishedTargets = lib.mapAttrs' (
            cert: conf:
            lib.nameValuePair "acme-finished-${cert}" {
              wantedBy = [ "default.target" ];
              requires = [ "acme-${cert}.service" ];
              after = [ "acme-${cert}.service" ];
            }
          ) certConfigs;

          # Create targets to limit the number of simultaneous account creations
          # How it works:
          # - Pick a "leader" cert service, which will be in charge of creating the account,
@@ -1214,8 +1188,8 @@ in
            let
              dnsConfs = builtins.filter (conf: cfg.certs.${conf.cert}.dnsProvider != null) confs;
              leaderConf = if dnsConfs != [ ] then builtins.head dnsConfs else builtins.head confs;
              leader = "acme-${leaderConf.cert}.service";
              followers = map (conf: "acme-${conf.cert}.service") (
              leader = "acme-order-renew-${leaderConf.cert}.service";
              followers = map (conf: "acme-order-renew-${conf.cert}.service") (
                builtins.filter (conf: conf != leaderConf) confs
              );
            in
@@ -1224,10 +1198,11 @@ in
              before = followers;
              requires = [ leader ];
              after = [ leader ];
              unitConfig.RefuseManualStart = true;
            }
          ) (lib.groupBy (conf: conf.accountHash) (lib.attrValues certConfigs));
        in
        finishedTargets // accountTargets;
        accountTargets;
    })
  ];

+1 −1
Original line number Diff line number Diff line
@@ -156,7 +156,7 @@ in
        "network.target"
      ]
      ++ lib.optional (cfg.useACMEHost != null) "acme-${cfg.useACMEHost}.service";
      wants = lib.optional (cfg.useACMEHost != null) "acme-finished-${cfg.useACMEHost}.target";
      wants = lib.optional (cfg.useACMEHost != null) "acme-${cfg.useACMEHost}.service";
      wantedBy = [ "multi-user.target" ];
      serviceConfig = {
        AmbientCapabilities = "CAP_NET_BIND_SERVICE";
Loading