Commit ebdfdae1 authored by Martin Puppe's avatar Martin Puppe
Browse files

nixos/paperless: download NLTK data

Since version 1.10.0 paperless-ngx depends on the NLTK library which is
used to pre-process data for machine learning. NLTK needs certain
data for stemming, stopword removal etc. This data has to be downloaded
first. This commit introduces a new systemd service that does the
downloading.
parent fa7fbe56
Loading
Loading
Loading
Loading
+30 −0
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ let
  pkg = cfg.package;

  defaultUser = "paperless";
  nltkDir = "/var/cache/paperless/nltk";

  # Don't start a redis instance if the user sets a custom redis connection
  enableRedis = !hasAttr "PAPERLESS_REDIS" cfg.extraConfig;
@@ -15,6 +16,7 @@ let
    PAPERLESS_DATA_DIR = cfg.dataDir;
    PAPERLESS_MEDIA_ROOT = cfg.mediaDir;
    PAPERLESS_CONSUMPTION_DIR = cfg.consumptionDir;
    PAPERLESS_NLTK_DIR = nltkDir;
    GUNICORN_CMD_ARGS = "--bind=${cfg.address}:${toString cfg.port}";
  } // optionalAttrs (config.time.timeZone != null) {
    PAPERLESS_TIME_ZONE = config.time.timeZone;
@@ -49,6 +51,7 @@ let
      cfg.dataDir
      cfg.mediaDir
    ];
    CacheDirectory = "paperless";
    CapabilityBoundingSet = "";
    # ProtectClock adds DeviceAllow=char-rtc r
    DeviceAllow = "";
@@ -293,6 +296,33 @@ in
      };
    };

    # Download NLTK corpus data
    systemd.services.paperless-download-nltk-data = {
      wantedBy = [ "paperless-scheduler.service" ];
      before = [ "paperless-scheduler.service" ];
      after = [ "network-online.target" ];
      serviceConfig = defaultServiceConfig // {
        User = cfg.user;
        Type = "oneshot";
        # Enable internet access
        PrivateNetwork = false;
        # Restrict write access
        BindPaths = [];
        BindReadOnlyPaths = [
          "/nix/store"
          "-/etc/resolv.conf"
          "-/etc/nsswitch.conf"
          "-/etc/ssl/certs"
          "-/etc/static/ssl/certs"
          "-/etc/hosts"
          "-/etc/localtime"
        ];
        ExecStart = let pythonWithNltk = pkg.python.withPackages (ps: [ ps.nltk ]); in ''
          ${pythonWithNltk}/bin/python -m nltk.downloader -d '${nltkDir}' punkt snowball_data stopwords
        '';
      };
    };

    systemd.services.paperless-consumer = {
      description = "Paperless document consumer";
      # Bind to `paperless-scheduler` so that the consumer never runs