Unverified Commit 56cfc87a authored by Christian Kögler's avatar Christian Kögler Committed by GitHub
Browse files

Merge pull request #307020 from ck3d/local-ai-2130

local-ai: 2.12.4 -> 2.13.0
parents adc9566b 729264e1
Loading
Loading
Loading
Loading
+30 −0
Original line number Diff line number Diff line
{ lib
, writers
, writeText
, linkFarmFromDrvs
}: {
  genModels = configs:
    let
      name = lib.strings.sanitizeDerivationName
        (builtins.concatStringsSep "_" ([ "local-ai-models" ] ++ (builtins.attrNames configs)));

      genModelFiles = name: config:
        let
          templateName = type: name + "_" + type;

          config' = lib.recursiveUpdate config ({
            inherit name;
          } // lib.optionalAttrs (lib.isDerivation config.parameters.model) {
            parameters.model = config.parameters.model.name;
          } // lib.optionalAttrs (config ? template) {
            template = builtins.mapAttrs (n: _: templateName n) config.template;
          });
        in
        [ (writers.writeYAML "${name}.yaml" config') ]
        ++ lib.optional (lib.isDerivation config.parameters.model)
          config.parameters.model
        ++ lib.optionals (config ? template)
          (lib.mapAttrsToList (n: writeText "${templateName n}.tmpl") config.template);
    in
    linkFarmFromDrvs name (lib.flatten (lib.mapAttrsToList genModelFiles configs));
}
+56 −0
Original line number Diff line number Diff line
{ pkgs, config, lib, ... }:
let
  cfg = config.services.local-ai;
  inherit (lib) mkOption types;
in
{
  options.services.local-ai = {
    enable = lib.mkEnableOption "Enable service";

    package = lib.mkPackageOption pkgs "local-ai" { };

    extraArgs = mkOption {
      type = types.listOf types.str;
      default = [ ];
    };

    port = mkOption {
      type = types.port;
      default = 8080;
    };

    threads = mkOption {
      type = types.int;
      default = 1;
    };

    models = mkOption {
      type = types.either types.package types.str;
      default = "models";
    };
  };

  config = lib.mkIf cfg.enable {
    systemd.services.local-ai = {
      wantedBy = [ "multi-user.target" ];
      serviceConfig = {
        DynamicUser = true;
        ExecStart = lib.escapeShellArgs ([
          "${cfg.package}/bin/local-ai"
          "--debug"
          "--address"
          ":${toString cfg.port}"
          "--threads"
          (toString cfg.threads)
          "--localai-config-dir"
          "."
          "--models-path"
          (toString cfg.models)
        ]
        ++ cfg.extraArgs);
        RuntimeDirectory = "local-ai";
        WorkingDirectory = "%t/local-ai";
      };
    };
  };
}
+24 −16
Original line number Diff line number Diff line
@@ -6,6 +6,8 @@
, fetchpatch
, fetchFromGitHub
, protobuf
, protoc-gen-go
, protoc-gen-go-grpc
, grpc
, openssl
, llama-cpp
@@ -61,8 +63,8 @@ let

  inherit (cudaPackages) libcublas cuda_nvcc cuda_cccl cuda_cudart cudatoolkit;

  go-llama-ggml = effectiveStdenv.mkDerivation {
    name = "go-llama-ggml";
  go-llama = effectiveStdenv.mkDerivation {
    name = "go-llama";
    src = fetchFromGitHub {
      owner = "go-skynet";
      repo = "go-llama.cpp";
@@ -98,8 +100,8 @@ let
    src = fetchFromGitHub {
      owner = "ggerganov";
      repo = "llama.cpp";
      rev = "1b67731e184e27a465b8c5476061294a4af668ea";
      hash = "sha256-0WWbsklpW6HhFRkvWpYh8Lhi8VIansS/zmyIKNQRkIs=";
      rev = "784e11dea1f5ce9638851b2b0dddb107e2a609c8";
      hash = "sha256-yAQAUo5J+a6O2kTqhFL1UH0tANxpQn3JhAd3MByaC6I=";
      fetchSubmodules = true;
    };
    postPatch = prev.postPatch + ''
@@ -252,8 +254,8 @@ let
    src = fetchFromGitHub {
      owner = "ggerganov";
      repo = "whisper.cpp";
      rev = "8f253ef3af1c62c04316ba4afa7145fc4d701a8c";
      hash = "sha256-yHHjhpQIn99A/hqFwAb7TfTf4Q9KnKat93zyXS70bT8=";
      rev = "858452d58dba3acdc3431c9bced2bb8cfd9bf418";
      hash = "sha256-2fT3RgGpBex1mF6GJsVDo4rb0F31YqxTymsXcrpQAZk=";
    };

    nativeBuildInputs = [ cmake pkg-config ]
@@ -371,18 +373,18 @@ let
      stdenv;

  pname = "local-ai";
  version = "2.12.4";
  version = "2.13.0";
  src = fetchFromGitHub {
    owner = "go-skynet";
    repo = "LocalAI";
    rev = "v${version}";
    hash = "sha256-piu2B6u4ZfxiOd9SXrE7jiiiwL2SM8EqXo2s5qeKRl0=";
    hash = "sha256-jZE8Ow9FFhnx/jvsURLYlYtSuKpE4UWBezxg/mpHs9g=";
  };

  self = buildGoModule.override { stdenv = effectiveStdenv; } {
    inherit pname version src;

    vendorHash = "sha256-8Hu1y/PK21twnB7D22ltslFFzRrsB8d1R2hkgIFB/XY=";
    vendorHash = "sha256-nWNK2YekQnBSLx4ouNSe6esIe0yFuo69E0HStYLQANg=";

    env.NIX_CFLAGS_COMPILE = lib.optionalString with_stablediffusion " -isystem ${opencv}/include/opencv4";

@@ -392,12 +394,12 @@ let
      in
      ''
        sed -i Makefile \
          -e 's;git clone.*go-llama-ggml$;${cp} ${go-llama-ggml} sources/go-llama-ggml;' \
          -e 's;git clone.*go-llama\.cpp$;${cp} ${go-llama} sources/go-llama\.cpp;' \
          -e 's;git clone.*gpt4all$;${cp} ${gpt4all} sources/gpt4all;' \
          -e 's;git clone.*go-piper$;${cp} ${if with_tts then go-piper else go-piper.src} sources/go-piper;' \
          -e 's;git clone.*go-rwkv$;${cp} ${go-rwkv} sources/go-rwkv;' \
          -e 's;git clone.*go-rwkv\.cpp$;${cp} ${go-rwkv} sources/go-rwkv\.cpp;' \
          -e 's;git clone.*whisper\.cpp$;${cp} ${whisper-cpp.src} sources/whisper\.cpp;' \
          -e 's;git clone.*go-bert$;${cp} ${go-bert} sources/go-bert;' \
          -e 's;git clone.*go-bert\.cpp$;${cp} ${go-bert} sources/go-bert\.cpp;' \
          -e 's;git clone.*diffusion$;${cp} ${if with_stablediffusion then go-stable-diffusion else go-stable-diffusion.src} sources/go-stable-diffusion;' \
          -e 's;git clone.*go-tiny-dream$;${cp} ${if with_tinydream then go-tiny-dream else go-tiny-dream.src} sources/go-tiny-dream;' \
          -e 's, && git checkout.*,,g' \
@@ -415,14 +417,19 @@ let
      ++ lib.optionals with_stablediffusion go-stable-diffusion.buildInputs
      ++ lib.optionals with_tts go-piper.buildInputs;

    nativeBuildInputs = [ makeWrapper ]
    nativeBuildInputs = [
      protobuf
      protoc-gen-go
      protoc-gen-go-grpc
      makeWrapper
    ]
    ++ lib.optionals with_cublas [ cuda_nvcc ];

    enableParallelBuilding = false;

    modBuildPhase = ''
      mkdir sources
      make prepare-sources
      make prepare-sources protogen-go
      go mod tidy -v
    '';

@@ -486,7 +493,7 @@ let

    passthru.local-packages = {
      inherit
        go-tiny-dream go-rwkv go-bert go-llama-ggml gpt4all go-piper
        go-tiny-dream go-rwkv go-bert go-llama gpt4all go-piper
        llama-cpp-grpc whisper-cpp go-tiny-dream-ncnn espeak-ng' piper-phonemize
        piper-tts';
    };
@@ -498,6 +505,7 @@ let
    };

    passthru.tests = callPackages ./tests.nix { inherit self; };
    passthru.lib = callPackages ./lib.nix { };

    meta = with lib; {
      description = "OpenAI alternative to run local LLMs, image and audio generation";
+188 −100
Original line number Diff line number Diff line
@@ -5,156 +5,244 @@
, fetchurl
, writers
, symlinkJoin
, linkFarmFromDrvs
, jq
}:
let
  common-config = { config, ... }: {
    imports = [ ./module.nix ];
    services.local-ai = {
      enable = true;
      package = self;
      threads = config.virtualisation.cores;
    };
  };

  inherit (self.lib) genModels;
in
{
  version = testers.testVersion {
    package = self;
    version = "v" + self.version;
    command = "local-ai --help";
  };

  health =
  health = testers.runNixOSTest ({ config, ... }: {
    name = self.name + "-health";
    nodes.machine = common-config;
    testScript =
      let
        port = "8080";
      in
    testers.runNixOSTest {
      name = self.name + "-health";
      nodes.machine = {
        systemd.services.local-ai = {
          wantedBy = [ "multi-user.target" ];
          serviceConfig.ExecStart = "${self}/bin/local-ai --debug --localai-config-dir . --address :${port}";
        };
      };
      testScript = ''
      ''
        machine.wait_for_open_port(${port})
        machine.succeed("curl -f http://localhost:${port}/readyz")
      '';
    };
  });

  # https://localai.io/docs/getting-started/manual/
  llama =
  # https://localai.io/features/embeddings/#bert-embeddings
  bert =
    let
      port = "8080";
      gguf = fetchurl {
        url = "https://huggingface.co/TheBloke/Luna-AI-Llama2-Uncensored-GGUF/resolve/main/luna-ai-llama2-uncensored.Q4_K_M.gguf";
        sha256 = "6a9dc401c84f0d48996eaa405174999c3a33bf12c2bfd8ea4a1e98f376de1f15";
      model = "embedding";
      model-configs.${model} = {
        # Note: q4_0 and q4_1 models can not be loaded
        parameters.model = fetchurl {
          url = "https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-f16.bin";
          sha256 = "9c195b2453a4fef60a4f6be3a88a39211366214df6498a4fe4885c9e22314f50";
        };
        backend = "bert-embeddings";
        embeddings = true;
      };

      models = genModels model-configs;

      requests.request = {
        inherit model;
        input = "Your text string goes here";
      };
      models = linkFarmFromDrvs "models" [
        gguf
      ];
    in
    testers.runNixOSTest {
      name = self.name + "-llama";
      nodes.machine =
      name = self.name + "-bert";
      nodes.machine = {
        imports = [ common-config ];
        virtualisation.cores = 2;
        virtualisation.memorySize = 2048;
        services.local-ai.models = models;
      };
      passthru = { inherit models requests; };
      testScript =
        let
          cores = 4;
          port = "8080";
        in
        {
          virtualisation = {
            inherit cores;
            memorySize = 8192;
        ''
          machine.wait_for_open_port(${port})
          machine.succeed("curl -f http://localhost:${port}/readyz")
          machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
          machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${model}\"' models.json")
          machine.succeed("curl -f http://localhost:${port}/embeddings --json @${writers.writeJSON "request.json" requests.request} --output embeddings.json")
          machine.succeed("${jq}/bin/jq --exit-status 'debug | .model == \"${model}\"' embeddings.json")
        '';
    };
          systemd.services.local-ai = {
            wantedBy = [ "multi-user.target" ];
            serviceConfig.ExecStart = "${self}/bin/local-ai --debug --threads ${toString cores} --models-path ${models} --localai-config-dir . --address :${port}";

} // lib.optionalAttrs (!self.features.with_cublas && !self.features.with_clblas) {
  # https://localai.io/docs/getting-started/manual/
  llama =
    let
      model = "gpt-3.5-turbo";

      # https://localai.io/advanced/#full-config-model-file-reference
      model-configs.${model} = rec {
        context_size = 8192;
        parameters = {
          # https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF
          # https://ai.meta.com/blog/meta-llama-3/
          model = fetchurl {
            url = "https://huggingface.co/lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf";
            sha256 = "ab9e4eec7e80892fd78f74d9a15d0299f1e22121cea44efd68a7a02a3fe9a1da";
          };
          # defaults from:
          # https://deepinfra.com/meta-llama/Meta-Llama-3-8B-Instruct
          temperature = 0.7;
          top_p = 0.9;
          top_k = 0;
          # following parameter leads to outputs like: !!!!!!!!!!!!!!!!!!!
          #repeat_penalty = 1;
          presence_penalty = 0;
          frequency_penalty = 0;
          max_tokens = 100;
        };
        stopwords = [ "<|eot_id|>" ];
        template = {
          # Templates implement following specifications
          # https://github.com/meta-llama/llama3/tree/main?tab=readme-ov-file#instruction-tuned-models
          # ... and are insprired by:
          # https://github.com/mudler/LocalAI/blob/master/embedded/models/llama3-instruct.yaml
          #
          # The rules for template evaluateion are defined here:
          # https://pkg.go.dev/text/template
          chat_message = ''
            <|start_header_id|>{{.RoleName}}<|end_header_id|>

            {{.Content}}${builtins.head stopwords}'';

          chat = "<|begin_of_text|>{{.Input}}<|start_header_id|>assistant<|end_header_id|>";
        };
      };
      testScript =
        let

      models = genModels model-configs;

      requests = {
        # https://localai.io/features/text-generation/#chat-completions
          request-chat-completions = {
            model = gguf.name;
            messages = [{ role = "user"; content = "Say this is a test!"; }];
            temperature = 0.7;
        chat-completions = {
          inherit model;
          messages = [{ role = "user"; content = "1 + 2 = ?"; }];
        };
        # https://localai.io/features/text-generation/#edit-completions
          request-edit-completions = {
            model = gguf.name;
        edit-completions = {
          inherit model;
          instruction = "rephrase";
          input = "Black cat jumped out of the window";
            temperature = 0.7;
          max_tokens = 50;
        };
        # https://localai.io/features/text-generation/#completions
          request-completions = {
            model = gguf.name;
        completions = {
          inherit model;
          prompt = "A long time ago in a galaxy far, far away";
            temperature = 0.7;
        };
      };
    in
    testers.runNixOSTest {
      name = self.name + "-llama";
      nodes.machine = {
        imports = [ common-config ];
        virtualisation.cores = 4;
        virtualisation.memorySize = 8192;
        services.local-ai.models = models;
      };
      passthru = { inherit models requests; };
      testScript =
        let
          port = "8080";
        in
        ''
          machine.wait_for_open_port(${port})
          machine.succeed("curl -f http://localhost:${port}/readyz")
          machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
          machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${gguf.name}\"' models.json")
          machine.succeed("curl -f http://localhost:${port}/v1/chat/completions --json @${writers.writeJSON "request-chat-completions.json" request-chat-completions} --output chat-completions.json")
          machine.succeed("${jq}/bin/jq --exit-status 'debug | .data[].id == \"${model}\"' models.json")

          machine.succeed("curl -f http://localhost:${port}/v1/chat/completions --json @${writers.writeJSON "request-chat-completions.json" requests.chat-completions} --output chat-completions.json")
          machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"chat.completion\"' chat-completions.json")
          machine.succeed("curl -f http://localhost:${port}/v1/edits --json @${writers.writeJSON "request-edit-completions.json" request-edit-completions} --output edit-completions.json")
          machine.succeed("${jq}/bin/jq --exit-status 'debug | .choices | first.message.content | tonumber == 3' chat-completions.json")

          machine.succeed("curl -f http://localhost:${port}/v1/edits --json @${writers.writeJSON "request-edit-completions.json" requests.edit-completions} --output edit-completions.json")
          machine.succeed("${jq}/bin/jq --exit-status 'debug | .object == \"edit\"' edit-completions.json")
          machine.succeed("curl -f http://localhost:${port}/v1/completions --json @${writers.writeJSON "request-completions.json" request-completions} --output completions.json")
          machine.succeed("${jq}/bin/jq --exit-status '.usage.completion_tokens | debug == ${toString requests.edit-completions.max_tokens}' edit-completions.json")

          machine.succeed("curl -f http://localhost:${port}/v1/completions --json @${writers.writeJSON "request-completions.json" requests.completions} --output completions.json")
          machine.succeed("${jq}/bin/jq --exit-status 'debug | .object ==\"text_completion\"' completions.json")
          machine.succeed("${jq}/bin/jq --exit-status '.usage.completion_tokens | debug == ${toString model-configs.${model}.parameters.max_tokens}' completions.json")
        '';
    };

} // lib.optionalAttrs self.features.with_tts {
} // lib.optionalAttrs (self.features.with_tts && !self.features.with_cublas && !self.features.with_clblas) {
  # https://localai.io/features/text-to-audio/#piper
  tts =
    let
      port = "8080";
      voice-en-us = fetchzip {
        url = "https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-danny-low.tar.gz";
        hash = "sha256-5wf+6H5HeQY0qgdqnAG1vSqtjIFM9lXH53OgouuPm0M=";
        stripRoot = false;
      };
      ggml-tiny-en = fetchurl {
      model-stt = "whisper-en";
      model-configs.${model-stt} = {
        backend = "whisper";
        parameters.model = fetchurl {
          url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin";
          hash = "sha256-x3xXZvHO8JtrfUfyG1Rsvd1BV4hrO11tT3CekeZsfCs=";
        };
      whisper-en = {
        name = "whisper-en";
        backend = "whisper";
        parameters.model = ggml-tiny-en.name;
      };
      models = symlinkJoin {
        name = "models";

      model-tts = "piper-en";
      model-configs.${model-tts} = {
        backend = "piper";
        parameters.model = "en-us-danny-low.onnx";
      };

      models =
        let
          models = genModels model-configs;
        in
        symlinkJoin {
          inherit (models) name;
          paths = [
          voice-en-us
          (linkFarmFromDrvs "whisper-en" [
            (writers.writeYAML "whisper-en.yaml" whisper-en)
            ggml-tiny-en
          ])
            models
            (fetchzip {
              url = "https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-danny-low.tar.gz";
              hash = "sha256-5wf+6H5HeQY0qgdqnAG1vSqtjIFM9lXH53OgouuPm0M=";
              stripRoot = false;
            })
          ];
        };

      requests.request = {
        model = model-tts;
        input = "Hello, how are you?";
      };
    in
    testers.runNixOSTest {
      name = self.name + "-tts";
      nodes.machine =
        let
          cores = 2;
        in
        {
          virtualisation = {
            inherit cores;
          };
          systemd.services.local-ai = {
            wantedBy = [ "multi-user.target" ];
            serviceConfig.ExecStart = "${self}/bin/local-ai --debug --threads ${toString cores} --models-path ${models} --localai-config-dir . --address :${port}";
          };
      nodes.machine = {
        imports = [ common-config ];
        virtualisation.cores = 2;
        services.local-ai.models = models;
      };
      passthru = { inherit models requests; };
      testScript =
        let
          request = {
            model = "en-us-danny-low.onnx";
            backend = "piper";
            input = "Hello, how are you?";
          };
          port = "8080";
        in
        ''
          machine.wait_for_open_port(${port})
          machine.succeed("curl -f http://localhost:${port}/readyz")
          machine.succeed("curl -f http://localhost:${port}/tts --json @${writers.writeJSON "request.json" request} --output out.wav")
          machine.succeed("curl -f http://localhost:${port}/v1/audio/transcriptions --header 'Content-Type: multipart/form-data' --form file=@out.wav --form model=${whisper-en.name} --output transcription.json")
          machine.succeed("${jq}/bin/jq --exit-status 'debug | .segments | first.text == \"${request.input}\"' transcription.json")
          machine.succeed("curl -f http://localhost:${port}/v1/models --output models.json")
          machine.succeed("${jq}/bin/jq --exit-status 'debug' models.json")
          machine.succeed("curl -f http://localhost:${port}/tts --json @${writers.writeJSON "request.json" requests.request} --output out.wav")
          machine.succeed("curl -f http://localhost:${port}/v1/audio/transcriptions --header 'Content-Type: multipart/form-data' --form file=@out.wav --form model=${model-stt} --output transcription.json")
          machine.succeed("${jq}/bin/jq --exit-status 'debug | .segments | first.text == \"${requests.request.input}\"' transcription.json")
        '';
    };
}