Unverified Commit 4d404975 authored by j-k's avatar j-k Committed by GitHub
Browse files

llama-swap: 176 -> 182 (#477972)

parents 6d4db03a 7fe87ba2
Loading
Loading
Loading
Loading
+34 −31
Original line number Diff line number Diff line
@@ -55,6 +55,10 @@ in
                llama-server = lib.getExe' llama-cpp "llama-server";
              in
              {
                # more useful logging output
                logLevel = "debug";
                logToStdout = "both";

                hooks.on_startup.preload = [
                  "smollm2"
                ];
@@ -62,14 +66,14 @@ in
                models = {
                  "smollm2" = {
                    ttl = 10;
                    cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9";
                    cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --alias smollm2 --no-webui --temp 0.2 --top-k 9";
                  };
                  "smollm2-group-1" = {
                    cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9 -c 1024";
                    cmd = "${llama-server} --port \${PORT} -m ${smollm2-135m} --alias smollm2-group-1 --no-webui --temp 0.2 --top-k 9 -c 1024";
                  };
                  "smollm2-group-2" = {
                    proxy = "http://127.0.0.1:5802";
                    cmd = "${llama-server} --port 5802 -m ${smollm2-135m} --no-webui --temp 0.2 --top-k 9 -c 1024";
                    cmd = "${llama-server} --port 5802 -m ${smollm2-135m} --alias smollm2-group-2 --no-webui --temp 0.2 --top-k 9 -c 1024";
                  };
                };
                groups = {
@@ -115,7 +119,6 @@ in
          '-s',
          '--fail',
          '-H "Content-Type: application/json"',
          '-H "Authorization: Bearer no-key"',
          "-d '{d}'".format(d=json.dumps(data))
        ]
        return json.loads(machine.succeed('curl {args} http://localhost:8080{route}'.format(args=" ".join(args), route=route)))
@@ -127,7 +130,8 @@ in
        machine.succeed('curl --fail http:/localhost:8080/ui/')

      with subtest('check is healthy'):
        machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/health | grep "OK"')
        response_healthy = machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/health')
        assert 'OK' in response_healthy, '/health was not OK'

    ''
    + lib.optionalString useSmollm2-135m ''
@@ -135,10 +139,10 @@ in
      with subtest('check `/running` for preloaded smollm2'):
        machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/running | grep "smollm2"')
        running_response = get_json('/running')
        assert len(running_response['running']) == 1
        assert len(running_response['running']) == 1, f'running doesn\'t have one entry as expected: {running_response}'
        running_model = running_response['running'][0]
        assert running_model['model'] == 'smollm2'
        assert running_model['state'] == 'ready'
        assert running_model['model'] == 'smollm2', f'running model is not smollm2: {running_response}'
        assert running_model['state'] == 'ready', f'running smollm2 is not ready: {running_response}'

      with subtest('runs smollm2'):
        response = None
@@ -155,31 +159,30 @@ in
          response = post_json('/v1/chat/completions', data)

        with subtest('response is from smollm2'):
          assert response['model'] == 'smollm2'
          assert response['model'] == 'smollm2', f'response was not from smollm2: {response['model']}'

        with subtest('response contains at least one item in "choices"'):
          assert len(response['choices']) >= 1
          assert len(response['choices']) >= 1, f'response had no choices: {response}'

        assistant_choices = None
        with subtest('response contains at least one "assistant" message'):
          assistant_choices = [c for c in response['choices'] if c['message']['role'] == 'assistant']
          assert len(assistant_choices) >= 1
          assert len(assistant_choices) >= 1, f'response had no assistant message: {response}'

        with subtest('first message (lowercase) starts with "hello"'):
          assert assistant_choices[0]['message']['content'].lower()[:5] == 'hello'
          assert assistant_choices[0]['message']['content'].lower()[:5] == 'hello', f'response didn\'t start with hello: {response}'

        with subtest('check `/running` for just smollm2'):
          running_response = get_json('/running')
          assert len(running_response['running']) == 1
          assert len(running_response['running']) == 1, f'running doesn\'t have one entry as expected: {running_response}'
          running_model = running_response['running'][0]
          assert running_model['model'] == 'smollm2'
          assert running_model['state'] == 'ready'
          assert running_model['model'] == 'smollm2', f'running model is not smollm2: {running_response}'
          assert running_model['state'] == 'ready', f'running smollm2 is not ready: {running_response}'

      with subtest('check `/running` for smollm2 to timeout'):
        machine.succeed('curl --silent --fail http://localhost:8080/running | grep "smollm2"')
        machine.wait_until_succeeds('curl --silent --fail http://localhost:8080/running | grep -v "smollm2"', timeout=11)
        running_response = get_json('/running')
        assert len(running_response['running']) == 0
        assert len(running_response['running']) == 0, "smollm2 was still running after timeout"

      with subtest('runs smollm2-group-1 and smollm2-group-2'):
        response_1 = None
@@ -196,25 +199,25 @@ in
          response_1 = post_json('/v1/chat/completions', data)

        with subtest('response 1 is from smollm2-group-1'):
          assert response_1['model'] == 'smollm2-group-1'
          assert response_1['model'] == 'smollm2-group-1', f'response was not from smollm2-group-1: {response_1['model']}'

        with subtest('response 1 contains at least one item in "choices"'):
          assert len(response['choices']) >= 1
          assert len(response_1['choices']) >= 1, f'response had no choices: {response_1}'

        assistant_choices_1 = None
        with subtest('response 1 contains at least one "assistant" message'):
          assistant_choices_1 = [c for c in response_1['choices'] if c['message']['role'] == 'assistant']
          assert len(assistant_choices_1) >= 1
          assert len(assistant_choices_1) >= 1, f'response had no assistant message: {response_1}'

        with subtest('first message (lowercase) in response 1 starts with "hello"'):
          assert assistant_choices_1[0]['message']['content'].lower()[:5] == 'hello'
          assert assistant_choices_1[0]['message']['content'].lower()[:5] == 'hello', f'response didn\'t start with hello: {response_1}'

        with subtest('check `/running` for just smollm2-group-1'):
          running_response = get_json('/running')
          assert len(running_response['running']) == 1
          assert len(running_response['running']) == 1, f'running doesn\'t have one entry as expected: {running_response}'
          running_model = running_response['running'][0]
          assert running_model['model'] == 'smollm2-group-1'
          assert running_model['state'] == 'ready'
          assert running_model['model'] == 'smollm2-group-1', f'running model is not smollm2-group-1: {running_response}'
          assert running_model['state'] == 'ready', f'running smollm2-group-1 is not ready: {running_response}'

        response_2 = None
        with subtest('send request to smollm2-group-2'):
@@ -230,29 +233,29 @@ in
          response_2 = post_json('/v1/chat/completions', data)

        with subtest('response 2 is from smollm2-group-2'):
          assert response_2['model'] == 'smollm2-group-2'
          assert response_2['model'] == 'smollm2-group-2', f'response was not from smollm2-group-2: {response_2['model']}'

        with subtest('response 2 contains at least one item in "choices"'):
          assert len(response['choices']) >= 1
          assert len(response_2['choices']) >= 1, f'response had no choices: {response_2}'

        assistant_choices_2 = None
        with subtest('response 2 contains at least one "assistant" message'):
          assistant_choices_2 = [c for c in response_2['choices'] if c['message']['role'] == 'assistant']
          assert len(assistant_choices_2) >= 1
          assert len(assistant_choices_2) >= 1, f'response had no assistant message: {response_2}'

        with subtest('first message (lowercase) in response 1 starts with "hello"'):
          assert assistant_choices_2[0]['message']['content'].lower()[:5] == 'hello'
          assert assistant_choices_2[0]['message']['content'].lower()[:5] == 'hello', f'response didn\'t start with hello: {response_2}'

        with subtest('check `/running` for both smollm2-group-1 and smollm2-group-2'):
          running_response = get_json('/running')['running']
          assert len(running_response) == 2
          assert len(running_response) == 2, 'expected both in smollm2 group to be running'
          assert len([
            rm for rm in running_response
            if rm['state'] == 'ready' and rm['model'] == 'smollm2-group-1'
          ]) == 1
          ]) == 1, f'smollm2-group-1 not in running group and ready {running_response}'
          assert len([
            rm for rm in running_response
            if rm['state'] == 'ready' and rm['model'] == 'smollm2-group-2'
          ]) == 1
          ]) == 1, f'smollm2-group-2 not in running group and ready {running_response}'
    '';
}
+3 −3
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ let
in
buildGoModule (finalAttrs: {
  pname = "llama-swap";
  version = "176";
  version = "182";

  outputs = [
    "out"
@@ -28,7 +28,7 @@ buildGoModule (finalAttrs: {
    owner = "mostlygeek";
    repo = "llama-swap";
    tag = "v${finalAttrs.version}";
    hash = "sha256-nfkuaiEITOmpkiLft3iNW1VUexHwZ36c8gwcQKGANbQ=";
    hash = "sha256-w/VQS8uCpgniwLiJsH/8IG/AGasRxjCv7fADTfpvWLw=";
    # populate values that require us to use git. By doing this in postFetch we
    # can delete .git afterwards and maintain better reproducibility of the src.
    leaveDotGit = true;
@@ -41,7 +41,7 @@ buildGoModule (finalAttrs: {
    '';
  };

  vendorHash = "sha256-/EbFyuCVFxHTTO0UwSV3B/6PYUpudxB2FD8nNx1Bb+M=";
  vendorHash = "sha256-XiDYlw/byu8CWvg4KSPC7m8PGCZXtp08Y1velx4BR8U=";

  passthru.ui = callPackage ./ui.nix { llama-swap = finalAttrs.finalPackage; };