Unverified Commit 6713c5e4 authored by Luna Nova's avatar Luna Nova
Browse files

rocmPackages.rocm-runtime: apply fix for segfault when queue allocation fails

Found by @06kellyjac, possibly triggered by bug in linux-6.18.4

https://lore.kernel.org/linux-iommu/870872aa-28e9-412a-bac6-8020bf560e4f@amd.com/t/

Resolving this *will not fix the underlying issue*, but does transmute a
segfault into an actual error result.

ROCm error: out of memory
  current device: 0, in function stream at /build/source/ggml/src/ggml-hip/../ggml-cuda/common.cuh:1345
  hipStreamCreateWithFlags(&streams[device][stream], 0x01)
parent b17c41cc
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -53,6 +53,9 @@ stdenv.mkDerivation (finalAttrs: {
  ];

  patches = [
    # Vendored upstream PR for fix for segfault when queue allocation fails
    # https://github.com/ROCm/rocm-systems/pull/2850
    ./queue-failure.patch
    (fetchpatch {
      # [PATCH] rocr: Extend HIP ISA compatibility check
      sha256 = "sha256-8r2Lb5lBfFaZC3knCxfXGcnkzNv6JxOKyJn2rD5gus4=";
+71 −0
Original line number Diff line number Diff line
From b7717446c85d08b2d7c0c60ba3ac0eff11ee6120 Mon Sep 17 00:00:00 2001
From: Luna Nova <git@lunnova.dev>
Date: Tue, 20 Jan 2026 12:55:45 -0800
Subject: [PATCH 1/2] rocm-runtime: fix crash in QueueCreate due to trying to
 free non allocated scratch

if (scratch.main_queue_base != nullptr) before calling ReleaseQueueMainScratch
because ReleaseQueueMainScratch is only valid if main_queue_base is set
and the scope guard can fire for an error allocating the queue.
---
 .../runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 01b01fe869..83db40dacc 100644
--- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -1792,7 +1792,9 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
   scratch.main_queue_base = nullptr;
   scratch.main_queue_process_offset = 0;
 
-  MAKE_NAMED_SCOPE_GUARD(scratchGuard, [&]() { ReleaseQueueMainScratch(scratch); });
+  MAKE_NAMED_SCOPE_GUARD(scratchGuard, [&]() {
+    if (scratch.main_queue_base != nullptr) ReleaseQueueMainScratch(scratch);
+  });
 
   if (scratch.main_size != 0) {
     AcquireQueueMainScratch(scratch);
-- 
2.52.0


From 9c1746cd76a703e4d2321dc2ffe85fc61bfd2f21 Mon Sep 17 00:00:00 2001
From: Luna Nova <git@lunnova.dev>
Date: Tue, 20 Jan 2026 13:00:32 -0800
Subject: [PATCH 2/2] rocm-runtime: log for errors in QueueCreate

---
 .../runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp   | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 83db40dacc..ae68732eb5 100644
--- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -1799,6 +1799,9 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
   if (scratch.main_size != 0) {
     AcquireQueueMainScratch(scratch);
     if (scratch.main_queue_base == nullptr) {
+      LogPrint(HSA_AMD_LOG_FLAG_INFO,
+               "Failed to allocate scratch memory for queue, size=%zu, node=%u",
+               scratch.main_size, node_id());
       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
     }
   }
@@ -1827,7 +1830,11 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, u
             node_id()));
   }
 
-  if (!shared_queue) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  if (!shared_queue) {
+    LogPrint(HSA_AMD_LOG_FLAG_INFO,
+             "Failed to allocate shared queue descriptor memory, node=%u", node_id());
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
 
   auto aql_queue = new AqlQueue(shared_queue, this, size, node_id(), scratch, event_callback, data,
                                 flags);
-- 
2.52.0