Unverified Commit 108d0d4e authored by Luna Nova's avatar Luna Nova Committed by GitHub
Browse files

rocmPackages.{clr,migraphx,miopen,rocm-comgr}: fix runtime hiprtc failures (#498395)

parents 28f73fb6 c1c21ed5
Loading
Loading
Loading
Loading
+8 −3
Original line number Diff line number Diff line
@@ -207,12 +207,13 @@ stdenv.mkDerivation (finalAttrs: {
    ln -s ${hipClang} $out/llvm
  '';

  # libamdhip64.so dlopens its own bare name for hipGetProcAddress symbol resolution.
  # Add its own directory to its RPATH so it can find itself
  # libamdhip64.so dlopens its own bare name for hipGetProcAddress symbol resolution,
  # same pattern with libhiprtc.so, so add own lib directory to all .so's
  # RPATHs so they can find themselves and neighbouring libs
  # Must be in postFixup so it runs after patchelf --shrink-rpath which removes
  # the apparently useless rpath
  postFixup = ''
    patchelf --add-rpath "$out/lib" "$out/lib/libamdhip64.so"
    patchelf --add-rpath "$out/lib" "$out"/lib/*.so
  '';

  disallowedRequisites = [
@@ -294,6 +295,10 @@ stdenv.mkDerivation (finalAttrs: {
          "amdgcnspirv"
        ];
      };
      hiprtc-type-traits = callPackage ./test-hiprtc-type-traits.nix {
        clr = finalAttrs.finalPackage;
        inherit rocm-smi;
      };
    };

    selectGpuTargets =
+68 −0
Original line number Diff line number Diff line
#include <hip/hip_runtime.h>
#include <hip/hiprtc.h>
#include <iostream>
#include <string>

#define CHECK_HIP(expr) do { \
  if ((expr) != hipSuccess) { \
    std::cerr << #expr << " failed" << std::endl; \
    return 1; \
  } \
} while(0)

#define CHECK_HIPRTC(expr) do { \
  hiprtcResult _res = (expr); \
  if (_res != HIPRTC_SUCCESS) { \
    std::cerr << #expr << " failed: " << hiprtcGetErrorString(_res) << std::endl; \
    hiprtcGetProgramLogSize(prog, &log_size); \
    if (log_size > 0) { \
      std::string log(log_size, '\0'); \
      hiprtcGetProgramLog(prog, log.data()); \
      std::cerr << "Compile log:\n" << log << std::endl; \
    } \
    return 1; \
  } \
} while(0)

static const char* kernelSource = R"(
  #include <type_traits>

  extern "C" __global__ void test_kernel(int* out) {
    static_assert(std::is_same<int, std::remove_const<const int>::type>::value,
                  "type_traits not working");
    out[0] = 5;
  }
)";

int main() {
  hiprtcProgram prog;
  size_t log_size = 0;
  CHECK_HIPRTC(hiprtcCreateProgram(&prog, kernelSource, "test.hip", 0, nullptr, nullptr));
  CHECK_HIPRTC(hiprtcCompileProgram(prog, 0, nullptr));

  size_t code_size;
  CHECK_HIPRTC(hiprtcGetCodeSize(prog, &code_size));
  std::string code(code_size, '\0');
  CHECK_HIPRTC(hiprtcGetCode(prog, code.data()));
  hiprtcDestroyProgram(&prog);

  hipModule_t module;
  hipFunction_t kernel;
  CHECK_HIP(hipModuleLoadData(&module, code.data()));
  CHECK_HIP(hipModuleGetFunction(&kernel, module, "test_kernel"));

  int* d_out;
  int h_out = 0;
  CHECK_HIP(hipMalloc(&d_out, sizeof(int)));
  void* args[] = { &d_out };
  CHECK_HIP(hipModuleLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, nullptr, args, nullptr));
  CHECK_HIP(hipMemcpy(&h_out, d_out, sizeof(int), hipMemcpyDeviceToHost));

  if (h_out != 5) {
    std::cerr << "Kernel output mismatch: expected 5, got " << h_out << std::endl;
    return 1;
  }

  std::cout << "HIPRTC type_traits test passed (output=" << h_out << ")" << std::endl;
  return 0;
}
+57 −0
Original line number Diff line number Diff line
{
  lib,
  stdenv,
  makeImpureTest,
  clr,
  rocm-smi,
}:
# minimal hiprtc test that compiles a kernel using <type_traits> at runtime
# mirrors an migraphx workload, better test/iteration UX to be able to confirm
# with just a build up to clr
let
  hiprtc-test = stdenv.mkDerivation {
    pname = "hiprtc-type-traits-test";
    version = "0";

    dontUnpack = true;

    nativeBuildInputs = [ clr ];

    buildPhase = ''
      runHook preBuild
      hipcc -o hiprtc-test ${./test-hiprtc-type-traits.cpp} -lhiprtc
      runHook postBuild
    '';

    installPhase = ''
      runHook preInstall
      mkdir -p $out/bin
      cp hiprtc-test $out/bin/
      runHook postInstall
    '';
  };
in
makeImpureTest {
  name = "hiprtc-type-traits";
  testedPackage = "rocmPackages.clr";

  sandboxPaths = [
    "/sys"
    "/dev/dri"
    "/dev/kfd"
  ];

  nativeBuildInputs = [
    hiprtc-test
    rocm-smi
  ];

  testScript = ''
    rocm-smi
    hiprtc-test
  '';

  meta = {
    teams = [ lib.teams.rocm ];
  };
}
+7 −0
Original line number Diff line number Diff line
{
  lib,
  stdenv,
  callPackage,
  fetchFromGitHub,
  rocmUpdateScript,
  pkg-config,
@@ -183,6 +184,12 @@ stdenv.mkDerivation (finalAttrs: {
      patchelf $test/bin/test_* --shrink-rpath --allowed-rpath-prefixes "$NIX_STORE"
    '';

  passthru.impureTests = {
    # NIXPKGS_ALLOW_UNFREE=1 bash $(nix-build -A rocmPackages.migraphx.impureTests.migraphx-driver)
    migraphx-driver = callPackage ./test-migraphx-driver.nix {
      migraphx = finalAttrs.finalPackage;
    };
  };
  passthru.updateScript = rocmUpdateScript {
    name = finalAttrs.pname;
    inherit (finalAttrs.src) owner;
+47 −0
Original line number Diff line number Diff line
{
  lib,
  fetchurl,
  makeImpureTest,
  writableTmpDirAsHomeHook,
  migraphx,
  clr,
  rocm-smi,
}:

# Verify that a ≈50MiB resnet onnx can run with migraphx
let
  resnet18 = fetchurl {
    url = "https://huggingface.co/onnxmodelzoo/resnet18_Opset18_timm/resolve/main/resnet18_Opset18_timm.onnx";
    hash = "sha256-u2Io20n72qoA9atRsFIWb0zHF1WdJYgHQdMWfJhJGHA=";
    meta.license = lib.licenses.unfree;
  };
in
makeImpureTest {
  name = "migraphx-driver";
  testedPackage = "rocmPackages.migraphx";

  sandboxPaths = [
    "/sys"
    "/dev/dri"
    "/dev/kfd"
  ];

  nativeBuildInputs = [
    writableTmpDirAsHomeHook
    migraphx
    clr
    rocm-smi
  ];

  # FIXME(@LunNova): tol values are set too high - was seeing high divergence on iGPU
  # want this test to be useful for verifying workloads run at all
  # and will investigate what's broken for accuracy
  testScript = ''
    rocm-smi
    migraphx-driver verify -O --rms-tol 0.03 --atol 1.0 --rtol 0.01 ${resnet18}
  '';

  meta = {
    teams = [ lib.teams.rocm ];
  };
}
Loading