Commit fcfeb1e5 authored by Fabian Mora's avatar Fabian Mora
Browse files

[mlir][gpu] Add GPU target support to `gpu-to-llvm`.

**For an explanation of these patches see D154153.**

This patch modifies the lowering of `gpu.module` & `gpu.launch_func` in the `gpu-to-llvm` pass,
allowing the usage of the new GPU compilation mechanism in the patch series ending in D154153.

Instead of removing Modules, this patch preserves the module if it has target attributes so that the
`gpu-module-to-binary` pass can later serialize them.

Instead of lowering the kernel calls to the LLVM dialect, this patch primarily updates the operation's
arguments, leaving the job of converting the operation into LLVM instructions to the translation stage.
The reason for not lowering the operation to LLVM at this stage is that kernel launches do not have a
single one-to-one representation in LLVM. For example, a kernel launch can be represented by a call
to a kernel stub, like in CUDA or HIP.
Kernel launches are also intrinsically linked to the binary associated with the call, and the binaries are
converted during translation.

Depends on D154149

Reviewed By: mehdi_amini

Differential Revision: https://reviews.llvm.org/D154152
parent 43752a2a
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -66,10 +66,10 @@ struct FunctionCallBuilder {

/// Collect a set of patterns to convert from the GPU dialect to LLVM and
/// populate converter for gpu types.
void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                                         RewritePatternSet &patterns,
                                         StringRef gpuBinaryAnnotation = {},
                                         bool kernelBarePtrCallConv = false);
void populateGpuToLLVMConversionPatterns(
    LLVMTypeConverter &converter, RewritePatternSet &patterns,
    StringRef gpuBinaryAnnotation = {}, bool kernelBarePtrCallConv = false,
    SymbolTable *cachedModuleTable = nullptr);

/// A function that maps a MemorySpace enum to a target-specific integer value.
using MemorySpaceMapping = std::function<unsigned(gpu::AddressSpace)>;
+76 −8
Original line number Diff line number Diff line
@@ -450,10 +450,12 @@ class ConvertLaunchFuncOpToGpuRuntimeCallPattern
public:
  ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter,
                                             StringRef gpuBinaryAnnotation,
                                             bool kernelBarePtrCallConv)
                                             bool kernelBarePtrCallConv,
                                             SymbolTable *cachedModuleTable)
      : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
        gpuBinaryAnnotation(gpuBinaryAnnotation),
        kernelBarePtrCallConv(kernelBarePtrCallConv) {}
        kernelBarePtrCallConv(kernelBarePtrCallConv),
        cachedModuleTable(cachedModuleTable) {}

private:
  Value generateParamsArray(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor,
@@ -467,6 +469,7 @@ private:

  llvm::SmallString<32> gpuBinaryAnnotation;
  bool kernelBarePtrCallConv;
  SymbolTable *cachedModuleTable;
};

class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
@@ -571,7 +574,23 @@ void GpuToLLVMConversionPass::runOnOperation() {
  RewritePatternSet patterns(&getContext());
  LLVMConversionTarget target(getContext());

  target.addIllegalDialect<gpu::GPUDialect>();
  SymbolTable symbolTable = SymbolTable(getOperation());
  // Preserve GPU modules if they have target attributes.
  target.addDynamicallyLegalOp<gpu::GPUModuleOp>(
      [](gpu::GPUModuleOp module) -> bool {
        return module.getTargetsAttr() != nullptr;
      });
  // Accept as legal LaunchFuncOps if they refer to GPU Modules with targets and
  // the operands have been lowered.
  target.addDynamicallyLegalOp<gpu::LaunchFuncOp>(
      [&](gpu::LaunchFuncOp op) -> bool {
        auto module =
            symbolTable.lookup<gpu::GPUModuleOp>(op.getKernelModuleName());
        return converter.isLegal(op->getOperandTypes()) &&
               converter.isLegal(op->getResultTypes()) &&
               (module && module.getTargetsAttr() &&
                module.getTargetsAttr().size());
      });

  mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns);
  mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns);
@@ -581,7 +600,7 @@ void GpuToLLVMConversionPass::runOnOperation() {
  populateAsyncStructuralTypeConversionsAndLegality(converter, patterns,
                                                    target);
  populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation,
                                      kernelBarePtrCallConv);
                                      kernelBarePtrCallConv, &symbolTable);

  if (failed(
          applyPartialConversion(getOperation(), target, std::move(patterns))))
@@ -1069,10 +1088,58 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(

  // Create an LLVM global with CUBIN extracted from the kernel annotation and
  // obtain a pointer to the first byte in it.
  auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
  gpu::GPUModuleOp kernelModule;
  if (cachedModuleTable)
    kernelModule = cachedModuleTable->lookup<gpu::GPUModuleOp>(
        launchOp.getKernelModuleName());
  else
    kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
        launchOp, launchOp.getKernelModuleName());
  assert(kernelModule && "expected a kernel module");

  // If the module has Targets then just update the op operands.
  if (ArrayAttr targets = kernelModule.getTargetsAttr()) {
    Value stream = Value();
    if (adaptor.getAsyncDependencies().size())
      stream = adaptor.getAsyncDependencies().front();
    // If the async keyword is present and there are no dependencies, then a
    // stream must be created to pass to subsequent operations.
    else if (launchOp.getAsyncToken())
      stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult();

    // Lower the kernel operands to match kernel parameters.
    SmallVector<Value, 4> arguments;
    if (kernelBarePtrCallConv) {
      // Hack the bare pointer value on just for the argument promotion
      LLVMTypeConverter *converter = getTypeConverter();
      LowerToLLVMOptions options = converter->getOptions();
      LowerToLLVMOptions overrideToMatchKernelOpts = options;
      overrideToMatchKernelOpts.useBarePtrCallConv = true;
      converter->dangerousSetOptions(overrideToMatchKernelOpts);
      arguments =
          converter->promoteOperands(loc, launchOp.getKernelOperands(),
                                     adaptor.getKernelOperands(), rewriter);
      converter->dangerousSetOptions(options);
    } else {
      arguments = getTypeConverter()->promoteOperands(
          loc, launchOp.getKernelOperands(), adaptor.getKernelOperands(),
          rewriter);
    }

    rewriter.create<gpu::LaunchFuncOp>(
        launchOp.getLoc(), launchOp.getKernelAttr(),
        gpu::KernelDim3{adaptor.getGridSizeX(), adaptor.getGridSizeY(),
                        adaptor.getGridSizeZ()},
        gpu::KernelDim3{adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
                        adaptor.getBlockSizeZ()},
        adaptor.getDynamicSharedMemorySize(), arguments, stream);
    if (launchOp.getAsyncToken())
      rewriter.replaceOp(launchOp, {stream});
    else
      rewriter.eraseOp(launchOp);
    return success();
  }

  auto binaryAttr =
      kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation);
  if (!binaryAttr) {
@@ -1845,7 +1912,8 @@ LogicalResult ConvertSetCsrPointersOpToGpuRuntimeCallPattern::matchAndRewrite(
void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                                               RewritePatternSet &patterns,
                                               StringRef gpuBinaryAnnotation,
                                               bool kernelBarePtrCallConv) {
                                               bool kernelBarePtrCallConv,
                                               SymbolTable *cachedModuleTable) {
  addOpaquePointerConversion<gpu::AsyncTokenType>(converter);
  addOpaquePointerConversion<gpu::SparseDnTensorHandleType>(converter);
  addOpaquePointerConversion<gpu::SparseSpMatHandleType>(converter);
@@ -1881,6 +1949,6 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
               ConvertSpGEMMGetSizeOpToGpuRuntimeCallPattern,
               ConvertSetCsrPointersOpToGpuRuntimeCallPattern>(converter);
  patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
      converter, gpuBinaryAnnotation, kernelBarePtrCallConv);
      converter, gpuBinaryAnnotation, kernelBarePtrCallConv, cachedModuleTable);
  patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());
}
+30 −0
Original line number Diff line number Diff line
// RUN: mlir-opt %s --gpu-to-llvm="use-bare-pointers-for-kernels=1" -split-input-file | FileCheck %s

module attributes {gpu.container_module} {
  gpu.module @kernels [#nvvm.target]  {
    llvm.func @kernel_1(%arg0: f32, %arg1: !llvm.ptr<1>) attributes {gpu.kernel, nvvm.kernel} {
      %0 = llvm.mlir.undef : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
      %1 = llvm.insertvalue %arg1, %0[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
      %3 = llvm.mlir.constant(0 : index) : i64
      %4 = llvm.insertvalue %3, %2[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
      %5 = llvm.mlir.constant(10 : index) : i64
      %6 = llvm.insertvalue %5, %4[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
      %7 = llvm.mlir.constant(1 : index) : i64
      %8 = llvm.insertvalue %7, %6[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
      llvm.return
    }
  }
  func.func @foo() {
    // CHECK: [[MEMREF:%.*]] = gpu.alloc () : memref<10xf32, 1>
    // CHECK: [[DESCRIPTOR:%.*]] = builtin.unrealized_conversion_cast [[MEMREF]] : memref<10xf32, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
    // CHECK: [[PTR:%.*]] = llvm.extractvalue [[DESCRIPTOR]][1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
    // CHECK: gpu.launch_func  @kernels::@kernel_1 blocks in ({{.*}}) threads in ({{.*}}) : i64
    // CHECK: args(%{{.*}} : f32, [[PTR]] : !llvm.ptr<1>)
    %0 = arith.constant 0. : f32
    %1 = gpu.alloc () : memref<10xf32, 1>
    %c8 = arith.constant 8 : index
    gpu.launch_func  @kernels::@kernel_1 blocks in (%c8, %c8, %c8) threads in (%c8, %c8, %c8) args(%0 : f32, %1 : memref<10xf32, 1>)
    return
  }
}
+36 −2
Original line number Diff line number Diff line
// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin use-opaque-pointers=1" | FileCheck %s
// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco use-opaque-pointers=1" | FileCheck %s --check-prefix=ROCDL
// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin use-opaque-pointers=1" -split-input-file | FileCheck %s
// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco use-opaque-pointers=1" -split-input-file | FileCheck %s --check-prefix=ROCDL

module attributes {gpu.container_module} {

@@ -61,3 +61,37 @@ module attributes {gpu.container_module} {
  // CHECK: llvm.call @mgpuStreamDestroy
  // CHECK: llvm.call @mgpuModuleUnload
}

// -----

module attributes {gpu.container_module} {
  // CHECK: gpu.module
  // ROCDL: gpu.module
  gpu.module @kernel_module [#nvvm.target] {
    llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
        %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
        %arg5: i64) attributes {gpu.kernel} {
      llvm.return
    }
  }

  func.func @foo(%buffer: memref<?xf32>) {
  // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64
  // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32) : i32
  // CHECK: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32
    %c8 = arith.constant 8 : index
    %c32 = arith.constant 32 : i32
    %c256 = arith.constant 256 : i32

  // CHECK: gpu.launch_func @kernel_module::@kernel
  // CHECK: blocks in ([[C8]], [[C8]], [[C8]]) threads in ([[C8]], [[C8]], [[C8]]) : i64
  // CHECK: dynamic_shared_memory_size [[C256]]
  // CHECK: args([[C32]] : i32, %{{.*}} : !llvm.ptr, %{{.*}} : !llvm.ptr, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64)
    gpu.launch_func @kernel_module::@kernel
        blocks in (%c8, %c8, %c8)
        threads in (%c8, %c8, %c8)
        dynamic_shared_memory_size %c256
        args(%c32 : i32, %buffer : memref<?xf32>)
    return
  }
}