Commit 57eda9be authored by Uday Bondhugula's avatar Uday Bondhugula
Browse files

[MLIR][GPU] Add constant propagator for gpu.launch op

Add a constant propagator for gpu.launch op in cases where the
grid/thread IDs can be trivially determined to take a single constant
value of zero.

Differential Revision: https://reviews.llvm.org/D109994
parent 4121ac1e
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -523,6 +523,7 @@ def GPU_LaunchOp : GPU_Op<"launch">,
  let parser = [{ return parseLaunchOp(parser, result); }];
  let printer = [{ printLaunchOp(p, *this); }];
  let verifier = [{ return ::verify(*this); }];
  let hasCanonicalizer = 1;
}

def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, NoSideEffect,
+44 −0
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/FunctionImplementation.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
@@ -530,6 +531,49 @@ static ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) {
                 parser.parseOptionalAttrDict(result.attributes));
}

/// Simplify the gpu.launch when the range of the thread and block IDs is
/// trivially known to be one.
struct FoldLaunchArguments : public OpRewritePattern<LaunchOp> {
  using OpRewritePattern<LaunchOp>::OpRewritePattern;
  LogicalResult matchAndRewrite(LaunchOp op,
                                PatternRewriter &rewriter) const override {
    auto isTriviallyOne = [](Value size) {
      IntegerAttr cst;
      return matchPattern(size, m_Constant(&cst)) && cst.getInt() == 1;
    };

    // If the range implies a single value for `id`, replace `id`'s uses by
    // zero.
    Value zero;
    bool simplified = false;
    auto constPropIdUses = [&](Value id, Value size) {
      if (!isTriviallyOne(size))
        return;
      if (!simplified) {
        // Create a zero value the first time.
        OpBuilder::InsertionGuard guard(rewriter);
        rewriter.setInsertionPointToStart(&op.body().front());
        zero = rewriter.create<ConstantIndexOp>(op.getLoc(), /*value=*/0);
      }
      id.replaceAllUsesWith(zero);
      simplified = true;
    };
    constPropIdUses(op.getBlockIds().x, op.gridSizeX());
    constPropIdUses(op.getBlockIds().y, op.gridSizeY());
    constPropIdUses(op.getBlockIds().z, op.gridSizeZ());
    constPropIdUses(op.getThreadIds().x, op.blockSizeX());
    constPropIdUses(op.getThreadIds().y, op.blockSizeY());
    constPropIdUses(op.getThreadIds().z, op.blockSizeZ());

    return success(simplified);
  }
};

void LaunchOp::getCanonicalizationPatterns(RewritePatternSet &rewrites,
                                           MLIRContext *context) {
  rewrites.add<FoldLaunchArguments>(context);
}

//===----------------------------------------------------------------------===//
// LaunchFuncOp
//===----------------------------------------------------------------------===//
+56 −0
Original line number Diff line number Diff line
@@ -31,3 +31,59 @@ func @gpu_dim_of_alloc(%size: index) -> index {
  %1 = memref.dim %0, %c0 : memref<?xindex>
  return %1 : index
}

// -----

// CHECK-LABEL: func @simplify_gpu_launch
func @simplify_gpu_launch() attributes {llvm.emit_c_interface} {
  %cst = constant 0.000000e+00 : f32
  %c1 = constant 1 : index
  %c32 = constant 32 : index
  %c16 = constant 16 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %0 = memref.alloc() : memref<2x16x16xf32>
  scf.for %arg0 = %c0 to %c2 step %c1 {
    scf.for %arg1 = %c0 to %c16 step %c1 {
      scf.for %arg2 = %c0 to %c16 step %c1 {
        memref.store %cst, %0[%arg0, %arg1, %arg2] : memref<2x16x16xf32>
      }
    }
  }
  %1 = gpu.wait async
  %memref, %asyncToken = gpu.alloc async [%1] () : memref<2x16x16xf32>
  %2 = gpu.memcpy async [%1] %memref, %0 : memref<2x16x16xf32>, memref<2x16x16xf32>
  gpu.wait [%1]
  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1)
    threads(%arg3, %arg4, %arg5) in (%arg9 = %c32, %arg10 = %c1, %arg11 = %c1) {
    %3 = muli %arg5, %c32 : index
    %4 = muli %arg4, %c32 : index
    %5 = addi %3, %4 : index
    %6 = addi %5, %arg3 : index
    %7 = divi_unsigned %6, %c32 : index
    %8 = muli %arg0, %c16 : index
    %9 = muli %arg1, %c2 : index
    %10 = muli %7, %c2 : index
    %11 = addi %9, %10 : index
    %12 = memref.load %memref[%11, %c0, %8] : memref<2x16x16xf32>
    %13 = addi %11, %c1 : index
    %14 = memref.load %memref[%13, %c0, %8] : memref<2x16x16xf32>
    memref.store %12, %memref[%11, %c0, %8] : memref<2x16x16xf32>
    memref.store %14, %memref[%13, %c0, %8] : memref<2x16x16xf32>
    gpu.terminator
  }
  return
}

// CHECK-DAG: %[[C1:.*]] = constant 1 : index
// CHECK-DAG: %[[C0:.*]] = constant 0 : index
// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C1]], %{{.*}} = %[[C1]], %{{.*}} = %[[C1]]) threads(%[[TIDX:.*]], %{{.*}}, %{{.*}}) in (%{{.*}} = %c32, %{{.*}} = %[[C1]], %{{.*}} = %[[C1]]) {
// CHECK-NEXT:  	divi_unsigned %[[TIDX]], %c32 : index
// CHECK-NEXT:  	muli %{{.*}}, %c2 : index
// CHECK-NEXT:    memref.load %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32>
// CHECK-NEXT:    addi %{{.*}}, %[[C1]] : index
// CHECK-NEXT:    memref.load %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32>
// CHECK-NEXT:    memref.store %{{.*}}, %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32>
// CHECK-NEXT:    memref.store %{{.*}}, %memref[%{{.*}}, %[[C0]], %[[C0]]] : memref<2x16x16xf32>
// CHECK-NEXT:    gpu.terminator
// CHECK-NEXT:  }