Commit 64871f77 authored by Andy Davis's avatar Andy Davis
Browse files

[mlir] Adds affine loop fusion transformation function to LoopFusionUtils.

Summary:
Adds affine loop fusion transformation function to LoopFusionUtils.
Updates TestLoopFusion utility to run loop fusion transformation until a fixed point is reached.
Adds unit tests to test the transformation.

Reviewers: bondhugula, dcaballe, nicolasvasilache

Reviewed By: bondhugula, dcaballe

Subscribers: Joonsoo, merge_guards_bot, mehdi_amini, rriddle, jpienaar, burmako, shauheen, antiagainst, arpith-jacob, mgester, lucyrfox, aartbik, liufengdb, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D73190
parent e4f4a6c0
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -51,6 +51,11 @@ FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
                          unsigned dstLoopDepth,
                          ComputationSliceState *srcSlice);

/// Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point
/// and source slice loop bounds specified in 'srcSlice'.
void fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
               ComputationSliceState *srcSlice);

/// LoopNestStats aggregates various per-loop statistics (eg. loop trip count
/// and operation count) for a loop nest up until (and including) the innermost
/// loop body.
+29 −0
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@
#include "mlir/IR/Builders.h"
#include "mlir/IR/Function.h"
#include "mlir/IR/Operation.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Debug.h"
@@ -246,6 +247,34 @@ FusionResult mlir::canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
  return FusionResult::Success;
}

/// Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point
/// and source slice loop bounds specified in 'srcSlice'.
void mlir::fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
                     ComputationSliceState *srcSlice) {
  // Clone 'srcForOp' into 'dstForOp' at 'srcSlice->insertPoint'.
  OpBuilder b(srcSlice->insertPoint->getBlock(), srcSlice->insertPoint);
  BlockAndValueMapping mapper;
  b.clone(*srcForOp, mapper);

  // Update 'sliceLoopNest' upper and lower bounds from computed 'srcSlice'.
  SmallVector<AffineForOp, 4> sliceLoops;
  for (unsigned i = 0, e = srcSlice->ivs.size(); i < e; ++i) {
    auto loopIV = mapper.lookupOrNull(srcSlice->ivs[i]);
    if (!loopIV)
      continue;
    auto forOp = getForInductionVarOwner(loopIV);
    sliceLoops.push_back(forOp);
    if (AffineMap lbMap = srcSlice->lbs[i])
      forOp.setLowerBound(srcSlice->lbOperands[i], lbMap);
    if (AffineMap ubMap = srcSlice->ubs[i])
      forOp.setUpperBound(srcSlice->ubOperands[i], ubMap);
  }

  // Promote any single iteration slice loops.
  for (auto forOp : sliceLoops)
    promoteIfSingleIteration(forOp);
}

/// Collect loop nest statistics (eg. loop trip count and operation count)
/// in 'stats' for loop nest rooted at 'forOp'. Returns true on success,
/// returns false otherwise.
+105 −0
Original line number Diff line number Diff line
// RUN: mlir-opt %s -test-loop-fusion -test-loop-fusion-transformation -split-input-file -canonicalize | FileCheck %s

// CHECK-LABEL: func @slice_depth1_loop_nest() {
func @slice_depth1_loop_nest() {
  %0 = alloc() : memref<100xf32>
  %cst = constant 7.000000e+00 : f32
  affine.for %i0 = 0 to 16 {
    affine.store %cst, %0[%i0] : memref<100xf32>
  }
  affine.for %i1 = 0 to 5 {
    %1 = affine.load %0[%i1] : memref<100xf32>
  }
  // CHECK:      affine.for %[[IV0:.*]] = 0 to 5 {
  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%[[IV0]]] : memref<100xf32>
  // CHECK-NEXT:   affine.load %{{.*}}[%[[IV0]]] : memref<100xf32>
  // CHECK-NEXT: }
  // CHECK-NEXT: return
  return
}

// -----

// CHECK-LABEL: func @should_fuse_reduction_to_pointwise() {
func @should_fuse_reduction_to_pointwise() {
  %a = alloc() : memref<10x10xf32>
  %b = alloc() : memref<10xf32>
  %c = alloc() : memref<10xf32>

  %cf7 = constant 7.0 : f32

  affine.for %i0 = 0 to 10 {
    affine.for %i1 = 0 to 10 {
      %v0 = affine.load %b[%i0] : memref<10xf32>
      %v1 = affine.load %a[%i0, %i1] : memref<10x10xf32>
      %v3 = addf %v0, %v1 : f32
      affine.store %v3, %b[%i0] : memref<10xf32>
    }
  }
  affine.for %i2 = 0 to 10 {
    %v4 = affine.load %b[%i2] : memref<10xf32>
    affine.store %v4, %c[%i2] : memref<10xf32>
  }

  // Match on the fused loop nest.
  // Should fuse in entire inner loop on %i1 from source loop nest, as %i1
  // is not used in the access function of the store/load on %b.
  // CHECK:       affine.for %{{.*}} = 0 to 10 {
  // CHECK-NEXT:    affine.for %{{.*}} = 0 to 10 {
  // CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
  // CHECK-NEXT:      affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
  // CHECK-NEXT:      addf %{{.*}}, %{{.*}} : f32
  // CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
  // CHECK-NEXT:    }
  // CHECK-NEXT:    affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
  // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
  // CHECK-NEXT:  }
  // CHECK-NEXT:  return
  return
}

// -----

// CHECK-LABEL: func @should_fuse_avoiding_dependence_cycle() {
func @should_fuse_avoiding_dependence_cycle() {
  %a = alloc() : memref<10xf32>
  %b = alloc() : memref<10xf32>
  %c = alloc() : memref<10xf32>

  %cf7 = constant 7.0 : f32

  // Set up the following dependences:
  // 1) loop0 -> loop1 on memref '%{{.*}}'
  // 2) loop0 -> loop2 on memref '%{{.*}}'
  // 3) loop1 -> loop2 on memref '%{{.*}}'
  affine.for %i0 = 0 to 10 {
    %v0 = affine.load %a[%i0] : memref<10xf32>
    affine.store %cf7, %b[%i0] : memref<10xf32>
  }
  affine.for %i1 = 0 to 10 {
    affine.store %cf7, %a[%i1] : memref<10xf32>
    %v1 = affine.load %c[%i1] : memref<10xf32>
  }
  affine.for %i2 = 0 to 10 {
    %v2 = affine.load %b[%i2] : memref<10xf32>
    affine.store %cf7, %c[%i2] : memref<10xf32>
  }
  // Fusing loop first loop into last would create a cycle:
  //   {1} <--> {0, 2}
  // However, we can avoid the dependence cycle if we first fuse loop0 into
  // loop1:
  //   {0, 1) --> {2}
  // Then fuse this loop nest with loop2:
  //   {0, 1, 2}
  //
  // CHECK:      affine.for %{{.*}} = 0 to 10 {
  // CHECK-NEXT:   affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
  // CHECK-NEXT:   affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
  // CHECK-NEXT:   affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
  // CHECK-NEXT:   affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
  // CHECK-NEXT: }
  // CHECK-NEXT: return
  return
}
+68 −20
Original line number Diff line number Diff line
@@ -41,6 +41,11 @@ static llvm::cl::opt<bool> clTestSliceComputation(
    llvm::cl::desc("Enable testing of loop fusion slice computation"),
    llvm::cl::cat(clOptionsCategory));

static llvm::cl::opt<bool> clTestLoopFusionTransformation(
    "test-loop-fusion-transformation",
    llvm::cl::desc("Enable testing of loop fusion transformation"),
    llvm::cl::cat(clOptionsCategory));

namespace {

struct TestLoopFusion : public FunctionPass<TestLoopFusion> {
@@ -69,11 +74,9 @@ gatherLoops(Block *block, unsigned currLoopDepth,
// Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths
// in range ['loopDepth' + 1, 'maxLoopDepth'].
// Emits a remark on 'loops[i]' if a fusion-preventing dependence exists.
static void testDependenceCheck(SmallVector<AffineForOp, 2> &loops, unsigned i,
                                unsigned j, unsigned loopDepth,
static bool testDependenceCheck(AffineForOp srcForOp, AffineForOp dstForOp,
                                unsigned i, unsigned j, unsigned loopDepth,
                                unsigned maxLoopDepth) {
  AffineForOp srcForOp = loops[i];
  AffineForOp dstForOp = loops[j];
  mlir::ComputationSliceState sliceUnion;
  for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
    FusionResult result =
@@ -84,6 +87,7 @@ static void testDependenceCheck(SmallVector<AffineForOp, 2> &loops, unsigned i,
          << i << " into loop nest " << j << " at depth " << loopDepth;
    }
  }
  return false;
}

// Returns the index of 'op' in its block.
@@ -121,11 +125,9 @@ static std::string getSliceStr(const mlir::ComputationSliceState &sliceUnion) {
// Computes fusion slice union on 'loops[i]' and 'loops[j]' at loop depths
// in range ['loopDepth' + 1, 'maxLoopDepth'].
// Emits a string representation of the slice union as a remark on 'loops[j]'.
static void testSliceComputation(SmallVector<AffineForOp, 2> &loops, unsigned i,
                                 unsigned j, unsigned loopDepth,
static bool testSliceComputation(AffineForOp forOpA, AffineForOp forOpB,
                                 unsigned i, unsigned j, unsigned loopDepth,
                                 unsigned maxLoopDepth) {
  AffineForOp forOpA = loops[i];
  AffineForOp forOpB = loops[j];
  for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
    mlir::ComputationSliceState sliceUnion;
    FusionResult result = mlir::canFuseLoops(forOpA, forOpB, d, &sliceUnion);
@@ -135,31 +137,77 @@ static void testSliceComputation(SmallVector<AffineForOp, 2> &loops, unsigned i,
          << " : " << getSliceStr(sliceUnion) << ")";
    }
  }
  return false;
}

void TestLoopFusion::runOnFunction() {
  // Gather all AffineForOps by loop depth.
  DenseMap<unsigned, SmallVector<AffineForOp, 2>> depthToLoops;
  for (auto &block : getFunction()) {
    gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
static bool testLoopFusionTransformation(AffineForOp forOpA, AffineForOp forOpB,
                                         unsigned i, unsigned j,
                                         unsigned loopDepth,
                                         unsigned maxLoopDepth) {
  for (unsigned d = loopDepth + 1; d <= maxLoopDepth; ++d) {
    mlir::ComputationSliceState sliceUnion;
    FusionResult result = mlir::canFuseLoops(forOpA, forOpB, d, &sliceUnion);
    if (result.value == FusionResult::Success) {
      mlir::fuseLoops(forOpA, forOpB, &sliceUnion);
      // Note: 'forOpA' is removed to simplify test output. A proper loop
      // fusion pass should check the data dependence graph and run memref
      // region analysis to ensure removing 'forOpA' is safe.
      forOpA.erase();
      return true;
    }
  }
  return false;
}

using LoopFunc = function_ref<bool(AffineForOp, AffineForOp, unsigned, unsigned,
                                   unsigned, unsigned)>;

// Run tests on all combinations of src/dst loop nests in 'depthToLoops'.
static bool
iterateLoops(DenseMap<unsigned, SmallVector<AffineForOp, 2>> &depthToLoops,
             LoopFunc fn) {
  bool changed = false;
  for (auto &depthAndLoops : depthToLoops) {
    unsigned loopDepth = depthAndLoops.first;
    auto &loops = depthAndLoops.second;
    unsigned numLoops = loops.size();
    for (unsigned j = 0; j < numLoops; ++j) {
      for (unsigned k = 0; k < numLoops; ++k) {
        if (j == k)
          continue;
        if (clTestDependenceCheck)
          testDependenceCheck(loops, j, k, loopDepth, depthToLoops.size());
        if (clTestSliceComputation)
          testSliceComputation(loops, j, k, loopDepth, depthToLoops.size());
        if (j != k)
          changed |=
              fn(loops[j], loops[k], j, k, loopDepth, depthToLoops.size());
      }
    }
  }
  return changed;
}

void TestLoopFusion::runOnFunction() {
  DenseMap<unsigned, SmallVector<AffineForOp, 2>> depthToLoops;
  if (clTestLoopFusionTransformation) {
    // Run loop fusion until a fixed point is reached.
    bool changed = true;
    while (changed) {
      depthToLoops.clear();
      // Gather all AffineForOps by loop depth.
      for (auto &block : getFunction())
        gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);

      // Try to fuse all combinations of src/dst loop nests in 'depthToLoops'.
      changed = iterateLoops(depthToLoops, testLoopFusionTransformation);
    }
    return;
  }

  // Gather all AffineForOps by loop depth.
  for (auto &block : getFunction()) {
    gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops);
  }
  // Run tests on all combinations of src/dst loop nests in 'depthToLoops'.
  if (clTestDependenceCheck)
    iterateLoops(depthToLoops, testDependenceCheck);
  if (clTestSliceComputation)
    iterateLoops(depthToLoops, testSliceComputation);
}

static PassRegistration<TestLoopFusion>