[mlir][Affine] Add support for multi-store producer fusion (c8fc5c03) · Commits · llvm-doe / llvm-project

mlir/include/mlir/Analysis/AffineStructures.h

+15 −0

Original line number	Diff line number	Diff line
		@@ -232,6 +232,21 @@ public:
		// TODO: add support for non-unit strides.
		LogicalResult addAffineForOpDomain(AffineForOp forOp);

		/// Adds constraints (lower and upper bounds) for each loop in the loop nest
		/// described by the bound maps 'lbMaps' and 'ubMaps' of a computation slice.
		/// Every pair ('lbMaps[i]', 'ubMaps[i]') describes the bounds of a loop in
		/// the nest, sorted outer-to-inner. 'operands' contains the bound operands
		/// for a single bound map. All the bound maps will use the same bound
		/// operands. Note that some loops described by a computation slice might not
		/// exist yet in the IR so the Value attached to those dimension identifiers
		/// might be empty. For that reason, this method doesn't perform Value
		/// look-ups to retrieve the dimension identifier positions. Instead, it
		/// assumes the position of the dim identifiers in the constraint system is
		/// the same as the position of the loop in the loop nest.
		LogicalResult addDomainFromSliceMaps(ArrayRef<AffineMap> lbMaps,
		ArrayRef<AffineMap> ubMaps,
		ArrayRef<Value> operands);

		/// Adds constraints imposed by the `affine.if` operation. These constraints
		/// are collected from the IntegerSet attached to the given `affine.if`
		/// instance argument (`ifOp`). It is asserted that:

mlir/include/mlir/Analysis/Utils.h

+16 −1

Original line number	Diff line number	Diff line
		@@ -83,10 +83,25 @@ struct ComputationSliceState {
		// Clears all bounds and operands in slice state.
		void clearBounds();

		/// Return true if the computation slice is empty.
		/// Returns true if the computation slice is empty.
		bool isEmpty() const { return ivs.empty(); }

		/// Returns true if the computation slice encloses all the iterations of the
		/// sliced loop nest. Returns false if it does not. Returns llvm::None if it
		/// cannot determine if the slice is maximal or not.
		// TODO: Cache 'isMaximal' so that we don't recompute it when the slice
		// information hasn't changed.
		Optional<bool> isMaximal() const;

		void dump() const;

		private:
		/// Fast check to determine if the computation slice is maximal. Returns true
		/// if each slice dimension maps to an existing dst dimension and both the src
		/// and the dst loops for those dimensions have the same bounds. Returns false
		/// if both the src and the dst loops don't have the same bounds. Returns
		/// llvm::None if none of the above can be proven.
		Optional<bool> isSliceMaximalFastCheck() const;
		};

		/// Computes the computation slice loop bounds for one loop nest as affine maps

mlir/include/mlir/Transforms/LoopFusionUtils.h

+38 −11

Original line number	Diff line number	Diff line
		@@ -50,7 +50,8 @@ struct FusionResult {
		// TODO: Generalize utilities so that producer-consumer and sibling fusion
		// strategies can be used without the assumptions made in the AffineLoopFusion
		// pass.
		struct FusionStrategy {
		class FusionStrategy {
		public:
		enum StrategyEnum {
		// Generic loop fusion: Arbitrary loops are considered for fusion. No
		// assumptions about a specific fusion strategy from AffineLoopFusion pass
		@@ -69,13 +70,34 @@ struct FusionStrategy {
		// implementation in AffineLoopFusion pass are made. See pass for specific
		// details.
		Sibling
		} strategy;
		};

		// Target memref for this fusion transformation.
		Value memref;
		/// Construct a generic or producer-consumer fusion strategy.
		FusionStrategy(StrategyEnum strategy) : strategy(strategy) {
		assert(strategy != Sibling &&
		"Sibling fusion strategy requires a specific memref");
		}

		/// Construct a sibling fusion strategy targeting 'memref'. This construct
		/// should only be used for sibling fusion.
		FusionStrategy(Value memref) : strategy(Sibling), memref(memref) {}

		/// Returns the fusion strategy.
		StrategyEnum getStrategy() const { return strategy; };

		FusionStrategy(StrategyEnum strategy, Value memref)
		: strategy(strategy), memref(memref) {}
		/// Returns the memref attached to this sibling fusion strategy.
		Value getSiblingFusionMemRef() const {
		assert(strategy == Sibling && "Memref is only valid for sibling fusion");
		return memref;
		}

		private:
		/// Fusion strategy.
		StrategyEnum strategy;

		/// Target memref for this fusion transformation. Only used for sibling
		/// fusion.
		Value memref;
		};

		/// Checks the feasibility of fusing the loop nest rooted at 'srcForOp' into the
		@@ -86,11 +108,10 @@ struct FusionStrategy {
		/// NOTE: This function is not feature complete and should only be used in
		/// testing.
		/// TODO: Update comments when this function is fully implemented.
		FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
		unsigned dstLoopDepth,
		FusionResult
		canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, unsigned dstLoopDepth,
		ComputationSliceState *srcSlice,
		FusionStrategy fusionStrategy = {
		FusionStrategy::Generic, Value()});
		FusionStrategy fusionStrategy = FusionStrategy::Generic);

		/// Fuses 'srcForOp' into 'dstForOp' with destination loop block insertion point
		/// and source slice loop bounds specified in 'srcSlice'.
		@@ -134,6 +155,12 @@ bool getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats,
		const ComputationSliceState &slice,
		int64_t *computeCost);

		/// Returns in 'producerConsumerMemrefs' the memrefs involved in a
		/// producer-consumer dependence between write ops in 'srcOps' and read ops in
		/// 'dstOps'.
		void gatherProducerConsumerMemrefs(ArrayRef<Operation *> srcOps,
		ArrayRef<Operation *> dstOps,
		DenseSet<Value> &producerConsumerMemrefs);
		} // end namespace mlir

		#endif // MLIR_TRANSFORMS_LOOP_FUSION_UTILS_H

mlir/include/mlir/Transforms/Passes.td

+105 −0

Original line number	Diff line number	Diff line
		@@ -17,6 +17,111 @@ include "mlir/Pass/PassBase.td"

		def AffineLoopFusion : FunctionPass<"affine-loop-fusion"> {
		let summary = "Fuse affine loop nests";
		let description = [{
		This pass performs fusion of loop nests using a slicing-based approach. It
		combines two fusion strategies: producer-consumer fusion and sibling fusion.
		Producer-consumer fusion is aimed at fusing pairs of loops where the first
		one writes to a memref that the second reads. Sibling fusion targets pairs
		of loops that share no dependences between them but that load from the same
		memref. The fused loop nests, when possible, are rewritten to access
		significantly smaller local buffers instead of the original memref's, and
		the latter are often either completely optimized away or contracted. This
		transformation leads to enhanced locality and lower memory footprint through
		the elimination or contraction of temporaries/intermediate memref's. These
		benefits are sometimes achieved at the expense of redundant computation
		through a cost model that evaluates available choices such as the depth at
		which a source slice should be materialized in the designation slice.

		Example 1: Producer-consumer fusion.
		Input:
		```mlir
		func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
		%0 = alloc() : memref<10xf32>
		%1 = alloc() : memref<10xf32>
		%cst = constant 0.000000e+00 : f32
		affine.for %arg2 = 0 to 10 {
		affine.store %cst, %0[%arg2] : memref<10xf32>
		affine.store %cst, %1[%arg2] : memref<10xf32>
		}
		affine.for %arg2 = 0 to 10 {
		%2 = affine.load %0[%arg2] : memref<10xf32>
		%3 = addf %2, %2 : f32
		affine.store %3, %arg0[%arg2] : memref<10xf32>
		}
		affine.for %arg2 = 0 to 10 {
		%2 = affine.load %1[%arg2] : memref<10xf32>
		%3 = mulf %2, %2 : f32
		affine.store %3, %arg1[%arg2] : memref<10xf32>
		}
		return
		}
		```
		Output:
		```mlir
		func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
		%0 = alloc() : memref<1xf32>
		%1 = alloc() : memref<1xf32>
		%cst = constant 0.000000e+00 : f32
		affine.for %arg2 = 0 to 10 {
		affine.store %cst, %0[0] : memref<1xf32>
		affine.store %cst, %1[0] : memref<1xf32>
		%2 = affine.load %1[0] : memref<1xf32>
		%3 = mulf %2, %2 : f32
		affine.store %3, %arg1[%arg2] : memref<10xf32>
		%4 = affine.load %0[0] : memref<1xf32>
		%5 = addf %4, %4 : f32
		affine.store %5, %arg0[%arg2] : memref<10xf32>
		}
		return
		}
		```

		Example 2: Sibling fusion.
		Input:
		```mlir
		func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
		%arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
		%arg4: memref<10x10xf32>) {
		affine.for %arg5 = 0 to 3 {
		affine.for %arg6 = 0 to 3 {
		%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
		%1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
		%2 = mulf %0, %1 : f32
		affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
		}
		}
		affine.for %arg5 = 0 to 3 {
		affine.for %arg6 = 0 to 3 {
		%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
		%1 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
		%2 = addf %0, %1 : f32
		affine.store %2, %arg4[%arg5, %arg6] : memref<10x10xf32>
		}
		}
		return
		}
		```
		Output:
		```mlir
		func @sibling_fusion(%arg0: memref<10x10xf32>, %arg1: memref<10x10xf32>,
		%arg2: memref<10x10xf32>, %arg3: memref<10x10xf32>,
		%arg4: memref<10x10xf32>) {
		affine.for %arg5 = 0 to 3 {
		affine.for %arg6 = 0 to 3 {
		%0 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
		%1 = affine.load %arg1[%arg5, %arg6] : memref<10x10xf32>
		%2 = mulf %0, %1 : f32
		affine.store %2, %arg3[%arg5, %arg6] : memref<10x10xf32>
		%3 = affine.load %arg0[%arg5, %arg6] : memref<10x10xf32>
		%4 = affine.load %arg2[%arg5, %arg6] : memref<10x10xf32>
		%5 = addf %3, %4 : f32
		affine.store %5, %arg4[%arg5, %arg6] : memref<10x10xf32>
		}
		}
		return
		}
		```
		}];
		let constructor = "mlir::createLoopFusionPass()";
		let options = [
		Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double",

mlir/lib/Analysis/AffineStructures.cpp

+64 −0

Original line number	Diff line number	Diff line
		@@ -709,6 +709,70 @@ LogicalResult FlatAffineConstraints::addAffineForOpDomain(AffineForOp forOp) {
		/eq=/false, /lower=/false);
		}

		/// Adds constraints (lower and upper bounds) for each loop in the loop nest
		/// described by the bound maps 'lbMaps' and 'ubMaps' of a computation slice.
		/// Every pair ('lbMaps[i]', 'ubMaps[i]') describes the bounds of a loop in
		/// the nest, sorted outer-to-inner. 'operands' contains the bound operands
		/// for a single bound map. All the bound maps will use the same bound
		/// operands. Note that some loops described by a computation slice might not
		/// exist yet in the IR so the Value attached to those dimension identifiers
		/// might be empty. For that reason, this method doesn't perform Value
		/// look-ups to retrieve the dimension identifier positions. Instead, it
		/// assumes the position of the dim identifiers in the constraint system is
		/// the same as the position of the loop in the loop nest.
		LogicalResult
		FlatAffineConstraints::addDomainFromSliceMaps(ArrayRef<AffineMap> lbMaps,
		ArrayRef<AffineMap> ubMaps,
		ArrayRef<Value> operands) {
		assert(lbMaps.size() == ubMaps.size());
		assert(lbMaps.size() <= getNumDimIds());

		for (unsigned i = 0, e = lbMaps.size(); i < e; ++i) {
		AffineMap lbMap = lbMaps[i];
		AffineMap ubMap = ubMaps[i];
		assert(!lbMap \|\| lbMap.getNumInputs() == operands.size());
		assert(!ubMap \|\| ubMap.getNumInputs() == operands.size());

		// Check if this slice is just an equality along this dimension. If so,
		// retrieve the existing loop it equates to and add it to the system.
		if (lbMap && ubMap && lbMap.getNumResults() == 1 &&
		ubMap.getNumResults() == 1 &&
		lbMap.getResult(0) + 1 == ubMap.getResult(0) &&
		// The condition above will be true for maps describing a single
		// iteration (e.g., lbMap.getResult(0) = 0, ubMap.getResult(0) = 1).
		// Make sure we skip those cases by checking that the lb result is not
		// just a constant.
		!lbMap.getResult(0).isa<AffineConstantExpr>()) {
		// Limited support: we expect the lb result to be just a loop dimension.
		// Not supported otherwise for now.
		AffineDimExpr result = lbMap.getResult(0).dyn_cast<AffineDimExpr>();
		if (!result)
		return failure();

		AffineForOp loop =
		getForInductionVarOwner(operands[result.getPosition()]);
		if (!loop)
		return failure();

		if (failed(addAffineForOpDomain(loop)))
		return failure();
		continue;
		}

		// This slice refers to a loop that doesn't exist in the IR yet. Add its
		// bounds to the system assuming its dimension identifier position is the
		// same as the position of the loop in the loop nest.
		if (lbMap && failed(addLowerOrUpperBound(i, lbMap, operands, /eq=/false,
		/lower=/true)))
		return failure();

		if (ubMap && failed(addLowerOrUpperBound(i, ubMap, operands, /eq=/false,
		/lower=/false)))
		return failure();
		}
		return success();
		}

		void FlatAffineConstraints::addAffineIfOpDomain(AffineIfOp ifOp) {
		// Create the base constraints from the integer set attached to ifOp.
		FlatAffineConstraints cst(ifOp.getIntegerSet());