LoopDistribute/LAA: Respect convergent (2466ba97) · Commits · Cabrera, Anthony / llvm-project

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

+6 −0

Original line number	Diff line number	Diff line
		@@ -522,6 +522,11 @@ public:
		/// no memory dependence cycles.
		bool canVectorizeMemory() const { return CanVecMem; }

		/// Return true if there is a convergent operation in the loop. There may
		/// still be reported runtime pointer checks that would be required, but it is
		/// not legal to insert them.
		bool hasConvergentOp() const { return HasConvergentOp; }

		const RuntimePointerChecking *getRuntimePointerChecking() const {
		return PtrRtChecking.get();
		}
		@@ -642,6 +647,7 @@ private:

		/// Cache the result of analyzeLoop.
		bool CanVecMem;
		bool HasConvergentOp;

		/// Indicator that there are non vectorizable stores to a uniform address.
		bool HasDependenceInvolvingLoopInvariantAddress;

llvm/lib/Analysis/LoopAccessAnalysis.cpp

+55 −9

Original line number	Diff line number	Diff line
		@@ -1778,6 +1778,11 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis AA, LoopInfo LI,
		unsigned NumReads = 0;
		unsigned NumReadWrites = 0;

		bool HasComplexMemInst = false;

		// A runtime check is only legal to insert if there are no convergent calls.
		HasConvergentOp = false;

		PtrRtChecking->Pointers.clear();
		PtrRtChecking->Need = false;

		@@ -1785,8 +1790,25 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis AA, LoopInfo LI,

		// For each block.
		for (BasicBlock *BB : TheLoop->blocks()) {
		// Scan the BB and collect legal loads and stores.
		// Scan the BB and collect legal loads and stores. Also detect any
		// convergent instructions.
		for (Instruction &I : *BB) {
		if (auto *Call = dyn_cast<CallBase>(&I)) {
		if (Call->isConvergent())
		HasConvergentOp = true;
		}

		// With both a non-vectorizable memory instruction and a convergent
		// operation, found in this loop, no reason to continue the search.
		if (HasComplexMemInst && HasConvergentOp) {
		CanVecMem = false;
		return;
		}

		// Avoid hitting recordAnalysis multiple times.
		if (HasComplexMemInst)
		continue;

		// If this is a load, save it. If this instruction can read from memory
		// but is not a load, then we quit. Notice that we don't handle function
		// calls that read or write.
		@@ -1805,12 +1827,18 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis AA, LoopInfo LI,
		continue;

		auto *Ld = dyn_cast<LoadInst>(&I);
		if (!Ld \|\| (!Ld->isSimple() && !IsAnnotatedParallel)) {
		if (!Ld) {
		recordAnalysis("CantVectorizeInstruction", Ld)
		<< "instruction cannot be vectorized";
		HasComplexMemInst = true;
		continue;
		}
		if (!Ld->isSimple() && !IsAnnotatedParallel) {
		recordAnalysis("NonSimpleLoad", Ld)
		<< "read with atomic ordering or volatile read";
		LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
		CanVecMem = false;
		return;
		HasComplexMemInst = true;
		continue;
		}
		NumLoads++;
		Loads.push_back(Ld);
		@@ -1826,15 +1854,15 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis AA, LoopInfo LI,
		if (!St) {
		recordAnalysis("CantVectorizeInstruction", St)
		<< "instruction cannot be vectorized";
		CanVecMem = false;
		return;
		HasComplexMemInst = true;
		continue;
		}
		if (!St->isSimple() && !IsAnnotatedParallel) {
		recordAnalysis("NonSimpleStore", St)
		<< "write with atomic ordering or volatile write";
		LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
		CanVecMem = false;
		return;
		HasComplexMemInst = true;
		continue;
		}
		NumStores++;
		Stores.push_back(St);
		@@ -1845,6 +1873,11 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis AA, LoopInfo LI,
		} // Next instr.
		} // Next block.

		if (HasComplexMemInst) {
		CanVecMem = false;
		return;
		}

		// Now we have two lists that hold the loads and the stores.
		// Next, we find the pointers that they use.

		@@ -1962,7 +1995,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis AA, LoopInfo LI,
		}

		LLVM_DEBUG(
		dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
		dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");

		CanVecMem = true;
		if (Accesses.isDependencyCheckNeeded()) {
		@@ -1997,6 +2030,15 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis AA, LoopInfo LI,
		}
		}

		if (HasConvergentOp) {
		recordAnalysis("CantInsertRuntimeCheckWithConvergent")
		<< "cannot add control dependency to convergent operation";
		LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
		"would be needed with a convergent operation\n");
		CanVecMem = false;
		return;
		}

		if (CanVecMem)
		LLVM_DEBUG(
		dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
		@@ -2285,6 +2327,7 @@ LoopAccessInfo::LoopAccessInfo(Loop L, ScalarEvolution SE,
		PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
		DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
		NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
		HasConvergentOp(false),
		HasDependenceInvolvingLoopInvariantAddress(false) {
		if (canAnalyzeLoop())
		analyzeLoop(AA, LI, TLI, DT);
		@@ -2301,6 +2344,9 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
		OS << "\n";
		}

		if (HasConvergentOp)
		OS.indent(Depth) << "Has convergent operation in loop\n";

		if (Report)
		OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";

llvm/lib/Transforms/Scalar/LoopDistribute.cpp

+14 −1

Original line number	Diff line number	Diff line
		@@ -766,8 +766,14 @@ public:
		"cannot isolate unsafe dependencies");
		}

		// Don't distribute the loop if we need too many SCEV run-time checks.
		// Don't distribute the loop if we need too many SCEV run-time checks, or
		// any if it's illegal.
		const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
		if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
		return fail("RuntimeCheckWithConvergent",
		"may not insert runtime check with convergent operation");
		}

		if (Pred.getComplexity() > (IsForced.getValueOr(false)
		? PragmaDistributeSCEVCheckThreshold
		: DistributeSCEVCheckThreshold))
		@@ -795,7 +801,14 @@ public:
		auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
		RtPtrChecking);

		if (LAI->hasConvergentOp() && !Checks.empty()) {
		return fail("RuntimeCheckWithConvergent",
		"may not insert runtime check with convergent operation");
		}

		if (!Pred.isAlwaysTrue() \|\| !Checks.empty()) {
		assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");

		MDNode *OrigLoopID = L->getLoopID();

		LLVM_DEBUG(dbgs() << "\nPointers:\n");

llvm/test/Analysis/LoopAccessAnalysis/unsafe-and-rt-checks-convergent.ll

0 → 100644

+73 −0

Original line number	Diff line number	Diff line
		; RUN: opt -loop-accesses -analyze < %s \| FileCheck %s
		; RUN: opt -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' -disable-output < %s 2>&1 \| FileCheck %s

		; Analyze this loop:
		; for (i = 0; i < n; i++)
		; A[i + 1] = A[i] * B[i] * C[i];

		target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"

		; CHECK: for.body:
		; CHECK: Has convergent operation in loop
		; CHECK: Report: cannot add control dependency to convergent operation
		; CHECK-NEXT: Dependences:
		; CHECK-NEXT: Backward:
		; CHECK-NEXT: %loadA = load i16, i16* %arrayidxA, align 2 ->
		; CHECK-NEXT: store i16 %mul1, i16* %arrayidxA_plus_2, align 2
		; CHECK: Run-time memory checks:
		; CHECK-NEXT: 0:
		; CHECK-NEXT: Comparing group
		; CHECK-NEXT: %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
		; CHECK-NEXT: %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
		; CHECK-NEXT: Against group
		; CHECK-NEXT: %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3
		; CHECK-NEXT: 1:
		; CHECK-NEXT: Comparing group
		; CHECK-NEXT: %arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
		; CHECK-NEXT: %arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
		; CHECK-NEXT: Against group
		; CHECK-NEXT: %arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3

		@B = common global i16* null, align 8
		@A = common global i16* null, align 8
		@C = common global i16* null, align 8

		define void @f() #1 {
		entry:
		%a = load i16, i16* @A, align 8
		%b = load i16, i16* @B, align 8
		%c = load i16, i16* @C, align 8
		br label %for.body

		for.body: ; preds = %for.body, %entry
		%storemerge3 = phi i64 [ 0, %entry ], [ %add, %for.body ]

		%arrayidxA = getelementptr inbounds i16, i16* %a, i64 %storemerge3
		%loadA = load i16, i16* %arrayidxA, align 2

		%arrayidxB = getelementptr inbounds i16, i16* %b, i64 %storemerge3
		%loadB = load i16, i16* %arrayidxB, align 2

		%arrayidxC = getelementptr inbounds i16, i16* %c, i64 %storemerge3
		%loadC = load i16, i16* %arrayidxC, align 2

		call void @llvm.convergent()

		%mul = mul i16 %loadB, %loadA
		%mul1 = mul i16 %mul, %loadC

		%add = add nuw nsw i64 %storemerge3, 1
		%arrayidxA_plus_2 = getelementptr inbounds i16, i16* %a, i64 %add
		store i16 %mul1, i16* %arrayidxA_plus_2, align 2

		%exitcond = icmp eq i64 %add, 20
		br i1 %exitcond, label %for.end, label %for.body

		for.end: ; preds = %for.body
		ret void
		}

		declare void @llvm.convergent() #0

		attributes #0 = { nounwind readnone convergent }
		attributes #1 = { nounwind convergent }

llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll

+113 −0

Original line number	Diff line number	Diff line
		@@ -5,6 +5,9 @@
		; RUN: -verify-loop-info -verify-dom-info -S < %s \| \
		; RUN: FileCheck --check-prefix=VECTORIZE %s

		; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info \
		; RUN: -loop-accesses -analyze < %s \| FileCheck %s --check-prefix=ANALYSIS

		; The memcheck version of basic.ll. We should distribute and vectorize the
		; second part of this loop with 5 memchecks (A+1 x {C, D, E} + C x {A, B})
		;
		@@ -173,3 +176,113 @@ for.body:
		for.end:
		ret void
		}

		declare i32 @llvm.convergent(i32) #0

		; This is the same as f, and would require the same bounds
		; check. However, it is not OK to introduce new control dependencies
		; on the convergent call.

		; CHECK-LABEL: @f_with_convergent(
		; CHECK: call i32 @llvm.convergent
		; CHECK-NOT: call i32 @llvm.convergent

		; ANALYSIS: for.body:
		; ANALYSIS: Report: cannot add control dependency to convergent operation
		define void @f_with_convergent() #1 {
		entry:
		%a = load i32, i32* @A, align 8
		%b = load i32, i32* @B, align 8
		%c = load i32, i32* @C, align 8
		%d = load i32, i32* @D, align 8
		%e = load i32, i32* @E, align 8
		br label %for.body

		for.body: ; preds = %for.body, %entry
		%ind = phi i64 [ 0, %entry ], [ %add, %for.body ]

		%arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
		%loadA = load i32, i32* %arrayidxA, align 4

		%arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
		%loadB = load i32, i32* %arrayidxB, align 4

		%mulA = mul i32 %loadB, %loadA

		%add = add nuw nsw i64 %ind, 1
		%arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
		store i32 %mulA, i32* %arrayidxA_plus_4, align 4

		%arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
		%loadD = load i32, i32* %arrayidxD, align 4

		%arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
		%loadE = load i32, i32* %arrayidxE, align 4

		%convergentD = call i32 @llvm.convergent(i32 %loadD)
		%mulC = mul i32 %convergentD, %loadE

		%arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
		store i32 %mulC, i32* %arrayidxC, align 4

		%exitcond = icmp eq i64 %add, 20
		br i1 %exitcond, label %for.end, label %for.body

		for.end: ; preds = %for.body
		ret void
		}

		; Make sure an explicit request for distribution is ignored if it
		; requires possibly illegal checks.

		; CHECK-LABEL: @f_with_convergent_forced_distribute(
		; CHECK: call i32 @llvm.convergent
		; CHECK-NOT: call i32 @llvm.convergent
		define void @f_with_convergent_forced_distribute() #1 {
		entry:
		%a = load i32, i32* @A, align 8
		%b = load i32, i32* @B, align 8
		%c = load i32, i32* @C, align 8
		%d = load i32, i32* @D, align 8
		%e = load i32, i32* @E, align 8
		br label %for.body

		for.body: ; preds = %for.body, %entry
		%ind = phi i64 [ 0, %entry ], [ %add, %for.body ]

		%arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
		%loadA = load i32, i32* %arrayidxA, align 4

		%arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
		%loadB = load i32, i32* %arrayidxB, align 4

		%mulA = mul i32 %loadB, %loadA

		%add = add nuw nsw i64 %ind, 1
		%arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
		store i32 %mulA, i32* %arrayidxA_plus_4, align 4

		%arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
		%loadD = load i32, i32* %arrayidxD, align 4

		%arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
		%loadE = load i32, i32* %arrayidxE, align 4

		%convergentD = call i32 @llvm.convergent(i32 %loadD)
		%mulC = mul i32 %convergentD, %loadE

		%arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
		store i32 %mulC, i32* %arrayidxC, align 4

		%exitcond = icmp eq i64 %add, 20
		br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0

		for.end: ; preds = %for.body
		ret void
		}

		attributes #0 = { nounwind readnone convergent }
		attributes #1 = { nounwind convergent }

		!0 = distinct !{!0, !1}
		!1 = !{!"llvm.loop.distribute.enable", i1 true}