Revert "AMDGPU: Fix handling of infinite loops in fragment shaders" (08b205bb) · Commits · llvm-doe / llvm-project

llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp

+6 −73

Original line number	Diff line number	Diff line
		@@ -34,7 +34,6 @@
		#include "llvm/IR/InstrTypes.h"
		#include "llvm/IR/Instructions.h"
		#include "llvm/IR/Intrinsics.h"
		#include "llvm/IR/IRBuilder.h"
		#include "llvm/IR/Type.h"
		#include "llvm/InitializePasses.h"
		#include "llvm/Pass.h"
		@@ -118,58 +117,24 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
		return true;
		}

		static void removeDoneExport(Function &F) {
		ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
		for (BasicBlock &BB : F) {
		for (Instruction &I : BB) {
		if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
		if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
		Intrin->setArgOperand(6, BoolFalse); // done
		} else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
		Intrin->setArgOperand(4, BoolFalse); // done
		}
		}
		}
		}
		}

		static BasicBlock *unifyReturnBlockSet(Function &F,
		ArrayRef<BasicBlock *> ReturningBlocks,
		bool InsertExport,
		const TargetTransformInfo &TTI,
		StringRef Name) {
		// Otherwise, we need to insert a new basic block into the function, add a PHI
		// nodes (if the function returns values), and convert all of the return
		// instructions into unconditional branches.
		BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
		IRBuilder<> B(NewRetBlock);

		if (InsertExport) {
		// Ensure that there's only one "done" export in the shader by removing the
		// "done" bit set on the original final export. More than one "done" export
		// can lead to undefined behavior.
		removeDoneExport(F);

		Value *Undef = UndefValue::get(B.getFloatTy());
		B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
		{
		B.getInt32(9), // target, SQ_EXP_NULL
		B.getInt32(0), // enabled channels
		Undef, Undef, Undef, Undef, // values
		B.getTrue(), // done
		B.getTrue(), // valid mask
		});
		}

		PHINode *PN = nullptr;
		if (F.getReturnType()->isVoidTy()) {
		B.CreateRetVoid();
		ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
		} else {
		// If the function doesn't return void... add a PHI node to the block...
		PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
		PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
		"UnifiedRetVal");
		assert(!InsertExport);
		B.CreateRet(PN);
		NewRetBlock->getInstList().push_back(PN);
		ReturnInst::Create(F.getContext(), PN, NewRetBlock);
		}

		// Loop over all of the blocks, replacing the return instruction with an
		@@ -208,8 +173,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
		// Dummy return block for infinite loop.
		BasicBlock *DummyReturnBB = nullptr;

		bool InsertExport = false;

		for (BasicBlock *BB : PDT.getRoots()) {
		if (isa<ReturnInst>(BB->getTerminator())) {
		if (!isUniformlyReached(DA, *BB))
		@@ -225,36 +188,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
		"DummyReturnBlock", &F);
		Type *RetTy = F.getReturnType();
		Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);

		// For pixel shaders, the producer guarantees that an export is
		// executed before each return instruction. However, if there is an
		// infinite loop and we insert a return ourselves, we need to uphold
		// that guarantee by inserting a null export. This can happen e.g. in
		// an infinite loop with kill instructions, which is supposed to
		// terminate. However, we don't need to do this if there is a non-void
		// return value, since then there is an epilog afterwards which will
		// still export.
		//
		// Note: In the case where only some threads enter the infinite loop,
		// this can result in the null export happening redundantly after the
		// original exports. However, The last "real" export happens after all
		// the threads that didn't enter an infinite loop converged, which
		// means that the only extra threads to execute the null export are
		// threads that entered the infinite loop, and they only could've
		// exited through being killed which sets their exec bit to 0.
		// Therefore, unless there's an actual infinite loop, which can have
		// invalid results, or there's a kill after the last export, which we
		// assume the frontend won't do, this export will have the same exec
		// mask as the last "real" export, and therefore the valid mask will be
		// overwritten with the same value and will still be correct. Also,
		// even though this forces an extra unnecessary export wait, we assume
		// that this happens rare enough in practice to that we don't have to
		// worry about performance.
		if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
		RetTy->isVoidTy()) {
		InsertExport = true;
		}

		ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
		ReturningBlocks.push_back(DummyReturnBB);
		}
		@@ -327,6 +260,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
		const TargetTransformInfo &TTI
		= getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

		unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock");
		unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
		return true;
		}

llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll

deleted100644 → 0

+0 −68

Original line number	Diff line number	Diff line
		; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope %s
		; Although it's modeled without any control flow in order to get better code
		; out of the structurizer, @llvm.amdgcn.kill actually ends the thread that calls
		; it with "true". In case it's called in a provably infinite loop, we still
		; need to successfully exit and export something, even if we can't know where
		; to jump to in the LLVM IR. Therefore we insert a null export ourselves in
		; this case right before the s_endpgm to avoid GPU hangs, which is what this
		; tests.

		; CHECK-LABEL: return_void
		; Make sure that we remove the done bit from the original export
		; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} vm
		; CHECK: exp null off, off, off, off done vm
		; CHECK-NEXT: s_endpgm
		define amdgpu_ps void @return_void(float %0) #0 {
		main_body:
		%cmp = fcmp olt float %0, 1.000000e+01
		br i1 %cmp, label %end, label %loop

		loop:
		call void @llvm.amdgcn.kill(i1 false) #3
		br label %loop

		end:
		call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 0., float 0., float 0., float 1., i1 true, i1 true) #3
		ret void
		}

		; Check that we also remove the done bit from compressed exports correctly.
		; CHECK-LABEL: return_void_compr
		; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off compr vm
		; CHECK: exp null off, off, off, off done vm
		; CHECK-NEXT: s_endpgm
		define amdgpu_ps void @return_void_compr(float %0) #0 {
		main_body:
		%cmp = fcmp olt float %0, 1.000000e+01
		br i1 %cmp, label %end, label %loop

		loop:
		call void @llvm.amdgcn.kill(i1 false) #3
		br label %loop

		end:
		call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 5, <2 x i16> < i16 0, i16 0 >, <2 x i16> < i16 0, i16 0 >, i1 true, i1 true) #3
		ret void
		}

		; In case there's an epilog, we shouldn't have to do this.
		; CHECK-LABEL: return_nonvoid
		; CHECK-NOT: exp null off, off, off, off done vm
		define amdgpu_ps float @return_nonvoid(float %0) #0 {
		main_body:
		%cmp = fcmp olt float %0, 1.000000e+01
		br i1 %cmp, label %end, label %loop

		loop:
		call void @llvm.amdgcn.kill(i1 false) #3
		br label %loop

		end:
		ret float 0.
		}

		declare void @llvm.amdgcn.kill(i1) #0
		declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0
		declare void @llvm.amdgcn.exp.compr.v2i16(i32 immarg, i32 immarg, <2 x i16>, <2 x i16>, i1 immarg, i1 immarg) #0

		attributes #0 = { nounwind }