Loading llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +6 −73 Original line number Diff line number Diff line Loading @@ -34,7 +34,6 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" Loading Loading @@ -118,58 +117,24 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, return true; } static void removeDoneExport(Function &F) { ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext()); for (BasicBlock &BB : F) { for (Instruction &I : BB) { if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) { if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { Intrin->setArgOperand(6, BoolFalse); // done } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { Intrin->setArgOperand(4, BoolFalse); // done } } } } } static BasicBlock *unifyReturnBlockSet(Function &F, ArrayRef<BasicBlock *> ReturningBlocks, bool InsertExport, const TargetTransformInfo &TTI, StringRef Name) { // Otherwise, we need to insert a new basic block into the function, add a PHI // nodes (if the function returns values), and convert all of the return // instructions into unconditional branches. BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); IRBuilder<> B(NewRetBlock); if (InsertExport) { // Ensure that there's only one "done" export in the shader by removing the // "done" bit set on the original final export. More than one "done" export // can lead to undefined behavior. removeDoneExport(F); Value *Undef = UndefValue::get(B.getFloatTy()); B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() }, { B.getInt32(9), // target, SQ_EXP_NULL B.getInt32(0), // enabled channels Undef, Undef, Undef, Undef, // values B.getTrue(), // done B.getTrue(), // valid mask }); } PHINode *PN = nullptr; if (F.getReturnType()->isVoidTy()) { B.CreateRetVoid(); ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); } else { // If the function doesn't return void... add a PHI node to the block... PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(), PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), "UnifiedRetVal"); assert(!InsertExport); B.CreateRet(PN); NewRetBlock->getInstList().push_back(PN); ReturnInst::Create(F.getContext(), PN, NewRetBlock); } // Loop over all of the blocks, replacing the return instruction with an Loading Loading @@ -208,8 +173,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { // Dummy return block for infinite loop. BasicBlock *DummyReturnBB = nullptr; bool InsertExport = false; for (BasicBlock *BB : PDT.getRoots()) { if (isa<ReturnInst>(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) Loading @@ -225,36 +188,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { "DummyReturnBlock", &F); Type *RetTy = F.getReturnType(); Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); // For pixel shaders, the producer guarantees that an export is // executed before each return instruction. However, if there is an // infinite loop and we insert a return ourselves, we need to uphold // that guarantee by inserting a null export. This can happen e.g. in // an infinite loop with kill instructions, which is supposed to // terminate. However, we don't need to do this if there is a non-void // return value, since then there is an epilog afterwards which will // still export. // // Note: In the case where only some threads enter the infinite loop, // this can result in the null export happening redundantly after the // original exports. However, The last "real" export happens after all // the threads that didn't enter an infinite loop converged, which // means that the only extra threads to execute the null export are // threads that entered the infinite loop, and they only could've // exited through being killed which sets their exec bit to 0. // Therefore, unless there's an actual infinite loop, which can have // invalid results, or there's a kill after the last export, which we // assume the frontend won't do, this export will have the same exec // mask as the last "real" export, and therefore the valid mask will be // overwritten with the same value and will still be correct. Also, // even though this forces an extra unnecessary export wait, we assume // that this happens rare enough in practice to that we don't have to // worry about performance. if (F.getCallingConv() == CallingConv::AMDGPU_PS && RetTy->isVoidTy()) { InsertExport = true; } ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); ReturningBlocks.push_back(DummyReturnBB); } Loading Loading @@ -327,6 +260,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock"); unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock"); return true; } llvm/test/CodeGen/AMDGPU/kill-infinite-loop.lldeleted 100644 → 0 +0 −68 Original line number Diff line number Diff line ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope %s ; Although it's modeled without any control flow in order to get better code ; out of the structurizer, @llvm.amdgcn.kill actually ends the thread that calls ; it with "true". In case it's called in a provably infinite loop, we still ; need to successfully exit and export something, even if we can't know where ; to jump to in the LLVM IR. Therefore we insert a null export ourselves in ; this case right before the s_endpgm to avoid GPU hangs, which is what this ; tests. ; CHECK-LABEL: return_void ; Make sure that we remove the done bit from the original export ; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} vm ; CHECK: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm define amdgpu_ps void @return_void(float %0) #0 { main_body: %cmp = fcmp olt float %0, 1.000000e+01 br i1 %cmp, label %end, label %loop loop: call void @llvm.amdgcn.kill(i1 false) #3 br label %loop end: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 0., float 0., float 0., float 1., i1 true, i1 true) #3 ret void } ; Check that we also remove the done bit from compressed exports correctly. ; CHECK-LABEL: return_void_compr ; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off compr vm ; CHECK: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm define amdgpu_ps void @return_void_compr(float %0) #0 { main_body: %cmp = fcmp olt float %0, 1.000000e+01 br i1 %cmp, label %end, label %loop loop: call void @llvm.amdgcn.kill(i1 false) #3 br label %loop end: call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 5, <2 x i16> < i16 0, i16 0 >, <2 x i16> < i16 0, i16 0 >, i1 true, i1 true) #3 ret void } ; In case there's an epilog, we shouldn't have to do this. ; CHECK-LABEL: return_nonvoid ; CHECK-NOT: exp null off, off, off, off done vm define amdgpu_ps float @return_nonvoid(float %0) #0 { main_body: %cmp = fcmp olt float %0, 1.000000e+01 br i1 %cmp, label %end, label %loop loop: call void @llvm.amdgcn.kill(i1 false) #3 br label %loop end: ret float 0. } declare void @llvm.amdgcn.kill(i1) #0 declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0 declare void @llvm.amdgcn.exp.compr.v2i16(i32 immarg, i32 immarg, <2 x i16>, <2 x i16>, i1 immarg, i1 immarg) #0 attributes #0 = { nounwind } Loading
llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +6 −73 Original line number Diff line number Diff line Loading @@ -34,7 +34,6 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" Loading Loading @@ -118,58 +117,24 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, return true; } static void removeDoneExport(Function &F) { ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext()); for (BasicBlock &BB : F) { for (Instruction &I : BB) { if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) { if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { Intrin->setArgOperand(6, BoolFalse); // done } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { Intrin->setArgOperand(4, BoolFalse); // done } } } } } static BasicBlock *unifyReturnBlockSet(Function &F, ArrayRef<BasicBlock *> ReturningBlocks, bool InsertExport, const TargetTransformInfo &TTI, StringRef Name) { // Otherwise, we need to insert a new basic block into the function, add a PHI // nodes (if the function returns values), and convert all of the return // instructions into unconditional branches. BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); IRBuilder<> B(NewRetBlock); if (InsertExport) { // Ensure that there's only one "done" export in the shader by removing the // "done" bit set on the original final export. More than one "done" export // can lead to undefined behavior. removeDoneExport(F); Value *Undef = UndefValue::get(B.getFloatTy()); B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() }, { B.getInt32(9), // target, SQ_EXP_NULL B.getInt32(0), // enabled channels Undef, Undef, Undef, Undef, // values B.getTrue(), // done B.getTrue(), // valid mask }); } PHINode *PN = nullptr; if (F.getReturnType()->isVoidTy()) { B.CreateRetVoid(); ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); } else { // If the function doesn't return void... add a PHI node to the block... PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(), PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), "UnifiedRetVal"); assert(!InsertExport); B.CreateRet(PN); NewRetBlock->getInstList().push_back(PN); ReturnInst::Create(F.getContext(), PN, NewRetBlock); } // Loop over all of the blocks, replacing the return instruction with an Loading Loading @@ -208,8 +173,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { // Dummy return block for infinite loop. BasicBlock *DummyReturnBB = nullptr; bool InsertExport = false; for (BasicBlock *BB : PDT.getRoots()) { if (isa<ReturnInst>(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) Loading @@ -225,36 +188,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { "DummyReturnBlock", &F); Type *RetTy = F.getReturnType(); Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); // For pixel shaders, the producer guarantees that an export is // executed before each return instruction. However, if there is an // infinite loop and we insert a return ourselves, we need to uphold // that guarantee by inserting a null export. This can happen e.g. in // an infinite loop with kill instructions, which is supposed to // terminate. However, we don't need to do this if there is a non-void // return value, since then there is an epilog afterwards which will // still export. // // Note: In the case where only some threads enter the infinite loop, // this can result in the null export happening redundantly after the // original exports. However, The last "real" export happens after all // the threads that didn't enter an infinite loop converged, which // means that the only extra threads to execute the null export are // threads that entered the infinite loop, and they only could've // exited through being killed which sets their exec bit to 0. // Therefore, unless there's an actual infinite loop, which can have // invalid results, or there's a kill after the last export, which we // assume the frontend won't do, this export will have the same exec // mask as the last "real" export, and therefore the valid mask will be // overwritten with the same value and will still be correct. Also, // even though this forces an extra unnecessary export wait, we assume // that this happens rare enough in practice to that we don't have to // worry about performance. if (F.getCallingConv() == CallingConv::AMDGPU_PS && RetTy->isVoidTy()) { InsertExport = true; } ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); ReturningBlocks.push_back(DummyReturnBB); } Loading Loading @@ -327,6 +260,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock"); unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock"); return true; }
llvm/test/CodeGen/AMDGPU/kill-infinite-loop.lldeleted 100644 → 0 +0 −68 Original line number Diff line number Diff line ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope %s ; Although it's modeled without any control flow in order to get better code ; out of the structurizer, @llvm.amdgcn.kill actually ends the thread that calls ; it with "true". In case it's called in a provably infinite loop, we still ; need to successfully exit and export something, even if we can't know where ; to jump to in the LLVM IR. Therefore we insert a null export ourselves in ; this case right before the s_endpgm to avoid GPU hangs, which is what this ; tests. ; CHECK-LABEL: return_void ; Make sure that we remove the done bit from the original export ; CHECK: exp mrt0 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} vm ; CHECK: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm define amdgpu_ps void @return_void(float %0) #0 { main_body: %cmp = fcmp olt float %0, 1.000000e+01 br i1 %cmp, label %end, label %loop loop: call void @llvm.amdgcn.kill(i1 false) #3 br label %loop end: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 0., float 0., float 0., float 1., i1 true, i1 true) #3 ret void } ; Check that we also remove the done bit from compressed exports correctly. ; CHECK-LABEL: return_void_compr ; CHECK: exp mrt0 v{{[0-9]+}}, off, v{{[0-9]+}}, off compr vm ; CHECK: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm define amdgpu_ps void @return_void_compr(float %0) #0 { main_body: %cmp = fcmp olt float %0, 1.000000e+01 br i1 %cmp, label %end, label %loop loop: call void @llvm.amdgcn.kill(i1 false) #3 br label %loop end: call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 5, <2 x i16> < i16 0, i16 0 >, <2 x i16> < i16 0, i16 0 >, i1 true, i1 true) #3 ret void } ; In case there's an epilog, we shouldn't have to do this. ; CHECK-LABEL: return_nonvoid ; CHECK-NOT: exp null off, off, off, off done vm define amdgpu_ps float @return_nonvoid(float %0) #0 { main_body: %cmp = fcmp olt float %0, 1.000000e+01 br i1 %cmp, label %end, label %loop loop: call void @llvm.amdgcn.kill(i1 false) #3 br label %loop end: ret float 0. } declare void @llvm.amdgcn.kill(i1) #0 declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0 declare void @llvm.amdgcn.exp.compr.v2i16(i32 immarg, i32 immarg, <2 x i16>, <2 x i16>, i1 immarg, i1 immarg) #0 attributes #0 = { nounwind }