[AMDGPU] Fix llvm.amdgcn.init.exec and frame materialization (a80ebd01) · Commits · llvm-doe / llvm-project

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

+2 −0

Original line number	Diff line number	Diff line
		@@ -182,6 +182,8 @@ def int_amdgcn_init_exec : Intrinsic<[],
		// Set EXEC according to a thread count packed in an SGPR input:
		// thread_count = (input >> bitoffset) & 0x7f;
		// This is always moved to the beginning of the basic block.
		// Note: only inreg arguments to the parent function are valid as
		// inputs to this intrinsic, computed values cannot be used.
		def int_amdgcn_init_exec_from_input : Intrinsic<[],
		[llvm_i32_ty, // 32-bit SGPR input
		llvm_i32_ty], // bit offset of the thread count

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+0 −71

Original line number	Diff line number	Diff line
		@@ -4021,77 +4021,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
		MI.eraseFromParent();
		return BB;
		}
		case AMDGPU::SI_INIT_EXEC:
		// This should be before all vector instructions.
		BuildMI(BB, &BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
		AMDGPU::EXEC)
		.addImm(MI.getOperand(0).getImm());
		MI.eraseFromParent();
		return BB;

		case AMDGPU::SI_INIT_EXEC_LO:
		// This should be before all vector instructions.
		BuildMI(BB, &BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
		AMDGPU::EXEC_LO)
		.addImm(MI.getOperand(0).getImm());
		MI.eraseFromParent();
		return BB;

		case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
		// Extract the thread count from an SGPR input and set EXEC accordingly.
		// Since BFM can't shift by 64, handle that case with CMP + CMOV.
		//
		// S_BFE_U32 count, input, {shift, 7}
		// S_BFM_B64 exec, count, 0
		// S_CMP_EQ_U32 count, 64
		// S_CMOV_B64 exec, -1
		MachineInstr FirstMI = &BB->begin();
		MachineRegisterInfo &MRI = MF->getRegInfo();
		Register InputReg = MI.getOperand(0).getReg();
		Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
		bool Found = false;

		// Move the COPY of the input reg to the beginning, so that we can use it.
		for (auto I = BB->begin(); I != &MI; I++) {
		if (I->getOpcode() != TargetOpcode::COPY \|\|
		I->getOperand(0).getReg() != InputReg)
		continue;

		if (I == FirstMI) {
		FirstMI = &*++BB->begin();
		} else {
		I->removeFromParent();
		BB->insert(FirstMI, &*I);
		}
		Found = true;
		break;
		}
		assert(Found);
		(void)Found;

		// This should be before all vector instructions.
		unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
		bool isWave32 = getSubtarget()->isWave32();
		unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
		BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
		.addReg(InputReg)
		.addImm((MI.getOperand(1).getImm() & Mask) \| 0x70000);
		BuildMI(*BB, FirstMI, DebugLoc(),
		TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
		Exec)
		.addReg(CountReg)
		.addImm(0);
		BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
		.addReg(CountReg, RegState::Kill)
		.addImm(getSubtarget()->getWavefrontSize());
		BuildMI(*BB, FirstMI, DebugLoc(),
		TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
		Exec)
		.addImm(-1);
		MI.eraseFromParent();
		return BB;
		}

		case AMDGPU::GET_GROUPSTATICSIZE: {
		assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA \|\|
		getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);

llvm/lib/Target/AMDGPU/SIInstructions.td

+0 −19

Original line number	Diff line number	Diff line
		@@ -399,32 +399,13 @@ def SI_INIT_EXEC : SPseudoInstSI <
		(outs), (ins i64imm:$src),
		[(int_amdgcn_init_exec (i64 timm:$src))]> {
		let Defs = [EXEC];
		let usesCustomInserter = 1;
		let isAsCheapAsAMove = 1;
		let WaveSizePredicate = isWave64;
		}

		// FIXME: Intrinsic should be mangled for wave size.
		def SI_INIT_EXEC_LO : SPseudoInstSI <
		(outs), (ins i32imm:$src), []> {
		let Defs = [EXEC_LO];
		let usesCustomInserter = 1;
		let isAsCheapAsAMove = 1;
		let WaveSizePredicate = isWave32;
		}

		// FIXME: Wave32 version
		def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
		(outs), (ins SSrc_b32:$input, i32imm:$shift),
		[(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
		let Defs = [EXEC];
		let usesCustomInserter = 1;
		}

		def : GCNPat <
		(int_amdgcn_init_exec timm:$src),
		(SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
		let WaveSizePredicate = isWave32;
		}

		// Return for returning shaders to a shader variant epilog.

llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

+94 −0

Original line number	Diff line number	Diff line
		@@ -93,6 +93,8 @@ private:

		MachineBasicBlock *emitEndCf(MachineInstr &MI);

		void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);

		void findMaskOperands(MachineInstr &MI, unsigned OpNo,
		SmallVectorImpl<MachineOperand> &Src) const;

		@@ -661,6 +663,90 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
		return SplitBB;
		}

		void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
		MachineInstr &MI) {
		MachineFunction &MF = *MBB->getParent();
		const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
		bool IsWave32 = ST.isWave32();

		if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
		// This should be before all vector instructions.
		BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
		TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
		.addImm(MI.getOperand(0).getImm());
		if (LIS)
		LIS->RemoveMachineInstrFromMaps(MI);
		MI.eraseFromParent();
		return;
		}

		// Extract the thread count from an SGPR input and set EXEC accordingly.
		// Since BFM can't shift by 64, handle that case with CMP + CMOV.
		//
		// S_BFE_U32 count, input, {shift, 7}
		// S_BFM_B64 exec, count, 0
		// S_CMP_EQ_U32 count, 64
		// S_CMOV_B64 exec, -1
		Register InputReg = MI.getOperand(0).getReg();
		MachineInstr FirstMI = &MBB->begin();
		if (InputReg.isVirtual()) {
		MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
		assert(DefInstr && DefInstr->isCopy());
		if (DefInstr->getParent() == MBB) {
		if (DefInstr != FirstMI) {
		// If the `InputReg` is defined in current block, we also need to
		// move that instruction to the beginning of the block.
		DefInstr->removeFromParent();
		MBB->insert(FirstMI, DefInstr);
		if (LIS)
		LIS->handleMove(*DefInstr);
		} else {
		// If first instruction is definition then move pointer after it.
		FirstMI = &*std::next(FirstMI->getIterator());
		}
		}
		}

		// Insert instruction sequence at block beginning (before vector operations).
		const DebugLoc DL = MI.getDebugLoc();
		const unsigned WavefrontSize = ST.getWavefrontSize();
		const unsigned Mask = (WavefrontSize << 1) - 1;
		Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
		auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
		.addReg(InputReg)
		.addImm((MI.getOperand(1).getImm() & Mask) \| 0x70000);
		auto BfmMI =
		BuildMI(*MBB, FirstMI, DL,
		TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
		.addReg(CountReg)
		.addImm(0);
		auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
		.addReg(CountReg, RegState::Kill)
		.addImm(WavefrontSize);
		auto CmovMI =
		BuildMI(*MBB, FirstMI, DL,
		TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
		Exec)
		.addImm(-1);

		if (!LIS) {
		MI.eraseFromParent();
		return;
		}

		LIS->RemoveMachineInstrFromMaps(MI);
		MI.eraseFromParent();

		LIS->InsertMachineInstrInMaps(*BfeMI);
		LIS->InsertMachineInstrInMaps(*BfmMI);
		LIS->InsertMachineInstrInMaps(*CmpMI);
		LIS->InsertMachineInstrInMaps(*CmovMI);

		LIS->removeInterval(InputReg);
		LIS->createAndComputeVirtRegInterval(InputReg);
		LIS->createAndComputeVirtRegInterval(CountReg);
		}

		bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
		auto GetFallThroughSucc = [=](MachineBasicBlock B) -> MachineBasicBlock {
		auto *S = B->getNextNode();
		@@ -781,6 +867,14 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
		SplitMBB = process(MI);
		break;

		// FIXME: find a better place for this
		case AMDGPU::SI_INIT_EXEC:
		case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
		lowerInitExec(MBB, MI);
		if (LIS)
		LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
		break;

		default:
		break;
		}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll

+111 −0

Original line number	Diff line number	Diff line
		@@ -84,6 +84,117 @@ main_body:
		unreachable
		}

		; GCN-LABEL: {{^}}init_exec_before_frame_materialize:
		; GCN-NOT: {{^}}v_
		; GCN: s_mov_b64 exec, -1
		; GCN: v_mov
		; GCN: v_add
		define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) {
		main_body:
		%array0 = alloca [1024 x i32], align 16, addrspace(5)
		%array1 = alloca [20 x i32], align 16, addrspace(5)
		call void @llvm.amdgcn.init.exec(i64 -1)

		%ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
		store i32 %a, i32 addrspace(5)* %ptr0, align 4

		%ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
		store i32 %a, i32 addrspace(5)* %ptr1, align 4

		%ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
		store i32 %b, i32 addrspace(5)* %ptr2, align 4

		%ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
		%v3 = load i32, i32 addrspace(5)* %ptr3, align 4

		%ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
		%v4 = load i32, i32 addrspace(5)* %ptr4, align 4

		%v5 = add i32 %v3, %v4
		%v = bitcast i32 %v5 to float
		ret float %v
		}

		; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize:
		; GCN-NOT: {{^}}v_
		; GCN: s_bfe_u32 s2, s2, 0x70008
		; GCN-NEXT: s_bfm_b64 exec, s2, 0
		; GCN-NEXT: s_cmp_eq_u32 s2, 64
		; GCN-NEXT: s_cmov_b64 exec, -1
		; GCN: v_mov
		; GCN: v_add
		define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
		main_body:
		%array0 = alloca [1024 x i32], align 16, addrspace(5)
		%array1 = alloca [20 x i32], align 16, addrspace(5)
		call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)

		%ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
		store i32 %a, i32 addrspace(5)* %ptr0, align 4

		%ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
		store i32 %a, i32 addrspace(5)* %ptr1, align 4

		%ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
		store i32 %b, i32 addrspace(5)* %ptr2, align 4

		%ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
		%v3 = load i32, i32 addrspace(5)* %ptr3, align 4

		%ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
		%v4 = load i32, i32 addrspace(5)* %ptr4, align 4

		%v5 = add i32 %v3, %v4
		%v = bitcast i32 %v5 to float
		ret float %v
		}

		; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry:
		; GCN-NOT: {{^}}v_
		; GCN: %endif
		; GCN: s_bfe_u32 s3, s2, 0x70008
		; GCN-NEXT: s_bfm_b64 exec, s3, 0
		; GCN-NEXT: s_cmp_eq_u32 s3, 64
		; GCN-NEXT: s_cmov_b64 exec, -1
		; GCN: v_mov
		; GCN: v_add
		define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
		main_body:
		; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel
		%array0 = alloca [1024 x i32], align 16, addrspace(5)
		%array1 = alloca [20 x i32], align 16, addrspace(5)

		%cc = icmp uge i32 %count, 32
		br i1 %cc, label %endif, label %if

		if:
		call void asm sideeffect "", ""()
		br label %endif

		endif:
		call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)

		%ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
		store i32 %a, i32 addrspace(5)* %ptr0, align 4

		%ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
		store i32 %a, i32 addrspace(5)* %ptr1, align 4

		%ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
		store i32 %b, i32 addrspace(5)* %ptr2, align 4

		%ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
		%v3 = load i32, i32 addrspace(5)* %ptr3, align 4

		%ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
		%v4 = load i32, i32 addrspace(5)* %ptr4, align 4

		%v5 = add i32 %v3, %v4
		%v6 = add i32 %v5, %count
		%v = bitcast i32 %v6 to float
		ret float %v
		}

		declare void @llvm.amdgcn.init.exec(i64) #1
		declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1