Loading llvm/include/llvm/IR/IntrinsicsAMDGPU.td +6 −0 Original line number Diff line number Diff line Loading @@ -1944,6 +1944,12 @@ def int_amdgcn_s_bitreplicate : def int_amdgcn_s_quadmask : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; // Lowers to S_WQM_B{32,64} // The argument must be uniform; otherwise, the result is undefined. // Does not set WQM; merely calculates the bitmask. def int_amdgcn_s_wqm : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< [data_ty], [ Loading llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +3 −1 Original line number Diff line number Diff line Loading @@ -2996,6 +2996,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_inverse_ballot: case Intrinsic::amdgcn_s_bitreplicate: case Intrinsic::amdgcn_s_quadmask: case Intrinsic::amdgcn_s_wqm: applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 2); // Mask return; Loading Loading @@ -4541,7 +4542,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); break; } case Intrinsic::amdgcn_s_quadmask: { case Intrinsic::amdgcn_s_quadmask: case Intrinsic::amdgcn_s_wqm: { Register MaskReg = MI.getOperand(2).getReg(); unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); Loading llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +4 −2 Original line number Diff line number Diff line Loading @@ -6484,10 +6484,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, return CreatedBB; } // Legalize S_BITREPLICATE and S_QUADMASK // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 || MI.getOpcode() == AMDGPU::S_QUADMASK_B32 || MI.getOpcode() == AMDGPU::S_QUADMASK_B64) { MI.getOpcode() == AMDGPU::S_QUADMASK_B64 || MI.getOpcode() == AMDGPU::S_WQM_B32 || MI.getOpcode() == AMDGPU::S_WQM_B64) { MachineOperand &Src = MI.getOperand(1); if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); Loading llvm/lib/Target/AMDGPU/SOPInstructions.td +4 −2 Original line number Diff line number Diff line Loading @@ -216,8 +216,10 @@ let Defs = [SCC] in { def S_NOT_B64 : SOP1_64 <"s_not_b64", [(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))] >; def S_WQM_B32 : SOP1_32 <"s_wqm_b32">; def S_WQM_B64 : SOP1_64 <"s_wqm_b64">; def S_WQM_B32 : SOP1_32 <"s_wqm_b32", [(set i32:$sdst, (int_amdgcn_s_wqm i32:$src0))]>; def S_WQM_B64 : SOP1_64 <"s_wqm_b64", [(set i64:$sdst, (int_amdgcn_s_wqm i64:$src0))]>; } // End Defs = [SCC] Loading llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll 0 → 100644 +87 −0 Original line number Diff line number Diff line ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s declare i32 @llvm.amdgcn.s.wqm.i32(i32) declare i64 @llvm.amdgcn.s.wqm.i64(i64) define i32 @test_s_wqm_constant_i32() { ; GFX11-LABEL: test_s_wqm_constant_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_wqm_b32 s0, 0x85fe3a92 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 u0x85FE3A92) ret i32 %br } define amdgpu_cs void @test_s_wqm_sgpr_i32(i32 inreg %mask, ptr addrspace(1) %out) { ; GFX11-LABEL: test_s_wqm_sgpr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_wqm_b32 s0, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 %mask) store i32 %br, ptr addrspace(1) %out ret void } define i32 @test_s_wqm_vgpr_i32(i32 %mask) { ; GFX11-LABEL: test_s_wqm_vgpr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: s_wqm_b32 s0, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 %mask) ret i32 %br } define i64 @test_s_wqm_constant_i64() { ; GFX11-LABEL: test_s_wqm_constant_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0x85fe3a92 ; GFX11-NEXT: s_mov_b32 s1, 0x3a9285fe ; GFX11-NEXT: s_wqm_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 u0x3A9285FE85FE3A92) ret i64 %br } define amdgpu_cs void @test_s_wqm_sgpr_i64(i64 inreg %mask, ptr addrspace(1) %out) { ; GFX11-LABEL: test_s_wqm_sgpr_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_wqm_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 %mask) store i64 %br, ptr addrspace(1) %out ret void } define i64 @test_s_wqm_vgpr_i64(i64 %mask) { ; GFX11-LABEL: test_s_wqm_vgpr_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: s_wqm_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 %mask) ret i64 %br } Loading
llvm/include/llvm/IR/IntrinsicsAMDGPU.td +6 −0 Original line number Diff line number Diff line Loading @@ -1944,6 +1944,12 @@ def int_amdgcn_s_bitreplicate : def int_amdgcn_s_quadmask : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; // Lowers to S_WQM_B{32,64} // The argument must be uniform; otherwise, the result is undefined. // Does not set WQM; merely calculates the bitmask. def int_amdgcn_s_wqm : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< [data_ty], [ Loading
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +3 −1 Original line number Diff line number Diff line Loading @@ -2996,6 +2996,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_inverse_ballot: case Intrinsic::amdgcn_s_bitreplicate: case Intrinsic::amdgcn_s_quadmask: case Intrinsic::amdgcn_s_wqm: applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(B, MI, 2); // Mask return; Loading Loading @@ -4541,7 +4542,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); break; } case Intrinsic::amdgcn_s_quadmask: { case Intrinsic::amdgcn_s_quadmask: case Intrinsic::amdgcn_s_wqm: { Register MaskReg = MI.getOperand(2).getReg(); unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); Loading
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +4 −2 Original line number Diff line number Diff line Loading @@ -6484,10 +6484,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, return CreatedBB; } // Legalize S_BITREPLICATE and S_QUADMASK // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 || MI.getOpcode() == AMDGPU::S_QUADMASK_B32 || MI.getOpcode() == AMDGPU::S_QUADMASK_B64) { MI.getOpcode() == AMDGPU::S_QUADMASK_B64 || MI.getOpcode() == AMDGPU::S_WQM_B32 || MI.getOpcode() == AMDGPU::S_WQM_B64) { MachineOperand &Src = MI.getOperand(1); if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); Loading
llvm/lib/Target/AMDGPU/SOPInstructions.td +4 −2 Original line number Diff line number Diff line Loading @@ -216,8 +216,10 @@ let Defs = [SCC] in { def S_NOT_B64 : SOP1_64 <"s_not_b64", [(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))] >; def S_WQM_B32 : SOP1_32 <"s_wqm_b32">; def S_WQM_B64 : SOP1_64 <"s_wqm_b64">; def S_WQM_B32 : SOP1_32 <"s_wqm_b32", [(set i32:$sdst, (int_amdgcn_s_wqm i32:$src0))]>; def S_WQM_B64 : SOP1_64 <"s_wqm_b64", [(set i64:$sdst, (int_amdgcn_s_wqm i64:$src0))]>; } // End Defs = [SCC] Loading
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll 0 → 100644 +87 −0 Original line number Diff line number Diff line ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s declare i32 @llvm.amdgcn.s.wqm.i32(i32) declare i64 @llvm.amdgcn.s.wqm.i64(i64) define i32 @test_s_wqm_constant_i32() { ; GFX11-LABEL: test_s_wqm_constant_i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_wqm_b32 s0, 0x85fe3a92 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 u0x85FE3A92) ret i32 %br } define amdgpu_cs void @test_s_wqm_sgpr_i32(i32 inreg %mask, ptr addrspace(1) %out) { ; GFX11-LABEL: test_s_wqm_sgpr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_wqm_b32 s0, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 %mask) store i32 %br, ptr addrspace(1) %out ret void } define i32 @test_s_wqm_vgpr_i32(i32 %mask) { ; GFX11-LABEL: test_s_wqm_vgpr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: s_wqm_b32 s0, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 %mask) ret i32 %br } define i64 @test_s_wqm_constant_i64() { ; GFX11-LABEL: test_s_wqm_constant_i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, 0x85fe3a92 ; GFX11-NEXT: s_mov_b32 s1, 0x3a9285fe ; GFX11-NEXT: s_wqm_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 u0x3A9285FE85FE3A92) ret i64 %br } define amdgpu_cs void @test_s_wqm_sgpr_i64(i64 inreg %mask, ptr addrspace(1) %out) { ; GFX11-LABEL: test_s_wqm_sgpr_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_wqm_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 %mask) store i64 %br, ptr addrspace(1) %out ret void } define i64 @test_s_wqm_vgpr_i64(i64 %mask) { ; GFX11-LABEL: test_s_wqm_vgpr_i64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: s_wqm_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 %mask) ret i64 %br }