Commit c80d8a8c authored by Stanislav Mekhanoshin's avatar Stanislav Mekhanoshin
Browse files

[AMDGPU] MachineLICM cannot hoist VALU

MachineLoop::isLoopInvariant() returns false for all VALU
because of the exec use. Check TII::isIgnorableUse() to
allow hoisting.

That unfortunately results in higher register consumption
since MachineLICM does not adequately estimate pressure.
Therefor I think it shall only be enabled after D107677 even
though it does not depend on it.

Differential Revision: https://reviews.llvm.org/D107859
parent 61858356
Loading
Loading
Loading
Loading
+6 −2
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/InitializePasses.h"
@@ -154,7 +155,9 @@ MachineLoopInfo::findLoopPreheader(MachineLoop *L, bool SpeculativePreheader,
bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
  MachineFunction *MF = I.getParent()->getParent();
  MachineRegisterInfo *MRI = &MF->getRegInfo();
  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
  const TargetSubtargetInfo &ST = MF->getSubtarget();
  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
  const TargetInstrInfo *TII = ST.getInstrInfo();

  // The instruction is loop invariant if all of its operands are.
  for (const MachineOperand &MO : I.operands()) {
@@ -174,7 +177,8 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
        // However, if the physreg is known to always be caller saved/restored
        // then this use is safe to hoist.
        if (!MRI->isConstantPhysReg(Reg) &&
            !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())))
            !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())) &&
            !TII->isIgnorableUse(MO))
          return false;
        // Otherwise it's safe to move.
        continue;
+22 −22
Original line number Diff line number Diff line
@@ -120,25 +120,25 @@ define protected amdgpu_kernel void @nand(i32 addrspace(1)* %p, %S addrspace(1)*
; CHECK:       ; %bb.0:
; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
; CHECK-NEXT:    s_mov_b64 s[4:5], 0
; CHECK-NEXT:    v_mov_b32_e32 v0, 0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    v_mov_b32_e32 v0, s6
; CHECK-NEXT:    v_mov_b32_e32 v1, s6
; CHECK-NEXT:  BB5_1: ; %atomicrmw.start
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    v_mov_b32_e32 v1, v0
; CHECK-NEXT:    v_not_b32_e32 v0, v1
; CHECK-NEXT:    v_mov_b32_e32 v2, 0
; CHECK-NEXT:    v_or_b32_e32 v0, -2, v0
; CHECK-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; CHECK-NEXT:    v_mov_b32_e32 v3, v1
; CHECK-NEXT:    v_not_b32_e32 v1, v3
; CHECK-NEXT:    v_or_b32_e32 v2, -2, v1
; CHECK-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
; CHECK-NEXT:    s_cbranch_execnz BB5_1
; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, 12, s[2:3]
; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
; CHECK-NEXT:    global_store_dword v[0:1], v2, off
; CHECK-NEXT:    s_endpgm
@@ -330,24 +330,24 @@ define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1
; CHECK:       ; %bb.0:
; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
; CHECK-NEXT:    s_mov_b64 s[4:5], 0
; CHECK-NEXT:    v_mov_b32_e32 v0, 0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    v_mov_b32_e32 v0, s6
; CHECK-NEXT:    v_mov_b32_e32 v1, s6
; CHECK-NEXT:  BB14_1: ; %atomicrmw.start
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    v_mov_b32_e32 v1, v0
; CHECK-NEXT:    v_mov_b32_e32 v2, 0
; CHECK-NEXT:    v_add_f32_e32 v0, 1.0, v1
; CHECK-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; CHECK-NEXT:    v_mov_b32_e32 v3, v1
; CHECK-NEXT:    v_add_f32_e32 v2, 1.0, v3
; CHECK-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
; CHECK-NEXT:    s_cbranch_execnz BB14_1
; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v1
; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT:    global_store_dword v[0:1], v2, off
@@ -365,24 +365,24 @@ define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1
; CHECK:       ; %bb.0:
; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
; CHECK-NEXT:    s_mov_b64 s[4:5], 0
; CHECK-NEXT:    v_mov_b32_e32 v0, 0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    s_load_dword s6, s[0:1], 0x0
; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
; CHECK-NEXT:    v_mov_b32_e32 v0, s6
; CHECK-NEXT:    v_mov_b32_e32 v1, s6
; CHECK-NEXT:  BB15_1: ; %atomicrmw.start
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    v_mov_b32_e32 v1, v0
; CHECK-NEXT:    v_mov_b32_e32 v2, 0
; CHECK-NEXT:    v_add_f32_e32 v0, -1.0, v1
; CHECK-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
; CHECK-NEXT:    v_mov_b32_e32 v3, v1
; CHECK-NEXT:    v_add_f32_e32 v2, -1.0, v3
; CHECK-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc
; CHECK-NEXT:    s_waitcnt vmcnt(0)
; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
; CHECK-NEXT:    s_cbranch_execnz BB15_1
; CHECK-NEXT:  ; %bb.2: ; %atomicrmw.end
; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v1
; CHECK-NEXT:    v_mov_b32_e32 v2, 1.0
; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT:    global_store_dword v[0:1], v2, off
+3 −3
Original line number Diff line number Diff line
@@ -416,13 +416,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(double addrspace(1)*
; GFX90A:       ; %bb.0: ; %main_body
; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT:  BB24_1: ; %atomicrmw.start
; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], 4.0
; GFX90A-NEXT:    buffer_wbl2
; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -464,13 +464,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(double addrsp
; GFX90A:       ; %bb.0: ; %main_body
; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT:  BB26_1: ; %atomicrmw.start
; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], 4.0
; GFX90A-NEXT:    buffer_wbl2
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -626,13 +626,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(double ad
; GFX90A:       ; %bb.0: ; %main_body
; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT:  BB34_1: ; %atomicrmw.start
; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], 4.0
; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+115 −115

File changed.

Preview size limit exceeded, changes collapsed.

+3 −1
Original line number Diff line number Diff line
@@ -318,12 +318,12 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
; GCN-NEXT:    s_mov_b64 s[46:47], exec
; GCN-NEXT:    v_mov_b32_e32 v2, 0x7b
; GCN-NEXT:  BB3_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT:    v_readfirstlane_b32 s16, v0
; GCN-NEXT:    v_readfirstlane_b32 s17, v1
; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
; GCN-NEXT:    v_mov_b32_e32 v0, 0x7b
; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
@@ -331,9 +331,11 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
; GCN-NEXT:    s_mov_b32 s12, s44
; GCN-NEXT:    s_mov_b32 s13, s43
; GCN-NEXT:    s_mov_b32 s14, s42
; GCN-NEXT:    v_mov_b32_e32 v0, v2
; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
; GCN-NEXT:    ; implicit-def: $vgpr31
; GCN-NEXT:    ; implicit-def: $vgpr2
; GCN-NEXT:    s_xor_b64 exec, exec, s[48:49]
; GCN-NEXT:    s_cbranch_execnz BB3_1
; GCN-NEXT:  ; %bb.2:
Loading