Commit 67422612 authored by Carl Ritson's avatar Carl Ritson
Browse files

[AMDGPU] Apply pre-emit s_cbranch_vcc optimation to more patterns

Add handling of s_andn2 and mask of 0.
This eliminates redundant instructions from uniform control flow.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D83641
parent 233af895
Loading
Loading
Loading
Loading
+61 −11
Original line number Diff line number Diff line
@@ -54,14 +54,14 @@ char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;

bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
  // Match:
  // sreg = -1
  // vcc = S_AND_B64 exec, sreg
  // sreg = -1 or 0
  // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
  // S_CBRANCH_VCC[N]Z
  // =>
  // S_CBRANCH_EXEC[N]Z
  // We end up with this pattern sometimes after basic block placement.
  // It happens while combining a block which assigns -1 to a saved mask and
  // another block which consumes that saved mask and then a branch.
  // It happens while combining a block which assigns -1 or 0 to a saved mask
  // and another block which consumes that saved mask and then a branch.
  bool Changed = false;
  MachineBasicBlock &MBB = *MI.getParent();
  const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
@@ -69,6 +69,7 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
  const unsigned CondReg = TRI->getVCC();
  const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
  const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
  const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;

  MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
                                      E = MBB.rend();
@@ -80,7 +81,8 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
    if (A->modifiesRegister(ExecReg, TRI))
      return false;
    if (A->modifiesRegister(CondReg, TRI)) {
      if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
      if (!A->definesRegister(CondReg, TRI) ||
          (A->getOpcode() != And && A->getOpcode() != AndN2))
        return false;
      break;
    }
@@ -97,9 +99,10 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
  }
  if (Op1.getReg() != ExecReg)
    return Changed;
  if (Op2.isImm() && Op2.getImm() != -1)
  if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
    return Changed;

  int64_t MaskValue = 0;
  Register SReg;
  if (Op2.isReg()) {
    SReg = Op2.getReg();
@@ -113,28 +116,75 @@ bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
      ReadsSreg |= M->readsRegister(SReg, TRI);
    }
    if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
        M->getOperand(1).getImm() != -1)
        (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
      return Changed;
    // First if sreg is only used in and instruction fold the immediate
    // into that and.
    MaskValue = M->getOperand(1).getImm();
    // First if sreg is only used in the AND instruction fold the immediate
    // into into the AND.
    if (!ReadsSreg && Op2.isKill()) {
      A->getOperand(2).ChangeToImmediate(-1);
      A->getOperand(2).ChangeToImmediate(MaskValue);
      M->eraseFromParent();
    }
  } else if (Op2.isImm()) {
    MaskValue = Op2.getImm();
  } else {
    llvm_unreachable("Op2 must be register or immediate");
  }

  // Invert mask for s_andn2
  assert(MaskValue == 0 || MaskValue == -1);
  if (A->getOpcode() == AndN2)
    MaskValue = ~MaskValue;

  if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
      MI.killsRegister(CondReg, TRI))
    A->eraseFromParent();

  bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
  if (SReg == ExecReg) {
    // EXEC is updated directly
    if (IsVCCZ) {
      MI.eraseFromParent();
      return true;
    }
    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
  } else if (IsVCCZ && MaskValue == 0) {
    // Will always branch
    // Remove all succesors shadowed by new unconditional branch
    MachineBasicBlock *Parent = MI.getParent();
    SmallVector<MachineInstr *, 4> ToRemove;
    bool Found = false;
    for (MachineInstr &Term : Parent->terminators()) {
      if (Found) {
        if (Term.isBranch())
          ToRemove.push_back(&Term);
      } else {
        Found = Term.isIdenticalTo(MI);
      }
    }
    assert(Found && "conditional branch is not terminator");
    for (auto BranchMI : ToRemove) {
      MachineOperand &Dst = BranchMI->getOperand(0);
      assert(Dst.isMBB() && "destination is not basic block");
      Parent->removeSuccessor(Dst.getMBB());
      BranchMI->eraseFromParent();
    }

    if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
      Parent->removeSuccessor(Succ);
    }

    // Rewrite to unconditional branch
    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
  } else if (!IsVCCZ && MaskValue == 0) {
    // Will never branch
    MachineOperand &Dst = MI.getOperand(0);
    assert(Dst.isMBB() && "destination is not basic block");
    MI.getParent()->removeSuccessor(Dst.getMBB());
    MI.eraseFromParent();
    return true;
  } else if (MaskValue == -1) {
    // Depends only on EXEC
    MI.setDesc(
        TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
  }
+1 −4
Original line number Diff line number Diff line
@@ -482,13 +482,10 @@ ret:
; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
; GCN-NEXT: s_addc_u32
; GCN-NEXT: s_setpc_b64

; GCN-NEXT: [[LONG_BR_0]]:
; GCN: s_setpc_b64

; GCN: [[LONG_BR_DEST0]]
; GCN: [[LONG_BR_DEST0]]:

; GCN: s_cbranch_vccnz
; GCN-DAG: v_cmp_lt_i32
; GCN-DAG: v_cmp_ge_i32

+1 −1
Original line number Diff line number Diff line
@@ -524,7 +524,7 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(

; GCN: {{^; %bb.[0-9]}}:
; GCN: s_mov_b64 exec,
; GCN: s_cbranch_vccnz [[BB2]]
; GCN: s_cbranch_execnz [[BB2]]

define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
bb:
+1 −1
Original line number Diff line number Diff line
@@ -159,7 +159,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
; SI-NEXT:    ; in Loop: Header=BB3_2 Depth=1
; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
; SI-NEXT:    s_and_b64 vcc, exec, 0
; SI-NEXT:    s_cbranch_vccz BB3_2
; SI-NEXT:    s_branch BB3_2
; SI-NEXT:  BB3_5: ; %UnifiedReturnBlock
; SI-NEXT:    s_endpgm
; IR-LABEL: @infinite_loop_nest_ret(
+77 −0
Original line number Diff line number Diff line
@@ -338,3 +338,80 @@ body: |
    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
    S_ENDPGM 0
...
---
# GCN-LABEL: name: andn2_execz_mov_vccz
# GCN-NOT: S_MOV_
# GCN-NOT: S_ANDN2_
# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec
name:            andn2_execz_mov_vccz
body:             |
  bb.0:
    S_NOP 0

  bb.1:
    S_NOP 0

  bb.2:
    $sgpr0_sgpr1 = S_MOV_B64 0
    $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
    S_ENDPGM 0
...
---
# GCN-LABEL: name: andn2_branch_mov_vccz
# GCN-NOT: S_MOV_
# GCN-NOT: S_ANDN2_
# GCN: S_BRANCH %bb.1
name:            andn2_branch_mov_vccz
body:             |
  bb.0:
    S_NOP 0

  bb.1:
    S_NOP 0

  bb.2:
    $sgpr0_sgpr1 = S_MOV_B64 -1
    $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
    S_ENDPGM 0
...
---
# GCN-LABEL: name: andn2_execnz_mov_vccnz
# GCN-NOT: S_MOV_
# GCN-NOT: S_ANDN2_
# GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec
name:            andn2_execnz_mov_vccnz
body:             |
  bb.0:
    S_NOP 0

  bb.1:
    S_NOP 0

  bb.2:
    $sgpr0_sgpr1 = S_MOV_B64 0
    $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
    S_ENDPGM 0
...
---
# GCN-LABEL: name: andn2_no_branch_mov_vccnz
# GCN-NOT: S_MOV_
# GCN-NOT: S_ANDN2_
# GCN-NOT: S_CBRANCH
# GCN-NOT: S_BRANCH
name:            andn2_no_branch_mov_vccnz
body:             |
  bb.0:
    S_NOP 0

  bb.1:
    S_NOP 0

  bb.2:
    $sgpr0_sgpr1 = S_MOV_B64 -1
    $vcc = S_ANDN2_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc
    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
    S_ENDPGM 0
...
Loading