Commit 4a331bea authored by Jay Foad's avatar Jay Foad
Browse files

[AMDGPU] Fix vccz after v_readlane/v_readfirstlane to vcc_lo/hi

Summary:
Up to gfx9, writes to vcc_lo and vcc_hi by instructions like
v_readlane and v_readfirstlane do not update vccz to reflect the new
value of vcc. Fix it by reusing part of the existing vccz bug handling
code, which inserts an "s_mov_b64 vcc, vcc" instruction to restore vccz
just before an instruction that needs the correct value.

Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D69661
parent 00efeae3
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -587,6 +587,11 @@ public:
    return getGeneration() <= SEA_ISLANDS;
  }

  /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
  bool partialVCCWritesUpdateVCCZ() const {
    return getGeneration() >= GFX10;
  }

  /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
  /// was written by a VALU instruction.
  bool hasSMRDReadVALUDefHazard() const {
+35 −7
Original line number Diff line number Diff line
@@ -1383,6 +1383,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
    ScoreBrackets.dump();
  });

  // Assume VCCZ is correct at basic block boundaries, unless and until we need
  // to handle cases where that is not true.
  bool VCCZCorrect = true;

  // Walk over the instructions.
  MachineInstr *OldWaitcntInstr = nullptr;

@@ -1402,13 +1406,26 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
      continue;
    }

    bool VCCZBugWorkAround = false;
    // We might need to restore vccz to its correct value for either of two
    // different reasons; see ST->hasReadVCCZBug() and
    // ST->partialVCCWritesUpdateVCCZ().
    bool RestoreVCCZ = false;
    if (readsVCCZ(Inst)) {
      if (!VCCZCorrect)
        RestoreVCCZ = true;
      else if (ST->hasReadVCCZBug()) {
        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
        // vccz bit, so when we detect that an instruction may read from a
        // corrupt vccz bit, we need to:
        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
        //    operations to complete.
        // 2. Restore the correct value of vccz by writing the current value
        //    of vcc back to vcc.
        if (ScoreBrackets.getScoreLB(LGKM_CNT) <
            ScoreBrackets.getScoreUB(LGKM_CNT) &&
            ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
        if (ST->hasReadVCCZBug())
          VCCZBugWorkAround = true;
          RestoreVCCZ = true;
        }
      }
    }

@@ -1419,6 +1436,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
      }
    }

    if (!ST->partialVCCWritesUpdateVCCZ()) {
      // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
      // Writes to vcc will fix it.
      if (Inst.definesRegister(AMDGPU::VCC_LO) ||
          Inst.definesRegister(AMDGPU::VCC_HI))
        VCCZCorrect = false;
      else if (Inst.definesRegister(AMDGPU::VCC))
        VCCZCorrect = true;
    }

    // Generate an s_waitcnt instruction to be placed before
    // cur_Inst, if needed.
    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
@@ -1444,7 +1471,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,

    // TODO: Remove this work-around after fixing the scheduler and enable the
    // assert above.
    if (VCCZBugWorkAround) {
    if (RestoreVCCZ) {
      // Restore the vccz bit.  Any time a value is written to vcc, the vcc
      // bit is updated, so we can restore the bit by reading the value of
      // vcc and then writing it back to the register.
@@ -1452,6 +1479,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
              TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
              TRI->getVCC())
          .addReg(TRI->getVCC());
      VCCZCorrect = true;
      Modified = true;
    }

+78 −0
Original line number Diff line number Diff line
@@ -85,3 +85,81 @@ body: |
    S_ENDPGM 0

...
---
# Test that after reloading vcc spilled to a vgpr, we insert any necessary
# instructions to fix vccz.

# CHECK-LABEL: name: reload_vcc_from_vgpr
# CHECK: $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc
# CHECK: $vcc_hi = V_READLANE_B32_vi $vgpr0, 9
# SI:    $vcc = S_MOV_B64 $vcc
# GFX9:  $vcc = S_MOV_B64 $vcc
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc

name: reload_vcc_from_vgpr
body: |
  bb.0:
    $vcc_lo = V_READLANE_B32_vi $vgpr0, 8, implicit-def $vcc
    $vcc_hi = V_READLANE_B32_vi $vgpr0, 9
    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
  bb.1:

...
---
# Test that after reloading vcc spilled to memory, we insert any necessary
# instructions to fix vccz.

# CHECK-LABEL: name: reload_vcc_from_mem
# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
# CHECK: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
# CHECK: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
# SI:    $vcc = S_MOV_B64 $vcc
# GFX9:  $vcc = S_MOV_B64 $vcc
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc

name: reload_vcc_from_mem
body: |
  bb.0:
    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec
    $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
    $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec
    $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc
    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
  bb.1:

...
---
# Test that after inline asm that defines vcc_lo, we insert any necessary
# instructions to fix vccz.

# CHECK-LABEL: name: inlineasm_def_vcc_lo
# CHECK: INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo
# SI:    $vcc = S_MOV_B64 $vcc
# GFX9:  $vcc = S_MOV_B64 $vcc
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc

name: inlineasm_def_vcc_lo
body: |
  bb.0:
    INLINEASM &"; def vcc_lo", 1, 10, implicit-def $vcc_lo
    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
  bb.1:

...
---
# Test that after inline asm that defines vcc, no unnecessary instructions are
# inserted to fix vccz.

# CHECK-LABEL: name: inlineasm_def_vcc
# CHECK: INLINEASM &"; def vcc", 1, 10, implicit-def $vcc
# CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc

name: inlineasm_def_vcc
body: |
  bb.0:
    INLINEASM &"; def vcc", 1, 10, implicit-def $vcc
    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
  bb.1:

...