Unverified Commit 81d618b6 authored by xiaohuguo2023's avatar xiaohuguo2023 Committed by GitHub
Browse files

[AMDGPU][SIInsertWaitcnts] Fix iota_range assertion when OtherMarks is empty...

[AMDGPU][SIInsertWaitcnts] Fix iota_range assertion when OtherMarks is empty in mergeAsyncMarks() (#193499)

WaitcntBrackets::mergeAsyncMarks() asserts when merging async wait-count
state at a CFG join point where one predecessor has pending async memory
  operations and the other does not.

  Problem:
  - The existing early-exit only handles the both-empty case
- When OtherMarks is empty but AsyncMarks is not, MergeCount = min(0, N)
= 0
  - seq_inclusive<unsigned>(1, 0) fires: "Assertion Begin <= End failed"

  Changes:
  - Add early return when MergeCount == 0 (OtherMarks is empty)
  - When the other predecessor contributed no async marks, our marks are
    unchanged and no stricter waits are needed
  - Add regression lit test: asyncmark-merge-empty-other.mir
parent 4f0b96f6
Loading
Loading
Loading
Loading
+6 −1
Original line number Diff line number Diff line
@@ -3025,7 +3025,7 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
  bool StrictDom = false;

  LLVM_DEBUG(dbgs() << "Merging async marks ...");
  // Early exit: both empty
  // Early exit: nothing to merge when both sides are empty.
  if (AsyncMarks.empty() && OtherMarks.empty()) {
    LLVM_DEBUG(dbgs() << " nothing to merge\n");
    return false;
@@ -3067,6 +3067,11 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
  unsigned OtherSize = OtherMarks.size();
  unsigned OurSize = AsyncMarks.size();
  unsigned MergeCount = std::min(OtherSize, OurSize);
  // OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0.
  // Our existing marks are the conservative result; return early to avoid
  // passing MergeCount == 0 to seq_inclusive which asserts Begin <= End.
  if (MergeCount == 0)
    return StrictDom;
  for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
    for (auto T : inst_counter_types(Context->MaxCounter)) {
      StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
+139 −0
Original line number Diff line number Diff line
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s

# Regression test for mergeAsyncMarks() asserting when OtherMarks is empty.
#
# At a CFG join point where one predecessor has an ASYNCMARK (non-empty
# AsyncMarks) and the other does not (empty OtherMarks), MergeCount becomes
# min(0, N) = 0. Before the fix, seq_inclusive<unsigned>(1, 0) asserted
# Begin <= End. After the fix the function returns early when either side
# is empty.
#
# GLOBAL_LOAD_ASYNC_TO_LDS_B32 is a GFX1250 async LDS DMA instruction tracked
# via ASYNC_CNT.  isAsync() returns true for it, so the score is recorded into
# AsyncScore before ASYNCMARK pushes it onto AsyncMarks.
#
# The join block contains WAIT_ASYNCMARK 0 to consume the pending mark.
# Before the fix, mergeAsyncMarks() asserted before reaching the wait.
# After the fix the pass completes without asserting.
#
# Two patterns are tested:
#   asyncmark_in_then - ASYNCMARK in the then-successor, else-successor is sync
#   asyncmark_in_else - ASYNCMARK in the else-successor, then-successor is sync

---
# Pattern 1: ASYNCMARK in then-successor, else-successor is sync.
name:            asyncmark_in_then
tracksRegLiveness: true
machineFunctionInfo:
  occupancy:       8
body:             |
  ; CHECK-LABEL: name: asyncmark_in_then
  ; CHECK: bb.0:
  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
  ; CHECK-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT:   S_WAIT_LOADCNT_DSCNT 0
  ; CHECK-NEXT:   S_WAIT_KMCNT 0
  ; CHECK-NEXT:   S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc
  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT: bb.1:
  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT:   GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3)
  ; CHECK-NEXT:   ASYNCMARK
  ; CHECK-NEXT:   S_BRANCH %bb.3
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT: bb.2:
  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT:   S_BRANCH %bb.3
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT: bb.3:
  ; CHECK-NEXT:   WAIT_ASYNCMARK 0
  ; CHECK-NEXT:   S_WAIT_ASYNCCNT 0, implicit-def $asynccnt, implicit $asynccnt
  ; CHECK-NEXT:   S_ENDPGM 0
  bb.0:
    liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2

    S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc
    S_CBRANCH_SCC1 %bb.2, implicit killed $scc

  ; then branch — issues async LDS DMA + ASYNCMARK
  bb.1:
    liveins: $vgpr0_vgpr1, $vgpr2

    GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3)
    ASYNCMARK
    S_BRANCH %bb.3

  ; else branch — sync path, no async operations; OtherMarks is empty at join
  bb.2:
    S_BRANCH %bb.3

  ; join — mergeAsyncMarks sees non-empty AsyncMarks (then) and empty OtherMarks (else).
  ; Before fix: assertion. After fix: returns early, no spurious wait inserted.
  bb.3:
    WAIT_ASYNCMARK 0
    S_ENDPGM 0
...
---
# Pattern 2: ASYNCMARK in else-successor, then-successor is sync.
# Mirror of asyncmark_in_then — exercises the opposite predecessor ordering.
name:            asyncmark_in_else
tracksRegLiveness: true
machineFunctionInfo:
  occupancy:       8
body:             |
  ; CHECK-LABEL: name: asyncmark_in_else
  ; CHECK: bb.0:
  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
  ; CHECK-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT:   S_WAIT_LOADCNT_DSCNT 0
  ; CHECK-NEXT:   S_WAIT_KMCNT 0
  ; CHECK-NEXT:   S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc
  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT: bb.1:
  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT:   S_BRANCH %bb.3
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT: bb.2:
  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT:   GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3)
  ; CHECK-NEXT:   ASYNCMARK
  ; CHECK-NEXT:   S_BRANCH %bb.3
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT: bb.3:
  ; CHECK-NEXT:   WAIT_ASYNCMARK 0
  ; CHECK-NEXT:   S_WAIT_ASYNCCNT 0, implicit-def $asynccnt, implicit $asynccnt
  ; CHECK-NEXT:   S_ENDPGM 0
  bb.0:
    liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2

    S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc
    S_CBRANCH_SCC1 %bb.2, implicit killed $scc

  ; then branch — sync path, no async operations
  bb.1:
    S_BRANCH %bb.3

  ; else branch — issues async LDS DMA + ASYNCMARK
  bb.2:
    liveins: $vgpr0_vgpr1, $vgpr2

    GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3)
    ASYNCMARK
    S_BRANCH %bb.3

  ; join block
  bb.3:
    WAIT_ASYNCMARK 0
    S_ENDPGM 0
...