Commit 096cd991 authored by Matt Arsenault's avatar Matt Arsenault
Browse files

AMDGPU: Fix divergence analysis of control flow intrinsics

The mask results of these should be uniform. The trickier part is the
dummy booleans used as IR glue need to be treated as divergent. This
should make the divergence analysis results correct for the IR the DAG
is constructed from.

This should allow us to eliminate requiresUniformRegister, which has
an expensive, recursive scan over all users looking for control flow
intrinsics. This should avoid recent compile time regressions.
parent ca1fd460
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -270,5 +270,13 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;

// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
// uniform result, so the returned struct is collectively divergent.
// isAlwaysUniform can override the extract of the uniform component.
def : SourceOfDivergence<int_amdgcn_if>;
def : SourceOfDivergence<int_amdgcn_else>;
def : SourceOfDivergence<int_amdgcn_loop>;

foreach intr = AMDGPUImageDimAtomicIntrinsics in
def : SourceOfDivergence<intr>;
+21 −6
Original line number Diff line number Diff line
@@ -706,6 +706,7 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
    case Intrinsic::amdgcn_readlane:
    case Intrinsic::amdgcn_icmp:
    case Intrinsic::amdgcn_fcmp:
    case Intrinsic::amdgcn_if_break:
      return true;
    }
  }
@@ -720,13 +721,27 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
  if (!ExtValue)
    return false;

  if (const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0))) {
  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
  if (!CI)
    return false;

  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
    switch (Intrinsic->getIntrinsicID()) {
    default:
      return false;
    case Intrinsic::amdgcn_if:
    case Intrinsic::amdgcn_else: {
      ArrayRef<unsigned> Indices = ExtValue->getIndices();
      return Indices.size() == 1 && Indices[0] == 1;
    }
    }
  }

  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
  // divergent for the overall struct return. We need to override it in the
  // case we're extracting an SGPR component here.
  if (isa<InlineAsm>(CI->getCalledValue()))
    return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
  }

  return false;
}
+102 −0
Original line number Diff line number Diff line
; RUN: opt -mtriple=amdgcn-mesa-mesa3d -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s

; Tests control flow intrinsics that should be treated as uniform

; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_if_break':
; CHECK: DIVERGENT: %cond = icmp eq i32 %arg0, 0
; CHECK-NOT: DIVERGENT
; CHECK: ret void
define amdgpu_ps void @test_if_break(i32 %arg0, i64 inreg %saved) {
entry:
  %cond = icmp eq i32 %arg0, 0
  %break = call i64 @llvm.amdgcn.if.break.i64.i64(i1 %cond, i64 %saved)
  store volatile i64 %break, i64 addrspace(1)* undef
  ret void
}

; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_if':
; CHECK: DIVERGENT: %cond = icmp eq i32 %arg0, 0
; CHECK-NEXT: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
; CHECK-NOT: DIVERGENT
; CHECK: DIVERGENT: %if.bool.ext = zext i1 %if.bool to i32
define void @test_if(i32 %arg0) {
entry:
  %cond = icmp eq i32 %arg0, 0
  %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
  %if.bool = extractvalue { i1, i64 } %if, 0
  %if.mask = extractvalue { i1, i64 } %if, 1
  %if.bool.ext = zext i1 %if.bool to i32
  store volatile i32 %if.bool.ext, i32 addrspace(1)* undef
  store volatile i64 %if.mask, i64 addrspace(1)* undef
  ret void
}

; The result should still be treated as divergent, even with a uniform source.
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_if_uniform':
; CHECK-NOT: DIVERGENT
; CHECK: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
; CHECK-NOT: DIVERGENT
; CHECK: DIVERGENT: %if.bool.ext = zext i1 %if.bool to i32
define amdgpu_ps void @test_if_uniform(i32 inreg %arg0) {
entry:
  %cond = icmp eq i32 %arg0, 0
  %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
  %if.bool = extractvalue { i1, i64 } %if, 0
  %if.mask = extractvalue { i1, i64 } %if, 1
  %if.bool.ext = zext i1 %if.bool to i32
  store volatile i32 %if.bool.ext, i32 addrspace(1)* undef
  store volatile i64 %if.mask, i64 addrspace(1)* undef
  ret void
}

; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_loop_uniform':
; CHECK: DIVERGENT: %loop = call i1 @llvm.amdgcn.loop.i64(i64 %mask)
define amdgpu_ps void @test_loop_uniform(i64 inreg %mask) {
entry:
  %loop = call i1 @llvm.amdgcn.loop.i64(i64 %mask)
  %loop.ext = zext i1 %loop to i32
  store volatile i32 %loop.ext, i32 addrspace(1)* undef
  ret void
}

; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_else':
; CHECK: DIVERGENT: %else = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
; CHECK: DIVERGENT:       %else.bool = extractvalue { i1, i64 } %else, 0
; CHECK: {{^[ \t]+}}%else.mask = extractvalue { i1, i64 } %else, 1
define amdgpu_ps void @test_else(i64 inreg %mask) {
entry:
  %else = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
  %else.bool = extractvalue { i1, i64 } %else, 0
  %else.mask = extractvalue { i1, i64 } %else, 1
  %else.bool.ext = zext i1 %else.bool to i32
  store volatile i32 %else.bool.ext, i32 addrspace(1)* undef
  store volatile i64 %else.mask, i64 addrspace(1)* undef
  ret void
}

; This case is probably always broken
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'test_else_divergent_mask':
; CHECK: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
; CHECK-NOT: DIVERGENT
; CHECK: DIVERGENT: %if.bool.ext = zext i1 %if.bool to i32
define void @test_else_divergent_mask(i64 %mask) {
entry:
  %if = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
  %if.bool = extractvalue { i1, i64 } %if, 0
  %if.mask = extractvalue { i1, i64 } %if, 1
  %if.bool.ext = zext i1 %if.bool to i32
  store volatile i32 %if.bool.ext, i32 addrspace(1)* undef
  store volatile i64 %if.mask, i64 addrspace(1)* undef
  ret void
}

declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #0
declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #0
declare i64 @llvm.amdgcn.if.break.i64.i64(i1, i64) #1
declare i1 @llvm.amdgcn.loop.i64(i64) #1

attributes #0 = { convergent nounwind }
attributes #1 = { convergent nounwind readnone }