Commit 88aced1e authored by Matt Arsenault's avatar Matt Arsenault Committed by Matt Arsenault
Browse files

AMDGPU: Fix computation for getOccupancyWithLocalMemSize

The computation here didn't really make sense to me, and reported
wildy different results depending on the flat work group size
attribute.

I think this should really report a range derived from the possible
work group size bounds, and only allow an occupancy that is a multiple
of the group size.
parent 27a3ecee
Loading
Loading
Loading
Loading
+32 −9
Original line number Diff line number Diff line
@@ -328,18 +328,41 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
}

// FIXME: Should return min,max range.
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
  const Function &F) const {
  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
  if (!WorkGroupsPerCu)
  const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
  const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
  if (!MaxWorkGroupsPerCu)
    return 0;
  unsigned MaxWaves = getMaxWavesPerEU();
  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
  NumWaves = std::min(NumWaves, MaxWaves);
  NumWaves = std::max(NumWaves, 1u);
  return NumWaves;

  const unsigned WaveSize = getWavefrontSize();

  // FIXME: Do we need to account for alignment requirement of LDS rounding the
  // size up?
  // Compute restriction based on LDS usage
  unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);

  // This can be queried with more LDS than is possible, so just assume the
  // worst.
  if (NumGroups == 0)
    return 1;

  NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);

  // Round to the number of waves.
  const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
  unsigned MaxWaves = NumGroups * MaxGroupNumWaves;

  // Clamp to the maximum possible number of waves.
  MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());

  // FIXME: Needs to be a multiple of the group size?
  //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);

  assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
         "computed invalid occupancy");
  return MaxWaves;
}

unsigned
+89 −0
Original line number Diff line number Diff line
@@ -283,6 +283,95 @@ define amdgpu_kernel void @used_lds_13112() {
  ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_64:
; GFX9:       ; Occupancy: 7{{$}}
; GFX101064:    ; Occupancy: 7{{$}}
; GFX1010W32:    ; Occupancy: 14{{$}}
@lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4
define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
  store volatile i8 1, i8 addrspace(3)* %p
  ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_96:
; GFX9:       ; Occupancy: 10{{$}}
; GFX1010W64:    ; Occupancy: 14{{$}}
; GFX1010W32:    ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
  store volatile i8 1, i8 addrspace(3)* %p
  ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_128:
; GFX9:       ; Occupancy: 10{{$}}
; GFX1010W64:    ; Occupancy: 14{{$}}
; GFX1010W32:    ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
  store volatile i8 1, i8 addrspace(3)* %p
  ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_192:
; GFX9:       ; Occupancy: 10{{$}}
; GFX1010W64:    ; Occupancy: 20{{$}}
; GFX1010W32:    ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
  store volatile i8 1, i8 addrspace(3)* %p
  ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_256:
; GFX9:       ; Occupancy: 10{{$}}
; GFX1010W64:    ; Occupancy: 20{{$}}
; GFX1010W32:    ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
  store volatile i8 1, i8 addrspace(3)* %p
  ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_512:
; GFX9:       ; Occupancy: 10{{$}}
; GFX1010W64:    ; Occupancy: 20{{$}}
; GFX1010W32:    ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
  store volatile i8 1, i8 addrspace(3)* %p
  ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_1024:
; GFX9:       ; Occupancy: 10{{$}}
; GFX1010W64:    ; Occupancy: 20{{$}}
; GFX1010W32:    ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
  store volatile i8 1, i8 addrspace(3)* %p
  ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32:
; GFX9:       ; Occupancy: 7{{$}}
; GFX1010W64:    ; Occupancy: 7{{$}}
; GFX1010W32:    ; Occupancy: 7{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
  store volatile i8 1, i8 addrspace(3)* %p
  ret void
}

attributes #0 = { "amdgpu-waves-per-eu"="2,3" }
attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }
attributes #4 = { "amdgpu-flat-work-group-size"="1,96" }
attributes #5 = { "amdgpu-flat-work-group-size"="1,128" }
attributes #6 = { "amdgpu-flat-work-group-size"="1,192" }
attributes #7 = { "amdgpu-flat-work-group-size"="1,256" }
attributes #8 = { "amdgpu-flat-work-group-size"="1,512" }
attributes #9 = { "amdgpu-flat-work-group-size"="1,1024" }
attributes #10 = { "amdgpu-flat-work-group-size"="1,32" }