AMDGPU: Fix computation for getOccupancyWithLocalMemSize (88aced1e) · Commits · llvm-doe / llvm-project

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

+32 −9

Original line number	Diff line number	Diff line
		@@ -328,18 +328,41 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
		return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
		}

		// FIXME: Should return min,max range.
		unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
		const Function &F) const {
		unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
		unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
		if (!WorkGroupsPerCu)
		const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
		const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
		if (!MaxWorkGroupsPerCu)
		return 0;
		unsigned MaxWaves = getMaxWavesPerEU();
		unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
		unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
		NumWaves = std::min(NumWaves, MaxWaves);
		NumWaves = std::max(NumWaves, 1u);
		return NumWaves;

		const unsigned WaveSize = getWavefrontSize();

		// FIXME: Do we need to account for alignment requirement of LDS rounding the
		// size up?
		// Compute restriction based on LDS usage
		unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);

		// This can be queried with more LDS than is possible, so just assume the
		// worst.
		if (NumGroups == 0)
		return 1;

		NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);

		// Round to the number of waves.
		const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
		unsigned MaxWaves = NumGroups * MaxGroupNumWaves;

		// Clamp to the maximum possible number of waves.
		MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());

		// FIXME: Needs to be a multiple of the group size?
		//MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);

		assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
		"computed invalid occupancy");
		return MaxWaves;
		}

		unsigned

llvm/test/CodeGen/AMDGPU/occupancy-levels.ll

+89 −0

Original line number	Diff line number	Diff line
		@@ -283,6 +283,95 @@ define amdgpu_kernel void @used_lds_13112() {
		ret void
		}

		; GCN-LABEL: {{^}}used_lds_8252_max_group_size_64:
		; GFX9: ; Occupancy: 7{{$}}
		; GFX101064: ; Occupancy: 7{{$}}
		; GFX1010W32: ; Occupancy: 14{{$}}
		@lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4
		define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
		%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
		store volatile i8 1, i8 addrspace(3)* %p
		ret void
		}

		; GCN-LABEL: {{^}}used_lds_8252_max_group_size_96:
		; GFX9: ; Occupancy: 10{{$}}
		; GFX1010W64: ; Occupancy: 14{{$}}
		; GFX1010W32: ; Occupancy: 20{{$}}
		define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
		%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
		store volatile i8 1, i8 addrspace(3)* %p
		ret void
		}

		; GCN-LABEL: {{^}}used_lds_8252_max_group_size_128:
		; GFX9: ; Occupancy: 10{{$}}
		; GFX1010W64: ; Occupancy: 14{{$}}
		; GFX1010W32: ; Occupancy: 20{{$}}
		define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
		%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
		store volatile i8 1, i8 addrspace(3)* %p
		ret void
		}

		; GCN-LABEL: {{^}}used_lds_8252_max_group_size_192:
		; GFX9: ; Occupancy: 10{{$}}
		; GFX1010W64: ; Occupancy: 20{{$}}
		; GFX1010W32: ; Occupancy: 20{{$}}
		define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
		%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
		store volatile i8 1, i8 addrspace(3)* %p
		ret void
		}

		; GCN-LABEL: {{^}}used_lds_8252_max_group_size_256:
		; GFX9: ; Occupancy: 10{{$}}
		; GFX1010W64: ; Occupancy: 20{{$}}
		; GFX1010W32: ; Occupancy: 20{{$}}
		define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
		%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
		store volatile i8 1, i8 addrspace(3)* %p
		ret void
		}

		; GCN-LABEL: {{^}}used_lds_8252_max_group_size_512:
		; GFX9: ; Occupancy: 10{{$}}
		; GFX1010W64: ; Occupancy: 20{{$}}
		; GFX1010W32: ; Occupancy: 20{{$}}
		define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
		%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
		store volatile i8 1, i8 addrspace(3)* %p
		ret void
		}

		; GCN-LABEL: {{^}}used_lds_8252_max_group_size_1024:
		; GFX9: ; Occupancy: 10{{$}}
		; GFX1010W64: ; Occupancy: 20{{$}}
		; GFX1010W32: ; Occupancy: 20{{$}}
		define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
		%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
		store volatile i8 1, i8 addrspace(3)* %p
		ret void
		}

		; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32:
		; GFX9: ; Occupancy: 7{{$}}
		; GFX1010W64: ; Occupancy: 7{{$}}
		; GFX1010W32: ; Occupancy: 7{{$}}
		define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
		%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
		store volatile i8 1, i8 addrspace(3)* %p
		ret void
		}

		attributes #0 = { "amdgpu-waves-per-eu"="2,3" }
		attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
		attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
		attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }
		attributes #4 = { "amdgpu-flat-work-group-size"="1,96" }
		attributes #5 = { "amdgpu-flat-work-group-size"="1,128" }
		attributes #6 = { "amdgpu-flat-work-group-size"="1,192" }
		attributes #7 = { "amdgpu-flat-work-group-size"="1,256" }
		attributes #8 = { "amdgpu-flat-work-group-size"="1,512" }
		attributes #9 = { "amdgpu-flat-work-group-size"="1,1024" }
		attributes #10 = { "amdgpu-flat-work-group-size"="1,32" }