Unverified Commit 2ca30eb8 authored by Matt Arsenault's avatar Matt Arsenault Committed by GitHub
Browse files

AMDGPU/GlobalISel: Handle mubuf load/store for more types (#68268)

Fixes MUBUF path for most vectors and pointers, which unblocks fixing
the gfx6/7 run lines in assorted tests. Also fixes inconsistent behavior
for -flat-for-global.
parent ea71d2d0
Loading
Loading
Loading
Loading
+34 −10
Original line number Diff line number Diff line
@@ -574,11 +574,11 @@ class MUBUF_Store_Pseudo <string opName,

multiclass MUBUF_Pseudo_Store_Pats<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> {

  def _OFFSET : GCNPat <
  def : GCNPat <
    (st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset)),
    (!cast<MUBUF_Pseudo>(BaseInst # _OFFSET) store_vt:$vdata, v4i32:$srsrc, i32:$soffset, i32:$offset)>;

  def _ADDR64 : GCNPat <
  def : GCNPat <
    (st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset)),
    (!cast<MUBUF_Pseudo>(BaseInst # _ADDR64) store_vt:$vdata, i64:$vaddr, v4i32:$srsrc, i32:$soffset, i32:$offset)>;
}
@@ -912,10 +912,22 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", i32, load_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;

foreach vt = Reg32Types.types in {
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", vt, load_global>;
}

foreach vt = VReg_64.RegTypes in {
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", vt, load_global>;
}

foreach vt = VReg_96.RegTypes in {
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", vt, load_global>;
}

foreach vt = VReg_128.RegTypes in {
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", vt, load_global>;
}

defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
  "buffer_store_byte", i32
@@ -938,10 +950,22 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <

defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_BYTE", i32, truncstorei8_global>;
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_SHORT", i32, truncstorei16_global>;
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", i32, store_global>;
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", v2i32, store_global>;
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", v3i32, store_global>;
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", v4i32, store_global>;

foreach vt = Reg32Types.types in {
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", vt, store_global>;
}

foreach vt = VReg_64.RegTypes in {
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", vt, store_global>;
}

foreach vt = VReg_96.RegTypes in {
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", vt, store_global>;
}

foreach vt = VReg_128.RegTypes in {
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", vt, store_global>;
}

defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
  "buffer_atomic_swap", VGPR_32, i32
+24 −6
Original line number Diff line number Diff line
@@ -77,7 +77,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr,
;
; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; GFX7-NEXT:    s_mov_b32 s6, 0
; GFX7-NEXT:    s_mov_b32 s7, 0xf000
; GFX7-NEXT:    s_mov_b64 s[4:5], 0
; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT:    s_lshr_b32 s0, s2, 1
; GFX7-NEXT:    s_and_b32 s1, s2, 1
; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
@@ -150,7 +153,10 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; GFX7-NEXT:    s_mov_b32 s6, 0
; GFX7-NEXT:    s_mov_b32 s7, 0xf000
; GFX7-NEXT:    s_mov_b64 s[4:5], 0
; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
@@ -360,7 +366,10 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
; GFX7-LABEL: extractelement_vgpr_v4i16_idx0:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; GFX7-NEXT:    s_mov_b32 s6, 0
; GFX7-NEXT:    s_mov_b32 s7, 0xf000
; GFX7-NEXT:    s_mov_b64 s[4:5], 0
; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT:    s_waitcnt vmcnt(0)
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
@@ -402,7 +411,10 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
; GFX7-LABEL: extractelement_vgpr_v4i16_idx1:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; GFX7-NEXT:    s_mov_b32 s6, 0
; GFX7-NEXT:    s_mov_b32 s7, 0xf000
; GFX7-NEXT:    s_mov_b64 s[4:5], 0
; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT:    s_waitcnt vmcnt(0)
; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -447,7 +459,10 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
; GFX7-LABEL: extractelement_vgpr_v4i16_idx2:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; GFX7-NEXT:    s_mov_b32 s6, 0
; GFX7-NEXT:    s_mov_b32 s7, 0xf000
; GFX7-NEXT:    s_mov_b64 s[4:5], 0
; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT:    s_waitcnt vmcnt(0)
; GFX7-NEXT:    v_mov_b32_e32 v0, v1
; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -492,7 +507,10 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
; GFX7-LABEL: extractelement_vgpr_v4i16_idx3:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; GFX7-NEXT:    s_mov_b32 s6, 0
; GFX7-NEXT:    s_mov_b32 s7, 0xf000
; GFX7-NEXT:    s_mov_b64 s[4:5], 0
; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT:    s_waitcnt vmcnt(0)
; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
; GFX7-NEXT:    s_setpc_b64 s[30:31]
+19 −19
Original line number Diff line number Diff line
@@ -318,6 +318,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; CI-NEXT:    s_mov_b32 s6, -1
; CI-NEXT:    s_mov_b32 s7, 0xf000
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    v_mov_b32_e32 v0, s0
; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -334,9 +336,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
; CI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
; CI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
; CI-NEXT:    v_mov_b32_e32 v2, s4
; CI-NEXT:    v_mov_b32_e32 v3, s5
; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; CI-NEXT:    s_endpgm
;
; VI-LABEL: frem_f64:
@@ -381,6 +381,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; CI-NEXT:    s_mov_b32 s6, -1
; CI-NEXT:    s_mov_b32 s7, 0xf000
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[0:1]
; CI-NEXT:    v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
@@ -394,9 +396,7 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
; CI-NEXT:    v_mov_b32_e32 v2, s4
; CI-NEXT:    v_mov_b32_e32 v3, s5
; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; CI-NEXT:    s_endpgm
;
; VI-LABEL: fast_frem_f64:
@@ -438,6 +438,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; CI-NEXT:    s_mov_b32 s6, -1
; CI-NEXT:    s_mov_b32 s7, 0xf000
; CI-NEXT:    s_waitcnt lgkmcnt(0)
; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[0:1]
; CI-NEXT:    v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
@@ -451,9 +453,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
; CI-NEXT:    v_mov_b32_e32 v2, s4
; CI-NEXT:    v_mov_b32_e32 v3, s5
; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; CI-NEXT:    s_endpgm
;
; VI-LABEL: unsafe_frem_f64:
@@ -532,15 +532,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT:    v_fma_f32 v3, -v3, v6, v4
; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
; CI-NEXT:    s_mov_b32 s6, -1
; CI-NEXT:    s_mov_b32 s7, 0xf000
; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
; CI-NEXT:    v_trunc_f32_e32 v3, v3
; CI-NEXT:    v_fma_f32 v1, -v3, v2, v1
; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT:    v_or_b32_e32 v2, v0, v1
; CI-NEXT:    v_mov_b32_e32 v0, s4
; CI-NEXT:    v_mov_b32_e32 v1, s5
; CI-NEXT:    flat_store_dword v[0:1], v2
; CI-NEXT:    v_or_b32_e32 v0, v0, v1
; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
; CI-NEXT:    s_endpgm
;
; VI-LABEL: frem_v2f16:
@@ -669,15 +669,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT:    v_or_b32_e32 v0, v0, v1
; CI-NEXT:    s_mov_b32 s6, -1
; CI-NEXT:    s_mov_b32 s7, 0xf000
; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v3
; CI-NEXT:    v_trunc_f32_e32 v5, v5
; CI-NEXT:    v_fma_f32 v3, -v5, v4, v3
; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
; CI-NEXT:    v_or_b32_e32 v1, v2, v1
; CI-NEXT:    v_mov_b32_e32 v2, s4
; CI-NEXT:    v_mov_b32_e32 v3, s5
; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; CI-NEXT:    s_endpgm
;
; VI-LABEL: frem_v4f16:
@@ -1017,6 +1017,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT:    v_mov_b32_e32 v1, s9
; CI-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1]
; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
; CI-NEXT:    s_mov_b32 s6, -1
; CI-NEXT:    s_mov_b32 s7, 0xf000
; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -1043,9 +1045,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3]
; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
; CI-NEXT:    v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3]
; CI-NEXT:    v_mov_b32_e32 v4, s4
; CI-NEXT:    v_mov_b32_e32 v5, s5
; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT:    s_endpgm
;
; VI-LABEL: frem_v2f64:
+126 −103

File changed.

Preview size limit exceeded, changes collapsed.

+273 −35

File changed.

Preview size limit exceeded, changes collapsed.

Loading