Commit 2605adb6 authored by Austin Kerbow's avatar Austin Kerbow
Browse files

[AMDGPU][GlobalISel] Select 8-byte LDS Ops with 4-byte alignment

Reviewers: arsenm

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D73585
parent ce07cdea
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -88,6 +88,10 @@ def gi_ds_1addr_1offset :
    GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
    GIComplexPatternEquiv<DS1Addr1Offset>;

def gi_ds_64bit_4byte_aligned :
    GIComplexOperandMatcher<s64, "selectDS64Bit4ByteAligned">,
    GIComplexPatternEquiv<DS64Bit4ByteAligned>;

def gi_mubuf_addr64 :
    GIComplexOperandMatcher<s64, "selectMUBUFAddr64">,
    GIComplexPatternEquiv<MUBUFAddr64>;
+44 −0
Original line number Diff line number Diff line
@@ -2399,6 +2399,50 @@ AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
    }};
}

InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
  if (!RootDef) {
    return {{
        [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
        [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
        [=](MachineInstrBuilder &MIB) { MIB.addImm(1); }
      }};
  }

  int64_t ConstAddr = 0;
  Register PtrBase;
  int64_t Offset;

  std::tie(PtrBase, Offset) =
    getPtrBaseWithConstantOffset(Root.getReg(), *MRI);

  if (Offset) {
    int64_t DWordOffset0 = Offset / 4;
    int64_t DWordOffset1 = DWordOffset0 + 1;
    if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
      // (add n0, c0)
      return {{
          [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); },
          [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset0); },
          [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset1); }
        }};
    }
  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
    // TODO

  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
    // TODO

  }

  return {{
      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
      [=](MachineInstrBuilder &MIB) { MIB.addImm(1); }
    }};
}

/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
/// the base value with the constant offset. There may be intervening copies
/// between \p Root and the identified constant. Returns \p Root, 0 if this does
+2 −0
Original line number Diff line number Diff line
@@ -179,6 +179,8 @@ private:

  InstructionSelector::ComplexRendererFns
  selectDS1Addr1Offset(MachineOperand &Root) const;
  InstructionSelector::ComplexRendererFns
  selectDS64Bit4ByteAligned(MachineOperand &Root) const;

  std::pair<Register, int64_t>
  getPtrBaseWithConstantOffset(Register Root,
+20 −16
Original line number Diff line number Diff line
@@ -737,31 +737,35 @@ def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>;
def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
}


class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, PatFrag frag> : GCNPat <
  (v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
  (vt:$value (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
  (inst $ptr, $offset0, $offset1, (i1 0))
>;

class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
  (frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
  (inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
              (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
  (frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
  (inst $ptr, (i32 (EXTRACT_SUBREG VReg_64:$value, sub0)),
              (i32 (EXTRACT_SUBREG VReg_64:$value, sub1)), $offset0, $offset1,
              (i1 0))
>;

// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
// related to bounds checking.
multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> {
  let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
    def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, vt, load_local_m0>;
    def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, vt, store_local_m0>;
  }

  let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, load_local>;
def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
    def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, vt, load_local>;
    def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, vt, store_local>;
  }
}

// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
// related to bounds checking.
foreach vt = VReg_64.RegTypes in {
defm : DS64Bit4ByteAlignedPat_mc<vt>;
}

let AddedComplexity = 100 in {

+98 −130
Original line number Diff line number Diff line
@@ -28,12 +28,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
    ; GFX7: $vgpr0 = COPY [[DS_READ_B32_]]
    ; GFX7-DS128-LABEL: name: load_local_s32_from_4
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_B32_]]
    ; GFX9-LABEL: name: load_local_s32_from_4
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -68,12 +62,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 2, addrspace 3)
    ; GFX7: $vgpr0 = COPY [[DS_READ_U16_]]
    ; GFX7-DS128-LABEL: name: load_local_s32_from_2
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 2, addrspace 3)
    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_U16_]]
    ; GFX9-LABEL: name: load_local_s32_from_2
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -112,12 +100,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
    ; GFX7: $vgpr0 = COPY [[DS_READ_U8_]]
    ; GFX7-DS128-LABEL: name: load_local_s32_from_1
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_U8_]]
    ; GFX9-LABEL: name: load_local_s32_from_1
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -152,12 +134,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
    ; GFX7-DS128-LABEL: name: load_local_v2s32
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
    ; GFX9-LABEL: name: load_local_v2s32
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -188,21 +164,15 @@ body: |
    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
    ; GFX7-LABEL: name: load_local_v2s32_align4
    ; GFX7: liveins: $vgpr0
    ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
    ; GFX7-DS128-LABEL: name: load_local_v2s32_align4
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
    ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
    ; GFX9-LABEL: name: load_local_v2s32_align4
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
    ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
    ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
    %0:vgpr(p3) = COPY $vgpr0
    %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 4, addrspace 3)
    $vgpr0_vgpr1 = COPY %1
@@ -232,12 +202,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
    ; GFX7-DS128-LABEL: name: load_local_s64
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
    ; GFX9-LABEL: name: load_local_s64
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -268,21 +232,15 @@ body: |
    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
    ; GFX7-LABEL: name: load_local_s64_align4
    ; GFX7: liveins: $vgpr0
    ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
    ; GFX7-DS128-LABEL: name: load_local_s64_align4
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
    ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
    ; GFX9-LABEL: name: load_local_s64_align4
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
    ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
    ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
    %0:vgpr(p3) = COPY $vgpr0
    %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 4, addrspace 3)
    $vgpr0_vgpr1 = COPY %1
@@ -312,12 +270,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
    ; GFX7: $vgpr0 = COPY [[DS_READ_B32_]]
    ; GFX7-DS128-LABEL: name: load_local_p3_from_4
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_B32_]]
    ; GFX9-LABEL: name: load_local_p3_from_4
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -352,12 +304,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
    ; GFX7: $vgpr0 = COPY [[DS_READ_B32_]]
    ; GFX7-DS128-LABEL: name: load_local_p5_from_4
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_B32_]]
    ; GFX9-LABEL: name: load_local_p5_from_4
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -392,12 +338,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
    ; GFX7-DS128-LABEL: name: load_local_p1_align8
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
    ; GFX9-LABEL: name: load_local_p1_align8
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -428,21 +368,15 @@ body: |
    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
    ; GFX7-LABEL: name: load_local_p1_align4
    ; GFX7: liveins: $vgpr0
    ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
    ; GFX7-DS128-LABEL: name: load_local_p1_align4
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
    ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
    ; GFX9-LABEL: name: load_local_p1_align4
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
    ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
    ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
    %0:vgpr(p3) = COPY $vgpr0
    %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 4, addrspace 3)
    $vgpr0_vgpr1 = COPY %1
@@ -472,12 +406,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
    ; GFX7-DS128-LABEL: name: load_local_p999_from_8
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
    ; GFX9-LABEL: name: load_local_p999_from_8
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
@@ -512,12 +440,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
    ; GFX7-DS128-LABEL: name: load_local_v2p3
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
    ; GFX9-LABEL: name: load_local_v2p3
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
@@ -552,12 +474,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
    ; GFX7: $vgpr0 = COPY [[DS_READ_B32_]]
    ; GFX7-DS128-LABEL: name: load_local_v2s16
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_B32_]]
    ; GFX9-LABEL: name: load_local_v2s16
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -592,12 +508,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
    ; GFX7-DS128-LABEL: name: load_local_v4s16
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
    ; GFX9-LABEL: name: load_local_v4s16
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -659,12 +569,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 65535, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
    ; GFX7: $vgpr0 = COPY [[DS_READ_U8_]]
    ; GFX7-DS128-LABEL: name: load_local_s32_from_1_gep_65535
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 65535, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_U8_]]
    ; GFX9-LABEL: name: load_local_s32_from_1_gep_65535
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -750,14 +654,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
    ; GFX7: $vgpr0 = COPY [[DS_READ_U8_]]
    ; GFX7-DS128-LABEL: name: load_local_s32_from_1_gep_65536
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65536, implicit $exec
    ; GFX7-DS128: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_U8_]]
    ; GFX9-LABEL: name: load_local_s32_from_1_gep_65536
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -800,14 +696,6 @@ body: |
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
    ; GFX7: $vgpr0 = COPY [[DS_READ_U8_]]
    ; GFX7-DS128-LABEL: name: load_local_s32_from_1_gep_m1
    ; GFX7-DS128: liveins: $vgpr0
    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7-DS128: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
    ; GFX7-DS128: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
    ; GFX7-DS128: $m0 = S_MOV_B32 -1
    ; GFX7-DS128: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_U8_]]
    ; GFX9-LABEL: name: load_local_s32_from_1_gep_m1
    ; GFX9: liveins: $vgpr0
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -822,3 +710,83 @@ body: |
    $vgpr0 = COPY %3

...

---

name: load_local_s64_align4_from_1_gep_1016
legalized:       true
regBankSelected: true
tracksRegLiveness: true

body: |
  bb.0:
    liveins:  $vgpr0_vgpr1

    ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1016
    ; GFX6: liveins: $vgpr0_vgpr1
    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016
    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
    ; GFX6: $m0 = S_MOV_B32 -1
    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
    ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1016
    ; GFX7: liveins: $vgpr0_vgpr1
    ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 254, 255, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
    ; GFX9-LABEL: name: load_local_s64_align4_from_1_gep_1016
    ; GFX9: liveins: $vgpr0_vgpr1
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 254, 255, 0, implicit $exec :: (load 8, align 4, addrspace 3)
    ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
    %0:vgpr(p3) = COPY $vgpr0
    %1:vgpr(s32) = G_CONSTANT i32 1016
    %2:vgpr(p3) = G_PTR_ADD %0, %1
    %3:vgpr(s64) = G_LOAD %2 :: (load 8, align 4, addrspace 3)
    $vgpr0_vgpr1 = COPY %3

...

---

name: load_local_s64_align4_from_1_gep_1020
legalized:       true
regBankSelected: true
tracksRegLiveness: true

body: |
  bb.0:
    liveins:  $vgpr0_vgpr1

    ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1020
    ; GFX6: liveins: $vgpr0_vgpr1
    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020
    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
    ; GFX6: $m0 = S_MOV_B32 -1
    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
    ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1020
    ; GFX7: liveins: $vgpr0_vgpr1
    ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec
    ; GFX7: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
    ; GFX7: $m0 = S_MOV_B32 -1
    ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 %2, 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3)
    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
    ; GFX9-LABEL: name: load_local_s64_align4_from_1_gep_1020
    ; GFX9: liveins: $vgpr0_vgpr1
    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec
    ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
    ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
    ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
    %0:vgpr(p3) = COPY $vgpr0
    %1:vgpr(s32) = G_CONSTANT i32 1020
    %2:vgpr(p3) = G_PTR_ADD %0, %1
    %3:vgpr(s64) = G_LOAD %2 :: (load 8, align 4, addrspace 3)
    $vgpr0_vgpr1 = COPY %3

...
Loading