Commit 872e899b authored by Matt Arsenault's avatar Matt Arsenault Committed by Matt Arsenault
Browse files

AMDGPU/GlobalISel: Legalize unpacked d16 image operations

On targets that don't have the normal packed f16 layout, handle these
during legalization. Directly modify the register types. We can infer
this was a d16 load based on the mem operand size during selection.

A16 operands should possibly be handled here as well, but don't worry
about that yet.
parent d21182d6
Loading
Loading
Loading
Loading
+69 −3
Original line number Diff line number Diff line
@@ -2771,10 +2771,72 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
  return true;
}

bool AMDGPULegalizerInfo::legalizeIntrinsic(
bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
    MachineInstr &MI, MachineIRBuilder &B,
    GISelChangeObserver &Observer,
    const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
  // We are only processing the operands of d16 image operations on subtargets
  // that use the unpacked register layout.
  if (!ST.hasUnpackedD16VMem())
    return true;

  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
    AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);

  if (BaseOpcode->Atomic) // No d16 atomics
    return true;

  MachineRegisterInfo *MRI = B.getMRI();
  const LLT S32 = LLT::scalar(32);
  const LLT S16 = LLT::scalar(16);

  if (BaseOpcode->Store) {
    Register VData = MI.getOperand(1).getReg();
    LLT Ty = MRI->getType(VData);
    if (!Ty.isVector() || Ty.getElementType() != S16)
      return true;

    B.setInstr(MI);

    Observer.changingInstr(MI);
    MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
    Observer.changedInstr(MI);
    return true;
  }

  // Must be an image load.
  Register DstReg = MI.getOperand(0).getReg();
  LLT Ty = MRI->getType(DstReg);
  if (!Ty.isVector() || Ty.getElementType() != S16)
    return true;

  B.setInsertPt(*MI.getParent(), ++MI.getIterator());

  LLT WidenedTy = Ty.changeElementType(S32);
  Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);

  Observer.changingInstr(MI);
  MI.getOperand(0).setReg(WideDstReg);
  Observer.changedInstr(MI);

  // FIXME: Just vector trunc should be sufficent, but legalization currently
  // broken.
  auto Unmerge = B.buildUnmerge(S32, WideDstReg);

  int NumOps = Unmerge->getNumOperands() - 1;
  SmallVector<Register, 4> RemergeParts(NumOps);
  for (int I = 0; I != NumOps; ++I)
    RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);

  B.buildBuildVector(DstReg, RemergeParts);
  return true;
}

bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
                                            MachineIRBuilder &B,
                                            GISelChangeObserver &Observer) const {
  MachineRegisterInfo &MRI = *B.getMRI();

  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
  auto IntrID = MI.getIntrinsicID();
  switch (IntrID) {
@@ -2935,9 +2997,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(
    return legalizeAtomicIncDec(MI, B, true);
  case Intrinsic::amdgcn_atomic_dec:
    return legalizeAtomicIncDec(MI, B, false);
  default:
  default: {
    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
            AMDGPU::getImageDimIntrinsicInfo(IntrID))
      return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
    return true;
  }
  }

  return true;
}
+5 −0
Original line number Diff line number Diff line
@@ -126,6 +126,11 @@ public:
  bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
                            Intrinsic::ID IID) const;

  bool legalizeImageIntrinsic(
      MachineInstr &MI, MachineIRBuilder &B,
      GISelChangeObserver &Observer,
      const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const;

  bool legalizeAtomicIncDec(MachineInstr &MI,  MachineIRBuilder &B,
                            bool IsInc) const;

+201 −0
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -o - %s | FileCheck -check-prefix=UNPACKED %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -o - %s | FileCheck -check-prefix=PACKED %s

define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
  ; UNPACKED-LABEL: name: image_load_f16
  ; UNPACKED: bb.1 (%ir-block.0):
  ; UNPACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
  ; UNPACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
  ; UNPACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
  ; UNPACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
  ; UNPACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
  ; UNPACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
  ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
  ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
  ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
  ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
  ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; UNPACKED:   [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8")
  ; UNPACKED:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
  ; UNPACKED:   $vgpr0 = COPY [[ANYEXT]](s32)
  ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
  ; PACKED-LABEL: name: image_load_f16
  ; PACKED: bb.1 (%ir-block.0):
  ; PACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
  ; PACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
  ; PACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
  ; PACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
  ; PACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
  ; PACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
  ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
  ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
  ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
  ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
  ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; PACKED:   [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 1, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 2 from custom "TargetCustom8")
  ; PACKED:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
  ; PACKED:   $vgpr0 = COPY [[ANYEXT]](s32)
  ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
  %tex = call half @llvm.amdgcn.image.load.2d.f16.i32(i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
  ret half %tex
}

define amdgpu_ps <2 x half> @image_load_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
  ; UNPACKED-LABEL: name: image_load_v2f16
  ; UNPACKED: bb.1 (%ir-block.0):
  ; UNPACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
  ; UNPACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
  ; UNPACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
  ; UNPACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
  ; UNPACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
  ; UNPACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
  ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
  ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
  ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
  ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
  ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; UNPACKED:   [[INT:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8")
  ; UNPACKED:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<2 x s32>)
  ; UNPACKED:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
  ; UNPACKED:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
  ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
  ; UNPACKED:   $vgpr0 = COPY [[BUILD_VECTOR1]](<2 x s16>)
  ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
  ; PACKED-LABEL: name: image_load_v2f16
  ; PACKED: bb.1 (%ir-block.0):
  ; PACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
  ; PACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
  ; PACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
  ; PACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
  ; PACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
  ; PACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
  ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
  ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
  ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
  ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
  ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; PACKED:   [[INT:%[0-9]+]]:_(<2 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 3, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 4 from custom "TargetCustom8")
  ; PACKED:   $vgpr0 = COPY [[INT]](<2 x s16>)
  ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
  %tex = call <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
  ret <2 x half> %tex
}

define amdgpu_ps <3 x half> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
  ; UNPACKED-LABEL: name: image_load_v3f16
  ; UNPACKED: bb.1 (%ir-block.0):
  ; UNPACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
  ; UNPACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
  ; UNPACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
  ; UNPACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
  ; UNPACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
  ; UNPACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
  ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
  ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
  ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
  ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
  ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; UNPACKED:   [[INT:%[0-9]+]]:_(<3 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8)
  ; UNPACKED:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<3 x s32>)
  ; UNPACKED:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
  ; UNPACKED:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
  ; UNPACKED:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32)
  ; UNPACKED:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
  ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
  ; UNPACKED:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16)
  ; UNPACKED:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
  ; UNPACKED:   [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0
  ; UNPACKED:   [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
  ; UNPACKED:   [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF1]], [[EXTRACT]](<3 x s16>), 0
  ; UNPACKED:   [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0
  ; UNPACKED:   [[EXTRACT2:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32
  ; UNPACKED:   $vgpr0 = COPY [[EXTRACT1]](<2 x s16>)
  ; UNPACKED:   $vgpr1 = COPY [[EXTRACT2]](<2 x s16>)
  ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
  ; PACKED-LABEL: name: image_load_v3f16
  ; PACKED: bb.1 (%ir-block.0):
  ; PACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
  ; PACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
  ; PACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
  ; PACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
  ; PACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
  ; PACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
  ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
  ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
  ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
  ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
  ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; PACKED:   [[INT:%[0-9]+]]:_(<3 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 7, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 6 from custom "TargetCustom8", align 8)
  ; PACKED:   [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
  ; PACKED:   [[INSERT:%[0-9]+]]:_(<4 x s16>) = G_INSERT [[DEF]], [[INT]](<3 x s16>), 0
  ; PACKED:   [[EXTRACT:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 0
  ; PACKED:   [[EXTRACT1:%[0-9]+]]:_(<2 x s16>) = G_EXTRACT [[INSERT]](<4 x s16>), 32
  ; PACKED:   $vgpr0 = COPY [[EXTRACT]](<2 x s16>)
  ; PACKED:   $vgpr1 = COPY [[EXTRACT1]](<2 x s16>)
  ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
  %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
  ret <3 x half> %tex
}

define amdgpu_ps <4 x half> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
  ; UNPACKED-LABEL: name: image_load_v4f16
  ; UNPACKED: bb.1 (%ir-block.0):
  ; UNPACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
  ; UNPACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
  ; UNPACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
  ; UNPACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
  ; UNPACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
  ; UNPACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
  ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
  ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
  ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
  ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
  ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; UNPACKED:   [[INT:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 15, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
  ; UNPACKED:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INT]](<4 x s32>)
  ; UNPACKED:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
  ; UNPACKED:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
  ; UNPACKED:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32)
  ; UNPACKED:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[UV3]](s32)
  ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
  ; UNPACKED:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
  ; UNPACKED:   $vgpr0 = COPY [[BUILD_VECTOR1]](<2 x s16>)
  ; UNPACKED:   $vgpr1 = COPY [[BUILD_VECTOR2]](<2 x s16>)
  ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
  ; PACKED-LABEL: name: image_load_v4f16
  ; PACKED: bb.1 (%ir-block.0):
  ; PACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1
  ; PACKED:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
  ; PACKED:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
  ; PACKED:   [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
  ; PACKED:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5
  ; PACKED:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6
  ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
  ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
  ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
  ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
  ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
  ; PACKED:   [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.2d), 15, [[COPY8]](s32), [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 8 from custom "TargetCustom8")
  ; PACKED:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[INT]](<4 x s16>)
  ; PACKED:   $vgpr0 = COPY [[UV]](<2 x s16>)
  ; PACKED:   $vgpr1 = COPY [[UV1]](<2 x s16>)
  ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
  %tex = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
  ret <4 x half> %tex
}

declare half @llvm.amdgcn.image.load.2d.f16.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
declare <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
declare <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
declare <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0

attributes #0 = { nounwind readonly }
+205 −0

File added.

Preview size limit exceeded, changes collapsed.