Commit 05783e1c authored by Michael Liao's avatar Michael Liao
Browse files

[amdgpu] Revise the conversion from i64 to f32.

- Replace 'cmp+sel' with 'umin' if possible.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D107507
parent 8c4208d5
Loading
Loading
Loading
Loading
+29 −20
Original line number Diff line number Diff line
@@ -2457,10 +2457,6 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
  SDLoc SL(Op);
  SDValue Src = Op.getOperand(0);

  EVT SetCCVT =
      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);

  SDValue Lo, Hi;
  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
  SDValue Sign;
@@ -2468,25 +2464,38 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
  if (Signed && Subtarget->isGCN()) {
    // We also need to consider the sign bit in Lo if Hi has just sign bits,
    // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
    // account.
    SDValue HasSameSign =
        DAG.getSetCC(SL, SetCCVT, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
                     ZeroI32, ISD::SETGE);
    SDValue MaxShAmt = DAG.getSelect(SL, MVT::i32, HasSameSign,
                                     DAG.getConstant(33, SL, MVT::i32),
                                     DAG.getConstant(32, SL, MVT::i32));
    // account. That is, the maximal shift is
    // - 32 if Lo and Hi have opposite signs;
    // - 33 if Lo and Hi have the same sign.
    //
    // Or, MaxShAmt = 33 + OppositeSign, where
    //
    // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
    // - -1 if Lo and Hi have opposite signs; and
    // -  0 otherwise.
    //
    // All in all, ShAmt is calculated as
    //
    //  umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
    //
    // or
    //
    //  umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
    //
    // to reduce the critical path.
    SDValue OppositeSign = DAG.getNode(
        ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
        DAG.getConstant(31, SL, MVT::i32));
    SDValue MaxShAmt =
        DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
                    OppositeSign);
    // Count the leading sign bits.
    ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
    ShAmt = DAG.getSelect(SL, MVT::i32,
                          DAG.getSetCC(SL, SetCCVT, ShAmt,
                                       DAG.getAllOnesConstant(SL, MVT::i32),
                                       ISD::SETNE),
                          ShAmt, MaxShAmt);
    // The shift amount for signed integers is [1, 33].
    // Different from unsigned conversion, the shift should be one bit less to
    // preserve the sign bit.
    ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
                        DAG.getConstant(1, SL, MVT::i32));
    ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
  } else {
    if (Signed) {
      // Without 'ffbh_i32', only leading zeros could be counted. Take the
@@ -2507,9 +2516,9 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
  // Split it again.
  std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
  // Calculate the adjust bit for rounding.
  SDValue Adjust = DAG.getSelect(
      SL, MVT::i32, DAG.getSetCC(SL, SetCCVT, Lo, ZeroI32, ISD::SETNE),
      DAG.getConstant(1, SL, MVT::i32), ZeroI32);
  // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
  SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
                               DAG.getConstant(1, SL, MVT::i32), Lo);
  // Get the 32-bit normalized integer.
  Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
  // Convert the normalized 32-bit integer into f32.
+6 −12
Original line number Diff line number Diff line
@@ -2065,7 +2065,6 @@ bool AMDGPULegalizerInfo::legalizeITOFP(

  const LLT S64 = LLT::scalar(64);
  const LLT S32 = LLT::scalar(32);
  const LLT S1 = LLT::scalar(1);

  assert(MRI.getType(Src) == S64);

@@ -2089,29 +2088,24 @@ bool AMDGPULegalizerInfo::legalizeITOFP(

  assert(MRI.getType(Dst) == S32);

  auto Zero = B.buildConstant(S32, 0);
  auto One = B.buildConstant(S32, 1);
  auto AllOnes = B.buildConstant(S32, -1);

  MachineInstrBuilder ShAmt;
  if (Signed) {
    auto ThirtyThree = B.buildConstant(S32, 33);
    auto ThirtyOne = B.buildConstant(S32, 31);
    auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
    auto HasSameSign = B.buildICmp(CmpInst::ICMP_SGE, S1, X, Zero);
    auto MaxShAmt = B.buildSelect(S32, HasSameSign, ThirtyThree, ThirtyTwo);
    auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
    auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
    auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
                               /*HasSideEffects=*/false)
                  .addUse(Unmerge.getReg(1));
    auto NotAllSameBits = B.buildICmp(CmpInst::ICMP_NE, S1, LS, AllOnes);
    auto LS2 = B.buildSelect(S32, NotAllSameBits, LS, MaxShAmt);
    ShAmt = B.buildSub(S32, LS2, One);
    auto LS2 = B.buildSub(S32, LS, One);
    ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
  } else
    ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
  auto Norm = B.buildShl(S64, Src, ShAmt);
  auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
  auto NotAllZeros =
      B.buildICmp(CmpInst::ICMP_NE, S1, Unmerge2.getReg(0), Zero);
  auto Adjust = B.buildSelect(S32, NotAllZeros, One, Zero);
  auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
  auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
  auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
  auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
+14 −16
Original line number Diff line number Diff line
@@ -1082,15 +1082,15 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
; SI:       ; %bb.0:
; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT:    v_ffbh_i32_e32 v2, 0
; SI-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v2
; SI-NEXT:    v_cndmask_b32_e32 v2, 33, v2, vcc
; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
; SI-NEXT:    v_ffbh_i32_e32 v3, 0
; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
; SI-NEXT:    v_subrev_i32_e32 v3, vcc, 1, v3
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    v_subrev_i32_e32 v2, vcc, 1, v2
; SI-NEXT:    v_min_u32_e32 v2, v3, v2
; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT:    v_min_u32_e32 v0, 1, v0
; SI-NEXT:    v_or_b32_e32 v0, v1, v0
; SI-NEXT:    v_cvt_f32_i32_e32 v0, v0
; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
@@ -1100,15 +1100,15 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
; VI:       ; %bb.0:
; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT:    v_ffbh_i32_e32 v2, 0
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v2
; VI-NEXT:    v_cndmask_b32_e32 v2, 33, v2, vcc
; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
; VI-NEXT:    v_ffbh_i32_e32 v3, 0
; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
; VI-NEXT:    v_subrev_u32_e32 v3, vcc, 1, v3
; VI-NEXT:    v_mov_b32_e32 v1, 0
; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 1, v2
; VI-NEXT:    v_min_u32_e32 v2, v3, v2
; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT:    v_min_u32_e32 v0, 1, v0
; VI-NEXT:    v_or_b32_e32 v0, v1, v0
; VI-NEXT:    v_cvt_f32_i32_e32 v0, v0
; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
@@ -1128,8 +1128,7 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
; SI-NEXT:    v_mov_b32_e32 v1, 0
; SI-NEXT:    v_min_u32_e32 v2, 32, v2
; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT:    v_min_u32_e32 v0, 1, v0
; SI-NEXT:    v_or_b32_e32 v0, v1, v0
; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
@@ -1144,8 +1143,7 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
; VI-NEXT:    v_mov_b32_e32 v1, 0
; VI-NEXT:    v_min_u32_e32 v2, 32, v2
; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT:    v_min_u32_e32 v0, 1, v0
; VI-NEXT:    v_or_b32_e32 v0, v1, v0
; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
+100 −136

File changed.

Preview size limit exceeded, changes collapsed.

+38 −56
Original line number Diff line number Diff line
@@ -75,15 +75,13 @@ body: |
    ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
    ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
    ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX6: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32)
    ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]]
    ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32)
    ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64)
    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]]
    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]]
    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]]
    ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]]
    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]]
    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32)
    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]]
    ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32)
@@ -92,15 +90,13 @@ body: |
    ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
    ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX8: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32)
    ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]]
    ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32)
    ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64)
    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]]
    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]]
    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]]
    ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]]
    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]]
    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32)
    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]]
    ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32)
@@ -393,15 +389,13 @@ body: |
    ; GFX6: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
    ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
    ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX6: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32)
    ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C1]]
    ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND]], [[UMIN]](s32)
    ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64)
    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C2]]
    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C3]], [[C2]]
    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]]
    ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C2]], [[UV2]]
    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]]
    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32)
    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[UMIN]]
    ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32)
@@ -413,15 +407,13 @@ body: |
    ; GFX8: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
    ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
    ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX8: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32)
    ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C1]]
    ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND]], [[UMIN]](s32)
    ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64)
    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C2]]
    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C3]], [[C2]]
    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]]
    ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C2]], [[UV2]]
    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]]
    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32)
    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[UMIN]]
    ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32)
@@ -442,15 +434,13 @@ body: |
    ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
    ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
    ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX6: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32)
    ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]]
    ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32)
    ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64)
    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]]
    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]]
    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]]
    ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]]
    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]]
    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32)
    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]]
    ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32)
@@ -461,15 +451,13 @@ body: |
    ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
    ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX8: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32)
    ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]]
    ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32)
    ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64)
    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]]
    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]]
    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]]
    ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]]
    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]]
    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32)
    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]]
    ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32)
@@ -493,35 +481,32 @@ body: |
    ; GFX6: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
    ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
    ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX6: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV3]](s32)
    ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]]
    ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[UMIN]](s32)
    ; GFX6: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64)
    ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV4]](s32), [[C1]]
    ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]]
    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[SELECT]]
    ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV4]]
    ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[UMIN1]]
    ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32)
    ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]]
    ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32)
    ; GFX6: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32)
    ; GFX6: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
    ; GFX6: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV7]](s32)
    ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]]
    ; GFX6: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN1]](s32)
    ; GFX6: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]]
    ; GFX6: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN2]](s32)
    ; GFX6: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL1]](s64)
    ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV8]](s32), [[C1]]
    ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C2]], [[C1]]
    ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[SELECT1]]
    ; GFX6: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV8]]
    ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[UMIN3]]
    ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[OR1]](s32)
    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN1]]
    ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN2]]
    ; GFX6: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP1]](s32), [[SUB1]](s32)
    ; GFX6: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32)
    ; GFX6: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16)
    ; GFX6: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
    ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
    ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
    ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
    ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C2]](s32)
    ; GFX6: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
    ; GFX6: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
    ; GFX6: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
@@ -530,35 +515,32 @@ body: |
    ; GFX8: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
    ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
    ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
    ; GFX8: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV3]](s32)
    ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]]
    ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[UMIN]](s32)
    ; GFX8: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64)
    ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV4]](s32), [[C1]]
    ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]]
    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[SELECT]]
    ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV4]]
    ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[UMIN1]]
    ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32)
    ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]]
    ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32)
    ; GFX8: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32)
    ; GFX8: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
    ; GFX8: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV7]](s32)
    ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]]
    ; GFX8: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN1]](s32)
    ; GFX8: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]]
    ; GFX8: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN2]](s32)
    ; GFX8: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL1]](s64)
    ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV8]](s32), [[C1]]
    ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C2]], [[C1]]
    ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[SELECT1]]
    ; GFX8: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV8]]
    ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[UMIN3]]
    ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[OR1]](s32)
    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN1]]
    ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN2]]
    ; GFX8: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP1]](s32), [[SUB1]](s32)
    ; GFX8: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32)
    ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16)
    ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
    ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
    ; GFX8: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
    ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
    ; GFX8: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C2]](s32)
    ; GFX8: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
    ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
    ; GFX8: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
Loading