Commit eae44c8a authored by Amara Emerson's avatar Amara Emerson
Browse files

[GlobalISel] Implement merging of stores of truncates.

This is a port of a combine which matches a pattern where a wide type scalar
value is stored by several narrow stores. It folds it into a single store or
a BSWAP and a store if the targets supports it.

Assuming little endian target:
 i8 *p = ...
 i32 val = ...
 p[0] = (val >> 0) & 0xFF;
 p[1] = (val >> 8) & 0xFF;
 p[2] = (val >> 16) & 0xFF;
 p[3] = (val >> 24) & 0xFF;
=>
 *((i32)p) = val;

On CTMark AArch64 -Os this results in a good amount of savings:

Program            before        after       diff
             SPASS 412792       412788       -0.0%
                kc 432528       432512       -0.0%
            lencod 430112       430096       -0.0%
  consumer-typeset 419156       419128       -0.0%
            bullet 475840       475752       -0.0%
        tramp3d-v4 367760       367628       -0.0%
          clamscan 383388       383204       -0.0%
    pairlocalalign 249764       249476       -0.1%
    7zip-benchmark 570100       568860       -0.2%
           sqlite3 287628       286920       -0.2%
Geomean difference                           -0.1%

Differential Revision: https://reviews.llvm.org/D109419
parent e741fabc
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -74,6 +74,14 @@ struct ShiftOfShiftedLogic {

using BuildFnTy = std::function<void(MachineIRBuilder &)>;

struct MergeTruncStoresInfo {
  SmallVector<GStore *> FoundStores;
  GStore *LowestIdxStore = nullptr;
  Register WideSrcVal;
  bool NeedBSwap = false;
  bool NeedRotate = false;
};

using OperandBuildSteps =
    SmallVector<std::function<void(MachineInstrBuilder &)>, 4>;
struct InstructionBuildSteps {
@@ -523,6 +531,9 @@ public:
  /// bswap.
  bool matchLoadOrCombine(MachineInstr &MI, BuildFnTy &MatchInfo);

  bool matchTruncStoreMerge(MachineInstr &MI, MergeTruncStoresInfo &MatchInfo);
  void applyTruncStoreMerge(MachineInstr &MI, MergeTruncStoresInfo &MatchInfo);

  bool matchExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI);
  void applyExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI);

+10 −2
Original line number Diff line number Diff line
@@ -594,6 +594,14 @@ def load_or_combine : GICombineRule<
    [{ return Helper.matchLoadOrCombine(*${root}, ${info}); }]),
  (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;


def truncstore_merge_matcdata : GIDefMatchData<"MergeTruncStoresInfo">;
def truncstore_merge : GICombineRule<
  (defs root:$root, truncstore_merge_matcdata:$info),
  (match (wip_match_opcode G_STORE):$root,
   [{ return Helper.matchTruncStoreMerge(*${root}, ${info}); }]),
  (apply [{ Helper.applyTruncStoreMerge(*${root}, ${info}); }])>;

def extend_through_phis_matchdata: GIDefMatchData<"MachineInstr*">;
def extend_through_phis : GICombineRule<
  (defs root:$root, extend_through_phis_matchdata:$matchinfo),
@@ -733,8 +741,8 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
    unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shl,
    const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
    shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
    div_rem_to_divrem, funnel_shift_combines, form_bitfield_extract,
    constant_fold]>;
    truncstore_merge, div_rem_to_divrem, funnel_shift_combines,
    form_bitfield_extract, constant_fold]>;

// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
+267 −0
Original line number Diff line number Diff line
@@ -28,6 +28,8 @@
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/MathExtras.h"
#include <tuple>

@@ -3266,6 +3268,271 @@ bool CombinerHelper::matchLoadOrCombine(
  return true;
}

/// Check if the store \p Store is a truncstore that can be merged. That is,
/// it's a store of a shifted value of \p SrcVal. If \p SrcVal is an empty
/// Register then it does not need to match and SrcVal is set to the source
/// value found.
/// On match, returns the start byte offset of the \p SrcVal that is being
/// stored.
static Optional<int64_t> getTruncStoreByteOffset(GStore &Store, Register &SrcVal,
                                                 MachineRegisterInfo &MRI) {
  Register TruncVal;
  if (!mi_match(Store.getValueReg(), MRI, m_GTrunc(m_Reg(TruncVal))))
    return None;

  // The shift amount must be a constant multiple of the narrow type.
  // It is translated to the offset address in the wide source value "y".
  //
  // x = G_LSHR y, ShiftAmtC
  // s8 z = G_TRUNC x
  // store z, ...
  Register FoundSrcVal;
  int64_t ShiftAmt;
  if (!mi_match(TruncVal, MRI,
                m_any_of(m_GLShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt)),
                         m_GAShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt))))) {
    if (!SrcVal.isValid() || TruncVal == SrcVal) {
      if (!SrcVal.isValid())
        SrcVal = TruncVal;
      return 0; // If it's the lowest index store.
    }
    return None;
  }

  unsigned NarrowBits = Store.getMMO().getMemoryType().getScalarSizeInBits();
  if (ShiftAmt % NarrowBits!= 0)
    return None;
  const unsigned Offset = ShiftAmt / NarrowBits;

  if (SrcVal.isValid() && FoundSrcVal != SrcVal)
    return None;

  if (!SrcVal.isValid())
    SrcVal = FoundSrcVal;
  else if (MRI.getType(SrcVal) != MRI.getType(FoundSrcVal))
    return None;
  return Offset;
}

/// Match a pattern where a wide type scalar value is stored by several narrow
/// stores. Fold it into a single store or a BSWAP and a store if the targets
/// supports it.
///
/// Assuming little endian target:
///  i8 *p = ...
///  i32 val = ...
///  p[0] = (val >> 0) & 0xFF;
///  p[1] = (val >> 8) & 0xFF;
///  p[2] = (val >> 16) & 0xFF;
///  p[3] = (val >> 24) & 0xFF;
/// =>
///  *((i32)p) = val;
///
///  i8 *p = ...
///  i32 val = ...
///  p[0] = (val >> 24) & 0xFF;
///  p[1] = (val >> 16) & 0xFF;
///  p[2] = (val >> 8) & 0xFF;
///  p[3] = (val >> 0) & 0xFF;
/// =>
///  *((i32)p) = BSWAP(val);
bool CombinerHelper::matchTruncStoreMerge(MachineInstr &MI,
                                          MergeTruncStoresInfo &MatchInfo) {
  auto &StoreMI = cast<GStore>(MI);
  LLT MemTy = StoreMI.getMMO().getMemoryType();

  // We only handle merging simple stores of 1-4 bytes.
  if (!MemTy.isScalar())
    return false;
  switch (MemTy.getSizeInBits()) {
  case 8:
  case 16:
  case 32:
    break;
  default:
    return false;
  }
  if (!StoreMI.isSimple())
    return false;

  // We do a simple search for mergeable stores prior to this one.
  // Any potential alias hazard along the way terminates the search.
  SmallVector<GStore *> FoundStores;

  // We're looking for:
  // 1) a (store(trunc(...)))
  // 2) of an LSHR/ASHR of a single wide value, by the appropriate shift to get
  //    the partial value stored.
  // 3) where the offsets form either a little or big-endian sequence.

  auto &LastStore = StoreMI;

  // The single base pointer that all stores must use.
  Register BaseReg;
  int64_t LastOffset;
  if (!mi_match(LastStore.getPointerReg(), MRI,
                m_GPtrAdd(m_Reg(BaseReg), m_ICst(LastOffset)))) {
    BaseReg = LastStore.getPointerReg();
    LastOffset = 0;
  }

  GStore *LowestIdxStore = &LastStore;
  int64_t LowestIdxOffset = LastOffset;

  Register WideSrcVal;
  auto LowestShiftAmt = getTruncStoreByteOffset(LastStore, WideSrcVal, MRI);
  if (!LowestShiftAmt)
    return false; // Didn't match a trunc.
  assert(WideSrcVal.isValid());

  LLT WideStoreTy = MRI.getType(WideSrcVal);
  const unsigned NumStoresRequired =
      WideStoreTy.getSizeInBits() / MemTy.getSizeInBits();

  SmallVector<int64_t, 8> OffsetMap(NumStoresRequired, INT64_MAX);
  OffsetMap[*LowestShiftAmt] = LastOffset;
  FoundStores.emplace_back(&LastStore);

  // Search the block up for more stores.
  // We use a search threshold of 10 instructions here because the combiner
  // works top-down within a block, and we don't want to search an unbounded
  // number of predecessor instructions trying to find matching stores.
  // If we moved this optimization into a separate pass then we could probably
  // use a more efficient search without having a hard-coded threshold.
  const int MaxInstsToCheck = 10;
  int NumInstsChecked = 0;
  for (auto II = ++LastStore.getReverseIterator();
       II != LastStore.getParent()->rend() && NumInstsChecked < MaxInstsToCheck;
       ++II) {
    NumInstsChecked++;
    GStore *NewStore;
    if ((NewStore = dyn_cast<GStore>(&*II))) {
      if (NewStore->getMMO().getMemoryType() != MemTy || !NewStore->isSimple())
        break;
    } else if (II->isLoadFoldBarrier() || II->mayLoad()) {
      break;
    } else {
      continue; // This is a safe instruction we can look past.
    }

    Register NewBaseReg;
    int64_t MemOffset;
    // Check we're storing to the same base + some offset.
    if (!mi_match(NewStore->getPointerReg(), MRI,
                  m_GPtrAdd(m_Reg(NewBaseReg), m_ICst(MemOffset)))) {
      NewBaseReg = NewStore->getPointerReg();
      MemOffset = 0;
    }
    if (BaseReg != NewBaseReg)
      break;

    auto ShiftByteOffset = getTruncStoreByteOffset(*NewStore, WideSrcVal, MRI);
    if (!ShiftByteOffset)
      break;
    if (MemOffset < LowestIdxOffset) {
      LowestIdxOffset = MemOffset;
      LowestIdxStore = NewStore;
    }

    // Map the offset in the store and the offset in the combined value, and
    // early return if it has been set before.
    if (*ShiftByteOffset < 0 || *ShiftByteOffset >= NumStoresRequired ||
        OffsetMap[*ShiftByteOffset] != INT64_MAX)
      break;
    OffsetMap[*ShiftByteOffset] = MemOffset;

    FoundStores.emplace_back(NewStore);
    // Reset counter since we've found a matching inst.
    NumInstsChecked = 0;
    if (FoundStores.size() == NumStoresRequired)
      break;
  }

  if (FoundStores.size() != NumStoresRequired) {
    return false;
  }

  const auto &DL = LastStore.getMF()->getDataLayout();
  auto &C = LastStore.getMF()->getFunction().getContext();
  // Check that a store of the wide type is both allowed and fast on the target
  bool Fast = false;
  bool Allowed = getTargetLowering().allowsMemoryAccess(
      C, DL, WideStoreTy, LowestIdxStore->getMMO(), &Fast);
  if (!Allowed || !Fast)
    return false;

  // Check if the pieces of the value are going to the expected places in memory
  // to merge the stores.
  unsigned NarrowBits = MemTy.getScalarSizeInBits();
  auto checkOffsets = [&](bool MatchLittleEndian) {
    if (MatchLittleEndian) {
      for (unsigned i = 0; i != NumStoresRequired; ++i)
        if (OffsetMap[i] != i * (NarrowBits / 8) + LowestIdxOffset)
          return false;
    } else { // MatchBigEndian by reversing loop counter.
      for (unsigned i = 0, j = NumStoresRequired - 1; i != NumStoresRequired;
           ++i, --j)
        if (OffsetMap[j] != i * (NarrowBits / 8) + LowestIdxOffset)
          return false;
    }
    return true;
  };

  // Check if the offsets line up for the native data layout of this target.
  bool NeedBswap = false;
  bool NeedRotate = false;
  if (!checkOffsets(DL.isLittleEndian())) {
    // Special-case: check if byte offsets line up for the opposite endian.
    if (NarrowBits == 8 && checkOffsets(DL.isBigEndian()))
      NeedBswap = true;
    else if (NumStoresRequired == 2 && checkOffsets(DL.isBigEndian()))
      NeedRotate = true;
    else
      return false;
  }

  if (NeedBswap &&
      !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {WideStoreTy}}))
    return false;
  if (NeedRotate &&
      !isLegalOrBeforeLegalizer({TargetOpcode::G_ROTR, {WideStoreTy}}))
    return false;

  MatchInfo.NeedBSwap = NeedBswap;
  MatchInfo.NeedRotate = NeedRotate;
  MatchInfo.LowestIdxStore = LowestIdxStore;
  MatchInfo.WideSrcVal = WideSrcVal;
  MatchInfo.FoundStores = std::move(FoundStores);
  return true;
}

void CombinerHelper::applyTruncStoreMerge(MachineInstr &MI,
                                          MergeTruncStoresInfo &MatchInfo) {

  Builder.setInstrAndDebugLoc(MI);
  Register WideSrcVal = MatchInfo.WideSrcVal;
  LLT WideStoreTy = MRI.getType(WideSrcVal);

  if (MatchInfo.NeedBSwap) {
    WideSrcVal = Builder.buildBSwap(WideStoreTy, WideSrcVal).getReg(0);
  } else if (MatchInfo.NeedRotate) {
    assert(WideStoreTy.getSizeInBits() % 2 == 0 &&
           "Unexpected type for rotate");
    auto RotAmt =
        Builder.buildConstant(WideStoreTy, WideStoreTy.getSizeInBits() / 2);
    WideSrcVal =
        Builder.buildRotateRight(WideStoreTy, WideSrcVal, RotAmt).getReg(0);
  }

  Builder.buildStore(WideSrcVal, MatchInfo.LowestIdxStore->getPointerReg(),
                     MatchInfo.LowestIdxStore->getMMO().getPointerInfo(),
                     MatchInfo.LowestIdxStore->getMMO().getAlign());

  // Erase the old stores.
  for (auto *ST : MatchInfo.FoundStores)
    ST->eraseFromParent();
}

bool CombinerHelper::matchExtendThroughPhis(MachineInstr &MI,
                                            MachineInstr *&ExtMI) {
  assert(MI.getOpcode() == TargetOpcode::G_PHI);
+348 −0
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64-apple-ios -global-isel -global-isel-abort=1 | FileCheck %s

define dso_local void @trunc_i16_to_i8(i16 %x, i8* %p) {
; CHECK-LABEL: trunc_i16_to_i8:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    strh w0, [x1]
; CHECK-NEXT:    ret
  %t1 = trunc i16 %x to i8
  %sh = lshr i16 %x, 8
  %t2 = trunc i16 %sh to i8
  store i8 %t1, i8* %p, align 1
  %p1 = getelementptr inbounds i8, i8* %p, i64 1
  store i8 %t2, i8* %p1, align 1
  ret void
}

define dso_local void @trunc_i32_to_i8(i32 %x, i8* %p) {
; CHECK-LABEL: trunc_i32_to_i8:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    str w0, [x1]
; CHECK-NEXT:    ret
  %t1 = trunc i32 %x to i8
  %sh1 = lshr i32 %x, 8
  %t2 = trunc i32 %sh1 to i8
  %sh2 = lshr i32 %x, 16
  %t3 = trunc i32 %sh2 to i8
  %sh3 = lshr i32 %x, 24
  %t4 = trunc i32 %sh3 to i8
  store i8 %t1, i8* %p, align 1
  %p1 = getelementptr inbounds i8, i8* %p, i64 1
  store i8 %t2, i8* %p1, align 1
  %p2 = getelementptr inbounds i8, i8* %p, i64 2
  store i8 %t3, i8* %p2, align 1
  %p3 = getelementptr inbounds i8, i8* %p, i64 3
  store i8 %t4, i8* %p3, align 1
  ret void
}

define dso_local void @trunc_i32_to_i16(i32 %x, i16* %p) {
; CHECK-LABEL: trunc_i32_to_i16:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    str w0, [x1]
; CHECK-NEXT:    ret
  %t1 = trunc i32 %x to i16
  %sh = lshr i32 %x, 16
  %t2 = trunc i32 %sh to i16
  store i16 %t1, i16* %p, align 2
  %p1 = getelementptr inbounds i16, i16* %p, i64 1
  store i16 %t2, i16* %p1, align 2
  ret void
}

define dso_local void @be_i32_to_i16(i32 %x, i16* %p0) {
; CHECK-LABEL: be_i32_to_i16:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    ror w8, w0, #16
; CHECK-NEXT:    str w8, [x1]
; CHECK-NEXT:    ret
  %sh1 = lshr i32 %x, 16
  %t0 = trunc i32 %x to i16
  %t1 = trunc i32 %sh1 to i16
  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
  store i16 %t0, i16* %p1, align 2
  store i16 %t1, i16* %p0, align 2
  ret void
}

define dso_local void @be_i32_to_i16_order(i32 %x, i16* %p0) {
; CHECK-LABEL: be_i32_to_i16_order:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    ror w8, w0, #16
; CHECK-NEXT:    str w8, [x1]
; CHECK-NEXT:    ret
  %sh1 = lshr i32 %x, 16
  %t0 = trunc i32 %x to i16
  %t1 = trunc i32 %sh1 to i16
  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
  store i16 %t1, i16* %p0, align 2
  store i16 %t0, i16* %p1, align 2
  ret void
}

define dso_local void @trunc_i64_to_i8(i64 %x, i8* %p) {
; CHECK-LABEL: trunc_i64_to_i8:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    str x0, [x1]
; CHECK-NEXT:    ret
  %t1 = trunc i64 %x to i8
  %sh1 = lshr i64 %x, 8
  %t2 = trunc i64 %sh1 to i8
  %sh2 = lshr i64 %x, 16
  %t3 = trunc i64 %sh2 to i8
  %sh3 = lshr i64 %x, 24
  %t4 = trunc i64 %sh3 to i8
  %sh4 = lshr i64 %x, 32
  %t5 = trunc i64 %sh4 to i8
  %sh5 = lshr i64 %x, 40
  %t6 = trunc i64 %sh5 to i8
  %sh6 = lshr i64 %x, 48
  %t7 = trunc i64 %sh6 to i8
  %sh7 = lshr i64 %x, 56
  %t8 = trunc i64 %sh7 to i8
  store i8 %t1, i8* %p, align 1
  %p1 = getelementptr inbounds i8, i8* %p, i64 1
  store i8 %t2, i8* %p1, align 1
  %p2 = getelementptr inbounds i8, i8* %p, i64 2
  store i8 %t3, i8* %p2, align 1
  %p3 = getelementptr inbounds i8, i8* %p, i64 3
  store i8 %t4, i8* %p3, align 1
  %p4 = getelementptr inbounds i8, i8* %p, i64 4
  store i8 %t5, i8* %p4, align 1
  %p5 = getelementptr inbounds i8, i8* %p, i64 5
  store i8 %t6, i8* %p5, align 1
  %p6 = getelementptr inbounds i8, i8* %p, i64 6
  store i8 %t7, i8* %p6, align 1
  %p7 = getelementptr inbounds i8, i8* %p, i64 7
  store i8 %t8, i8* %p7, align 1
  ret void
}

define dso_local void @trunc_i64_to_i16(i64 %x, i16* %p) {
; CHECK-LABEL: trunc_i64_to_i16:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    str x0, [x1]
; CHECK-NEXT:    ret
  %t1 = trunc i64 %x to i16
  %sh1 = lshr i64 %x, 16
  %t2 = trunc i64 %sh1 to i16
  %sh2 = lshr i64 %x, 32
  %t3 = trunc i64 %sh2 to i16
  %sh3 = lshr i64 %x, 48
  %t4 = trunc i64 %sh3 to i16
  store i16 %t1, i16* %p, align 2
  %p1 = getelementptr inbounds i16, i16* %p, i64 1
  store i16 %t2, i16* %p1, align 2
  %p2 = getelementptr inbounds i16, i16* %p, i64 2
  store i16 %t3, i16* %p2, align 2
  %p3 = getelementptr inbounds i16, i16* %p, i64 3
  store i16 %t4, i16* %p3, align 2
  ret void
}

define dso_local void @trunc_i64_to_i32(i64 %x, i32* %p) {
; CHECK-LABEL: trunc_i64_to_i32:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    str x0, [x1]
; CHECK-NEXT:    ret
  %t1 = trunc i64 %x to i32
  %sh = lshr i64 %x, 32
  %t2 = trunc i64 %sh to i32
  store i32 %t1, i32* %p, align 4
  %p1 = getelementptr inbounds i32, i32* %p, i64 1
  store i32 %t2, i32* %p1, align 4
  ret void
}
define dso_local void @be_i64_to_i32(i64 %x, i32* %p0) {
; CHECK-LABEL: be_i64_to_i32:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    ror x8, x0, #32
; CHECK-NEXT:    str x8, [x1]
; CHECK-NEXT:    ret
  %sh1 = lshr i64 %x, 32
  %t0 = trunc i64 %x to i32
  %t1 = trunc i64 %sh1 to i32
  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
  store i32 %t0, i32* %p1, align 4
  store i32 %t1, i32* %p0, align 4
  ret void
}

define dso_local void @be_i64_to_i32_order(i64 %x, i32* %p0) {
; CHECK-LABEL: be_i64_to_i32_order:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    ror x8, x0, #32
; CHECK-NEXT:    str x8, [x1]
; CHECK-NEXT:    ret
  %sh1 = lshr i64 %x, 32
  %t0 = trunc i64 %x to i32
  %t1 = trunc i64 %sh1 to i32
  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
  store i32 %t1, i32* %p0, align 4
  store i32 %t0, i32* %p1, align 4
  ret void
}

; Negative tests.

define void @merge_hole(i32 %x, i8* %p) {
; CHECK-LABEL: merge_hole:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    lsr w8, w0, #16
; CHECK-NEXT:    strb w0, [x1]
; CHECK-NEXT:    strh w8, [x1, #2]
; CHECK-NEXT:    ret
  %pcast = bitcast i8* %p to i16*
  %p2 = getelementptr inbounds i16, i16* %pcast, i64 1
  %x3 = trunc i32 %x to i8
  store i8 %x3, i8* %p, align 1
  %sh = lshr i32 %x, 16
  %x01 = trunc i32 %sh to i16
  store i16 %x01, i16* %p2, align 1
  ret void
}

define void @merge_hole2(i32 %x, i8* %p) {
; CHECK-LABEL: merge_hole2:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    lsr w8, w0, #16
; CHECK-NEXT:    strh w8, [x1, #2]
; CHECK-NEXT:    strb w0, [x1]
; CHECK-NEXT:    ret
  %pcast = bitcast i8* %p to i16*
  %p2 = getelementptr inbounds i16, i16* %pcast, i64 1
  %sh = lshr i32 %x, 16
  %x01 = trunc i32 %sh to i16
  store i16 %x01, i16* %p2, align 1
  %x3 = trunc i32 %x to i8
  store i8 %x3, i8* %p, align 1
  ret void
}

define void @merge_hole3(i32 %x, i8* %p) {
; CHECK-LABEL: merge_hole3:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    lsr w8, w0, #16
; CHECK-NEXT:    strb w0, [x1, #1]
; CHECK-NEXT:    strh w8, [x1, #2]
; CHECK-NEXT:    ret
  %p1 = getelementptr inbounds i8, i8* %p, i64 1
  %pcast = bitcast i8* %p to i16*
  %p2 = getelementptr inbounds i16, i16* %pcast, i64 1
  %x3 = trunc i32 %x to i8
  store i8 %x3, i8* %p1, align 1
  %sh = lshr i32 %x, 16
  %x01 = trunc i32 %sh to i16
  store i16 %x01, i16* %p2, align 1
  ret void
}

define void @merge_hole4(i32 %x, i8* %p) {
; CHECK-LABEL: merge_hole4:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    lsr w8, w0, #16
; CHECK-NEXT:    strb w0, [x1, #2]
; CHECK-NEXT:    strh w8, [x1]
; CHECK-NEXT:    ret
  %pcast = bitcast i8* %p to i16*
  %p2 = getelementptr inbounds i8, i8* %p, i64 2
  %x3 = trunc i32 %x to i8
  store i8 %x3, i8* %p2, align 1
  %sh = lshr i32 %x, 16
  %x01 = trunc i32 %sh to i16
  store i16 %x01, i16* %pcast, align 1
  ret void
}

define dso_local i32 @load_between_stores(i32 %x, i16* %p, i32 *%ptr) {
; CHECK-LABEL: load_between_stores:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    strh w0, [x1]
; CHECK-NEXT:    ldr w8, [x2]
; CHECK-NEXT:    lsr w9, w0, #16
; CHECK-NEXT:    strh w9, [x1, #2]
; CHECK-NEXT:    mov w0, w8
; CHECK-NEXT:    ret
  %t1 = trunc i32 %x to i16
  %sh = lshr i32 %x, 16
  %t2 = trunc i32 %sh to i16
  store i16 %t1, i16* %p, align 2
  %ld = load i32, i32 *%ptr
  %p1 = getelementptr inbounds i16, i16* %p, i64 1
  store i16 %t2, i16* %p1, align 2
  ret i32 %ld
}

define dso_local void @invalid_shift(i16 %x, i8* %p) {
; CHECK-LABEL: invalid_shift:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    and w8, w0, #0xffff
; CHECK-NEXT:    lsr w8, w8, #4
; CHECK-NEXT:    strb w0, [x1]
; CHECK-NEXT:    strb w8, [x1, #1]
; CHECK-NEXT:    ret
  %t1 = trunc i16 %x to i8
  %sh = lshr i16 %x, 4
  %t2 = trunc i16 %sh to i8
  store i8 %t1, i8* %p, align 1
  %p1 = getelementptr inbounds i8, i8* %p, i64 1
  store i8 %t2, i8* %p1, align 1
  ret void
}

define dso_local void @missing_store(i32 %x, i8* %p) {
; CHECK-LABEL: missing_store:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    lsr w8, w0, #8
; CHECK-NEXT:    lsr w9, w0, #24
; CHECK-NEXT:    strb w0, [x1]
; CHECK-NEXT:    strb w8, [x1, #1]
; CHECK-NEXT:    strb w9, [x1, #3]
; CHECK-NEXT:    ret
  %t1 = trunc i32 %x to i8
  %sh1 = lshr i32 %x, 8
  %t2 = trunc i32 %sh1 to i8
  %sh3 = lshr i32 %x, 24
  %t4 = trunc i32 %sh3 to i8
  store i8 %t1, i8* %p, align 1
  %p1 = getelementptr inbounds i8, i8* %p, i64 1
  store i8 %t2, i8* %p1, align 1
  %p3 = getelementptr inbounds i8, i8* %p, i64 3
  store i8 %t4, i8* %p3, align 1
  ret void
}

define dso_local void @different_base_reg(i16 %x, i8* %p, i8 *%p2) {
; CHECK-LABEL: different_base_reg:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    and w8, w0, #0xffff
; CHECK-NEXT:    lsr w8, w8, #8
; CHECK-NEXT:    strb w0, [x1]
; CHECK-NEXT:    strb w8, [x2, #1]
; CHECK-NEXT:    ret
  %t1 = trunc i16 %x to i8
  %sh = lshr i16 %x, 8
  %t2 = trunc i16 %sh to i8
  store i8 %t1, i8* %p, align 1
  %p1 = getelementptr inbounds i8, i8* %p2, i64 1
  store i8 %t2, i8* %p1, align 1
  ret void
}

define dso_local void @second_store_is_volatile(i16 %x, i8* %p) {
; CHECK-LABEL: second_store_is_volatile:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    and w8, w0, #0xffff
; CHECK-NEXT:    lsr w8, w8, #8
; CHECK-NEXT:    strb w0, [x1]
; CHECK-NEXT:    strb w8, [x1, #1]
; CHECK-NEXT:    ret
  %t1 = trunc i16 %x to i8
  %sh = lshr i16 %x, 8
  %t2 = trunc i16 %sh to i8
  store volatile i8 %t1, i8* %p, align 1
  %p1 = getelementptr inbounds i8, i8* %p, i64 1
  store i8 %t2, i8* %p1, align 1
  ret void
}
+737 −0

File added.

Preview size limit exceeded, changes collapsed.