[GlobalISel] Implement merging of stores of truncates. (eae44c8a) · Commits · llvm-doe / llvm-project

llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h

+11 −0

Original line number	Diff line number	Diff line
		@@ -74,6 +74,14 @@ struct ShiftOfShiftedLogic {

		using BuildFnTy = std::function<void(MachineIRBuilder &)>;

		struct MergeTruncStoresInfo {
		SmallVector<GStore *> FoundStores;
		GStore *LowestIdxStore = nullptr;
		Register WideSrcVal;
		bool NeedBSwap = false;
		bool NeedRotate = false;
		};

		using OperandBuildSteps =
		SmallVector<std::function<void(MachineInstrBuilder &)>, 4>;
		struct InstructionBuildSteps {
		@@ -523,6 +531,9 @@ public:
		/// bswap.
		bool matchLoadOrCombine(MachineInstr &MI, BuildFnTy &MatchInfo);

		bool matchTruncStoreMerge(MachineInstr &MI, MergeTruncStoresInfo &MatchInfo);
		void applyTruncStoreMerge(MachineInstr &MI, MergeTruncStoresInfo &MatchInfo);

		bool matchExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI);
		void applyExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI);

llvm/include/llvm/Target/GlobalISel/Combine.td

+10 −2

Original line number	Diff line number	Diff line
		@@ -594,6 +594,14 @@ def load_or_combine : GICombineRule<
		[{ return Helper.matchLoadOrCombine(*${root}, ${info}); }]),
		(apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;


		def truncstore_merge_matcdata : GIDefMatchData<"MergeTruncStoresInfo">;
		def truncstore_merge : GICombineRule<
		(defs root:$root, truncstore_merge_matcdata:$info),
		(match (wip_match_opcode G_STORE):$root,
		[{ return Helper.matchTruncStoreMerge(*${root}, ${info}); }]),
		(apply [{ Helper.applyTruncStoreMerge(*${root}, ${info}); }])>;

		def extend_through_phis_matchdata: GIDefMatchData<"MachineInstr*">;
		def extend_through_phis : GICombineRule<
		(defs root:$root, extend_through_phis_matchdata:$matchinfo),
		@@ -733,8 +741,8 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
		unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shl,
		const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
		shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
		div_rem_to_divrem, funnel_shift_combines, form_bitfield_extract,
		constant_fold]>;
		truncstore_merge, div_rem_to_divrem, funnel_shift_combines,
		form_bitfield_extract, constant_fold]>;

		// A combine group used to for prelegalizer combiners at -O0. The combines in
		// this group have been selected based on experiments to balance code size and

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

+267 −0

Original line number	Diff line number	Diff line
		@@ -28,6 +28,8 @@
		#include "llvm/CodeGen/TargetInstrInfo.h"
		#include "llvm/CodeGen/TargetLowering.h"
		#include "llvm/CodeGen/TargetOpcodes.h"
		#include "llvm/IR/DataLayout.h"
		#include "llvm/Support/Casting.h"
		#include "llvm/Support/MathExtras.h"
		#include <tuple>

		@@ -3266,6 +3268,271 @@ bool CombinerHelper::matchLoadOrCombine(
		return true;
		}

		/// Check if the store \p Store is a truncstore that can be merged. That is,
		/// it's a store of a shifted value of \p SrcVal. If \p SrcVal is an empty
		/// Register then it does not need to match and SrcVal is set to the source
		/// value found.
		/// On match, returns the start byte offset of the \p SrcVal that is being
		/// stored.
		static Optional<int64_t> getTruncStoreByteOffset(GStore &Store, Register &SrcVal,
		MachineRegisterInfo &MRI) {
		Register TruncVal;
		if (!mi_match(Store.getValueReg(), MRI, m_GTrunc(m_Reg(TruncVal))))
		return None;

		// The shift amount must be a constant multiple of the narrow type.
		// It is translated to the offset address in the wide source value "y".
		//
		// x = G_LSHR y, ShiftAmtC
		// s8 z = G_TRUNC x
		// store z, ...
		Register FoundSrcVal;
		int64_t ShiftAmt;
		if (!mi_match(TruncVal, MRI,
		m_any_of(m_GLShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt)),
		m_GAShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt))))) {
		if (!SrcVal.isValid() \|\| TruncVal == SrcVal) {
		if (!SrcVal.isValid())
		SrcVal = TruncVal;
		return 0; // If it's the lowest index store.
		}
		return None;
		}

		unsigned NarrowBits = Store.getMMO().getMemoryType().getScalarSizeInBits();
		if (ShiftAmt % NarrowBits!= 0)
		return None;
		const unsigned Offset = ShiftAmt / NarrowBits;

		if (SrcVal.isValid() && FoundSrcVal != SrcVal)
		return None;

		if (!SrcVal.isValid())
		SrcVal = FoundSrcVal;
		else if (MRI.getType(SrcVal) != MRI.getType(FoundSrcVal))
		return None;
		return Offset;
		}

		/// Match a pattern where a wide type scalar value is stored by several narrow
		/// stores. Fold it into a single store or a BSWAP and a store if the targets
		/// supports it.
		///
		/// Assuming little endian target:
		/// i8 *p = ...
		/// i32 val = ...
		/// p[0] = (val >> 0) & 0xFF;
		/// p[1] = (val >> 8) & 0xFF;
		/// p[2] = (val >> 16) & 0xFF;
		/// p[3] = (val >> 24) & 0xFF;
		/// =>
		/// *((i32)p) = val;
		///
		/// i8 *p = ...
		/// i32 val = ...
		/// p[0] = (val >> 24) & 0xFF;
		/// p[1] = (val >> 16) & 0xFF;
		/// p[2] = (val >> 8) & 0xFF;
		/// p[3] = (val >> 0) & 0xFF;
		/// =>
		/// *((i32)p) = BSWAP(val);
		bool CombinerHelper::matchTruncStoreMerge(MachineInstr &MI,
		MergeTruncStoresInfo &MatchInfo) {
		auto &StoreMI = cast<GStore>(MI);
		LLT MemTy = StoreMI.getMMO().getMemoryType();

		// We only handle merging simple stores of 1-4 bytes.
		if (!MemTy.isScalar())
		return false;
		switch (MemTy.getSizeInBits()) {
		case 8:
		case 16:
		case 32:
		break;
		default:
		return false;
		}
		if (!StoreMI.isSimple())
		return false;

		// We do a simple search for mergeable stores prior to this one.
		// Any potential alias hazard along the way terminates the search.
		SmallVector<GStore *> FoundStores;

		// We're looking for:
		// 1) a (store(trunc(...)))
		// 2) of an LSHR/ASHR of a single wide value, by the appropriate shift to get
		// the partial value stored.
		// 3) where the offsets form either a little or big-endian sequence.

		auto &LastStore = StoreMI;

		// The single base pointer that all stores must use.
		Register BaseReg;
		int64_t LastOffset;
		if (!mi_match(LastStore.getPointerReg(), MRI,
		m_GPtrAdd(m_Reg(BaseReg), m_ICst(LastOffset)))) {
		BaseReg = LastStore.getPointerReg();
		LastOffset = 0;
		}

		GStore *LowestIdxStore = &LastStore;
		int64_t LowestIdxOffset = LastOffset;

		Register WideSrcVal;
		auto LowestShiftAmt = getTruncStoreByteOffset(LastStore, WideSrcVal, MRI);
		if (!LowestShiftAmt)
		return false; // Didn't match a trunc.
		assert(WideSrcVal.isValid());

		LLT WideStoreTy = MRI.getType(WideSrcVal);
		const unsigned NumStoresRequired =
		WideStoreTy.getSizeInBits() / MemTy.getSizeInBits();

		SmallVector<int64_t, 8> OffsetMap(NumStoresRequired, INT64_MAX);
		OffsetMap[*LowestShiftAmt] = LastOffset;
		FoundStores.emplace_back(&LastStore);

		// Search the block up for more stores.
		// We use a search threshold of 10 instructions here because the combiner
		// works top-down within a block, and we don't want to search an unbounded
		// number of predecessor instructions trying to find matching stores.
		// If we moved this optimization into a separate pass then we could probably
		// use a more efficient search without having a hard-coded threshold.
		const int MaxInstsToCheck = 10;
		int NumInstsChecked = 0;
		for (auto II = ++LastStore.getReverseIterator();
		II != LastStore.getParent()->rend() && NumInstsChecked < MaxInstsToCheck;
		++II) {
		NumInstsChecked++;
		GStore *NewStore;
		if ((NewStore = dyn_cast<GStore>(&*II))) {
		if (NewStore->getMMO().getMemoryType() != MemTy \|\| !NewStore->isSimple())
		break;
		} else if (II->isLoadFoldBarrier() \|\| II->mayLoad()) {
		break;
		} else {
		continue; // This is a safe instruction we can look past.
		}

		Register NewBaseReg;
		int64_t MemOffset;
		// Check we're storing to the same base + some offset.
		if (!mi_match(NewStore->getPointerReg(), MRI,
		m_GPtrAdd(m_Reg(NewBaseReg), m_ICst(MemOffset)))) {
		NewBaseReg = NewStore->getPointerReg();
		MemOffset = 0;
		}
		if (BaseReg != NewBaseReg)
		break;

		auto ShiftByteOffset = getTruncStoreByteOffset(*NewStore, WideSrcVal, MRI);
		if (!ShiftByteOffset)
		break;
		if (MemOffset < LowestIdxOffset) {
		LowestIdxOffset = MemOffset;
		LowestIdxStore = NewStore;
		}

		// Map the offset in the store and the offset in the combined value, and
		// early return if it has been set before.
		if (ShiftByteOffset < 0 \|\| ShiftByteOffset >= NumStoresRequired \|\|
		OffsetMap[*ShiftByteOffset] != INT64_MAX)
		break;
		OffsetMap[*ShiftByteOffset] = MemOffset;

		FoundStores.emplace_back(NewStore);
		// Reset counter since we've found a matching inst.
		NumInstsChecked = 0;
		if (FoundStores.size() == NumStoresRequired)
		break;
		}

		if (FoundStores.size() != NumStoresRequired) {
		return false;
		}

		const auto &DL = LastStore.getMF()->getDataLayout();
		auto &C = LastStore.getMF()->getFunction().getContext();
		// Check that a store of the wide type is both allowed and fast on the target
		bool Fast = false;
		bool Allowed = getTargetLowering().allowsMemoryAccess(
		C, DL, WideStoreTy, LowestIdxStore->getMMO(), &Fast);
		if (!Allowed \|\| !Fast)
		return false;

		// Check if the pieces of the value are going to the expected places in memory
		// to merge the stores.
		unsigned NarrowBits = MemTy.getScalarSizeInBits();
		auto checkOffsets = [&](bool MatchLittleEndian) {
		if (MatchLittleEndian) {
		for (unsigned i = 0; i != NumStoresRequired; ++i)
		if (OffsetMap[i] != i * (NarrowBits / 8) + LowestIdxOffset)
		return false;
		} else { // MatchBigEndian by reversing loop counter.
		for (unsigned i = 0, j = NumStoresRequired - 1; i != NumStoresRequired;
		++i, --j)
		if (OffsetMap[j] != i * (NarrowBits / 8) + LowestIdxOffset)
		return false;
		}
		return true;
		};

		// Check if the offsets line up for the native data layout of this target.
		bool NeedBswap = false;
		bool NeedRotate = false;
		if (!checkOffsets(DL.isLittleEndian())) {
		// Special-case: check if byte offsets line up for the opposite endian.
		if (NarrowBits == 8 && checkOffsets(DL.isBigEndian()))
		NeedBswap = true;
		else if (NumStoresRequired == 2 && checkOffsets(DL.isBigEndian()))
		NeedRotate = true;
		else
		return false;
		}

		if (NeedBswap &&
		!isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {WideStoreTy}}))
		return false;
		if (NeedRotate &&
		!isLegalOrBeforeLegalizer({TargetOpcode::G_ROTR, {WideStoreTy}}))
		return false;

		MatchInfo.NeedBSwap = NeedBswap;
		MatchInfo.NeedRotate = NeedRotate;
		MatchInfo.LowestIdxStore = LowestIdxStore;
		MatchInfo.WideSrcVal = WideSrcVal;
		MatchInfo.FoundStores = std::move(FoundStores);
		return true;
		}

		void CombinerHelper::applyTruncStoreMerge(MachineInstr &MI,
		MergeTruncStoresInfo &MatchInfo) {

		Builder.setInstrAndDebugLoc(MI);
		Register WideSrcVal = MatchInfo.WideSrcVal;
		LLT WideStoreTy = MRI.getType(WideSrcVal);

		if (MatchInfo.NeedBSwap) {
		WideSrcVal = Builder.buildBSwap(WideStoreTy, WideSrcVal).getReg(0);
		} else if (MatchInfo.NeedRotate) {
		assert(WideStoreTy.getSizeInBits() % 2 == 0 &&
		"Unexpected type for rotate");
		auto RotAmt =
		Builder.buildConstant(WideStoreTy, WideStoreTy.getSizeInBits() / 2);
		WideSrcVal =
		Builder.buildRotateRight(WideStoreTy, WideSrcVal, RotAmt).getReg(0);
		}

		Builder.buildStore(WideSrcVal, MatchInfo.LowestIdxStore->getPointerReg(),
		MatchInfo.LowestIdxStore->getMMO().getPointerInfo(),
		MatchInfo.LowestIdxStore->getMMO().getAlign());

		// Erase the old stores.
		for (auto *ST : MatchInfo.FoundStores)
		ST->eraseFromParent();
		}

		bool CombinerHelper::matchExtendThroughPhis(MachineInstr &MI,
		MachineInstr *&ExtMI) {
		assert(MI.getOpcode() == TargetOpcode::G_PHI);

llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll

0 → 100644

+348 −0

Original line number	Diff line number	Diff line
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
		; RUN: llc < %s -mtriple=aarch64-apple-ios -global-isel -global-isel-abort=1 \| FileCheck %s

		define dso_local void @trunc_i16_to_i8(i16 %x, i8* %p) {
		; CHECK-LABEL: trunc_i16_to_i8:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: strh w0, [x1]
		; CHECK-NEXT: ret
		%t1 = trunc i16 %x to i8
		%sh = lshr i16 %x, 8
		%t2 = trunc i16 %sh to i8
		store i8 %t1, i8* %p, align 1
		%p1 = getelementptr inbounds i8, i8* %p, i64 1
		store i8 %t2, i8* %p1, align 1
		ret void
		}

		define dso_local void @trunc_i32_to_i8(i32 %x, i8* %p) {
		; CHECK-LABEL: trunc_i32_to_i8:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: str w0, [x1]
		; CHECK-NEXT: ret
		%t1 = trunc i32 %x to i8
		%sh1 = lshr i32 %x, 8
		%t2 = trunc i32 %sh1 to i8
		%sh2 = lshr i32 %x, 16
		%t3 = trunc i32 %sh2 to i8
		%sh3 = lshr i32 %x, 24
		%t4 = trunc i32 %sh3 to i8
		store i8 %t1, i8* %p, align 1
		%p1 = getelementptr inbounds i8, i8* %p, i64 1
		store i8 %t2, i8* %p1, align 1
		%p2 = getelementptr inbounds i8, i8* %p, i64 2
		store i8 %t3, i8* %p2, align 1
		%p3 = getelementptr inbounds i8, i8* %p, i64 3
		store i8 %t4, i8* %p3, align 1
		ret void
		}

		define dso_local void @trunc_i32_to_i16(i32 %x, i16* %p) {
		; CHECK-LABEL: trunc_i32_to_i16:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: str w0, [x1]
		; CHECK-NEXT: ret
		%t1 = trunc i32 %x to i16
		%sh = lshr i32 %x, 16
		%t2 = trunc i32 %sh to i16
		store i16 %t1, i16* %p, align 2
		%p1 = getelementptr inbounds i16, i16* %p, i64 1
		store i16 %t2, i16* %p1, align 2
		ret void
		}

		define dso_local void @be_i32_to_i16(i32 %x, i16* %p0) {
		; CHECK-LABEL: be_i32_to_i16:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: ror w8, w0, #16
		; CHECK-NEXT: str w8, [x1]
		; CHECK-NEXT: ret
		%sh1 = lshr i32 %x, 16
		%t0 = trunc i32 %x to i16
		%t1 = trunc i32 %sh1 to i16
		%p1 = getelementptr inbounds i16, i16* %p0, i64 1
		store i16 %t0, i16* %p1, align 2
		store i16 %t1, i16* %p0, align 2
		ret void
		}

		define dso_local void @be_i32_to_i16_order(i32 %x, i16* %p0) {
		; CHECK-LABEL: be_i32_to_i16_order:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: ror w8, w0, #16
		; CHECK-NEXT: str w8, [x1]
		; CHECK-NEXT: ret
		%sh1 = lshr i32 %x, 16
		%t0 = trunc i32 %x to i16
		%t1 = trunc i32 %sh1 to i16
		%p1 = getelementptr inbounds i16, i16* %p0, i64 1
		store i16 %t1, i16* %p0, align 2
		store i16 %t0, i16* %p1, align 2
		ret void
		}

		define dso_local void @trunc_i64_to_i8(i64 %x, i8* %p) {
		; CHECK-LABEL: trunc_i64_to_i8:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: str x0, [x1]
		; CHECK-NEXT: ret
		%t1 = trunc i64 %x to i8
		%sh1 = lshr i64 %x, 8
		%t2 = trunc i64 %sh1 to i8
		%sh2 = lshr i64 %x, 16
		%t3 = trunc i64 %sh2 to i8
		%sh3 = lshr i64 %x, 24
		%t4 = trunc i64 %sh3 to i8
		%sh4 = lshr i64 %x, 32
		%t5 = trunc i64 %sh4 to i8
		%sh5 = lshr i64 %x, 40
		%t6 = trunc i64 %sh5 to i8
		%sh6 = lshr i64 %x, 48
		%t7 = trunc i64 %sh6 to i8
		%sh7 = lshr i64 %x, 56
		%t8 = trunc i64 %sh7 to i8
		store i8 %t1, i8* %p, align 1
		%p1 = getelementptr inbounds i8, i8* %p, i64 1
		store i8 %t2, i8* %p1, align 1
		%p2 = getelementptr inbounds i8, i8* %p, i64 2
		store i8 %t3, i8* %p2, align 1
		%p3 = getelementptr inbounds i8, i8* %p, i64 3
		store i8 %t4, i8* %p3, align 1
		%p4 = getelementptr inbounds i8, i8* %p, i64 4
		store i8 %t5, i8* %p4, align 1
		%p5 = getelementptr inbounds i8, i8* %p, i64 5
		store i8 %t6, i8* %p5, align 1
		%p6 = getelementptr inbounds i8, i8* %p, i64 6
		store i8 %t7, i8* %p6, align 1
		%p7 = getelementptr inbounds i8, i8* %p, i64 7
		store i8 %t8, i8* %p7, align 1
		ret void
		}

		define dso_local void @trunc_i64_to_i16(i64 %x, i16* %p) {
		; CHECK-LABEL: trunc_i64_to_i16:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: str x0, [x1]
		; CHECK-NEXT: ret
		%t1 = trunc i64 %x to i16
		%sh1 = lshr i64 %x, 16
		%t2 = trunc i64 %sh1 to i16
		%sh2 = lshr i64 %x, 32
		%t3 = trunc i64 %sh2 to i16
		%sh3 = lshr i64 %x, 48
		%t4 = trunc i64 %sh3 to i16
		store i16 %t1, i16* %p, align 2
		%p1 = getelementptr inbounds i16, i16* %p, i64 1
		store i16 %t2, i16* %p1, align 2
		%p2 = getelementptr inbounds i16, i16* %p, i64 2
		store i16 %t3, i16* %p2, align 2
		%p3 = getelementptr inbounds i16, i16* %p, i64 3
		store i16 %t4, i16* %p3, align 2
		ret void
		}

		define dso_local void @trunc_i64_to_i32(i64 %x, i32* %p) {
		; CHECK-LABEL: trunc_i64_to_i32:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: str x0, [x1]
		; CHECK-NEXT: ret
		%t1 = trunc i64 %x to i32
		%sh = lshr i64 %x, 32
		%t2 = trunc i64 %sh to i32
		store i32 %t1, i32* %p, align 4
		%p1 = getelementptr inbounds i32, i32* %p, i64 1
		store i32 %t2, i32* %p1, align 4
		ret void
		}
		define dso_local void @be_i64_to_i32(i64 %x, i32* %p0) {
		; CHECK-LABEL: be_i64_to_i32:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: ror x8, x0, #32
		; CHECK-NEXT: str x8, [x1]
		; CHECK-NEXT: ret
		%sh1 = lshr i64 %x, 32
		%t0 = trunc i64 %x to i32
		%t1 = trunc i64 %sh1 to i32
		%p1 = getelementptr inbounds i32, i32* %p0, i64 1
		store i32 %t0, i32* %p1, align 4
		store i32 %t1, i32* %p0, align 4
		ret void
		}

		define dso_local void @be_i64_to_i32_order(i64 %x, i32* %p0) {
		; CHECK-LABEL: be_i64_to_i32_order:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: ror x8, x0, #32
		; CHECK-NEXT: str x8, [x1]
		; CHECK-NEXT: ret
		%sh1 = lshr i64 %x, 32
		%t0 = trunc i64 %x to i32
		%t1 = trunc i64 %sh1 to i32
		%p1 = getelementptr inbounds i32, i32* %p0, i64 1
		store i32 %t1, i32* %p0, align 4
		store i32 %t0, i32* %p1, align 4
		ret void
		}

		; Negative tests.

		define void @merge_hole(i32 %x, i8* %p) {
		; CHECK-LABEL: merge_hole:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: lsr w8, w0, #16
		; CHECK-NEXT: strb w0, [x1]
		; CHECK-NEXT: strh w8, [x1, #2]
		; CHECK-NEXT: ret
		%pcast = bitcast i8* %p to i16*
		%p2 = getelementptr inbounds i16, i16* %pcast, i64 1
		%x3 = trunc i32 %x to i8
		store i8 %x3, i8* %p, align 1
		%sh = lshr i32 %x, 16
		%x01 = trunc i32 %sh to i16
		store i16 %x01, i16* %p2, align 1
		ret void
		}

		define void @merge_hole2(i32 %x, i8* %p) {
		; CHECK-LABEL: merge_hole2:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: lsr w8, w0, #16
		; CHECK-NEXT: strh w8, [x1, #2]
		; CHECK-NEXT: strb w0, [x1]
		; CHECK-NEXT: ret
		%pcast = bitcast i8* %p to i16*
		%p2 = getelementptr inbounds i16, i16* %pcast, i64 1
		%sh = lshr i32 %x, 16
		%x01 = trunc i32 %sh to i16
		store i16 %x01, i16* %p2, align 1
		%x3 = trunc i32 %x to i8
		store i8 %x3, i8* %p, align 1
		ret void
		}

		define void @merge_hole3(i32 %x, i8* %p) {
		; CHECK-LABEL: merge_hole3:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: lsr w8, w0, #16
		; CHECK-NEXT: strb w0, [x1, #1]
		; CHECK-NEXT: strh w8, [x1, #2]
		; CHECK-NEXT: ret
		%p1 = getelementptr inbounds i8, i8* %p, i64 1
		%pcast = bitcast i8* %p to i16*
		%p2 = getelementptr inbounds i16, i16* %pcast, i64 1
		%x3 = trunc i32 %x to i8
		store i8 %x3, i8* %p1, align 1
		%sh = lshr i32 %x, 16
		%x01 = trunc i32 %sh to i16
		store i16 %x01, i16* %p2, align 1
		ret void
		}

		define void @merge_hole4(i32 %x, i8* %p) {
		; CHECK-LABEL: merge_hole4:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: lsr w8, w0, #16
		; CHECK-NEXT: strb w0, [x1, #2]
		; CHECK-NEXT: strh w8, [x1]
		; CHECK-NEXT: ret
		%pcast = bitcast i8* %p to i16*
		%p2 = getelementptr inbounds i8, i8* %p, i64 2
		%x3 = trunc i32 %x to i8
		store i8 %x3, i8* %p2, align 1
		%sh = lshr i32 %x, 16
		%x01 = trunc i32 %sh to i16
		store i16 %x01, i16* %pcast, align 1
		ret void
		}

		define dso_local i32 @load_between_stores(i32 %x, i16* %p, i32 *%ptr) {
		; CHECK-LABEL: load_between_stores:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: strh w0, [x1]
		; CHECK-NEXT: ldr w8, [x2]
		; CHECK-NEXT: lsr w9, w0, #16
		; CHECK-NEXT: strh w9, [x1, #2]
		; CHECK-NEXT: mov w0, w8
		; CHECK-NEXT: ret
		%t1 = trunc i32 %x to i16
		%sh = lshr i32 %x, 16
		%t2 = trunc i32 %sh to i16
		store i16 %t1, i16* %p, align 2
		%ld = load i32, i32 *%ptr
		%p1 = getelementptr inbounds i16, i16* %p, i64 1
		store i16 %t2, i16* %p1, align 2
		ret i32 %ld
		}

		define dso_local void @invalid_shift(i16 %x, i8* %p) {
		; CHECK-LABEL: invalid_shift:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: and w8, w0, #0xffff
		; CHECK-NEXT: lsr w8, w8, #4
		; CHECK-NEXT: strb w0, [x1]
		; CHECK-NEXT: strb w8, [x1, #1]
		; CHECK-NEXT: ret
		%t1 = trunc i16 %x to i8
		%sh = lshr i16 %x, 4
		%t2 = trunc i16 %sh to i8
		store i8 %t1, i8* %p, align 1
		%p1 = getelementptr inbounds i8, i8* %p, i64 1
		store i8 %t2, i8* %p1, align 1
		ret void
		}

		define dso_local void @missing_store(i32 %x, i8* %p) {
		; CHECK-LABEL: missing_store:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: lsr w8, w0, #8
		; CHECK-NEXT: lsr w9, w0, #24
		; CHECK-NEXT: strb w0, [x1]
		; CHECK-NEXT: strb w8, [x1, #1]
		; CHECK-NEXT: strb w9, [x1, #3]
		; CHECK-NEXT: ret
		%t1 = trunc i32 %x to i8
		%sh1 = lshr i32 %x, 8
		%t2 = trunc i32 %sh1 to i8
		%sh3 = lshr i32 %x, 24
		%t4 = trunc i32 %sh3 to i8
		store i8 %t1, i8* %p, align 1
		%p1 = getelementptr inbounds i8, i8* %p, i64 1
		store i8 %t2, i8* %p1, align 1
		%p3 = getelementptr inbounds i8, i8* %p, i64 3
		store i8 %t4, i8* %p3, align 1
		ret void
		}

		define dso_local void @different_base_reg(i16 %x, i8* %p, i8 *%p2) {
		; CHECK-LABEL: different_base_reg:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: and w8, w0, #0xffff
		; CHECK-NEXT: lsr w8, w8, #8
		; CHECK-NEXT: strb w0, [x1]
		; CHECK-NEXT: strb w8, [x2, #1]
		; CHECK-NEXT: ret
		%t1 = trunc i16 %x to i8
		%sh = lshr i16 %x, 8
		%t2 = trunc i16 %sh to i8
		store i8 %t1, i8* %p, align 1
		%p1 = getelementptr inbounds i8, i8* %p2, i64 1
		store i8 %t2, i8* %p1, align 1
		ret void
		}

		define dso_local void @second_store_is_volatile(i16 %x, i8* %p) {
		; CHECK-LABEL: second_store_is_volatile:
		; CHECK: ; %bb.0:
		; CHECK-NEXT: and w8, w0, #0xffff
		; CHECK-NEXT: lsr w8, w8, #8
		; CHECK-NEXT: strb w0, [x1]
		; CHECK-NEXT: strb w8, [x1, #1]
		; CHECK-NEXT: ret
		%t1 = trunc i16 %x to i8
		%sh = lshr i16 %x, 8
		%t2 = trunc i16 %sh to i8
		store volatile i8 %t1, i8* %p, align 1
		%p1 = getelementptr inbounds i8, i8* %p, i64 1
		store i8 %t2, i8* %p1, align 1
		ret void
		}

llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.mir

0 → 100644

+737 −0

File added.

Preview size limit exceeded, changes collapsed.

Admin message