[WebAssembly] Optimize vector shift using a splat value from outside block (291101aa) · Commits · llvm-doe / llvm-project

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

+25 −0

Original line number	Diff line number	Diff line
		@@ -32,6 +32,7 @@
		#include "llvm/IR/Function.h"
		#include "llvm/IR/Intrinsics.h"
		#include "llvm/IR/IntrinsicsWebAssembly.h"
		#include "llvm/IR/PatternMatch.h"
		#include "llvm/Support/Debug.h"
		#include "llvm/Support/ErrorHandling.h"
		#include "llvm/Support/KnownBits.h"
		@@ -833,6 +834,30 @@ bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
		return isa<Function>(GV) ? false : TargetLowering::isOffsetFoldingLegal(GA);
		}

		bool WebAssemblyTargetLowering::shouldSinkOperands(
		Instruction I, SmallVectorImpl<Use > &Ops) const {
		using namespace llvm::PatternMatch;

		if (!I->getType()->isVectorTy() \|\| !I->isShift())
		return false;

		Value *V = I->getOperand(1);
		// We dont need to sink constant splat.
		if (dyn_cast<Constant>(V))
		return false;

		if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
		m_Value(), m_ZeroMask()))) {
		// Sink insert
		Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
		// Sink shuffle
		Ops.push_back(&I->getOperandUse(1));
		return true;
		}

		return false;
		}

		EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
		LLVMContext &C,
		EVT VT) const {

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -76,6 +76,8 @@ private:
		bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
		bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
		bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
		bool shouldSinkOperands(Instruction *I,
		SmallVectorImpl<Use *> &Ops) const override;
		EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
		EVT VT) const override;
		bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,

llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll

0 → 100644

+104 −0

Original line number	Diff line number	Diff line
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
		; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 \| FileCheck %s

		; Test that SIMD shifts can be lowered correctly even when shift
		; values are exported from outside blocks.

		target triple = "wasm32-unknown-unknown"

		define void @shl_loop(ptr %a, i8 %shift, i32 %count) {
		; CHECK-LABEL: shl_loop:
		; CHECK: .functype shl_loop (i32, i32, i32) -> ()
		; CHECK-NEXT: # %bb.0: # %entry
		; CHECK-NEXT: .LBB0_1: # %body
		; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
		; CHECK-NEXT: loop # label0:
		; CHECK-NEXT: local.get 0
		; CHECK-NEXT: local.get 0
		; CHECK-NEXT: v128.load 0:p2align=0
		; CHECK-NEXT: local.get 1
		; CHECK-NEXT: i8x16.shl
		; CHECK-NEXT: v128.store 16
		; CHECK-NEXT: local.get 0
		; CHECK-NEXT: i32.const 16
		; CHECK-NEXT: i32.add
		; CHECK-NEXT: local.set 0
		; CHECK-NEXT: local.get 2
		; CHECK-NEXT: i32.const -1
		; CHECK-NEXT: i32.add
		; CHECK-NEXT: local.tee 2
		; CHECK-NEXT: i32.eqz
		; CHECK-NEXT: br_if 0 # 0: up to label0
		; CHECK-NEXT: # %bb.2: # %exit
		; CHECK-NEXT: end_loop
		; CHECK-NEXT: # fallthrough-return
		entry:
		%t1 = insertelement <16 x i8> undef, i8 %shift, i32 0
		%vshift = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer
		br label %body
		body:
		%out = phi ptr [%a, %entry], [%b, %body]
		%i = phi i32 [0, %entry], [%next, %body]
		%v = load <16 x i8>, ptr %out, align 1
		%r = shl <16 x i8> %v, %vshift
		%b = getelementptr inbounds i8, ptr %out, i32 16
		store <16 x i8> %r, ptr %b
		%next = add i32 %i, 1
		%i.cmp = icmp eq i32 %next, %count
		br i1 %i.cmp, label %body, label %exit
		exit:
		ret void
		}

		; Test that SIMD shifts can be lowered correctly when shift value
		; is a phi inside loop body.

		define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) {
		; CHECK-LABEL: shl_phi_loop:
		; CHECK: .functype shl_phi_loop (i32, i32, i32) -> ()
		; CHECK-NEXT: # %bb.0: # %entry
		; CHECK-NEXT: .LBB1_1: # %body
		; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
		; CHECK-NEXT: loop # label1:
		; CHECK-NEXT: local.get 0
		; CHECK-NEXT: local.get 0
		; CHECK-NEXT: v128.load 0:p2align=0
		; CHECK-NEXT: local.get 1
		; CHECK-NEXT: i8x16.shl
		; CHECK-NEXT: v128.store 16
		; CHECK-NEXT: local.get 1
		; CHECK-NEXT: i32.const 1
		; CHECK-NEXT: i32.and
		; CHECK-NEXT: local.set 1
		; CHECK-NEXT: local.get 0
		; CHECK-NEXT: i32.const 16
		; CHECK-NEXT: i32.add
		; CHECK-NEXT: local.set 0
		; CHECK-NEXT: local.get 2
		; CHECK-NEXT: i32.const -1
		; CHECK-NEXT: i32.add
		; CHECK-NEXT: local.tee 2
		; CHECK-NEXT: i32.eqz
		; CHECK-NEXT: br_if 0 # 0: up to label1
		; CHECK-NEXT: # %bb.2: # %exit
		; CHECK-NEXT: end_loop
		; CHECK-NEXT: # fallthrough-return
		entry:
		br label %body
		body:
		%out = phi ptr [%a, %entry], [%b, %body]
		%i = phi i32 [0, %entry], [%next, %body]
		%t1 = phi i8 [%shift, %entry], [%sand, %body]
		%t2 = insertelement <16 x i8> undef, i8 %t1, i32 0
		%vshift = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer
		%v = load <16 x i8>, ptr %out, align 1
		%r = shl <16 x i8> %v, %vshift
		%b = getelementptr inbounds i8, ptr %out, i32 16
		store <16 x i8> %r, ptr %b
		%sand = and i8 %t1, 1
		%next = add i32 %i, 1
		%i.cmp = icmp eq i32 %next, %count
		br i1 %i.cmp, label %body, label %exit
		exit:
		ret void
		}