Commit 291101aa authored by Yolanda Chen's avatar Yolanda Chen Committed by Thomas Lively
Browse files

[WebAssembly] Optimize vector shift using a splat value from outside block

The vector shift operation in WebAssembly uses an i32 shift amount type, while
the LLVM IR requires binary operator uses the same type of operands. When the
shift amount operand is splated from a different block, the splat source will
not be exported and the vector shift will be unrolled to scalar shifts. This
patch enables the vector shift to identify the splat source value from the other
block, and generate expected WebAssembly bytecode when lowering.

Reviewed By: tlively

Differential Revision: https://reviews.llvm.org/D158399
parent 52b93d2f
Loading
Loading
Loading
Loading
+25 −0
Original line number Diff line number Diff line
@@ -32,6 +32,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsWebAssembly.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
@@ -833,6 +834,30 @@ bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
  return isa<Function>(GV) ? false : TargetLowering::isOffsetFoldingLegal(GA);
}

bool WebAssemblyTargetLowering::shouldSinkOperands(
    Instruction *I, SmallVectorImpl<Use *> &Ops) const {
  using namespace llvm::PatternMatch;

  if (!I->getType()->isVectorTy() || !I->isShift())
    return false;

  Value *V = I->getOperand(1);
  // We dont need to sink constant splat.
  if (dyn_cast<Constant>(V))
    return false;

  if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
                         m_Value(), m_ZeroMask()))) {
    // Sink insert
    Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
    // Sink shuffle
    Ops.push_back(&I->getOperandUse(1));
    return true;
  }

  return false;
}

EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
                                                  LLVMContext &C,
                                                  EVT VT) const {
+2 −0
Original line number Diff line number Diff line
@@ -76,6 +76,8 @@ private:
  bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
  bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
  bool shouldSinkOperands(Instruction *I,
                          SmallVectorImpl<Use *> &Ops) const override;
  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                         EVT VT) const override;
  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+104 −0
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s

; Test that SIMD shifts can be lowered correctly even when shift
; values are exported from outside blocks.

target triple = "wasm32-unknown-unknown"

define void @shl_loop(ptr %a, i8 %shift, i32 %count) {
; CHECK-LABEL: shl_loop:
; CHECK:         .functype shl_loop (i32, i32, i32) -> ()
; CHECK-NEXT:  # %bb.0: # %entry
; CHECK-NEXT:  .LBB0_1: # %body
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    loop # label0:
; CHECK-NEXT:    local.get 0
; CHECK-NEXT:    local.get 0
; CHECK-NEXT:    v128.load 0:p2align=0
; CHECK-NEXT:    local.get 1
; CHECK-NEXT:    i8x16.shl
; CHECK-NEXT:    v128.store 16
; CHECK-NEXT:    local.get 0
; CHECK-NEXT:    i32.const 16
; CHECK-NEXT:    i32.add
; CHECK-NEXT:    local.set 0
; CHECK-NEXT:    local.get 2
; CHECK-NEXT:    i32.const -1
; CHECK-NEXT:    i32.add
; CHECK-NEXT:    local.tee 2
; CHECK-NEXT:    i32.eqz
; CHECK-NEXT:    br_if 0 # 0: up to label0
; CHECK-NEXT:  # %bb.2: # %exit
; CHECK-NEXT:    end_loop
; CHECK-NEXT:    # fallthrough-return
entry:
 %t1 = insertelement <16 x i8> undef, i8 %shift, i32 0
 %vshift = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer
 br label %body
body:
 %out = phi ptr [%a, %entry], [%b, %body]
 %i = phi i32 [0, %entry], [%next, %body]
 %v = load <16 x i8>, ptr %out, align 1
 %r = shl <16 x i8> %v, %vshift
 %b = getelementptr inbounds i8, ptr %out, i32 16
 store <16 x i8> %r, ptr %b
 %next = add i32 %i, 1
 %i.cmp = icmp eq i32 %next, %count
 br i1 %i.cmp, label %body, label %exit
exit:
 ret void
}

; Test that SIMD shifts can be lowered correctly when shift value
; is a phi inside loop body.

define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) {
; CHECK-LABEL: shl_phi_loop:
; CHECK:         .functype shl_phi_loop (i32, i32, i32) -> ()
; CHECK-NEXT:  # %bb.0: # %entry
; CHECK-NEXT:  .LBB1_1: # %body
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    loop # label1:
; CHECK-NEXT:    local.get 0
; CHECK-NEXT:    local.get 0
; CHECK-NEXT:    v128.load 0:p2align=0
; CHECK-NEXT:    local.get 1
; CHECK-NEXT:    i8x16.shl
; CHECK-NEXT:    v128.store 16
; CHECK-NEXT:    local.get 1
; CHECK-NEXT:    i32.const 1
; CHECK-NEXT:    i32.and
; CHECK-NEXT:    local.set 1
; CHECK-NEXT:    local.get 0
; CHECK-NEXT:    i32.const 16
; CHECK-NEXT:    i32.add
; CHECK-NEXT:    local.set 0
; CHECK-NEXT:    local.get 2
; CHECK-NEXT:    i32.const -1
; CHECK-NEXT:    i32.add
; CHECK-NEXT:    local.tee 2
; CHECK-NEXT:    i32.eqz
; CHECK-NEXT:    br_if 0 # 0: up to label1
; CHECK-NEXT:  # %bb.2: # %exit
; CHECK-NEXT:    end_loop
; CHECK-NEXT:    # fallthrough-return
entry:
 br label %body
body:
 %out = phi ptr [%a, %entry], [%b, %body]
 %i = phi i32 [0, %entry], [%next, %body]
 %t1 = phi i8 [%shift, %entry], [%sand, %body]
 %t2 = insertelement <16 x i8> undef, i8 %t1, i32 0
 %vshift = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer
 %v = load <16 x i8>, ptr %out, align 1
 %r = shl <16 x i8> %v, %vshift
 %b = getelementptr inbounds i8, ptr %out, i32 16
 store <16 x i8> %r, ptr %b
 %sand = and i8 %t1, 1
 %next = add i32 %i, 1
 %i.cmp = icmp eq i32 %next, %count
 br i1 %i.cmp, label %body, label %exit
exit:
 ret void
}