Commit 0462903e authored by Hans Wennborg's avatar Hans Wennborg
Browse files

Merging r321870, r321872, and r321994:

------------------------------------------------------------------------
r321870 | abataev | 2018-01-05 07:20:40 -0800 (Fri, 05 Jan 2018) | 1 line

[SLP] Update test checks, NFC.
------------------------------------------------------------------------

------------------------------------------------------------------------
r321872 | abataev | 2018-01-05 08:15:17 -0800 (Fri, 05 Jan 2018) | 1 line

[SLP] Update more test checks, NFC.
------------------------------------------------------------------------

------------------------------------------------------------------------
r321994 | abataev | 2018-01-08 06:43:06 -0800 (Mon, 08 Jan 2018) | 13 lines

[SLP] Fix PR35777: Incorrect handling of aggregate values.

Summary:
Fixes the bug with incorrect handling of InsertValue|InsertElement
instrucions in SLP vectorizer. Currently, we may use incorrect
ExtractElement instructions as the operands of the original
InsertValue|InsertElement instructions.

Reviewers: mkuper, hfinkel, RKSimon, spatel

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D41767
------------------------------------------------------------------------

llvm-svn: 322675
parent cf9e4fbd
Loading
Loading
Loading
Loading
+1 −6
Original line number Diff line number Diff line
@@ -95,14 +95,9 @@ private:
  bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);

  /// \brief Try to vectorize a list of operands.
  /// \@param BuildVector A list of users to ignore for the purpose of
  ///                     scheduling and cost estimation when NeedExtraction
  ///                     is false.
  /// \returns true if a value was vectorized.
  bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
                          ArrayRef<Value *> BuildVector = None,
                          bool AllowReorder = false,
                          bool NeedExtraction = false);
                          bool AllowReorder = false);

  /// \brief Try to vectorize a chain that may start at the operands of \p I.
  bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);
+10 −50
Original line number Diff line number Diff line
@@ -4416,13 +4416,11 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
  if (!A || !B)
    return false;
  Value *VL[] = { A, B };
  return tryToVectorizeList(VL, R, None, true);
  return tryToVectorizeList(VL, R, true);
}

bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                                           ArrayRef<Value *> BuildVector,
                                           bool AllowReorder,
                                           bool NeedExtraction) {
                                           bool AllowReorder) {
  if (VL.size() < 2)
    return false;

@@ -4516,12 +4514,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                   << "\n");
      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);

      ArrayRef<Value *> EmptyArray;
      ArrayRef<Value *> BuildVectorSlice;
      if (!BuildVector.empty())
        BuildVectorSlice = BuildVector.slice(I, OpsWidth);

      R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);
      R.buildTree(Ops);
      // TODO: check if we can allow reordering for more cases.
      if (AllowReorder && R.shouldReorder()) {
        // Conceptually, there is nothing actually preventing us from trying to
@@ -4529,7 +4522,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
        // reductions. However, at this point, we only expect to get here when
        // there are exactly two operations.
        assert(Ops.size() == 2);
        assert(BuildVectorSlice.empty());
        Value *ReorderedOps[] = {Ops[1], Ops[0]};
        R.buildTree(ReorderedOps, None);
      }
@@ -4549,31 +4541,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                                 << " and with tree size "
                                 << ore::NV("TreeSize", R.getTreeSize()));

        Value *VectorizedRoot = R.vectorizeTree();

        // Reconstruct the build vector by extracting the vectorized root. This
        // way we handle the case where some elements of the vector are
        // undefined.
        //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
        if (!BuildVectorSlice.empty()) {
          // The insert point is the last build vector instruction. The
          // vectorized root will precede it. This guarantees that we get an
          // instruction. The vectorized tree could have been constant folded.
          Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
          unsigned VecIdx = 0;
          for (auto &V : BuildVectorSlice) {
            IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
                                        ++BasicBlock::iterator(InsertAfter));
            Instruction *I = cast<Instruction>(V);
            assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
            Instruction *Extract =
                cast<Instruction>(Builder.CreateExtractElement(
                    VectorizedRoot, Builder.getInt32(VecIdx++)));
            I->setOperand(1, Extract);
            I->moveAfter(Extract);
            InsertAfter = I;
          }
        }
        R.vectorizeTree();
        // Move to the next bundle.
        I += VF - 1;
        NextInst = I + 1;
@@ -5494,11 +5462,9 @@ private:
///
/// Returns true if it matches
static bool findBuildVector(InsertElementInst *LastInsertElem,
                            SmallVectorImpl<Value *> &BuildVector,
                            SmallVectorImpl<Value *> &BuildVectorOpds) {
  Value *V = nullptr;
  do {
    BuildVector.push_back(LastInsertElem);
    BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
    V = LastInsertElem->getOperand(0);
    if (isa<UndefValue>(V))
@@ -5507,7 +5473,6 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
    if (!LastInsertElem || !LastInsertElem->hasOneUse())
      return false;
  } while (true);
  std::reverse(BuildVector.begin(), BuildVector.end());
  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
  return true;
}
@@ -5516,11 +5481,9 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
///
/// \return true if it matches.
static bool findBuildAggregate(InsertValueInst *IV,
                               SmallVectorImpl<Value *> &BuildVector,
                               SmallVectorImpl<Value *> &BuildVectorOpds) {
  Value *V;
  do {
    BuildVector.push_back(IV);
    BuildVectorOpds.push_back(IV->getInsertedValueOperand());
    V = IV->getAggregateOperand();
    if (isa<UndefValue>(V))
@@ -5529,7 +5492,6 @@ static bool findBuildAggregate(InsertValueInst *IV,
    if (!IV || !IV->hasOneUse())
      return false;
  } while (true);
  std::reverse(BuildVector.begin(), BuildVector.end());
  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
  return true;
}
@@ -5705,27 +5667,25 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
  if (!R.canMapToVector(IVI->getType(), DL))
    return false;

  SmallVector<Value *, 16> BuildVector;
  SmallVector<Value *, 16> BuildVectorOpds;
  if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
  if (!findBuildAggregate(IVI, BuildVectorOpds))
    return false;

  DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
  // Aggregate value is unlikely to be processed in vector register, we need to
  // extract scalars into scalar registers, so NeedExtraction is set true.
  return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true);
  return tryToVectorizeList(BuildVectorOpds, R);
}

bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                   BasicBlock *BB, BoUpSLP &R) {
  SmallVector<Value *, 16> BuildVector;
  SmallVector<Value *, 16> BuildVectorOpds;
  if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
  if (!findBuildVector(IEI, BuildVectorOpds))
    return false;

  // Vectorize starting with the build vector operands ignoring the BuildVector
  // instructions for the purpose of scheduling and user extraction.
  return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
  return tryToVectorizeList(BuildVectorOpds, R);
}

bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -5803,8 +5763,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
      // is done when there are exactly two elements since tryToVectorizeList
      // asserts that there are only two values when AllowReorder is true.
      bool AllowReorder = NumElts == 2;
      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
                                            None, AllowReorder)) {
      if (NumElts > 1 &&
          tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
        // Success start over because instructions might have been changed.
        HaveVectorizedPhiNodes = true;
        Changed = true;
+48 −0
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -verify -slp-vectorizer -o - -S -mtriple=x86_64-apple-macosx10.13.0 | FileCheck %s

@global = local_unnamed_addr global [6 x double] zeroinitializer, align 16

define { i64, i64 } @patatino(double %arg) {
; CHECK-LABEL: @patatino(
; CHECK-NEXT:  bb:
; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([6 x double]* @global to <2 x double>*), align 16
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16
; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[ARG:%.*]], i32 0
; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1
; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]]
; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16
; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
; CHECK-NEXT:    [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
; CHECK-NEXT:    [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
; CHECK-NEXT:    [[TMP10:%.*]] = trunc <2 x i64> [[TMP9]] to <2 x i32>
; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP12]], 0
; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP14]], 1
; CHECK-NEXT:    ret { i64, i64 } [[TMP17]]
;
bb:
  %tmp = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 0), align 16
  %tmp1 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2), align 16
  %tmp2 = fmul double %tmp1, %arg
  %tmp3 = fadd double %tmp, %tmp2
  %tmp4 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4), align 16
  %tmp5 = fadd double %tmp4, %tmp3
  %tmp6 = fptosi double %tmp5 to i32
  %tmp7 = sext i32 %tmp6 to i64
  %tmp8 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 1), align 8
  %tmp9 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 3), align 8
  %tmp10 = fmul double %tmp9, %arg
  %tmp11 = fadd double %tmp8, %tmp10
  %tmp12 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 5), align 8
  %tmp13 = fadd double %tmp12, %tmp11
  %tmp14 = fptosi double %tmp13 to i32
  %tmp15 = sext i32 %tmp14 to i64
  %tmp16 = insertvalue { i64, i64 } undef, i64 %tmp7, 0
  %tmp17 = insertvalue { i64, i64 } %tmp16, i64 %tmp15, 1
  ret { i64, i64 } %tmp17
}
+110 −110

File changed.

Preview size limit exceeded, changes collapsed.

+140 −22
Original line number Diff line number Diff line
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s

; CHECK-LABEL: julia_2xdouble
; CHECK: load <2 x double>
; CHECK: load <2 x double>
; CHECK: fmul <2 x double>
; CHECK: fadd <2 x double>
define void @julia_2xdouble([2 x double]* sret, [2 x double]*, [2 x double]*, [2 x double]*) {
; CHECK-LABEL: @julia_2xdouble(
; CHECK-NEXT:  top:
; CHECK-NEXT:    [[PX0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2:%.*]], i64 0, i64 0
; CHECK-NEXT:    [[PY0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3:%.*]], i64 0, i64 0
; CHECK-NEXT:    [[PX1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2]], i64 0, i64 1
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[PX0]] to <2 x double>*
; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4
; CHECK-NEXT:    [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[PY0]] to <2 x double>*
; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4
; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0
; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1
; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[PZ0]] to <2 x double>*
; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 4
; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
; CHECK-NEXT:    [[I0:%.*]] = insertvalue [2 x double] undef, double [[TMP12]], 0
; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
; CHECK-NEXT:    [[I1:%.*]] = insertvalue [2 x double] [[I0]], double [[TMP13]], 1
; CHECK-NEXT:    store [2 x double] [[I1]], [2 x double]* [[TMP0:%.*]], align 4
; CHECK-NEXT:    ret void
;
top:
  %px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0
  %x0 = load double, double* %px0, align 4
@@ -29,12 +48,40 @@ top:
  ret void
}

; CHECK-LABEL: julia_4xfloat
; CHECK: load <4 x float>
; CHECK: load <4 x float>
; CHECK: fmul <4 x float>
; CHECK: fadd <4 x float>
define void @julia_4xfloat([4 x float]* sret, [4 x float]*, [4 x float]*, [4 x float]*) {
; CHECK-LABEL: @julia_4xfloat(
; CHECK-NEXT:  top:
; CHECK-NEXT:    [[PX0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2:%.*]], i64 0, i64 0
; CHECK-NEXT:    [[PY0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3:%.*]], i64 0, i64 0
; CHECK-NEXT:    [[PX1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 1
; CHECK-NEXT:    [[PY1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 1
; CHECK-NEXT:    [[PX2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 2
; CHECK-NEXT:    [[PY2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 2
; CHECK-NEXT:    [[PX3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 3
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[PX0]] to <4 x float>*
; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
; CHECK-NEXT:    [[PY3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 3
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[PY0]] to <4 x float>*
; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1:%.*]], i64 0, i64 0
; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 1
; CHECK-NEXT:    [[PZ2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 2
; CHECK-NEXT:    [[PZ3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 3
; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[PZ0]] to <4 x float>*
; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4
; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]
; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
; CHECK-NEXT:    [[I0:%.*]] = insertvalue [4 x float] undef, float [[TMP12]], 0
; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
; CHECK-NEXT:    [[I1:%.*]] = insertvalue [4 x float] [[I0]], float [[TMP13]], 1
; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
; CHECK-NEXT:    [[I2:%.*]] = insertvalue [4 x float] [[I1]], float [[TMP14]], 2
; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
; CHECK-NEXT:    [[I3:%.*]] = insertvalue [4 x float] [[I2]], float [[TMP15]], 3
; CHECK-NEXT:    store [4 x float] [[I3]], [4 x float]* [[TMP0:%.*]], align 4
; CHECK-NEXT:    ret void
;
top:
  %px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0
  %x0 = load float, float* %px0, align 4
@@ -76,9 +123,27 @@ top:
  ret void
}

; CHECK-LABEL: julia_load_array_of_float
; CHECK: fsub <4 x float>
define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) {
; CHECK-LABEL: @julia_load_array_of_float(
; CHECK-NEXT:  top:
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [4 x float]* [[A:%.*]] to <4 x float>*
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
; CHECK-NEXT:    [[A_ARR:%.*]] = load [4 x float], [4 x float]* [[A]], align 4
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [4 x float]* [[B:%.*]] to <4 x float>*
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
; CHECK-NEXT:    [[B_ARR:%.*]] = load [4 x float], [4 x float]* [[B]], align 4
; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x float] undef, float [[TMP5]], 0
; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x float] [[C_ARR0]], float [[TMP6]], 1
; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x float] [[C_ARR1]], float [[TMP7]], 2
; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x float] [[C_ARR2]], float [[TMP8]], 3
; CHECK-NEXT:    store [4 x float] [[C_ARR3]], [4 x float]* [[C:%.*]], align 4
; CHECK-NEXT:    ret void
;
top:
  %a_arr = load [4 x float], [4 x float]* %a, align 4
  %a0 = extractvalue [4 x float] %a_arr, 0
@@ -102,11 +167,27 @@ top:
  ret void
}

; CHECK-LABEL: julia_load_array_of_i32
; CHECK: load <4 x i32>
; CHECK: load <4 x i32>
; CHECK: sub <4 x i32>
define void @julia_load_array_of_i32([4 x i32]* %a, [4 x i32]* %b, [4 x i32]* %c) {
; CHECK-LABEL: @julia_load_array_of_i32(
; CHECK-NEXT:  top:
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [4 x i32]* [[A:%.*]] to <4 x i32>*
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
; CHECK-NEXT:    [[A_ARR:%.*]] = load [4 x i32], [4 x i32]* [[A]], align 4
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [4 x i32]* [[B:%.*]] to <4 x i32>*
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
; CHECK-NEXT:    [[B_ARR:%.*]] = load [4 x i32], [4 x i32]* [[B]], align 4
; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x i32] undef, i32 [[TMP5]], 0
; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x i32] [[C_ARR0]], i32 [[TMP6]], 1
; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x i32] [[C_ARR1]], i32 [[TMP7]], 2
; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x i32] [[C_ARR2]], i32 [[TMP8]], 3
; CHECK-NEXT:    store [4 x i32] [[C_ARR3]], [4 x i32]* [[C:%.*]], align 4
; CHECK-NEXT:    ret void
;
top:
  %a_arr = load [4 x i32], [4 x i32]* %a, align 4
  %a0 = extractvalue [4 x i32] %a_arr, 0
@@ -132,9 +213,30 @@ top:

; Almost identical to previous test, but for type that should NOT be vectorized.
;
; CHECK-LABEL: julia_load_array_of_i16
; CHECK-NOT: i2>
define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) {
; CHECK-LABEL: @julia_load_array_of_i16(
; CHECK-NEXT:  top:
; CHECK-NEXT:    [[A_ARR:%.*]] = load [4 x i16], [4 x i16]* [[A:%.*]], align 4
; CHECK-NEXT:    [[A0:%.*]] = extractvalue [4 x i16] [[A_ARR]], 0
; CHECK-NEXT:    [[A2:%.*]] = extractvalue [4 x i16] [[A_ARR]], 2
; CHECK-NEXT:    [[A1:%.*]] = extractvalue [4 x i16] [[A_ARR]], 1
; CHECK-NEXT:    [[B_ARR:%.*]] = load [4 x i16], [4 x i16]* [[B:%.*]], align 4
; CHECK-NEXT:    [[B0:%.*]] = extractvalue [4 x i16] [[B_ARR]], 0
; CHECK-NEXT:    [[B2:%.*]] = extractvalue [4 x i16] [[B_ARR]], 2
; CHECK-NEXT:    [[B1:%.*]] = extractvalue [4 x i16] [[B_ARR]], 1
; CHECK-NEXT:    [[A3:%.*]] = extractvalue [4 x i16] [[A_ARR]], 3
; CHECK-NEXT:    [[C1:%.*]] = sub i16 [[A1]], [[B1]]
; CHECK-NEXT:    [[B3:%.*]] = extractvalue [4 x i16] [[B_ARR]], 3
; CHECK-NEXT:    [[C0:%.*]] = sub i16 [[A0]], [[B0]]
; CHECK-NEXT:    [[C2:%.*]] = sub i16 [[A2]], [[B2]]
; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x i16] undef, i16 [[C0]], 0
; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x i16] [[C_ARR0]], i16 [[C1]], 1
; CHECK-NEXT:    [[C3:%.*]] = sub i16 [[A3]], [[B3]]
; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x i16] [[C_ARR1]], i16 [[C2]], 2
; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x i16] [[C_ARR2]], i16 [[C3]], 3
; CHECK-NEXT:    store [4 x i16] [[C_ARR3]], [4 x i16]* [[C:%.*]], align 4
; CHECK-NEXT:    ret void
;
top:
  %a_arr = load [4 x i16], [4 x i16]* %a, align 4
  %a0 = extractvalue [4 x i16] %a_arr, 0
@@ -160,11 +262,27 @@ top:

%pseudovec = type { float, float, float, float }

; CHECK-LABEL: julia_load_struct_of_float
; CHECK: load <4 x float>
; CHECK: load <4 x float>
; CHECK: fsub <4 x float>
define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) {
; CHECK-LABEL: @julia_load_struct_of_float(
; CHECK-NEXT:  top:
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %pseudovec* [[A:%.*]] to <4 x float>*
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
; CHECK-NEXT:    [[A_STRUCT:%.*]] = load [[PSEUDOVEC:%.*]], %pseudovec* [[A]], align 4
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %pseudovec* [[B:%.*]] to <4 x float>*
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
; CHECK-NEXT:    [[B_STRUCT:%.*]] = load [[PSEUDOVEC]], %pseudovec* [[B]], align 4
; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
; CHECK-NEXT:    [[C_STRUCT0:%.*]] = insertvalue [[PSEUDOVEC]] undef, float [[TMP5]], 0
; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
; CHECK-NEXT:    [[C_STRUCT1:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct0, float [[TMP6]], 1
; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
; CHECK-NEXT:    [[C_STRUCT2:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct1, float [[TMP7]], 2
; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
; CHECK-NEXT:    [[C_STRUCT3:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct2, float [[TMP8]], 3
; CHECK-NEXT:    store [[PSEUDOVEC]] %c_struct3, %pseudovec* [[C:%.*]], align 4
; CHECK-NEXT:    ret void
;
top:
  %a_struct = load %pseudovec, %pseudovec* %a, align 4
  %a0 = extractvalue %pseudovec %a_struct, 0
Loading