Commit 6ed9cef2 authored by Ayal Zaks's avatar Ayal Zaks
Browse files

[LV] Scalar with predication must not be uniform

Fix PR40816: avoid considering scalar-with-predication instructions as also
uniform-after-vectorization.

Instructions identified as "scalar with predication" will be "vectorized" using
a replicating region. If such instructions are also optimized as "uniform after
vectorization", namely when only the first of VF lanes is used, such a
replicating region becomes erroneous - only the first instance of the region can
and should be formed. Fix such cases by not considering such instructions as
"uniform after vectorization".

Differential Revision: https://reviews.llvm.org/D70298
parent 96c8024e
Loading
Loading
Loading
Loading
+22 −17
Original line number Diff line number Diff line
@@ -4668,14 +4668,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
  SetVector<Instruction *> Worklist;
  BasicBlock *Latch = TheLoop->getLoopLatch();

  // Instructions that are scalar with predication must not be considered
  // uniform after vectorization, because that would create an erroneous
  // replicating region where only a single instance out of VF should be formed.
  // TODO: optimize such seldom cases if found important, see PR40816.
  auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
    if (isScalarWithPredication(I, VF)) {
      LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
                        << *I << "\n");
      return;
    }
    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
    Worklist.insert(I);
  };

  // Start with the conditional branch. If the branch condition is an
  // instruction contained in the loop that is only used by the branch, it is
  // uniform.
  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
    Worklist.insert(Cmp);
    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
  }
  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
    addToWorklistIfAllowed(Cmp);

  // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
  // are pointers that are treated like consecutive pointers during
@@ -4734,10 +4746,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
  // Add to the Worklist all consecutive and consecutive-like pointers that
  // aren't also identified as possibly non-uniform.
  for (auto *V : ConsecutiveLikePtrs)
    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
      LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
      Worklist.insert(V);
    }
    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
      addToWorklistIfAllowed(V);

  // Expand Worklist in topological order: whenever a new instruction
  // is added , its users should be already inside Worklist.  It ensures
@@ -4763,10 +4773,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
            return Worklist.count(J) ||
                   (OI == getLoadStorePointerOperand(J) &&
                    isUniformDecision(J, VF));
          })) {
        Worklist.insert(OI);
        LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
      }
          }))
        addToWorklistIfAllowed(OI);
    }
  }

@@ -4808,11 +4816,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
      continue;

    // The induction variable and its update instruction will remain uniform.
    Worklist.insert(Ind);
    Worklist.insert(IndUpdate);
    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
                      << "\n");
    addToWorklistIfAllowed(Ind);
    addToWorklistIfAllowed(IndUpdate);
  }

  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
+83 −0
Original line number Diff line number Diff line
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=2 -S | FileCheck %s -check-prefix=FORCE

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -65,3 +66,85 @@ for.end:
}

attributes #0 = { "target-cpu"="knl" }

; CHECK-LABEL: PR40816
;
; Check that scalar with predication instructions are not considered uniform
; after vectorization, because that results in replicating a region instead of
; having a single instance (out of VF). The predication stems from a tiny count
; of 3 leading to folding the tail by masking using icmp ule <i, i+1> <= <2, 2>.
;
; CHECK:     LV: Found trip count: 3
; CHECK:     LV: Found uniform instruction:   {{%.*}} = icmp eq i32 {{%.*}}, 0
; CHECK-NOT: LV: Found uniform instruction:   {{%.*}} = load i32, i32* {{%.*}}, align 1
; CHECK:     LV: Found not uniform being ScalarWithPredication:  {{%.*}} = load i32, i32* {{%.*}}, align 1
; CHECK:     LV: Found scalar instruction:   {{%.*}} = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 {{%.*}}
;
; FORCE-LABEL: @PR40816(
; FORCE-NEXT:  entry:
; FORCE-NEXT:    br i1 false, label {{%.*}}, label [[VECTOR_PH:%.*]]
; FORCE:       vector.ph:
; FORCE-NEXT:    br label [[VECTOR_BODY:%.*]]
; FORCE:       vector.body:
; FORCE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ]
; FORCE-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ]
; FORCE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
; FORCE-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
; FORCE-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; FORCE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; FORCE-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; FORCE:       pred.store.if:
; FORCE-NEXT:    store i32 [[TMP0]], i32* @b, align 1
; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE]]
; FORCE:       pred.store.continue:
; FORCE-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
; FORCE-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
; FORCE:       pred.store.if1:
; FORCE-NEXT:    store i32 [[TMP1]], i32* @b, align 1
; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE2]]
; FORCE:       pred.store.continue2:
; FORCE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; FORCE-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; FORCE:       pred.load.if:
; FORCE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP0]]
; FORCE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 1
; FORCE-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[TMP7]], i32 0
; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE]]
; FORCE:       pred.load.continue:
; FORCE-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ undef, [[PRED_STORE_CONTINUE2]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
; FORCE-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
; FORCE-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
; FORCE:       pred.load.if3:
; FORCE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP1]]
; FORCE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 1
; FORCE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
; FORCE:       pred.load.continue4:
; FORCE-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ]
; FORCE-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; FORCE-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; FORCE-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
; FORCE-NEXT:    br i1 [[TMP15]], label {{%.*}}, label [[VECTOR_BODY]]
;
@a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1
@b = external global i32, align 1

define void @PR40816() #1 {

entry:
  br label %for.body

for.body:                                         ; preds = %for.body, %entry
  %0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
  store i32 %0, i32* @b, align 1
  %arrayidx1 = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 %0
  %1 = load i32, i32* %arrayidx1, align 1
  %cmp2 = icmp eq i32 %1, 0
  %inc = add nuw nsw i32 %0, 1
  br i1 %cmp2, label %return, label %for.body

return:                                           ; preds = %for.body
  ret void
}

attributes #1 = { "target-cpu"="core2" }