Unverified Commit 80aa7e14 authored by Florian Hahn's avatar Florian Hahn
Browse files

[VPlan] Merge predicated-triangle regions, after sinking.

Sinking scalar operands into predicated-triangle regions may allow
merging regions. This patch adds a VPlan-to-VPlan transform that tries
to merge predicate-triangle regions after sinking.

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D100260
parent 69420760
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -9298,6 +9298,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
  }

  VPlanTransforms::sinkScalarOperands(*Plan);
  VPlanTransforms::mergeReplicateRegions(*Plan);

  std::string PlanName;
  raw_string_ostream RSO(PlanName);
+135 −0
Original line number Diff line number Diff line
@@ -148,3 +148,138 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {
  }
  return Changed;
}

/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
/// the mask.
VPValue *getPredicatedMask(VPRegionBlock *R) {
  auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
  if (!EntryBB || EntryBB->size() != 1 ||
      !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
    return nullptr;

  return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
}

/// If \p R is a triangle region, return the 'then' block of the triangle.
static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
  auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
  if (EntryBB->getNumSuccessors() != 2)
    return nullptr;

  auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
  auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
  if (!Succ0 || !Succ1)
    return nullptr;

  if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
    return nullptr;
  if (Succ0->getSingleSuccessor() == Succ1)
    return Succ0;
  if (Succ1->getSingleSuccessor() == Succ0)
    return Succ1;
  return nullptr;
}

bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) {
  SetVector<VPRegionBlock *> DeletedRegions;
  bool Changed = false;

  // Collect region blocks to process up-front, to avoid iterator invalidation
  // issues while merging regions.
  SmallVector<VPRegionBlock *, 8> CandidateRegions(
      VPBlockUtils::blocksOnly<VPRegionBlock>(depth_first(
          VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()))));

  // Check if Base is a predicated triangle, followed by an empty block,
  // followed by another predicate triangle. If that's the case, move the
  // recipes from the first to the second triangle.
  for (VPRegionBlock *Region1 : CandidateRegions) {
    if (DeletedRegions.contains(Region1))
      continue;
    auto *MiddleBasicBlock =
        dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
    if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
      continue;

    auto *Region2 =
        dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
    if (!Region2)
      continue;

    VPValue *Mask1 = getPredicatedMask(Region1);
    VPValue *Mask2 = getPredicatedMask(Region2);
    if (!Mask1 || Mask1 != Mask2)
      continue;
    VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
    VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
    if (!Then1 || !Then2)
      continue;

    assert(Mask1 && Mask2 && "both region must have conditions");

    // Note: No fusion-preventing memory dependencies are expected in either
    // region. Such dependencies should be rejected during earlier dependence
    // checks, which guarantee accesses can be re-ordered for vectorization.
    //
    // If a recipe is used by a first-order recurrence phi, we cannot move it at
    // the moment: a recipe R feeding a first order recurrence phi must allow
    // for a *vector* shuffle to be inserted immediately after it, and therefore
    // if R is *scalarized and predicated* it must appear last in its basic
    // block. In addition, other recipes may need to "sink after" R, so best if
    // R not be moved at all.
    auto IsImmovableRecipe = [](VPRecipeBase &R) {
      assert(R.getNumDefinedValues() <= 1 &&
             "no multi-defs are expected in predicated blocks");
      for (VPUser *U : R.getVPValue()->users()) {
        auto *UI = dyn_cast<VPRecipeBase>(U);
        if (!UI)
          continue;
        auto *PhiR = dyn_cast<VPWidenPHIRecipe>(UI);
        if (PhiR && !PhiR->getRecurrenceDescriptor())
          return true;
      }
      return false;
    };
    if (any_of(*Then1, IsImmovableRecipe))
      continue;

    // Move recipes to the successor region.
    for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
      ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());

    auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
    auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());

    // Move VPPredInstPHIRecipes from the merge block to the successor region's
    // merge block. Update all users inside the successor region to use the
    // original values.
    for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
      VPValue *PredInst1 =
          cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
      for (VPUser *U : Phi1ToMove.getVPValue()->users()) {
        auto *UI = dyn_cast<VPRecipeBase>(U);
        if (!UI || UI->getParent() != Then2)
          continue;
        for (unsigned I = 0, E = U->getNumOperands(); I != E; ++I) {
          if (Phi1ToMove.getVPValue() != U->getOperand(I))
            continue;
          U->setOperand(I, PredInst1);
        }
      }

      Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
    }

    // Finally, remove the first region.
    for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
      VPBlockUtils::disconnectBlocks(Pred, Region1);
      VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
    }
    VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
    DeletedRegions.insert(Region1);
  }

  for (VPRegionBlock *ToDelete : DeletedRegions)
    delete ToDelete;
  return Changed;
}
+2 −0
Original line number Diff line number Diff line
@@ -30,6 +30,8 @@ struct VPlanTransforms {
      SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE);

  static bool sinkScalarOperands(VPlan &Plan);

  static bool mergeReplicateRegions(VPlan &Plan);
};

} // namespace llvm
+8 −18
Original line number Diff line number Diff line
@@ -89,38 +89,28 @@ attributes #0 = { "target-cpu"="knl" }
; FORCE:       vector.body:
; FORCE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ]
; FORCE-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ]
; FORCE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
; FORCE-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
; FORCE-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; FORCE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; FORCE-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; FORCE:       pred.store.if:
; FORCE-NEXT:    store i32 [[TMP0]], i32* @b, align 1
; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE]]
; FORCE:       pred.store.continue:
; FORCE-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
; FORCE-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
; FORCE:       pred.store.if1:
; FORCE-NEXT:    store i32 [[TMP1]], i32* @b, align 1
; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE2]]
; FORCE:       pred.store.continue2:
; FORCE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; FORCE-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; FORCE-NEXT:    br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; FORCE:       pred.load.if:
; FORCE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
; FORCE-NEXT:    store i32 [[TMP0]], i32* @b, align 1
; FORCE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP0]]
; FORCE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 1
; FORCE-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE]]
; FORCE:       pred.load.continue:
; FORCE-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, [[PRED_STORE_CONTINUE2]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
; FORCE-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
; FORCE-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
; FORCE-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
; FORCE:       pred.load.if3:
; FORCE:       pred.load.if1:
; FORCE-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
; FORCE-NEXT:    store i32 [[TMP1]], i32* @b, align 1
; FORCE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP1]]
; FORCE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 1
; FORCE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
; FORCE:       pred.load.continue4:
; FORCE:       pred.load.continue2:
; FORCE-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ]
; FORCE-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; FORCE-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+148 −244

File changed.

Preview size limit exceeded, changes collapsed.

Loading