Loading llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +80 −69 Original line number Diff line number Diff line Loading @@ -2687,8 +2687,10 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass) { Value *Count = getOrCreateTripCount(L); BasicBlock *BB = L->getLoopPreheader(); IRBuilder<> Builder(BB->getTerminator()); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; IRBuilder<> Builder(TCCheckBlock->getTerminator()); // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the Loading @@ -2705,48 +2707,55 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); // Create new preheader for vector loop. LoopVectorPreHeader = TCCheckBlock->splitBasicBlock( TCCheckBlock->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. DT->addNewBlock(NewBB, BB); DT->addNewBlock(LoopVectorPreHeader, TCCheckBlock); if (L->getParentLoop()) L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); ReplaceInstWithInst(BB->getTerminator(), BranchInst::Create(Bypass, NewBB, CheckMinIters)); LoopBypassBlocks.push_back(BB); L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); ReplaceInstWithInst( TCCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); LoopBypassBlocks.push_back(TCCheckBlock); } void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { BasicBlock *BB = L->getLoopPreheader(); // Reuse existing vector loop preheader for SCEV checks. // Note that new preheader block is generated for vector loop. BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; // Generate the code to check that the SCEV assumptions that we made. // We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), "scev.check"); Value *SCEVCheck = Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); Value *SCEVCheck = Exp.expandCodeForPredicate( &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) if (C->isZero()) return; assert(!BB->getParent()->hasOptSize() && assert(!SCEVCheckBlock->getParent()->hasOptSize() && "Cannot SCEV check stride or overflow when optimizing for size"); // Create a new block containing the stride check. BB->setName("vector.scevcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); SCEVCheckBlock->setName("vector.scevcheck"); // Create new preheader for vector loop. LoopVectorPreHeader = SCEVCheckBlock->splitBasicBlock( SCEVCheckBlock->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. DT->addNewBlock(NewBB, BB); DT->addNewBlock(LoopVectorPreHeader, SCEVCheckBlock); if (L->getParentLoop()) L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); ReplaceInstWithInst(BB->getTerminator(), BranchInst::Create(Bypass, NewBB, SCEVCheck)); LoopBypassBlocks.push_back(BB); L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); ReplaceInstWithInst( SCEVCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; } Loading @@ -2755,7 +2764,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { if (EnableVPlanNativePath) return; BasicBlock *BB = L->getLoopPreheader(); // Reuse existing vector loop preheader for runtime memory checks. // Note that new preheader block is generated for vector loop. BasicBlock *const MemCheckBlock = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements Loading @@ -2763,11 +2774,11 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); if (!MemRuntimeCheck) return; if (BB->getParent()->hasOptSize()) { if (MemCheckBlock->getParent()->hasOptSize()) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); Loading @@ -2781,18 +2792,21 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { }); } // Create a new block containing the memory check. BB->setName("vector.memcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); MemCheckBlock->setName("vector.memcheck"); // Create new preheader for vector loop. LoopVectorPreHeader = MemCheckBlock->splitBasicBlock( MemCheckBlock->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. DT->addNewBlock(NewBB, BB); DT->addNewBlock(LoopVectorPreHeader, MemCheckBlock); if (L->getParentLoop()) L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); ReplaceInstWithInst(BB->getTerminator(), BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); LoopBypassBlocks.push_back(BB); L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); ReplaceInstWithInst( MemCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); LoopBypassBlocks.push_back(MemCheckBlock); AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we Loading Loading @@ -2923,12 +2937,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { ... */ BasicBlock *OldBasicBlock = OrigLoop->getHeader(); BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); MDNode *OrigLoopID = OrigLoop->getLoopID(); assert(VectorPH && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer Loading @@ -2945,12 +2954,18 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); BasicBlock *ScalarPH = MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); LoopExitBlock = OrigLoop->getExitBlock(); assert(LoopVectorPreHeader && "Invalid loop structure"); assert(LoopExitBlock && "Must have an exit block"); LoopVectorBody = LoopVectorPreHeader->splitBasicBlock( LoopVectorPreHeader->getTerminator(), "vector.body"); LoopMiddleBlock = LoopVectorBody->splitBasicBlock( LoopVectorBody->getTerminator(), "middle.block"); LoopScalarPreHeader = LoopMiddleBlock->splitBasicBlock( LoopMiddleBlock->getTerminator(), "scalar.ph"); // Create and register the new vector loop. Loop *Lp = LI->AllocateLoop(); Loading @@ -2960,12 +2975,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); ParentLoop->addBasicBlockToLoop(LoopScalarPreHeader, *LI); ParentLoop->addBasicBlockToLoop(LoopMiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } Lp->addBasicBlockToLoop(VecBody, *LI); Lp->addBasicBlockToLoop(LoopVectorBody, *LI); // Find the loop boundaries. Value *Count = getOrCreateTripCount(Lp); Loading @@ -2977,16 +2992,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. emitMinimumIterationCountCheck(Lp, ScalarPH); emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. emitSCEVChecks(Lp, ScalarPH); emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. emitMemRuntimeChecks(Lp, ScalarPH); emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) Loading Loading @@ -3014,8 +3029,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { InductionDescriptor II = InductionEntry.second; // Create phi nodes to merge from the backedge-taken check block. PHINode *BCResumeVal = PHINode::Create( OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", LoopScalarPreHeader->getTerminator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; Loading @@ -3028,21 +3044,21 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Instruction::CastOps CastOp = CastInst::getCastOpcode(CountRoundDown, true, StepType, true); Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, MiddleBlock); BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); // Fix the scalar body counter (PHI node). // The old induction's phi node in the scalar body needs the truncated // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } // We need the OrigLoop (scalar loop part) latch terminator to help Loading @@ -3060,9 +3076,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // If tail is to be folded, we know we don't need to run the remainder. Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, CountRoundDown, "cmp.n", LoopMiddleBlock->getTerminator()); // Here we use the same DebugLoc as the scalar loop latch branch instead // of the corresponding compare because they may have ended up with Loading @@ -3071,20 +3087,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); } BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); BranchInst *BrInst = BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); // Save the state. LoopVectorPreHeader = Lp->getLoopPreheader(); LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; LoopVectorBody = VecBody; LoopScalarBody = OldBasicBlock; assert(LoopVectorPreHeader == Lp->getLoopPreheader() && "Inconsistent vector loop preheader"); Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); Optional<MDNode *> VectorizedLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, Loading Loading
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +80 −69 Original line number Diff line number Diff line Loading @@ -2687,8 +2687,10 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass) { Value *Count = getOrCreateTripCount(L); BasicBlock *BB = L->getLoopPreheader(); IRBuilder<> Builder(BB->getTerminator()); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; IRBuilder<> Builder(TCCheckBlock->getTerminator()); // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the Loading @@ -2705,48 +2707,55 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); // Create new preheader for vector loop. LoopVectorPreHeader = TCCheckBlock->splitBasicBlock( TCCheckBlock->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. DT->addNewBlock(NewBB, BB); DT->addNewBlock(LoopVectorPreHeader, TCCheckBlock); if (L->getParentLoop()) L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); ReplaceInstWithInst(BB->getTerminator(), BranchInst::Create(Bypass, NewBB, CheckMinIters)); LoopBypassBlocks.push_back(BB); L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); ReplaceInstWithInst( TCCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); LoopBypassBlocks.push_back(TCCheckBlock); } void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { BasicBlock *BB = L->getLoopPreheader(); // Reuse existing vector loop preheader for SCEV checks. // Note that new preheader block is generated for vector loop. BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; // Generate the code to check that the SCEV assumptions that we made. // We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), "scev.check"); Value *SCEVCheck = Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); Value *SCEVCheck = Exp.expandCodeForPredicate( &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) if (C->isZero()) return; assert(!BB->getParent()->hasOptSize() && assert(!SCEVCheckBlock->getParent()->hasOptSize() && "Cannot SCEV check stride or overflow when optimizing for size"); // Create a new block containing the stride check. BB->setName("vector.scevcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); SCEVCheckBlock->setName("vector.scevcheck"); // Create new preheader for vector loop. LoopVectorPreHeader = SCEVCheckBlock->splitBasicBlock( SCEVCheckBlock->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. DT->addNewBlock(NewBB, BB); DT->addNewBlock(LoopVectorPreHeader, SCEVCheckBlock); if (L->getParentLoop()) L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); ReplaceInstWithInst(BB->getTerminator(), BranchInst::Create(Bypass, NewBB, SCEVCheck)); LoopBypassBlocks.push_back(BB); L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); ReplaceInstWithInst( SCEVCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; } Loading @@ -2755,7 +2764,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { if (EnableVPlanNativePath) return; BasicBlock *BB = L->getLoopPreheader(); // Reuse existing vector loop preheader for runtime memory checks. // Note that new preheader block is generated for vector loop. BasicBlock *const MemCheckBlock = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements Loading @@ -2763,11 +2774,11 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); if (!MemRuntimeCheck) return; if (BB->getParent()->hasOptSize()) { if (MemCheckBlock->getParent()->hasOptSize()) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); Loading @@ -2781,18 +2792,21 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { }); } // Create a new block containing the memory check. BB->setName("vector.memcheck"); auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); MemCheckBlock->setName("vector.memcheck"); // Create new preheader for vector loop. LoopVectorPreHeader = MemCheckBlock->splitBasicBlock( MemCheckBlock->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. DT->addNewBlock(NewBB, BB); DT->addNewBlock(LoopVectorPreHeader, MemCheckBlock); if (L->getParentLoop()) L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); ReplaceInstWithInst(BB->getTerminator(), BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); LoopBypassBlocks.push_back(BB); L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); ReplaceInstWithInst( MemCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); LoopBypassBlocks.push_back(MemCheckBlock); AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we Loading Loading @@ -2923,12 +2937,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { ... */ BasicBlock *OldBasicBlock = OrigLoop->getHeader(); BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); MDNode *OrigLoopID = OrigLoop->getLoopID(); assert(VectorPH && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer Loading @@ -2945,12 +2954,18 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); BasicBlock *ScalarPH = MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); LoopExitBlock = OrigLoop->getExitBlock(); assert(LoopVectorPreHeader && "Invalid loop structure"); assert(LoopExitBlock && "Must have an exit block"); LoopVectorBody = LoopVectorPreHeader->splitBasicBlock( LoopVectorPreHeader->getTerminator(), "vector.body"); LoopMiddleBlock = LoopVectorBody->splitBasicBlock( LoopVectorBody->getTerminator(), "middle.block"); LoopScalarPreHeader = LoopMiddleBlock->splitBasicBlock( LoopMiddleBlock->getTerminator(), "scalar.ph"); // Create and register the new vector loop. Loop *Lp = LI->AllocateLoop(); Loading @@ -2960,12 +2975,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); ParentLoop->addBasicBlockToLoop(LoopScalarPreHeader, *LI); ParentLoop->addBasicBlockToLoop(LoopMiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } Lp->addBasicBlockToLoop(VecBody, *LI); Lp->addBasicBlockToLoop(LoopVectorBody, *LI); // Find the loop boundaries. Value *Count = getOrCreateTripCount(Lp); Loading @@ -2977,16 +2992,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. emitMinimumIterationCountCheck(Lp, ScalarPH); emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. emitSCEVChecks(Lp, ScalarPH); emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. emitMemRuntimeChecks(Lp, ScalarPH); emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) Loading Loading @@ -3014,8 +3029,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { InductionDescriptor II = InductionEntry.second; // Create phi nodes to merge from the backedge-taken check block. PHINode *BCResumeVal = PHINode::Create( OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", LoopScalarPreHeader->getTerminator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; Loading @@ -3028,21 +3044,21 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Instruction::CastOps CastOp = CastInst::getCastOpcode(CountRoundDown, true, StepType, true); Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, MiddleBlock); BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); // Fix the scalar body counter (PHI node). // The old induction's phi node in the scalar body needs the truncated // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } // We need the OrigLoop (scalar loop part) latch terminator to help Loading @@ -3060,9 +3076,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // If tail is to be folded, we know we don't need to run the remainder. Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, CountRoundDown, "cmp.n", LoopMiddleBlock->getTerminator()); // Here we use the same DebugLoc as the scalar loop latch branch instead // of the corresponding compare because they may have ended up with Loading @@ -3071,20 +3087,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); } BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); BranchInst *BrInst = BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); // Save the state. LoopVectorPreHeader = Lp->getLoopPreheader(); LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; LoopVectorBody = VecBody; LoopScalarBody = OldBasicBlock; assert(LoopVectorPreHeader == Lp->getLoopPreheader() && "Inconsistent vector loop preheader"); Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); Optional<MDNode *> VectorizedLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, Loading