Loading clang/lib/CodeGen/CGKokkos.cpp +161 −15 Original line number Diff line number Diff line Loading @@ -267,11 +267,160 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE, return false; } // Check to see if we have an MDRange present if (BE->getStmtClass() == Expr::CXXTemporaryObjectExprClass) { const CXXTemporaryObjectExpr *CXXTO = dyn_cast<CXXTemporaryObjectExpr>(BE); std::string className = CXXTO->getBestDynamicClassType()->getNameAsString(); if (className == "MDRangePolicy") { return EmitKokkosParallelForMD(CE, PFName, BE, Lambda, ForallAttrs); } else { // What should we do here? } } // Create all jump destinations and basic blocks in the order they // appear in the IR. JumpDest Condition = getJumpDestInCurrentScope("kokkos.forall.cond"); llvm::BasicBlock *Detach = createBasicBlock("kokkos.forall.detach"); llvm::BasicBlock *PForBody = createBasicBlock("kokkos.forall.body"); JumpDest Reattach = getJumpDestInCurrentScope("kokkos.forall.reattach"); llvm::BasicBlock *Increment = createBasicBlock("kokkos.forall.inc"); JumpDest Cleanup = getJumpDestInCurrentScope("kokkos.forall.cond.cleanup"); JumpDest Sync = getJumpDestInCurrentScope("kokkos.forall.sync"); llvm::BasicBlock *End = createBasicBlock("kokkos.forall.end"); // Extract a conveince block and setup the lexical scope based on // the lambda's source range. llvm::BasicBlock *ConditionBlock = Condition.getBlock(); const SourceRange &R = CE->getSourceRange(); LexicalScope PForScope(*this, R); // Now we can start the dirty work of transforming the lambda into a // for loop. // The first step is to extract the argument to the lambda and transform it into // the loop induction variable. As part of this we assume the following are true // about the parallel_for: // 1. The iterator can be assigned a value of zero. // 2. We ignore the details of what is captured by the lambda. // // TODO: Do we need to "relax" these assumptions to support broader code coverage? // This is 'equivalent' to the Init statement in a traditional for loop (e.g. int i = 0). const ParmVarDecl *InductionVarDecl; InductionVarDecl = EmitKokkosParallelForInductionVar(Lambda).at(0); // Create the sync region. PushSyncRegion(); llvm::Instruction *SRStart = EmitSyncRegionStart(); CurSyncRegion->setSyncRegionStart(SRStart); // TODO: Need to check attributes for spawning strategy. LoopStack.setSpawnStrategy(LoopAttributes::DAC); EmitBlock(ConditionBlock); LoopStack.push(ConditionBlock, CGM.getContext(), ForallAttrs, SourceLocToDebugLoc(R.getBegin()), SourceLocToDebugLoc(R.getEnd())); // Store the blocks to use for break and continue. BreakContinueStack.push_back(BreakContinue(Reattach, Reattach)); // Create a scope for the condition variable cleanup. LexicalScope ConditionScope(*this, R); // Create the conditional. EmitKokkosParallelForCond(BE, InductionVarDecl, Detach, End, Sync); if (PForScope.requiresCleanups()) { EmitBlock(Cleanup.getBlock()); EmitBranchThroughCleanup(Sync); } // Handle the detach block... EmitBlock(Detach); auto OldAllocaInsertPt = AllocaInsertPt; llvm::Value *Undef = llvm::UndefValue::get(Int32Ty); AllocaInsertPt = new llvm::BitCastInst(Undef, Int32Ty, "", PForBody); llvm::Value *GInductionVar = GetAddrOfLocalVar(InductionVarDecl).getPointer(); llvm::Value *GInductionVal = Builder.CreateLoad(GetAddrOfLocalVar(InductionVarDecl)); QualType RefType = InductionVarDecl->getType(); // Create the detach terminator Builder.CreateDetach(PForBody, Increment, SRStart); EmitBlock(PForBody); incrementProfileCounter(CE); llvm::AllocaInst *TLInductionVar = Builder.CreateAlloca(getTypes().ConvertType(RefType), nullptr, InductionVarDecl->getName() + ".detach"); Builder.CreateAlignedStore(GInductionVal, TLInductionVar, getContext().getTypeAlignInChars(RefType)); { // Create a separate cleanup scope for the body, in case it is not // a compound statement. InKokkosConstruct = true; RunCleanupsScope BodyScope(*this); EmitStmt(Lambda->getBody()); InKokkosConstruct = false; } auto tmp = AllocaInsertPt; AllocaInsertPt = OldAllocaInsertPt; tmp->removeFromParent(); // Modify the body to use the ''detach''-local induction variable. // At this point in the codegen, the body block has been emitted // and we can safely replace the ''sequential`` induction variable // within the detach basic block. llvm::BasicBlock *CurrentBlock = Builder.GetInsertBlock(); for(llvm::Value::use_iterator UI = GInductionVar->use_begin(), UE = GInductionVar->use_end(); UI != UE; ) { llvm::Use &U = *UI++; llvm::Instruction *I = cast<llvm::Instruction>(U.getUser()); if (I->getParent() == CurrentBlock) U.set(TLInductionVar); } EmitBlock(Reattach.getBlock()); Builder.CreateReattach(Increment, SRStart); EmitBlock(Increment); llvm::Value *IncVal = Builder.CreateLoad(GetAddrOfLocalVar(InductionVarDecl)); llvm::Value *One = llvm::ConstantInt::get(ConvertType(InductionVarDecl->getType()), 1); IncVal = Builder.CreateAdd(IncVal, One); Builder.CreateStore(IncVal, GetAddrOfLocalVar(InductionVarDecl)); BreakContinueStack.pop_back(); ConditionScope.ForceCleanup(); EmitStopPoint(CE); EmitBranch(ConditionBlock); PForScope.ForceCleanup(); LoopStack.pop(); EmitBlock(Sync.getBlock()); Builder.CreateSync(End, SRStart); EmitBlock(End, true); return true; } bool CodeGenFunction::EmitKokkosParallelForMD(const CallExpr *CE, std::string PFName, const Expr *BE, const LambdaExpr *Lambda, ArrayRef<const Attr *> ForallAttrs) { // TODO: Need to add code to process any attributes (ForallAttrs). // Build the queue of dimensions (upper bounds) std::vector<const Expr *> DimQueue; std::vector<const Expr *> StartQueue; if (BE->getStmtClass() == Expr::CXXTemporaryObjectExprClass) { const CXXTemporaryObjectExpr *CXXTO = dyn_cast<CXXTemporaryObjectExpr>(BE); const InitListExpr *StartingBounds = dyn_cast<InitListExpr>(CXXTO->getArg(0)->IgnoreImplicit()); const InitListExpr *UpperBounds = dyn_cast<InitListExpr>(CXXTO->getArg(1)->IgnoreImplicit()); Loading @@ -285,9 +434,6 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE, const Expr *val = UpperBounds->getInit(i)->IgnoreImplicit(); DimQueue.push_back(val); } } else { DimQueue.push_back(BE); } // These are extra steps that we can probably optimize away BE = DimQueue.front(); Loading clang/lib/CodeGen/CodeGenFunction.h +4 −0 Original line number Diff line number Diff line Loading @@ -3526,6 +3526,10 @@ public: llvm::BasicBlock *ExitBlock, JumpDest &Sync); bool EmitKokkosParallelFor(const CallExpr *CE, ArrayRef<const Attr *> Attrs); bool EmitKokkosParallelForMD(const CallExpr *CE, std::string PFName, const Expr *BE, const LambdaExpr *Lambda, ArrayRef<const Attr *> ForallAttrs); bool EmitKokkosInnerLoop(const CallExpr *CE, const LambdaExpr *Lambda, llvm::BasicBlock *TopBlock, std::vector<const Expr*> DimQueue, Loading Loading
clang/lib/CodeGen/CGKokkos.cpp +161 −15 Original line number Diff line number Diff line Loading @@ -267,11 +267,160 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE, return false; } // Check to see if we have an MDRange present if (BE->getStmtClass() == Expr::CXXTemporaryObjectExprClass) { const CXXTemporaryObjectExpr *CXXTO = dyn_cast<CXXTemporaryObjectExpr>(BE); std::string className = CXXTO->getBestDynamicClassType()->getNameAsString(); if (className == "MDRangePolicy") { return EmitKokkosParallelForMD(CE, PFName, BE, Lambda, ForallAttrs); } else { // What should we do here? } } // Create all jump destinations and basic blocks in the order they // appear in the IR. JumpDest Condition = getJumpDestInCurrentScope("kokkos.forall.cond"); llvm::BasicBlock *Detach = createBasicBlock("kokkos.forall.detach"); llvm::BasicBlock *PForBody = createBasicBlock("kokkos.forall.body"); JumpDest Reattach = getJumpDestInCurrentScope("kokkos.forall.reattach"); llvm::BasicBlock *Increment = createBasicBlock("kokkos.forall.inc"); JumpDest Cleanup = getJumpDestInCurrentScope("kokkos.forall.cond.cleanup"); JumpDest Sync = getJumpDestInCurrentScope("kokkos.forall.sync"); llvm::BasicBlock *End = createBasicBlock("kokkos.forall.end"); // Extract a conveince block and setup the lexical scope based on // the lambda's source range. llvm::BasicBlock *ConditionBlock = Condition.getBlock(); const SourceRange &R = CE->getSourceRange(); LexicalScope PForScope(*this, R); // Now we can start the dirty work of transforming the lambda into a // for loop. // The first step is to extract the argument to the lambda and transform it into // the loop induction variable. As part of this we assume the following are true // about the parallel_for: // 1. The iterator can be assigned a value of zero. // 2. We ignore the details of what is captured by the lambda. // // TODO: Do we need to "relax" these assumptions to support broader code coverage? // This is 'equivalent' to the Init statement in a traditional for loop (e.g. int i = 0). const ParmVarDecl *InductionVarDecl; InductionVarDecl = EmitKokkosParallelForInductionVar(Lambda).at(0); // Create the sync region. PushSyncRegion(); llvm::Instruction *SRStart = EmitSyncRegionStart(); CurSyncRegion->setSyncRegionStart(SRStart); // TODO: Need to check attributes for spawning strategy. LoopStack.setSpawnStrategy(LoopAttributes::DAC); EmitBlock(ConditionBlock); LoopStack.push(ConditionBlock, CGM.getContext(), ForallAttrs, SourceLocToDebugLoc(R.getBegin()), SourceLocToDebugLoc(R.getEnd())); // Store the blocks to use for break and continue. BreakContinueStack.push_back(BreakContinue(Reattach, Reattach)); // Create a scope for the condition variable cleanup. LexicalScope ConditionScope(*this, R); // Create the conditional. EmitKokkosParallelForCond(BE, InductionVarDecl, Detach, End, Sync); if (PForScope.requiresCleanups()) { EmitBlock(Cleanup.getBlock()); EmitBranchThroughCleanup(Sync); } // Handle the detach block... EmitBlock(Detach); auto OldAllocaInsertPt = AllocaInsertPt; llvm::Value *Undef = llvm::UndefValue::get(Int32Ty); AllocaInsertPt = new llvm::BitCastInst(Undef, Int32Ty, "", PForBody); llvm::Value *GInductionVar = GetAddrOfLocalVar(InductionVarDecl).getPointer(); llvm::Value *GInductionVal = Builder.CreateLoad(GetAddrOfLocalVar(InductionVarDecl)); QualType RefType = InductionVarDecl->getType(); // Create the detach terminator Builder.CreateDetach(PForBody, Increment, SRStart); EmitBlock(PForBody); incrementProfileCounter(CE); llvm::AllocaInst *TLInductionVar = Builder.CreateAlloca(getTypes().ConvertType(RefType), nullptr, InductionVarDecl->getName() + ".detach"); Builder.CreateAlignedStore(GInductionVal, TLInductionVar, getContext().getTypeAlignInChars(RefType)); { // Create a separate cleanup scope for the body, in case it is not // a compound statement. InKokkosConstruct = true; RunCleanupsScope BodyScope(*this); EmitStmt(Lambda->getBody()); InKokkosConstruct = false; } auto tmp = AllocaInsertPt; AllocaInsertPt = OldAllocaInsertPt; tmp->removeFromParent(); // Modify the body to use the ''detach''-local induction variable. // At this point in the codegen, the body block has been emitted // and we can safely replace the ''sequential`` induction variable // within the detach basic block. llvm::BasicBlock *CurrentBlock = Builder.GetInsertBlock(); for(llvm::Value::use_iterator UI = GInductionVar->use_begin(), UE = GInductionVar->use_end(); UI != UE; ) { llvm::Use &U = *UI++; llvm::Instruction *I = cast<llvm::Instruction>(U.getUser()); if (I->getParent() == CurrentBlock) U.set(TLInductionVar); } EmitBlock(Reattach.getBlock()); Builder.CreateReattach(Increment, SRStart); EmitBlock(Increment); llvm::Value *IncVal = Builder.CreateLoad(GetAddrOfLocalVar(InductionVarDecl)); llvm::Value *One = llvm::ConstantInt::get(ConvertType(InductionVarDecl->getType()), 1); IncVal = Builder.CreateAdd(IncVal, One); Builder.CreateStore(IncVal, GetAddrOfLocalVar(InductionVarDecl)); BreakContinueStack.pop_back(); ConditionScope.ForceCleanup(); EmitStopPoint(CE); EmitBranch(ConditionBlock); PForScope.ForceCleanup(); LoopStack.pop(); EmitBlock(Sync.getBlock()); Builder.CreateSync(End, SRStart); EmitBlock(End, true); return true; } bool CodeGenFunction::EmitKokkosParallelForMD(const CallExpr *CE, std::string PFName, const Expr *BE, const LambdaExpr *Lambda, ArrayRef<const Attr *> ForallAttrs) { // TODO: Need to add code to process any attributes (ForallAttrs). // Build the queue of dimensions (upper bounds) std::vector<const Expr *> DimQueue; std::vector<const Expr *> StartQueue; if (BE->getStmtClass() == Expr::CXXTemporaryObjectExprClass) { const CXXTemporaryObjectExpr *CXXTO = dyn_cast<CXXTemporaryObjectExpr>(BE); const InitListExpr *StartingBounds = dyn_cast<InitListExpr>(CXXTO->getArg(0)->IgnoreImplicit()); const InitListExpr *UpperBounds = dyn_cast<InitListExpr>(CXXTO->getArg(1)->IgnoreImplicit()); Loading @@ -285,9 +434,6 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE, const Expr *val = UpperBounds->getInit(i)->IgnoreImplicit(); DimQueue.push_back(val); } } else { DimQueue.push_back(BE); } // These are extra steps that we can probably optimize away BE = DimQueue.front(); Loading
clang/lib/CodeGen/CodeGenFunction.h +4 −0 Original line number Diff line number Diff line Loading @@ -3526,6 +3526,10 @@ public: llvm::BasicBlock *ExitBlock, JumpDest &Sync); bool EmitKokkosParallelFor(const CallExpr *CE, ArrayRef<const Attr *> Attrs); bool EmitKokkosParallelForMD(const CallExpr *CE, std::string PFName, const Expr *BE, const LambdaExpr *Lambda, ArrayRef<const Attr *> ForallAttrs); bool EmitKokkosInnerLoop(const CallExpr *CE, const LambdaExpr *Lambda, llvm::BasicBlock *TopBlock, std::vector<const Expr*> DimQueue, Loading