Commit ee80889c authored by Patrick Flynn's avatar Patrick Flynn Committed by Patrick McCormick
Browse files

Added support for inner loops in parallel_for

parent 9fd4665e
Loading
Loading
Loading
Loading
+100 −38
Original line number Diff line number Diff line
@@ -195,17 +195,6 @@ std::vector<const ParmVarDecl*>
CodeGenFunction::EmitKokkosParallelForInductionVar(const LambdaExpr *Lambda) {
  const CXXMethodDecl *MD = Lambda->getCallOperator();
  assert(MD && "EmitKokkosParallelFor() -- bad method decl from labmda call.");
  /*const ParmVarDecl *InductionVarDecl = MD->getParamDecl(0);
  assert(InductionVarDecl && "EmitKokkosParallelFor() -- bad loop variable decl!");
  
  printf("PARAM COUNT: %d\n\n", MD->getNumParams());

  EmitVarDecl(*InductionVarDecl);
  Address Addr = GetAddrOfLocalVar(InductionVarDecl);
  llvm::Value *Zero = llvm::ConstantInt::get(ConvertType(InductionVarDecl->getType()), 0);
  Builder.CreateStore(Zero, Addr);

  return InductionVarDecl;*/
  
  std::vector<const ParmVarDecl*> params;
  
@@ -234,17 +223,6 @@ void CodeGenFunction::EmitKokkosParallelForCond(const Expr *BoundsExpr,
  if (BoundsExpr->getStmtClass() == Expr::BinaryOperatorClass) {
    RValue RV = EmitAnyExpr(BoundsExpr);
    LoopEnd = RV.getScalarVal();
  } else if (BoundsExpr->getStmtClass() == Expr::CXXTemporaryObjectExprClass) {
    const CXXTemporaryObjectExpr *CXXTO = dyn_cast<CXXTemporaryObjectExpr>(BoundsExpr);
    const InitListExpr *UpperBounds = dyn_cast<InitListExpr>(CXXTO->getArg(1)->IgnoreImplicit());
    
    // Create a multiply statement to computer the proper upper bound
    const Expr *lval = UpperBounds->getInit(0)->IgnoreImplicit();
    const Expr *rval = UpperBounds->getInit(1)->IgnoreImplicit();
    
    llvm::Value *lvalue = EmitScalarExpr(lval);
    llvm::Value *rvalue = EmitScalarExpr(rval);
    LoopEnd = Builder.CreateMul(lvalue, rvalue);
  } else { 
    LoopEnd = EmitScalarExpr(BoundsExpr);
  }
@@ -293,6 +271,30 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE,
    return false;
  }
  
  // Build the queue of dimensions (upper bounds)
  std::queue<const Expr *> DimQueue;
  
  if (BE->getStmtClass() == Expr::CXXTemporaryObjectExprClass) {
    const CXXTemporaryObjectExpr *CXXTO = dyn_cast<CXXTemporaryObjectExpr>(BE);
    const InitListExpr *UpperBounds = dyn_cast<InitListExpr>(CXXTO->getArg(1)->IgnoreImplicit());
    
    for (int i = 0; i<UpperBounds->getNumInits(); i++) {
      const Expr *val = UpperBounds->getInit(i)->IgnoreImplicit();
      DimQueue.push(val);
    }
  } else {
    DimQueue.push(BE);
  }
  
  // Get the induction varaibles
  std::vector<const ParmVarDecl*> params = EmitKokkosParallelForInductionVar(Lambda);
  
  // These are extra steps that we can probably optimize away
  BE = DimQueue.front();
  DimQueue.pop();
  
  const ParmVarDecl *InductionVarDecl = params.at(0);

  // Create all jump destinations and basic blocks in the order they
  // appear in the IR.
  JumpDest Condition = getJumpDestInCurrentScope("kokkos.forall.cond");
@@ -323,9 +325,6 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE,
  //
  // TODO: Do we need to "relax" these assumptions to support broader code coverage?
  // This is 'equivalent' to the Init statement in a traditional for loop (e.g. int i = 0). 
  /*const ParmVarDecl *InductionVarDecl; 
  InductionVarDecl = EmitKokkosParallelForInductionVar(Lambda);*/
  std::vector<const ParmVarDecl*> params = EmitKokkosParallelForInductionVar(Lambda);

   // Create the sync region.
  PushSyncRegion();
@@ -348,7 +347,6 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE,
  LexicalScope ConditionScope(*this, R);

  // Create the conditional.
  const ParmVarDecl *InductionVarDecl = params.at(0);
  EmitKokkosParallelForCond(BE, InductionVarDecl, Detach, End, Sync);

  if (PForScope.requiresCleanups()) {
@@ -380,12 +378,16 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE,
  Builder.CreateAlignedStore(GInductionVal, TLInductionVar,
                             getContext().getTypeAlignInChars(RefType));
  {
    if (DimQueue.size() == 0) {
      // Create a separate cleanup scope for the body, in case it is not
      // a compound statement.
      InKokkosConstruct = true;
      RunCleanupsScope BodyScope(*this);
      EmitStmt(Lambda->getBody());
      InKokkosConstruct = false;
    } else {
      EmitKokkosInnerLoop(CE, Lambda, nullptr, DimQueue, params);
    }
  }

  auto tmp = AllocaInsertPt;
@@ -409,12 +411,10 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE,
  Builder.CreateReattach(Increment, SRStart);

  EmitBlock(Increment);
  for (const ParmVarDecl* IVD : params) {
    llvm::Value *IncVal = Builder.CreateLoad(GetAddrOfLocalVar(IVD));
    llvm::Value *One = llvm::ConstantInt::get(ConvertType(IVD->getType()), 1);
  llvm::Value *IncVal = Builder.CreateLoad(GetAddrOfLocalVar(InductionVarDecl));
  llvm::Value *One = llvm::ConstantInt::get(ConvertType(InductionVarDecl->getType()), 1);
  IncVal = Builder.CreateAdd(IncVal, One);
    Builder.CreateStore(IncVal, GetAddrOfLocalVar(IVD));
  }
  Builder.CreateStore(IncVal, GetAddrOfLocalVar(InductionVarDecl));

  BreakContinueStack.pop_back();
  ConditionScope.ForceCleanup();
@@ -430,6 +430,68 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE,
  return true;
}

// This is in charge of building an inner loop
bool CodeGenFunction::EmitKokkosInnerLoop(const CallExpr *CE, const LambdaExpr *Lambda,
            llvm::BasicBlock *TopBlock,
            std::queue<const Expr*> DimQueue,
            std::vector<const ParmVarDecl*> params) {
  // Get arguments
  int pos = DimQueue.size();
  const Expr *BE = DimQueue.front();
  DimQueue.pop();
  
  const ParmVarDecl *InductionVarDecl = params.at(pos);
  
  llvm::BasicBlock *Zero = createBasicBlock("kokkos.forall.zero" + std::to_string(pos));
  JumpDest Condition = getJumpDestInCurrentScope("kokkos.forall.cond" + std::to_string(pos));
  llvm::BasicBlock *LoopBody = createBasicBlock("kokkos.forall.body" + std::to_string(pos));
  llvm::BasicBlock *Increment = createBasicBlock("kokkos.forall.inc" + std::to_string(pos));
  JumpDest EndDest = getJumpDestInCurrentScope("kokkos.forall.endlbl" + std::to_string(pos));
  llvm::BasicBlock *End = createBasicBlock("kokkos.forall.end" + std::to_string(pos));
  
  // Zero out the induction variable
  EmitBlock(Zero);
  llvm::Value *ZeroVal = llvm::ConstantInt::get(ConvertType(InductionVarDecl->getType()), 0);
  Builder.CreateStore(ZeroVal, GetAddrOfLocalVar(InductionVarDecl));
  
  // Create the conditional.
  llvm::BasicBlock *ConditionBlock = Condition.getBlock();
  EmitBlock(ConditionBlock);
  
  EmitKokkosParallelForCond(BE, InductionVarDecl, LoopBody, nullptr, EndDest);
  EmitBlock(LoopBody);
  
  {
    if (DimQueue.size() == 0) {
      // Create a separate cleanup scope for the body, in case it is not
      // a compound statement.
      InKokkosConstruct = true;
      RunCleanupsScope BodyScope(*this);
      EmitStmt(Lambda->getBody());
      InKokkosConstruct = false;
    } else {
      EmitKokkosInnerLoop(CE, Lambda, ConditionBlock, DimQueue, params);
    }
  }
  
  EmitBlock(Increment);
  llvm::Value *IncVal = Builder.CreateLoad(GetAddrOfLocalVar(InductionVarDecl));
  llvm::Value *One = llvm::ConstantInt::get(ConvertType(InductionVarDecl->getType()), 1);
  IncVal = Builder.CreateAdd(IncVal, One);
  Builder.CreateStore(IncVal, GetAddrOfLocalVar(InductionVarDecl));
  
  EmitBranch(ConditionBlock);
  
  if (TopBlock != nullptr) {
    EmitBranch(TopBlock);
  }
  
  EmitBlock(EndDest.getBlock());
  EmitBlock(End, true);
  
  return true;           
}

bool CodeGenFunction::EmitKokkosParallelReduce(const CallExpr *CE,
                    ArrayRef<const Attr *> Attrs) {
  DiagnosticsEngine &Diags = CGM.getDiags();
+6 −0
Original line number Diff line number Diff line
@@ -42,6 +42,8 @@
#include "llvm/Transforms/Utils/SanitizerStats.h"
#include "llvm/IR/ValueMap.h"

#include <queue>

namespace llvm {
class BasicBlock;
class LLVMContext;
@@ -3526,6 +3528,10 @@ public:
                                 llvm::BasicBlock *ExitBlock,
				 JumpDest &Sync);
  bool EmitKokkosParallelFor(const CallExpr *CE, ArrayRef<const Attr *> Attrs);
  bool EmitKokkosInnerLoop(const CallExpr *CE, const LambdaExpr *Lambda,
            llvm::BasicBlock *TopBlock,
            std::queue<const Expr*> DimQueue,
            std::vector<const ParmVarDecl*> params);
  bool EmitKokkosParallelReduce(const CallExpr *CE, ArrayRef<const Attr *> Attrs);
  bool InKokkosConstruct = false; // FIXME: Should/can we refactor this away?