Commit 81b7058b authored by Patrick Flynn's avatar Patrick Flynn Committed by Patrick McCormick
Browse files

Separated MDRange codegen from linear parallel_for codegen, and added better detection

parent f5414295
Loading
Loading
Loading
Loading
+161 −15
Original line number Diff line number Diff line
@@ -267,11 +267,160 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE,
    return false;
  }
  
  // Check to see if we have an MDRange present
  if (BE->getStmtClass() == Expr::CXXTemporaryObjectExprClass) {
    const CXXTemporaryObjectExpr *CXXTO = dyn_cast<CXXTemporaryObjectExpr>(BE);
    std::string className = CXXTO->getBestDynamicClassType()->getNameAsString();
    
    if (className == "MDRangePolicy") {
      return EmitKokkosParallelForMD(CE, PFName, BE, Lambda, ForallAttrs);
    } else {
      // What should we do here?
    }
  }

  // Create all jump destinations and basic blocks in the order they 
  // appear in the IR. 
  JumpDest Condition = getJumpDestInCurrentScope("kokkos.forall.cond");
  llvm::BasicBlock *Detach = createBasicBlock("kokkos.forall.detach");
  llvm::BasicBlock *PForBody = createBasicBlock("kokkos.forall.body");
  JumpDest Reattach = getJumpDestInCurrentScope("kokkos.forall.reattach");
  llvm::BasicBlock *Increment = createBasicBlock("kokkos.forall.inc");
  JumpDest Cleanup = getJumpDestInCurrentScope("kokkos.forall.cond.cleanup");
  JumpDest Sync = getJumpDestInCurrentScope("kokkos.forall.sync");
  llvm::BasicBlock *End = createBasicBlock("kokkos.forall.end");

  // Extract a conveince block and setup the lexical scope based on 
  // the lambda's source range. 
  llvm::BasicBlock *ConditionBlock = Condition.getBlock();
  
  const SourceRange &R = CE->getSourceRange();
  LexicalScope PForScope(*this, R);

  // Now we can start the dirty work of transforming the lambda into a 
  // for loop.  


  // The first step is to extract the argument to the lambda and transform it into 
  // the loop induction variable.  As part of this we assume the following are true
  // about the parallel_for:
  //    1. The iterator can be assigned a value of zero. 
  //    2. We ignore the details of what is captured by the lambda.
  // 
  // TODO: Do we need to "relax" these assumptions to support broader code coverage?
  // This is 'equivalent' to the Init statement in a traditional for loop (e.g. int i = 0). 
  const ParmVarDecl *InductionVarDecl; 
  InductionVarDecl = EmitKokkosParallelForInductionVar(Lambda).at(0);

   // Create the sync region. 
  PushSyncRegion();
  llvm::Instruction *SRStart = EmitSyncRegionStart();
  CurSyncRegion->setSyncRegionStart(SRStart);

  // TODO: Need to check attributes for spawning strategy. 
  LoopStack.setSpawnStrategy(LoopAttributes::DAC);
  
  EmitBlock(ConditionBlock);
  
  LoopStack.push(ConditionBlock, CGM.getContext(), ForallAttrs,
                 SourceLocToDebugLoc(R.getBegin()),
                 SourceLocToDebugLoc(R.getEnd()));

  // Store the blocks to use for break and continue. 
  BreakContinueStack.push_back(BreakContinue(Reattach, Reattach));

  // Create a scope for the condition variable cleanup. 
  LexicalScope ConditionScope(*this, R);

  // Create the conditional.
  EmitKokkosParallelForCond(BE, InductionVarDecl, Detach, End, Sync);

  if (PForScope.requiresCleanups()) {
    EmitBlock(Cleanup.getBlock());
    EmitBranchThroughCleanup(Sync);
  }

  // Handle the detach block...
  EmitBlock(Detach);

  auto OldAllocaInsertPt = AllocaInsertPt;
  llvm::Value *Undef = llvm::UndefValue::get(Int32Ty);
  AllocaInsertPt = new llvm::BitCastInst(Undef, Int32Ty, "", PForBody);

  llvm::Value *GInductionVar = GetAddrOfLocalVar(InductionVarDecl).getPointer();
  llvm::Value *GInductionVal = Builder.CreateLoad(GetAddrOfLocalVar(InductionVarDecl));

  QualType RefType = InductionVarDecl->getType();
  
  // Create the detach terminator 
  Builder.CreateDetach(PForBody, Increment, SRStart);

  EmitBlock(PForBody);
  incrementProfileCounter(CE);

  llvm::AllocaInst *TLInductionVar =
      Builder.CreateAlloca(getTypes().ConvertType(RefType), nullptr,
                           InductionVarDecl->getName() + ".detach");
  Builder.CreateAlignedStore(GInductionVal, TLInductionVar,
                             getContext().getTypeAlignInChars(RefType));
  {
    // Create a separate cleanup scope for the body, in case it is not
    // a compound statement.
    InKokkosConstruct = true;
    RunCleanupsScope BodyScope(*this);
    EmitStmt(Lambda->getBody());
    InKokkosConstruct = false;
  }

  auto tmp = AllocaInsertPt; 
  AllocaInsertPt = OldAllocaInsertPt; 
  tmp->removeFromParent(); 

  // Modify the body to use the ''detach''-local induction variable.
  // At this point in the codegen, the body block has been emitted 
  // and we can safely replace the ''sequential`` induction variable 
  // within the detach basic block.
  llvm::BasicBlock *CurrentBlock = Builder.GetInsertBlock();
  for(llvm::Value::use_iterator UI = GInductionVar->use_begin(), UE = GInductionVar->use_end(); 
      UI != UE; ) {
    llvm::Use &U = *UI++;
    llvm::Instruction *I = cast<llvm::Instruction>(U.getUser());
    if (I->getParent() == CurrentBlock) 
      U.set(TLInductionVar);
  }

  EmitBlock(Reattach.getBlock());
  Builder.CreateReattach(Increment, SRStart);

  EmitBlock(Increment);
  llvm::Value *IncVal = Builder.CreateLoad(GetAddrOfLocalVar(InductionVarDecl));
  llvm::Value *One = llvm::ConstantInt::get(ConvertType(InductionVarDecl->getType()), 1);
  IncVal = Builder.CreateAdd(IncVal, One);
  Builder.CreateStore(IncVal, GetAddrOfLocalVar(InductionVarDecl));

  BreakContinueStack.pop_back();
  ConditionScope.ForceCleanup();
  EmitStopPoint(CE);

  EmitBranch(ConditionBlock);
  PForScope.ForceCleanup();
  LoopStack.pop();

  EmitBlock(Sync.getBlock());
  Builder.CreateSync(End, SRStart);
  EmitBlock(End, true);
  return true;
}

bool CodeGenFunction::EmitKokkosParallelForMD(const CallExpr *CE, std::string PFName, const Expr *BE, const LambdaExpr *Lambda,
            ArrayRef<const Attr *> ForallAttrs) {
    
  // TODO: Need to add code to process any attributes (ForallAttrs).
  
  // Build the queue of dimensions (upper bounds)
  std::vector<const Expr *> DimQueue;
  std::vector<const Expr *> StartQueue;
  
  if (BE->getStmtClass() == Expr::CXXTemporaryObjectExprClass) {
  const CXXTemporaryObjectExpr *CXXTO = dyn_cast<CXXTemporaryObjectExpr>(BE);
  const InitListExpr *StartingBounds = dyn_cast<InitListExpr>(CXXTO->getArg(0)->IgnoreImplicit());
  const InitListExpr *UpperBounds = dyn_cast<InitListExpr>(CXXTO->getArg(1)->IgnoreImplicit());
@@ -285,9 +434,6 @@ bool CodeGenFunction::EmitKokkosParallelFor(const CallExpr *CE,
    const Expr *val = UpperBounds->getInit(i)->IgnoreImplicit();
    DimQueue.push_back(val);
  }
  } else {
    DimQueue.push_back(BE);
  }
  
  // These are extra steps that we can probably optimize away
  BE = DimQueue.front();
+4 −0
Original line number Diff line number Diff line
@@ -3526,6 +3526,10 @@ public:
                                 llvm::BasicBlock *ExitBlock,
				 JumpDest &Sync);
  bool EmitKokkosParallelFor(const CallExpr *CE, ArrayRef<const Attr *> Attrs);
  bool EmitKokkosParallelForMD(const CallExpr *CE, std::string PFName,
            const Expr *BE,
            const LambdaExpr *Lambda,
            ArrayRef<const Attr *> ForallAttrs);
  bool EmitKokkosInnerLoop(const CallExpr *CE, const LambdaExpr *Lambda,
            llvm::BasicBlock *TopBlock,
            std::vector<const Expr*> DimQueue,