Commit f0486394 authored by George Stelle's avatar George Stelle
Browse files

Initial GPUABI commit

parent 75c8b9fb
Loading
Loading
Loading
Loading
+107 −0
Original line number Diff line number Diff line
//===- GPUABI.h - Interface to the Kitsune GPU back end ------*- C++ -*--===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the Kitsune GPU ABI to convert Tapir instructions to
// calls into the Kitsune runtime system for NVIDIA GPU code.
//
//===----------------------------------------------------------------------===//
#ifndef GPU_ABI_H_
#define GPU_ABI_H_

#include "llvm/Transforms/Tapir/LoweringUtils.h"
#include "llvm/Transforms/Tapir/TapirLoopInfo.h"

namespace llvm {

class DataLayout;
class TargetMachine;
class LLVMLoop; 

class GPUABI : public TapirTarget {
  LLVMLoop *LOP = nullptr;
public:
  GPUABI(Module &M) : TapirTarget(M) {}
  ~GPUABI() {}
  Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final;
  void lowerSync(SyncInst &SI) override final;

  void addHelperAttributes(Function &F) override final {}
  void preProcessFunction(Function &F, TaskInfo &TI,
                          bool OutliningTapirLoops) override final;
  void postProcessFunction(Function &F, bool OutliningTapirLoops)
    override final;
  void postProcessHelper(Function &F) override final;
  void preProcessOutlinedTask(Function &F, Instruction *DetachPt,
                              Instruction *TaskFrameCreate,
                              bool IsSpawner) override final;
  void postProcessOutlinedTask(Function &F, Instruction *DetachPt,
                                       Instruction *TaskFrameCreate,
                                       bool IsSpawner) override final;
  void preProcessRootSpawner(Function &F) override final;
  void postProcessRootSpawner(Function &F) override final;

  void processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT)
    override final;

  LoopOutlineProcessor *getLoopOutlineProcessor(const TapirLoopInfo *TL)
    const override final;
};

class LLVMLoop : public LoopOutlineProcessor {
  friend class GPUABI; 

private:
  static unsigned NextKernelID;
  unsigned MyKernelID;
  Module LLVMM;
  TargetMachine *LLVMTargetMachine;
  GlobalVariable *LLVMGlobal;

  FunctionCallee GetThreadIdx = nullptr;
  FunctionCallee GPUInit = nullptr;
  FunctionCallee GPULaunchKernel = nullptr;
  FunctionCallee GPUWaitKernel = nullptr;

  SmallVector<Value *, 5> OrderedInputs; 
public:
  LLVMLoop(Module &M);

  void setupLoopOutlineArgs(
      Function &F, ValueSet &HelperArgs, SmallVectorImpl<Value *> &HelperInputs,
      ValueSet &InputSet, const SmallVectorImpl<Value *> &LCArgs,
      const SmallVectorImpl<Value *> &LCInputs,
      const ValueSet &TLInputsFixed)
    override final;
  unsigned getIVArgIndex(const Function &F, const ValueSet &Args) const
    override final;
  unsigned getLimitArgIndex(const Function &F, const ValueSet &Args) const
    override final;
  void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
                          ValueToValueMapTy &VMap) override final;
  void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                               DominatorTree &DT) override final;
};
}

#endif
/*
#include "llvm/Transforms/Tapir/LoopSpawningTI.h"
#include "llvm/Transforms/Tapir/LoweringUtils.h"
#include "llvm/ADT/DenseMap.h"

using namespace llvm; 

class GPU : public LoopOutlineProcessor {
public:
  GPU(Module &M) : LoopOutlineProcessor(M) {}
  void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
                          ValueToValueMapTy &VMap) override final;  
  GlobalVariable* LLVMKernel; 
};
*/
+1 −0
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@ enum class TapirTargetID {
  OpenCilk, // Lower to OpenCilk ABI
  OpenCL,   // Lower to OpenCL ABI
  OpenMP,   // Lower to OpenMP
  GPU,   // Lower to OpenCL
  Qthreads, // Lower to Qthreads
  Realm,    // Lower to Realm
  Last_TapirTargetID
+1 −0
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ add_llvm_component_library(LLVMTapirOpts
  Outline.cpp
  QthreadsABI.cpp
  RealmABI.cpp
  GPUABI.cpp
  SerialABI.cpp
  SerializeSmallTasks.cpp
  Tapir.cpp
+384 −0
Original line number Diff line number Diff line
//===- GPUABI.cpp - Lower Tapir to the Kitsune GPU back end -------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the Kitsune GPU ABI to convert Tapir instructions to
// calls into the Kitsune runtime system for GPU LLVM code.
//
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Tapir/GPUABI.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Tapir/Outline.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Vectorize.h"
#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/Support/TargetRegistry.h"
#include <sstream>

using namespace llvm;

#define DEBUG_TYPE "openclabi"

Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) {
  Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8);

  // Replace uses of grainsize intrinsic call with this grainsize value.
  GrainsizeCall->replaceAllUsesWith(Grainsize);
  return Grainsize;
}

void GPUABI::lowerSync(SyncInst &SI) {
  // currently a no-op...
}

void GPUABI::preProcessOutlinedTask(llvm::Function&, llvm::Instruction*, llvm::Instruction*, bool){}
void GPUABI::postProcessOutlinedTask(llvm::Function&, llvm::Instruction*, llvm::Instruction*, bool){}
void GPUABI::preProcessRootSpawner(llvm::Function&){}
void GPUABI::postProcessRootSpawner(llvm::Function&){}

void GPUABI::preProcessFunction(Function &F, TaskInfo &TI,
                                 bool OutliningTapirLoops) {
}

void GPUABI::postProcessFunction(Function &F, bool OutliningTapirLoops) {
}

void GPUABI::postProcessHelper(Function &F) {
}

void GPUABI::processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) {
}

LoopOutlineProcessor *GPUABI::getLoopOutlineProcessor(
    const TapirLoopInfo *TL) const {
  if(!LOP) 
    return new LLVMLoop(M);
  return LOP;
}

// Static counter for assigning IDs to kernels.
unsigned LLVMLoop::NextKernelID = 0;

LLVMLoop::LLVMLoop(Module &M)
    : LoopOutlineProcessor(M, LLVMM), LLVMM("spirvModule", M.getContext()) {
  // Assign an ID to this kernel.
  MyKernelID = NextKernelID++;

  // Setup an LLVM triple.
  Triple LLVMTriple("spir64-unknown-unknown");
  LLVMM.setTargetTriple(LLVMTriple.str());

  // Insert runtime-function declarations in LLVM host modules.
  Type *LLVMInt32Ty = Type::getInt32Ty(LLVMM.getContext());
  Type *LLVMInt64Ty = Type::getInt64Ty(LLVMM.getContext());
  GetThreadIdx = LLVMM.getOrInsertFunction("gtid", LLVMInt32Ty);
  Function* getid = LLVMM.getFunction("gtid"); 

  Type *VoidTy = Type::getVoidTy(M.getContext());
  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
  Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
  Type *Int8Ty = Type::getInt8Ty(M.getContext());
  Type *Int32Ty = Type::getInt32Ty(M.getContext());
  Type *Int64Ty = Type::getInt64Ty(M.getContext());
  GPUInit = M.getOrInsertFunction("initRuntime", VoidTy);
  GPULaunchKernel = M.getOrInsertFunction("launchKernel", VoidPtrTy, VoidPtrTy, VoidPtrPtrTy, Int64Ty);
  GPUWaitKernel = M.getOrInsertFunction("waitKernel", VoidTy);
}

void LLVMLoop::setupLoopOutlineArgs(
    Function &F, ValueSet &HelperArgs, SmallVectorImpl<Value *> &HelperInputs,
    ValueSet &InputSet, const SmallVectorImpl<Value *> &LCArgs,
    const SmallVectorImpl<Value *> &LCInputs, const ValueSet &TLInputsFixed) {
  // Add the loop control inputs.

  // The first parameter defines the extent of the index space, i.e., the number
  // of threads to launch.
  {
    Argument *EndArg = cast<Argument>(LCArgs[1]);
    EndArg->setName("runSize");
    HelperArgs.insert(EndArg);

    Value *InputVal = LCInputs[1];
    HelperInputs.push_back(InputVal);
    // Add loop-control input to the input set.
    InputSet.insert(InputVal);
  }
  // The second parameter defines the start of the index space.
  {
    Argument *StartArg = cast<Argument>(LCArgs[0]);
    StartArg->setName("runStart");
    HelperArgs.insert(StartArg);

    Value *InputVal = LCInputs[0];
    HelperInputs.push_back(InputVal);
    // Add loop-control input to the input set.
    InputSet.insert(InputVal);
  }
  // The third parameter defines the grainsize, if it is not constant.
  if (!isa<ConstantInt>(LCInputs[2])) {
    Argument *GrainsizeArg = cast<Argument>(LCArgs[2]);
    GrainsizeArg->setName("runStride");
    HelperArgs.insert(GrainsizeArg);

    Value *InputVal = LCInputs[2];
    HelperInputs.push_back(InputVal);
    // Add loop-control input to the input set.
    InputSet.insert(InputVal);
  }

  // Add the loop control inputs.
  for (Value *V : TLInputsFixed) {
    HelperArgs.insert(V); 
    HelperInputs.push_back(V);
  }

  for(Value *V : HelperInputs){
    OrderedInputs.push_back(V);
  }
}

unsigned LLVMLoop::getIVArgIndex(const Function &F, const ValueSet &Args) const {
  // The argument for the primary induction variable is the second input.
  return 1;
}

unsigned LLVMLoop::getLimitArgIndex(const Function &F, const ValueSet &Args)
  const {
  // The argument for the loop limit is the first input.
  return 0;
}

void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
                                   ValueToValueMapTy &VMap) {
  LLVMContext &Ctx = M.getContext();
  Type *Int8Ty = Type::getInt8Ty(Ctx);
  Type *Int32Ty = Type::getInt32Ty(Ctx);
  //Type *Int64Ty = Type::getInt64Ty(Ctx);
  //Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
  Task *T = TL.getTask();
  Loop *L = TL.getLoop();


  BasicBlock *Entry = cast<BasicBlock>(VMap[L->getLoopPreheader()]);
  BasicBlock *Header = cast<BasicBlock>(VMap[L->getHeader()]);
  BasicBlock *Exit = cast<BasicBlock>(VMap[TL.getExitBlock()]);
  PHINode *PrimaryIV = cast<PHINode>(VMap[TL.getPrimaryInduction().first]);
  Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry);
  Instruction *ClonedSyncReg = cast<Instruction>(
      VMap[T->getDetach()->getSyncRegion()]);

  // We no longer need the cloned sync region.
  ClonedSyncReg->eraseFromParent();

  // Set the helper function to have external linkage.
  // Get the thread ID for this invocation of Helper.
  IRBuilder<> B(Entry->getTerminator());
  Value *ThreadIdx = B.CreateCall(GetThreadIdx, ConstantInt::get(Int32Ty, 0));
  //Value *BlockIdx = B.CreateCall(GetBlockIdx, ConstantInt::get(Int32Ty, 0));
  //Value *BlockDim = B.CreateCall(GetBlockDim, ConstantInt::get(Int32Ty, 0));
  Value *ThreadID = B.CreateIntCast(ThreadIdx, PrimaryIV->getType(), false);


  Function *Helper = Out.Outline;
  //Helper->setName("kitsune_spirv_kernel"); 
  // Fix argument pointer types to global, nocapture
  // TODO: read/write attributes?
  LLVM_DEBUG(dbgs() << "Function type after globalization of argument pointers << " << *Helper->getType() << "\n"); 
  LLVM_DEBUG(dbgs() << "LLVMM after globalization of argument pointers << " << *Helper->getParent() << "\n"); 

  // Verify that the Thread ID corresponds to a valid iteration.  Because Tapir
  // loops use canonical induction variables, valid iterations range from 0 to
  // the loop limit with stride 1.  The End argument encodes the loop limit.
  // Get end and grainsize arguments
  Argument *End;
  Value *Grainsize;
  {
    auto OutlineArgsIter = Helper->arg_begin();
    // End argument is the first LC arg.
    End = &*OutlineArgsIter;

    // Get the grainsize value, which is either constant or the third LC arg.
    // ReplaceInstWithInst(gep, GetElementPtrInst::Create(
    if (unsigned ConstGrainsize = TL.getGrainsize())
      Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
    else
      // Grainsize argument is the third LC arg.
      Grainsize = &*++(++OutlineArgsIter);
  }
  ThreadID = B.CreateMul(ThreadID, Grainsize);
  Value *ThreadEndGrain = B.CreateAdd(ThreadID, Grainsize);
  Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, ThreadEndGrain, End); 
  Value *ThreadEnd = B.CreateSelect(Cmp, ThreadEndGrain, End); 
  Value *Cond = B.CreateICmpUGE(ThreadID, ThreadEnd);

  ReplaceInstWithInst(Entry->getTerminator(), BranchInst::Create(Exit, Header,
                                                                 Cond));
  // Use the thread ID as the start iteration number for the primary IV.
  PrimaryIVInput->replaceAllUsesWith(ThreadID);

  // Update cloned loop condition to use the thread-end value.
  unsigned TripCountIdx = 0;
  ICmpInst *ClonedCond = cast<ICmpInst>(VMap[TL.getCondition()]);
  if (ClonedCond->getOperand(0) != ThreadEnd)
    ++TripCountIdx;
  ClonedCond->setOperand(TripCountIdx, ThreadEnd);
  assert(ClonedCond->getOperand(TripCountIdx) == ThreadEnd &&
         "End argument not used in condition");

  // Update paramaters with necessary address space modifcations
  SmallVector<Type*, 8> paramTys; 
  for(auto &arg : Helper->args()){
    if (auto *apty = dyn_cast<PointerType>(arg.getType())){
      paramTys.push_back(PointerType::get(apty->getPointerElementType(), 1)); 
    } else {
      paramTys.push_back(arg.getType()); 
    }
  }
  ArrayRef<Type*> newParams(paramTys); 
  if(auto *fpty = dyn_cast<PointerType>(Helper->getType())){
    if(auto *fty = dyn_cast<FunctionType>(fpty->getPointerElementType())){
      LLVM_DEBUG(dbgs() << "Helper is pointer to function " << *Helper->getType() << "\n"); 
      auto *NewHelper = Function::Create(
          FunctionType::get(fty->getReturnType(), newParams, false), 
          GlobalValue::ExternalLinkage, 
          "kitsune_llvm_kernel", 
          LLVMM);

      ValueToValueMapTy NewVMap;
      auto argit = NewHelper->arg_begin();
      for (auto &arg : Helper->args()) {
        NewVMap[&arg] = argit++; 
      }
      SmallVector< ReturnInst *,5> retinsts;
      CloneFunctionInto(NewHelper, Helper, NewVMap, false, retinsts);
      //Helper->mutateType(PointerType::get(FunctionType::get(fty->getReturnType(), newParams, false), 0)); 
      NewHelper->setCallingConv(CallingConv::SPIR_KERNEL); 
      for(auto &arg : NewHelper->args()){
        if (auto *apty = dyn_cast<PointerType>(arg.getType())){
          arg.addAttr(Attribute::NoCapture);
        }
      }
      Helper = NewHelper; 
    }
  }
}

void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                      DominatorTree &DT) {
  LLVMContext &Ctx = M.getContext();
  Type *Int8Ty = Type::getInt8Ty(Ctx);
  Type *Int32Ty = Type::getInt32Ty(Ctx);
  //Type *Int64Ty = Type::getInt64Ty(Ctx);
  Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);

  //Task *T = TL.getTask();
  //Instruction *ReplCall = cast<CallBase>(TOI.ReplCall);
  LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << M);
  Function *Parent = TOI.ReplCall->getFunction();
  Value *TripCount = OrderedInputs[0];
  BasicBlock* RCBB = TOI.ReplCall->getParent(); 
  BasicBlock* NBB = RCBB->splitBasicBlock(TOI.ReplCall); 
  TOI.ReplCall->eraseFromParent(); 
  IRBuilder<> B(&NBB->front());

  // Compile the kernel 
  //LLVMM.getFunctionList().remove(TOI.Outline); 
  TOI.Outline->eraseFromParent(); 
  LLVMContext &LLVMCtx = LLVMM.getContext();

  SmallVector<Metadata *, 3> AV;
  //AV.push_back(ValueAsMetadata::get(Helper));
  //AV.push_back(MDString::get(LLVMCtx, "kernel"));
  //AV.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(LLVMCtx),
  //                                                   1)));
  //Annotations->addOperand(MDNode::get(Ctx, AV));

  legacy::PassManager *PassManager = new legacy::PassManager;

  PassManager->add(createVerifierPass());

  // Add in our optimization passes

  //PassManager->add(createInstructionCombiningPass());
  PassManager->add(createReassociatePass());
  PassManager->add(createGVNPass());
  PassManager->add(createCFGSimplificationPass());
  PassManager->add(createLoopVectorizePass());
  PassManager->add(createSLPVectorizerPass());
  //PassManager->add(createBreakCriticalEdgesPass());
  PassManager->add(createConstantPropagationPass());
  PassManager->add(createDeadInstEliminationPass());
  PassManager->add(createDeadStoreEliminationPass());
  //PassManager->add(createInstructionCombiningPass());
  PassManager->add(createCFGSimplificationPass());
  PassManager->add(createDeadCodeEliminationPass());
  PassManager->run(LLVMM);

  delete PassManager;

  LLVM_DEBUG(dbgs() << "LLVM Module: " << LLVMM);


  // generate llvm kernel code
  SmallVector<char, 1<<20> mbuf;
  BitcodeWriter bcw(mbuf);
  bcw.writeModule(LLVMM); 

  Constant *LLVM = ConstantDataArray::getRaw(mbuf.data(), mbuf.size(), Int8Ty);
  LLVMGlobal = new GlobalVariable(M, LLVM->getType(), true,
                                 GlobalValue::PrivateLinkage, LLVM,
                                 "gpu_" + Twine("kitsune_spirv_kernel"));

  //Value* TripCount = isSRetInput(TOI.InputSet[0]) ? TOI.InputSet[1] : TOI.InputSet[0]; 
  //Value *RunStart = ReplCall->getArgOperand(getIVArgIndex(*Parent,
  //                                                        TOI.InputSet));
  //Value *TripCount = ReplCall->getArgOperand(getLimitArgIndex(*Parent,
  //                                                            TOI.InputSet));

  Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID);
  Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy);

  Constant *kernelSize = ConstantInt::get(Int32Ty, 
    LLVMGlobal->getInitializer()->getType()->getArrayNumElements()); 
  BasicBlock &EBB = Parent->getEntryBlock(); 
  IRBuilder<> EB(&EBB.front()); 
  EB.CreateCall(GPUInit, {});

  ArrayType* arrayType = ArrayType::get(VoidPtrTy, OrderedInputs.size()); 
  Value* argArray = B.CreateAlloca(arrayType); 
  for (Value *V : OrderedInputs) {
    //Value *ElementSize = nullptr;
    LLVM_DEBUG(dbgs() << "Input set value: " << *V << "\n"); 
    Value *VPtr = B.CreateAlloca(V->getType()); 
    B.CreateStore(V, VPtr); 
    Value *VoidVPtr = B.CreateBitCast(VPtr, VoidPtrTy);
  }

  Value *Grainsize = TL.getGrainsize() ?  
    ConstantInt::get(TripCount->getType(), TL.getGrainsize()) :
    OrderedInputs[2]; 

  Value *RunSizeQ = B.CreateUDiv(TripCount, Grainsize);
  Value *RunRem = B.CreateURem(TripCount, Grainsize);
  Value *IsRem = B.CreateICmp(ICmpInst::ICMP_UGT, RunRem, ConstantInt::get(RunRem->getType(), 0)); 
  Value *IsRemAdd = B.CreateZExt(IsRem, RunSizeQ->getType()); 
  Value *RunSize = B.CreateAdd(RunSizeQ, IsRemAdd);  
  
  Value* stream = B.CreateCall(GPULaunchKernel, { LLVMGlobal, argArray, RunSize });
  B.CreateCall(GPUWaitKernel, stream);

  LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M);
}
+3 −0
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@
#include "llvm/Transforms/Tapir/OpenCilkABI.h"
#include "llvm/Transforms/Tapir/OpenMPABI.h"
#include "llvm/Transforms/Tapir/OpenCLABI.h"
#include "llvm/Transforms/Tapir/GPUABI.h"
#include "llvm/Transforms/Tapir/Outline.h"
#include "llvm/Transforms/Tapir/QthreadsABI.h"
#include "llvm/Transforms/Tapir/RealmABI.h"
@@ -55,6 +56,8 @@ TapirTarget *llvm::getTapirTargetFromID(Module &M, TapirTargetID ID) {
    return new OpenMPABI(M);
  case TapirTargetID::OpenCL:
    return new OpenCLABI(M);
  case TapirTargetID::GPU:
    return new GPUABI(M);
  case TapirTargetID::Qthreads:
    return new QthreadsABI(M);
  case TapirTargetID::Realm:
Loading