Loading llvm/include/llvm/Transforms/Tapir/GPUABI.h 0 → 100644 +107 −0 Original line number Diff line number Diff line //===- GPUABI.h - Interface to the Kitsune GPU back end ------*- C++ -*--===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the Kitsune GPU ABI to convert Tapir instructions to // calls into the Kitsune runtime system for NVIDIA GPU code. // //===----------------------------------------------------------------------===// #ifndef GPU_ABI_H_ #define GPU_ABI_H_ #include "llvm/Transforms/Tapir/LoweringUtils.h" #include "llvm/Transforms/Tapir/TapirLoopInfo.h" namespace llvm { class DataLayout; class TargetMachine; class LLVMLoop; class GPUABI : public TapirTarget { LLVMLoop *LOP = nullptr; public: GPUABI(Module &M) : TapirTarget(M) {} ~GPUABI() {} Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final; void lowerSync(SyncInst &SI) override final; void addHelperAttributes(Function &F) override final {} void preProcessFunction(Function &F, TaskInfo &TI, bool OutliningTapirLoops) override final; void postProcessFunction(Function &F, bool OutliningTapirLoops) override final; void postProcessHelper(Function &F) override final; void preProcessOutlinedTask(Function &F, Instruction *DetachPt, Instruction *TaskFrameCreate, bool IsSpawner) override final; void postProcessOutlinedTask(Function &F, Instruction *DetachPt, Instruction *TaskFrameCreate, bool IsSpawner) override final; void preProcessRootSpawner(Function &F) override final; void postProcessRootSpawner(Function &F) override final; void processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) override final; LoopOutlineProcessor *getLoopOutlineProcessor(const TapirLoopInfo *TL) const override final; }; class LLVMLoop : public LoopOutlineProcessor { friend class GPUABI; private: static unsigned NextKernelID; unsigned MyKernelID; Module LLVMM; TargetMachine *LLVMTargetMachine; GlobalVariable *LLVMGlobal; FunctionCallee GetThreadIdx = nullptr; FunctionCallee GPUInit = nullptr; FunctionCallee GPULaunchKernel = nullptr; FunctionCallee GPUWaitKernel = nullptr; SmallVector<Value *, 5> OrderedInputs; public: LLVMLoop(Module &M); void setupLoopOutlineArgs( Function &F, ValueSet &HelperArgs, SmallVectorImpl<Value *> &HelperInputs, ValueSet &InputSet, const SmallVectorImpl<Value *> &LCArgs, const SmallVectorImpl<Value *> &LCInputs, const ValueSet &TLInputsFixed) override final; unsigned getIVArgIndex(const Function &F, const ValueSet &Args) const override final; unsigned getLimitArgIndex(const Function &F, const ValueSet &Args) const override final; void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap) override final; void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) override final; }; } #endif /* #include "llvm/Transforms/Tapir/LoopSpawningTI.h" #include "llvm/Transforms/Tapir/LoweringUtils.h" #include "llvm/ADT/DenseMap.h" using namespace llvm; class GPU : public LoopOutlineProcessor { public: GPU(Module &M) : LoopOutlineProcessor(M) {} void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap) override final; GlobalVariable* LLVMKernel; }; */ llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h +1 −0 Original line number Diff line number Diff line Loading @@ -27,6 +27,7 @@ enum class TapirTargetID { OpenCilk, // Lower to OpenCilk ABI OpenCL, // Lower to OpenCL ABI OpenMP, // Lower to OpenMP GPU, // Lower to OpenCL Qthreads, // Lower to Qthreads Realm, // Lower to Realm Last_TapirTargetID Loading llvm/lib/Transforms/Tapir/CMakeLists.txt +1 −0 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ add_llvm_component_library(LLVMTapirOpts Outline.cpp QthreadsABI.cpp RealmABI.cpp GPUABI.cpp SerialABI.cpp SerializeSmallTasks.cpp Tapir.cpp Loading llvm/lib/Transforms/Tapir/GPUABI.cpp 0 → 100644 +384 −0 Original line number Diff line number Diff line //===- GPUABI.cpp - Lower Tapir to the Kitsune GPU back end -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the Kitsune GPU ABI to convert Tapir instructions to // calls into the Kitsune runtime system for GPU LLVM code. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Tapir/GPUABI.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Tapir/Outline.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/Support/TargetRegistry.h" #include <sstream> using namespace llvm; #define DEBUG_TYPE "openclabi" Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8); // Replace uses of grainsize intrinsic call with this grainsize value. GrainsizeCall->replaceAllUsesWith(Grainsize); return Grainsize; } void GPUABI::lowerSync(SyncInst &SI) { // currently a no-op... } void GPUABI::preProcessOutlinedTask(llvm::Function&, llvm::Instruction*, llvm::Instruction*, bool){} void GPUABI::postProcessOutlinedTask(llvm::Function&, llvm::Instruction*, llvm::Instruction*, bool){} void GPUABI::preProcessRootSpawner(llvm::Function&){} void GPUABI::postProcessRootSpawner(llvm::Function&){} void GPUABI::preProcessFunction(Function &F, TaskInfo &TI, bool OutliningTapirLoops) { } void GPUABI::postProcessFunction(Function &F, bool OutliningTapirLoops) { } void GPUABI::postProcessHelper(Function &F) { } void GPUABI::processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) { } LoopOutlineProcessor *GPUABI::getLoopOutlineProcessor( const TapirLoopInfo *TL) const { if(!LOP) return new LLVMLoop(M); return LOP; } // Static counter for assigning IDs to kernels. unsigned LLVMLoop::NextKernelID = 0; LLVMLoop::LLVMLoop(Module &M) : LoopOutlineProcessor(M, LLVMM), LLVMM("spirvModule", M.getContext()) { // Assign an ID to this kernel. MyKernelID = NextKernelID++; // Setup an LLVM triple. Triple LLVMTriple("spir64-unknown-unknown"); LLVMM.setTargetTriple(LLVMTriple.str()); // Insert runtime-function declarations in LLVM host modules. Type *LLVMInt32Ty = Type::getInt32Ty(LLVMM.getContext()); Type *LLVMInt64Ty = Type::getInt64Ty(LLVMM.getContext()); GetThreadIdx = LLVMM.getOrInsertFunction("gtid", LLVMInt32Ty); Function* getid = LLVMM.getFunction("gtid"); Type *VoidTy = Type::getVoidTy(M.getContext()); Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext()); Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo(); Type *Int8Ty = Type::getInt8Ty(M.getContext()); Type *Int32Ty = Type::getInt32Ty(M.getContext()); Type *Int64Ty = Type::getInt64Ty(M.getContext()); GPUInit = M.getOrInsertFunction("initRuntime", VoidTy); GPULaunchKernel = M.getOrInsertFunction("launchKernel", VoidPtrTy, VoidPtrTy, VoidPtrPtrTy, Int64Ty); GPUWaitKernel = M.getOrInsertFunction("waitKernel", VoidTy); } void LLVMLoop::setupLoopOutlineArgs( Function &F, ValueSet &HelperArgs, SmallVectorImpl<Value *> &HelperInputs, ValueSet &InputSet, const SmallVectorImpl<Value *> &LCArgs, const SmallVectorImpl<Value *> &LCInputs, const ValueSet &TLInputsFixed) { // Add the loop control inputs. // The first parameter defines the extent of the index space, i.e., the number // of threads to launch. { Argument *EndArg = cast<Argument>(LCArgs[1]); EndArg->setName("runSize"); HelperArgs.insert(EndArg); Value *InputVal = LCInputs[1]; HelperInputs.push_back(InputVal); // Add loop-control input to the input set. InputSet.insert(InputVal); } // The second parameter defines the start of the index space. { Argument *StartArg = cast<Argument>(LCArgs[0]); StartArg->setName("runStart"); HelperArgs.insert(StartArg); Value *InputVal = LCInputs[0]; HelperInputs.push_back(InputVal); // Add loop-control input to the input set. InputSet.insert(InputVal); } // The third parameter defines the grainsize, if it is not constant. if (!isa<ConstantInt>(LCInputs[2])) { Argument *GrainsizeArg = cast<Argument>(LCArgs[2]); GrainsizeArg->setName("runStride"); HelperArgs.insert(GrainsizeArg); Value *InputVal = LCInputs[2]; HelperInputs.push_back(InputVal); // Add loop-control input to the input set. InputSet.insert(InputVal); } // Add the loop control inputs. for (Value *V : TLInputsFixed) { HelperArgs.insert(V); HelperInputs.push_back(V); } for(Value *V : HelperInputs){ OrderedInputs.push_back(V); } } unsigned LLVMLoop::getIVArgIndex(const Function &F, const ValueSet &Args) const { // The argument for the primary induction variable is the second input. return 1; } unsigned LLVMLoop::getLimitArgIndex(const Function &F, const ValueSet &Args) const { // The argument for the loop limit is the first input. return 0; } void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap) { LLVMContext &Ctx = M.getContext(); Type *Int8Ty = Type::getInt8Ty(Ctx); Type *Int32Ty = Type::getInt32Ty(Ctx); //Type *Int64Ty = Type::getInt64Ty(Ctx); //Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); Task *T = TL.getTask(); Loop *L = TL.getLoop(); BasicBlock *Entry = cast<BasicBlock>(VMap[L->getLoopPreheader()]); BasicBlock *Header = cast<BasicBlock>(VMap[L->getHeader()]); BasicBlock *Exit = cast<BasicBlock>(VMap[TL.getExitBlock()]); PHINode *PrimaryIV = cast<PHINode>(VMap[TL.getPrimaryInduction().first]); Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry); Instruction *ClonedSyncReg = cast<Instruction>( VMap[T->getDetach()->getSyncRegion()]); // We no longer need the cloned sync region. ClonedSyncReg->eraseFromParent(); // Set the helper function to have external linkage. // Get the thread ID for this invocation of Helper. IRBuilder<> B(Entry->getTerminator()); Value *ThreadIdx = B.CreateCall(GetThreadIdx, ConstantInt::get(Int32Ty, 0)); //Value *BlockIdx = B.CreateCall(GetBlockIdx, ConstantInt::get(Int32Ty, 0)); //Value *BlockDim = B.CreateCall(GetBlockDim, ConstantInt::get(Int32Ty, 0)); Value *ThreadID = B.CreateIntCast(ThreadIdx, PrimaryIV->getType(), false); Function *Helper = Out.Outline; //Helper->setName("kitsune_spirv_kernel"); // Fix argument pointer types to global, nocapture // TODO: read/write attributes? LLVM_DEBUG(dbgs() << "Function type after globalization of argument pointers << " << *Helper->getType() << "\n"); LLVM_DEBUG(dbgs() << "LLVMM after globalization of argument pointers << " << *Helper->getParent() << "\n"); // Verify that the Thread ID corresponds to a valid iteration. Because Tapir // loops use canonical induction variables, valid iterations range from 0 to // the loop limit with stride 1. The End argument encodes the loop limit. // Get end and grainsize arguments Argument *End; Value *Grainsize; { auto OutlineArgsIter = Helper->arg_begin(); // End argument is the first LC arg. End = &*OutlineArgsIter; // Get the grainsize value, which is either constant or the third LC arg. // ReplaceInstWithInst(gep, GetElementPtrInst::Create( if (unsigned ConstGrainsize = TL.getGrainsize()) Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize); else // Grainsize argument is the third LC arg. Grainsize = &*++(++OutlineArgsIter); } ThreadID = B.CreateMul(ThreadID, Grainsize); Value *ThreadEndGrain = B.CreateAdd(ThreadID, Grainsize); Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, ThreadEndGrain, End); Value *ThreadEnd = B.CreateSelect(Cmp, ThreadEndGrain, End); Value *Cond = B.CreateICmpUGE(ThreadID, ThreadEnd); ReplaceInstWithInst(Entry->getTerminator(), BranchInst::Create(Exit, Header, Cond)); // Use the thread ID as the start iteration number for the primary IV. PrimaryIVInput->replaceAllUsesWith(ThreadID); // Update cloned loop condition to use the thread-end value. unsigned TripCountIdx = 0; ICmpInst *ClonedCond = cast<ICmpInst>(VMap[TL.getCondition()]); if (ClonedCond->getOperand(0) != ThreadEnd) ++TripCountIdx; ClonedCond->setOperand(TripCountIdx, ThreadEnd); assert(ClonedCond->getOperand(TripCountIdx) == ThreadEnd && "End argument not used in condition"); // Update paramaters with necessary address space modifcations SmallVector<Type*, 8> paramTys; for(auto &arg : Helper->args()){ if (auto *apty = dyn_cast<PointerType>(arg.getType())){ paramTys.push_back(PointerType::get(apty->getPointerElementType(), 1)); } else { paramTys.push_back(arg.getType()); } } ArrayRef<Type*> newParams(paramTys); if(auto *fpty = dyn_cast<PointerType>(Helper->getType())){ if(auto *fty = dyn_cast<FunctionType>(fpty->getPointerElementType())){ LLVM_DEBUG(dbgs() << "Helper is pointer to function " << *Helper->getType() << "\n"); auto *NewHelper = Function::Create( FunctionType::get(fty->getReturnType(), newParams, false), GlobalValue::ExternalLinkage, "kitsune_llvm_kernel", LLVMM); ValueToValueMapTy NewVMap; auto argit = NewHelper->arg_begin(); for (auto &arg : Helper->args()) { NewVMap[&arg] = argit++; } SmallVector< ReturnInst *,5> retinsts; CloneFunctionInto(NewHelper, Helper, NewVMap, false, retinsts); //Helper->mutateType(PointerType::get(FunctionType::get(fty->getReturnType(), newParams, false), 0)); NewHelper->setCallingConv(CallingConv::SPIR_KERNEL); for(auto &arg : NewHelper->args()){ if (auto *apty = dyn_cast<PointerType>(arg.getType())){ arg.addAttr(Attribute::NoCapture); } } Helper = NewHelper; } } } void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) { LLVMContext &Ctx = M.getContext(); Type *Int8Ty = Type::getInt8Ty(Ctx); Type *Int32Ty = Type::getInt32Ty(Ctx); //Type *Int64Ty = Type::getInt64Ty(Ctx); Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); //Task *T = TL.getTask(); //Instruction *ReplCall = cast<CallBase>(TOI.ReplCall); LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << M); Function *Parent = TOI.ReplCall->getFunction(); Value *TripCount = OrderedInputs[0]; BasicBlock* RCBB = TOI.ReplCall->getParent(); BasicBlock* NBB = RCBB->splitBasicBlock(TOI.ReplCall); TOI.ReplCall->eraseFromParent(); IRBuilder<> B(&NBB->front()); // Compile the kernel //LLVMM.getFunctionList().remove(TOI.Outline); TOI.Outline->eraseFromParent(); LLVMContext &LLVMCtx = LLVMM.getContext(); SmallVector<Metadata *, 3> AV; //AV.push_back(ValueAsMetadata::get(Helper)); //AV.push_back(MDString::get(LLVMCtx, "kernel")); //AV.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(LLVMCtx), // 1))); //Annotations->addOperand(MDNode::get(Ctx, AV)); legacy::PassManager *PassManager = new legacy::PassManager; PassManager->add(createVerifierPass()); // Add in our optimization passes //PassManager->add(createInstructionCombiningPass()); PassManager->add(createReassociatePass()); PassManager->add(createGVNPass()); PassManager->add(createCFGSimplificationPass()); PassManager->add(createLoopVectorizePass()); PassManager->add(createSLPVectorizerPass()); //PassManager->add(createBreakCriticalEdgesPass()); PassManager->add(createConstantPropagationPass()); PassManager->add(createDeadInstEliminationPass()); PassManager->add(createDeadStoreEliminationPass()); //PassManager->add(createInstructionCombiningPass()); PassManager->add(createCFGSimplificationPass()); PassManager->add(createDeadCodeEliminationPass()); PassManager->run(LLVMM); delete PassManager; LLVM_DEBUG(dbgs() << "LLVM Module: " << LLVMM); // generate llvm kernel code SmallVector<char, 1<<20> mbuf; BitcodeWriter bcw(mbuf); bcw.writeModule(LLVMM); Constant *LLVM = ConstantDataArray::getRaw(mbuf.data(), mbuf.size(), Int8Ty); LLVMGlobal = new GlobalVariable(M, LLVM->getType(), true, GlobalValue::PrivateLinkage, LLVM, "gpu_" + Twine("kitsune_spirv_kernel")); //Value* TripCount = isSRetInput(TOI.InputSet[0]) ? TOI.InputSet[1] : TOI.InputSet[0]; //Value *RunStart = ReplCall->getArgOperand(getIVArgIndex(*Parent, // TOI.InputSet)); //Value *TripCount = ReplCall->getArgOperand(getLimitArgIndex(*Parent, // TOI.InputSet)); Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID); Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy); Constant *kernelSize = ConstantInt::get(Int32Ty, LLVMGlobal->getInitializer()->getType()->getArrayNumElements()); BasicBlock &EBB = Parent->getEntryBlock(); IRBuilder<> EB(&EBB.front()); EB.CreateCall(GPUInit, {}); ArrayType* arrayType = ArrayType::get(VoidPtrTy, OrderedInputs.size()); Value* argArray = B.CreateAlloca(arrayType); for (Value *V : OrderedInputs) { //Value *ElementSize = nullptr; LLVM_DEBUG(dbgs() << "Input set value: " << *V << "\n"); Value *VPtr = B.CreateAlloca(V->getType()); B.CreateStore(V, VPtr); Value *VoidVPtr = B.CreateBitCast(VPtr, VoidPtrTy); } Value *Grainsize = TL.getGrainsize() ? ConstantInt::get(TripCount->getType(), TL.getGrainsize()) : OrderedInputs[2]; Value *RunSizeQ = B.CreateUDiv(TripCount, Grainsize); Value *RunRem = B.CreateURem(TripCount, Grainsize); Value *IsRem = B.CreateICmp(ICmpInst::ICMP_UGT, RunRem, ConstantInt::get(RunRem->getType(), 0)); Value *IsRemAdd = B.CreateZExt(IsRem, RunSizeQ->getType()); Value *RunSize = B.CreateAdd(RunSizeQ, IsRemAdd); Value* stream = B.CreateCall(GPULaunchKernel, { LLVMGlobal, argArray, RunSize }); B.CreateCall(GPUWaitKernel, stream); LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M); } llvm/lib/Transforms/Tapir/LoweringUtils.cpp +3 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,7 @@ #include "llvm/Transforms/Tapir/OpenCilkABI.h" #include "llvm/Transforms/Tapir/OpenMPABI.h" #include "llvm/Transforms/Tapir/OpenCLABI.h" #include "llvm/Transforms/Tapir/GPUABI.h" #include "llvm/Transforms/Tapir/Outline.h" #include "llvm/Transforms/Tapir/QthreadsABI.h" #include "llvm/Transforms/Tapir/RealmABI.h" Loading Loading @@ -55,6 +56,8 @@ TapirTarget *llvm::getTapirTargetFromID(Module &M, TapirTargetID ID) { return new OpenMPABI(M); case TapirTargetID::OpenCL: return new OpenCLABI(M); case TapirTargetID::GPU: return new GPUABI(M); case TapirTargetID::Qthreads: return new QthreadsABI(M); case TapirTargetID::Realm: Loading Loading
llvm/include/llvm/Transforms/Tapir/GPUABI.h 0 → 100644 +107 −0 Original line number Diff line number Diff line //===- GPUABI.h - Interface to the Kitsune GPU back end ------*- C++ -*--===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the Kitsune GPU ABI to convert Tapir instructions to // calls into the Kitsune runtime system for NVIDIA GPU code. // //===----------------------------------------------------------------------===// #ifndef GPU_ABI_H_ #define GPU_ABI_H_ #include "llvm/Transforms/Tapir/LoweringUtils.h" #include "llvm/Transforms/Tapir/TapirLoopInfo.h" namespace llvm { class DataLayout; class TargetMachine; class LLVMLoop; class GPUABI : public TapirTarget { LLVMLoop *LOP = nullptr; public: GPUABI(Module &M) : TapirTarget(M) {} ~GPUABI() {} Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final; void lowerSync(SyncInst &SI) override final; void addHelperAttributes(Function &F) override final {} void preProcessFunction(Function &F, TaskInfo &TI, bool OutliningTapirLoops) override final; void postProcessFunction(Function &F, bool OutliningTapirLoops) override final; void postProcessHelper(Function &F) override final; void preProcessOutlinedTask(Function &F, Instruction *DetachPt, Instruction *TaskFrameCreate, bool IsSpawner) override final; void postProcessOutlinedTask(Function &F, Instruction *DetachPt, Instruction *TaskFrameCreate, bool IsSpawner) override final; void preProcessRootSpawner(Function &F) override final; void postProcessRootSpawner(Function &F) override final; void processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) override final; LoopOutlineProcessor *getLoopOutlineProcessor(const TapirLoopInfo *TL) const override final; }; class LLVMLoop : public LoopOutlineProcessor { friend class GPUABI; private: static unsigned NextKernelID; unsigned MyKernelID; Module LLVMM; TargetMachine *LLVMTargetMachine; GlobalVariable *LLVMGlobal; FunctionCallee GetThreadIdx = nullptr; FunctionCallee GPUInit = nullptr; FunctionCallee GPULaunchKernel = nullptr; FunctionCallee GPUWaitKernel = nullptr; SmallVector<Value *, 5> OrderedInputs; public: LLVMLoop(Module &M); void setupLoopOutlineArgs( Function &F, ValueSet &HelperArgs, SmallVectorImpl<Value *> &HelperInputs, ValueSet &InputSet, const SmallVectorImpl<Value *> &LCArgs, const SmallVectorImpl<Value *> &LCInputs, const ValueSet &TLInputsFixed) override final; unsigned getIVArgIndex(const Function &F, const ValueSet &Args) const override final; unsigned getLimitArgIndex(const Function &F, const ValueSet &Args) const override final; void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap) override final; void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) override final; }; } #endif /* #include "llvm/Transforms/Tapir/LoopSpawningTI.h" #include "llvm/Transforms/Tapir/LoweringUtils.h" #include "llvm/ADT/DenseMap.h" using namespace llvm; class GPU : public LoopOutlineProcessor { public: GPU(Module &M) : LoopOutlineProcessor(M) {} void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap) override final; GlobalVariable* LLVMKernel; }; */
llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h +1 −0 Original line number Diff line number Diff line Loading @@ -27,6 +27,7 @@ enum class TapirTargetID { OpenCilk, // Lower to OpenCilk ABI OpenCL, // Lower to OpenCL ABI OpenMP, // Lower to OpenMP GPU, // Lower to OpenCL Qthreads, // Lower to Qthreads Realm, // Lower to Realm Last_TapirTargetID Loading
llvm/lib/Transforms/Tapir/CMakeLists.txt +1 −0 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ add_llvm_component_library(LLVMTapirOpts Outline.cpp QthreadsABI.cpp RealmABI.cpp GPUABI.cpp SerialABI.cpp SerializeSmallTasks.cpp Tapir.cpp Loading
llvm/lib/Transforms/Tapir/GPUABI.cpp 0 → 100644 +384 −0 Original line number Diff line number Diff line //===- GPUABI.cpp - Lower Tapir to the Kitsune GPU back end -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the Kitsune GPU ABI to convert Tapir instructions to // calls into the Kitsune runtime system for GPU LLVM code. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Tapir/GPUABI.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Tapir/Outline.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/Support/TargetRegistry.h" #include <sstream> using namespace llvm; #define DEBUG_TYPE "openclabi" Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8); // Replace uses of grainsize intrinsic call with this grainsize value. GrainsizeCall->replaceAllUsesWith(Grainsize); return Grainsize; } void GPUABI::lowerSync(SyncInst &SI) { // currently a no-op... } void GPUABI::preProcessOutlinedTask(llvm::Function&, llvm::Instruction*, llvm::Instruction*, bool){} void GPUABI::postProcessOutlinedTask(llvm::Function&, llvm::Instruction*, llvm::Instruction*, bool){} void GPUABI::preProcessRootSpawner(llvm::Function&){} void GPUABI::postProcessRootSpawner(llvm::Function&){} void GPUABI::preProcessFunction(Function &F, TaskInfo &TI, bool OutliningTapirLoops) { } void GPUABI::postProcessFunction(Function &F, bool OutliningTapirLoops) { } void GPUABI::postProcessHelper(Function &F) { } void GPUABI::processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) { } LoopOutlineProcessor *GPUABI::getLoopOutlineProcessor( const TapirLoopInfo *TL) const { if(!LOP) return new LLVMLoop(M); return LOP; } // Static counter for assigning IDs to kernels. unsigned LLVMLoop::NextKernelID = 0; LLVMLoop::LLVMLoop(Module &M) : LoopOutlineProcessor(M, LLVMM), LLVMM("spirvModule", M.getContext()) { // Assign an ID to this kernel. MyKernelID = NextKernelID++; // Setup an LLVM triple. Triple LLVMTriple("spir64-unknown-unknown"); LLVMM.setTargetTriple(LLVMTriple.str()); // Insert runtime-function declarations in LLVM host modules. Type *LLVMInt32Ty = Type::getInt32Ty(LLVMM.getContext()); Type *LLVMInt64Ty = Type::getInt64Ty(LLVMM.getContext()); GetThreadIdx = LLVMM.getOrInsertFunction("gtid", LLVMInt32Ty); Function* getid = LLVMM.getFunction("gtid"); Type *VoidTy = Type::getVoidTy(M.getContext()); Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext()); Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo(); Type *Int8Ty = Type::getInt8Ty(M.getContext()); Type *Int32Ty = Type::getInt32Ty(M.getContext()); Type *Int64Ty = Type::getInt64Ty(M.getContext()); GPUInit = M.getOrInsertFunction("initRuntime", VoidTy); GPULaunchKernel = M.getOrInsertFunction("launchKernel", VoidPtrTy, VoidPtrTy, VoidPtrPtrTy, Int64Ty); GPUWaitKernel = M.getOrInsertFunction("waitKernel", VoidTy); } void LLVMLoop::setupLoopOutlineArgs( Function &F, ValueSet &HelperArgs, SmallVectorImpl<Value *> &HelperInputs, ValueSet &InputSet, const SmallVectorImpl<Value *> &LCArgs, const SmallVectorImpl<Value *> &LCInputs, const ValueSet &TLInputsFixed) { // Add the loop control inputs. // The first parameter defines the extent of the index space, i.e., the number // of threads to launch. { Argument *EndArg = cast<Argument>(LCArgs[1]); EndArg->setName("runSize"); HelperArgs.insert(EndArg); Value *InputVal = LCInputs[1]; HelperInputs.push_back(InputVal); // Add loop-control input to the input set. InputSet.insert(InputVal); } // The second parameter defines the start of the index space. { Argument *StartArg = cast<Argument>(LCArgs[0]); StartArg->setName("runStart"); HelperArgs.insert(StartArg); Value *InputVal = LCInputs[0]; HelperInputs.push_back(InputVal); // Add loop-control input to the input set. InputSet.insert(InputVal); } // The third parameter defines the grainsize, if it is not constant. if (!isa<ConstantInt>(LCInputs[2])) { Argument *GrainsizeArg = cast<Argument>(LCArgs[2]); GrainsizeArg->setName("runStride"); HelperArgs.insert(GrainsizeArg); Value *InputVal = LCInputs[2]; HelperInputs.push_back(InputVal); // Add loop-control input to the input set. InputSet.insert(InputVal); } // Add the loop control inputs. for (Value *V : TLInputsFixed) { HelperArgs.insert(V); HelperInputs.push_back(V); } for(Value *V : HelperInputs){ OrderedInputs.push_back(V); } } unsigned LLVMLoop::getIVArgIndex(const Function &F, const ValueSet &Args) const { // The argument for the primary induction variable is the second input. return 1; } unsigned LLVMLoop::getLimitArgIndex(const Function &F, const ValueSet &Args) const { // The argument for the loop limit is the first input. return 0; } void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap) { LLVMContext &Ctx = M.getContext(); Type *Int8Ty = Type::getInt8Ty(Ctx); Type *Int32Ty = Type::getInt32Ty(Ctx); //Type *Int64Ty = Type::getInt64Ty(Ctx); //Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); Task *T = TL.getTask(); Loop *L = TL.getLoop(); BasicBlock *Entry = cast<BasicBlock>(VMap[L->getLoopPreheader()]); BasicBlock *Header = cast<BasicBlock>(VMap[L->getHeader()]); BasicBlock *Exit = cast<BasicBlock>(VMap[TL.getExitBlock()]); PHINode *PrimaryIV = cast<PHINode>(VMap[TL.getPrimaryInduction().first]); Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry); Instruction *ClonedSyncReg = cast<Instruction>( VMap[T->getDetach()->getSyncRegion()]); // We no longer need the cloned sync region. ClonedSyncReg->eraseFromParent(); // Set the helper function to have external linkage. // Get the thread ID for this invocation of Helper. IRBuilder<> B(Entry->getTerminator()); Value *ThreadIdx = B.CreateCall(GetThreadIdx, ConstantInt::get(Int32Ty, 0)); //Value *BlockIdx = B.CreateCall(GetBlockIdx, ConstantInt::get(Int32Ty, 0)); //Value *BlockDim = B.CreateCall(GetBlockDim, ConstantInt::get(Int32Ty, 0)); Value *ThreadID = B.CreateIntCast(ThreadIdx, PrimaryIV->getType(), false); Function *Helper = Out.Outline; //Helper->setName("kitsune_spirv_kernel"); // Fix argument pointer types to global, nocapture // TODO: read/write attributes? LLVM_DEBUG(dbgs() << "Function type after globalization of argument pointers << " << *Helper->getType() << "\n"); LLVM_DEBUG(dbgs() << "LLVMM after globalization of argument pointers << " << *Helper->getParent() << "\n"); // Verify that the Thread ID corresponds to a valid iteration. Because Tapir // loops use canonical induction variables, valid iterations range from 0 to // the loop limit with stride 1. The End argument encodes the loop limit. // Get end and grainsize arguments Argument *End; Value *Grainsize; { auto OutlineArgsIter = Helper->arg_begin(); // End argument is the first LC arg. End = &*OutlineArgsIter; // Get the grainsize value, which is either constant or the third LC arg. // ReplaceInstWithInst(gep, GetElementPtrInst::Create( if (unsigned ConstGrainsize = TL.getGrainsize()) Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize); else // Grainsize argument is the third LC arg. Grainsize = &*++(++OutlineArgsIter); } ThreadID = B.CreateMul(ThreadID, Grainsize); Value *ThreadEndGrain = B.CreateAdd(ThreadID, Grainsize); Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, ThreadEndGrain, End); Value *ThreadEnd = B.CreateSelect(Cmp, ThreadEndGrain, End); Value *Cond = B.CreateICmpUGE(ThreadID, ThreadEnd); ReplaceInstWithInst(Entry->getTerminator(), BranchInst::Create(Exit, Header, Cond)); // Use the thread ID as the start iteration number for the primary IV. PrimaryIVInput->replaceAllUsesWith(ThreadID); // Update cloned loop condition to use the thread-end value. unsigned TripCountIdx = 0; ICmpInst *ClonedCond = cast<ICmpInst>(VMap[TL.getCondition()]); if (ClonedCond->getOperand(0) != ThreadEnd) ++TripCountIdx; ClonedCond->setOperand(TripCountIdx, ThreadEnd); assert(ClonedCond->getOperand(TripCountIdx) == ThreadEnd && "End argument not used in condition"); // Update paramaters with necessary address space modifcations SmallVector<Type*, 8> paramTys; for(auto &arg : Helper->args()){ if (auto *apty = dyn_cast<PointerType>(arg.getType())){ paramTys.push_back(PointerType::get(apty->getPointerElementType(), 1)); } else { paramTys.push_back(arg.getType()); } } ArrayRef<Type*> newParams(paramTys); if(auto *fpty = dyn_cast<PointerType>(Helper->getType())){ if(auto *fty = dyn_cast<FunctionType>(fpty->getPointerElementType())){ LLVM_DEBUG(dbgs() << "Helper is pointer to function " << *Helper->getType() << "\n"); auto *NewHelper = Function::Create( FunctionType::get(fty->getReturnType(), newParams, false), GlobalValue::ExternalLinkage, "kitsune_llvm_kernel", LLVMM); ValueToValueMapTy NewVMap; auto argit = NewHelper->arg_begin(); for (auto &arg : Helper->args()) { NewVMap[&arg] = argit++; } SmallVector< ReturnInst *,5> retinsts; CloneFunctionInto(NewHelper, Helper, NewVMap, false, retinsts); //Helper->mutateType(PointerType::get(FunctionType::get(fty->getReturnType(), newParams, false), 0)); NewHelper->setCallingConv(CallingConv::SPIR_KERNEL); for(auto &arg : NewHelper->args()){ if (auto *apty = dyn_cast<PointerType>(arg.getType())){ arg.addAttr(Attribute::NoCapture); } } Helper = NewHelper; } } } void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) { LLVMContext &Ctx = M.getContext(); Type *Int8Ty = Type::getInt8Ty(Ctx); Type *Int32Ty = Type::getInt32Ty(Ctx); //Type *Int64Ty = Type::getInt64Ty(Ctx); Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); //Task *T = TL.getTask(); //Instruction *ReplCall = cast<CallBase>(TOI.ReplCall); LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << M); Function *Parent = TOI.ReplCall->getFunction(); Value *TripCount = OrderedInputs[0]; BasicBlock* RCBB = TOI.ReplCall->getParent(); BasicBlock* NBB = RCBB->splitBasicBlock(TOI.ReplCall); TOI.ReplCall->eraseFromParent(); IRBuilder<> B(&NBB->front()); // Compile the kernel //LLVMM.getFunctionList().remove(TOI.Outline); TOI.Outline->eraseFromParent(); LLVMContext &LLVMCtx = LLVMM.getContext(); SmallVector<Metadata *, 3> AV; //AV.push_back(ValueAsMetadata::get(Helper)); //AV.push_back(MDString::get(LLVMCtx, "kernel")); //AV.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(LLVMCtx), // 1))); //Annotations->addOperand(MDNode::get(Ctx, AV)); legacy::PassManager *PassManager = new legacy::PassManager; PassManager->add(createVerifierPass()); // Add in our optimization passes //PassManager->add(createInstructionCombiningPass()); PassManager->add(createReassociatePass()); PassManager->add(createGVNPass()); PassManager->add(createCFGSimplificationPass()); PassManager->add(createLoopVectorizePass()); PassManager->add(createSLPVectorizerPass()); //PassManager->add(createBreakCriticalEdgesPass()); PassManager->add(createConstantPropagationPass()); PassManager->add(createDeadInstEliminationPass()); PassManager->add(createDeadStoreEliminationPass()); //PassManager->add(createInstructionCombiningPass()); PassManager->add(createCFGSimplificationPass()); PassManager->add(createDeadCodeEliminationPass()); PassManager->run(LLVMM); delete PassManager; LLVM_DEBUG(dbgs() << "LLVM Module: " << LLVMM); // generate llvm kernel code SmallVector<char, 1<<20> mbuf; BitcodeWriter bcw(mbuf); bcw.writeModule(LLVMM); Constant *LLVM = ConstantDataArray::getRaw(mbuf.data(), mbuf.size(), Int8Ty); LLVMGlobal = new GlobalVariable(M, LLVM->getType(), true, GlobalValue::PrivateLinkage, LLVM, "gpu_" + Twine("kitsune_spirv_kernel")); //Value* TripCount = isSRetInput(TOI.InputSet[0]) ? TOI.InputSet[1] : TOI.InputSet[0]; //Value *RunStart = ReplCall->getArgOperand(getIVArgIndex(*Parent, // TOI.InputSet)); //Value *TripCount = ReplCall->getArgOperand(getLimitArgIndex(*Parent, // TOI.InputSet)); Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID); Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy); Constant *kernelSize = ConstantInt::get(Int32Ty, LLVMGlobal->getInitializer()->getType()->getArrayNumElements()); BasicBlock &EBB = Parent->getEntryBlock(); IRBuilder<> EB(&EBB.front()); EB.CreateCall(GPUInit, {}); ArrayType* arrayType = ArrayType::get(VoidPtrTy, OrderedInputs.size()); Value* argArray = B.CreateAlloca(arrayType); for (Value *V : OrderedInputs) { //Value *ElementSize = nullptr; LLVM_DEBUG(dbgs() << "Input set value: " << *V << "\n"); Value *VPtr = B.CreateAlloca(V->getType()); B.CreateStore(V, VPtr); Value *VoidVPtr = B.CreateBitCast(VPtr, VoidPtrTy); } Value *Grainsize = TL.getGrainsize() ? ConstantInt::get(TripCount->getType(), TL.getGrainsize()) : OrderedInputs[2]; Value *RunSizeQ = B.CreateUDiv(TripCount, Grainsize); Value *RunRem = B.CreateURem(TripCount, Grainsize); Value *IsRem = B.CreateICmp(ICmpInst::ICMP_UGT, RunRem, ConstantInt::get(RunRem->getType(), 0)); Value *IsRemAdd = B.CreateZExt(IsRem, RunSizeQ->getType()); Value *RunSize = B.CreateAdd(RunSizeQ, IsRemAdd); Value* stream = B.CreateCall(GPULaunchKernel, { LLVMGlobal, argArray, RunSize }); B.CreateCall(GPUWaitKernel, stream); LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M); }
llvm/lib/Transforms/Tapir/LoweringUtils.cpp +3 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,7 @@ #include "llvm/Transforms/Tapir/OpenCilkABI.h" #include "llvm/Transforms/Tapir/OpenMPABI.h" #include "llvm/Transforms/Tapir/OpenCLABI.h" #include "llvm/Transforms/Tapir/GPUABI.h" #include "llvm/Transforms/Tapir/Outline.h" #include "llvm/Transforms/Tapir/QthreadsABI.h" #include "llvm/Transforms/Tapir/RealmABI.h" Loading Loading @@ -55,6 +56,8 @@ TapirTarget *llvm::getTapirTargetFromID(Module &M, TapirTargetID ID) { return new OpenMPABI(M); case TapirTargetID::OpenCL: return new OpenCLABI(M); case TapirTargetID::GPU: return new GPUABI(M); case TapirTargetID::Qthreads: return new QthreadsABI(M); case TapirTargetID::Realm: Loading