Initial GPUABI commit (f0486394) · Commits · llvm-doe / llvm-project

llvm/include/llvm/Transforms/Tapir/GPUABI.h

0 → 100644

+107 −0

Original line number	Diff line number	Diff line
		//===- GPUABI.h - Interface to the Kitsune GPU back end ------- C++ ---===//
		//
		// The LLVM Compiler Infrastructure
		//
		// This file is distributed under the University of Illinois Open Source
		// License. See LICENSE.TXT for details.
		//
		//===----------------------------------------------------------------------===//
		//
		// This file implements the Kitsune GPU ABI to convert Tapir instructions to
		// calls into the Kitsune runtime system for NVIDIA GPU code.
		//
		//===----------------------------------------------------------------------===//
		#ifndef GPU_ABI_H_
		#define GPU_ABI_H_

		#include "llvm/Transforms/Tapir/LoweringUtils.h"
		#include "llvm/Transforms/Tapir/TapirLoopInfo.h"

		namespace llvm {

		class DataLayout;
		class TargetMachine;
		class LLVMLoop;

		class GPUABI : public TapirTarget {
		LLVMLoop *LOP = nullptr;
		public:
		GPUABI(Module &M) : TapirTarget(M) {}
		~GPUABI() {}
		Value lowerGrainsizeCall(CallInst GrainsizeCall) override final;
		void lowerSync(SyncInst &SI) override final;

		void addHelperAttributes(Function &F) override final {}
		void preProcessFunction(Function &F, TaskInfo &TI,
		bool OutliningTapirLoops) override final;
		void postProcessFunction(Function &F, bool OutliningTapirLoops)
		override final;
		void postProcessHelper(Function &F) override final;
		void preProcessOutlinedTask(Function &F, Instruction *DetachPt,
		Instruction *TaskFrameCreate,
		bool IsSpawner) override final;
		void postProcessOutlinedTask(Function &F, Instruction *DetachPt,
		Instruction *TaskFrameCreate,
		bool IsSpawner) override final;
		void preProcessRootSpawner(Function &F) override final;
		void postProcessRootSpawner(Function &F) override final;

		void processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT)
		override final;

		LoopOutlineProcessor getLoopOutlineProcessor(const TapirLoopInfo TL)
		const override final;
		};

		class LLVMLoop : public LoopOutlineProcessor {
		friend class GPUABI;

		private:
		static unsigned NextKernelID;
		unsigned MyKernelID;
		Module LLVMM;
		TargetMachine *LLVMTargetMachine;
		GlobalVariable *LLVMGlobal;

		FunctionCallee GetThreadIdx = nullptr;
		FunctionCallee GPUInit = nullptr;
		FunctionCallee GPULaunchKernel = nullptr;
		FunctionCallee GPUWaitKernel = nullptr;

		SmallVector<Value *, 5> OrderedInputs;
		public:
		LLVMLoop(Module &M);

		void setupLoopOutlineArgs(
		Function &F, ValueSet &HelperArgs, SmallVectorImpl<Value *> &HelperInputs,
		ValueSet &InputSet, const SmallVectorImpl<Value *> &LCArgs,
		const SmallVectorImpl<Value *> &LCInputs,
		const ValueSet &TLInputsFixed)
		override final;
		unsigned getIVArgIndex(const Function &F, const ValueSet &Args) const
		override final;
		unsigned getLimitArgIndex(const Function &F, const ValueSet &Args) const
		override final;
		void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
		ValueToValueMapTy &VMap) override final;
		void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
		DominatorTree &DT) override final;
		};
		}

		#endif
		/*
		#include "llvm/Transforms/Tapir/LoopSpawningTI.h"
		#include "llvm/Transforms/Tapir/LoweringUtils.h"
		#include "llvm/ADT/DenseMap.h"

		using namespace llvm;

		class GPU : public LoopOutlineProcessor {
		public:
		GPU(Module &M) : LoopOutlineProcessor(M) {}
		void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
		ValueToValueMapTy &VMap) override final;
		GlobalVariable* LLVMKernel;
		};
		*/

llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -27,6 +27,7 @@ enum class TapirTargetID {
		OpenCilk, // Lower to OpenCilk ABI
		OpenCL, // Lower to OpenCL ABI
		OpenMP, // Lower to OpenMP
		GPU, // Lower to OpenCL
		Qthreads, // Lower to Qthreads
		Realm, // Lower to Realm
		Last_TapirTargetID

llvm/lib/Transforms/Tapir/CMakeLists.txt

+1 −0

Original line number	Diff line number	Diff line
		@@ -13,6 +13,7 @@ add_llvm_component_library(LLVMTapirOpts
		Outline.cpp
		QthreadsABI.cpp
		RealmABI.cpp
		GPUABI.cpp
		SerialABI.cpp
		SerializeSmallTasks.cpp
		Tapir.cpp

llvm/lib/Transforms/Tapir/GPUABI.cpp

0 → 100644

+384 −0

Original line number	Diff line number	Diff line
		//===- GPUABI.cpp - Lower Tapir to the Kitsune GPU back end -------------===//
		//
		// The LLVM Compiler Infrastructure
		//
		// This file is distributed under the University of Illinois Open Source
		// License. See LICENSE.TXT for details.
		//
		//===----------------------------------------------------------------------===//
		//
		// This file implements the Kitsune GPU ABI to convert Tapir instructions to
		// calls into the Kitsune runtime system for GPU LLVM code.
		//
		//===----------------------------------------------------------------------===//

		#include "llvm/Transforms/Tapir/GPUABI.h"
		#include "llvm/IR/DebugInfoMetadata.h"
		#include "llvm/IR/LegacyPassManager.h"
		#include "llvm/IR/Module.h"
		#include "llvm/IR/Verifier.h"
		#include "llvm/Target/TargetMachine.h"
		#include "llvm/Transforms/Tapir/Outline.h"
		#include "llvm/Transforms/Utils/BasicBlockUtils.h"
		#include "llvm/Transforms/Scalar.h"
		#include "llvm/Transforms/Scalar/GVN.h"
		#include "llvm/Transforms/Vectorize.h"
		#include "llvm/Bitcode/BitcodeWriter.h"
		#include "llvm/Support/TargetRegistry.h"
		#include <sstream>

		using namespace llvm;

		#define DEBUG_TYPE "openclabi"

		Value GPUABI::lowerGrainsizeCall(CallInst GrainsizeCall) {
		Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8);

		// Replace uses of grainsize intrinsic call with this grainsize value.
		GrainsizeCall->replaceAllUsesWith(Grainsize);
		return Grainsize;
		}

		void GPUABI::lowerSync(SyncInst &SI) {
		// currently a no-op...
		}

		void GPUABI::preProcessOutlinedTask(llvm::Function&, llvm::Instruction, llvm::Instruction, bool){}
		void GPUABI::postProcessOutlinedTask(llvm::Function&, llvm::Instruction, llvm::Instruction, bool){}
		void GPUABI::preProcessRootSpawner(llvm::Function&){}
		void GPUABI::postProcessRootSpawner(llvm::Function&){}

		void GPUABI::preProcessFunction(Function &F, TaskInfo &TI,
		bool OutliningTapirLoops) {
		}

		void GPUABI::postProcessFunction(Function &F, bool OutliningTapirLoops) {
		}

		void GPUABI::postProcessHelper(Function &F) {
		}

		void GPUABI::processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) {
		}

		LoopOutlineProcessor *GPUABI::getLoopOutlineProcessor(
		const TapirLoopInfo *TL) const {
		if(!LOP)
		return new LLVMLoop(M);
		return LOP;
		}

		// Static counter for assigning IDs to kernels.
		unsigned LLVMLoop::NextKernelID = 0;

		LLVMLoop::LLVMLoop(Module &M)
		: LoopOutlineProcessor(M, LLVMM), LLVMM("spirvModule", M.getContext()) {
		// Assign an ID to this kernel.
		MyKernelID = NextKernelID++;

		// Setup an LLVM triple.
		Triple LLVMTriple("spir64-unknown-unknown");
		LLVMM.setTargetTriple(LLVMTriple.str());

		// Insert runtime-function declarations in LLVM host modules.
		Type *LLVMInt32Ty = Type::getInt32Ty(LLVMM.getContext());
		Type *LLVMInt64Ty = Type::getInt64Ty(LLVMM.getContext());
		GetThreadIdx = LLVMM.getOrInsertFunction("gtid", LLVMInt32Ty);
		Function* getid = LLVMM.getFunction("gtid");

		Type *VoidTy = Type::getVoidTy(M.getContext());
		Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
		Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
		Type *Int8Ty = Type::getInt8Ty(M.getContext());
		Type *Int32Ty = Type::getInt32Ty(M.getContext());
		Type *Int64Ty = Type::getInt64Ty(M.getContext());
		GPUInit = M.getOrInsertFunction("initRuntime", VoidTy);
		GPULaunchKernel = M.getOrInsertFunction("launchKernel", VoidPtrTy, VoidPtrTy, VoidPtrPtrTy, Int64Ty);
		GPUWaitKernel = M.getOrInsertFunction("waitKernel", VoidTy);
		}

		void LLVMLoop::setupLoopOutlineArgs(
		Function &F, ValueSet &HelperArgs, SmallVectorImpl<Value *> &HelperInputs,
		ValueSet &InputSet, const SmallVectorImpl<Value *> &LCArgs,
		const SmallVectorImpl<Value *> &LCInputs, const ValueSet &TLInputsFixed) {
		// Add the loop control inputs.

		// The first parameter defines the extent of the index space, i.e., the number
		// of threads to launch.
		{
		Argument *EndArg = cast<Argument>(LCArgs[1]);
		EndArg->setName("runSize");
		HelperArgs.insert(EndArg);

		Value *InputVal = LCInputs[1];
		HelperInputs.push_back(InputVal);
		// Add loop-control input to the input set.
		InputSet.insert(InputVal);
		}
		// The second parameter defines the start of the index space.
		{
		Argument *StartArg = cast<Argument>(LCArgs[0]);
		StartArg->setName("runStart");
		HelperArgs.insert(StartArg);

		Value *InputVal = LCInputs[0];
		HelperInputs.push_back(InputVal);
		// Add loop-control input to the input set.
		InputSet.insert(InputVal);
		}
		// The third parameter defines the grainsize, if it is not constant.
		if (!isa<ConstantInt>(LCInputs[2])) {
		Argument *GrainsizeArg = cast<Argument>(LCArgs[2]);
		GrainsizeArg->setName("runStride");
		HelperArgs.insert(GrainsizeArg);

		Value *InputVal = LCInputs[2];
		HelperInputs.push_back(InputVal);
		// Add loop-control input to the input set.
		InputSet.insert(InputVal);
		}

		// Add the loop control inputs.
		for (Value *V : TLInputsFixed) {
		HelperArgs.insert(V);
		HelperInputs.push_back(V);
		}

		for(Value *V : HelperInputs){
		OrderedInputs.push_back(V);
		}
		}

		unsigned LLVMLoop::getIVArgIndex(const Function &F, const ValueSet &Args) const {
		// The argument for the primary induction variable is the second input.
		return 1;
		}

		unsigned LLVMLoop::getLimitArgIndex(const Function &F, const ValueSet &Args)
		const {
		// The argument for the loop limit is the first input.
		return 0;
		}

		void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
		ValueToValueMapTy &VMap) {
		LLVMContext &Ctx = M.getContext();
		Type *Int8Ty = Type::getInt8Ty(Ctx);
		Type *Int32Ty = Type::getInt32Ty(Ctx);
		//Type *Int64Ty = Type::getInt64Ty(Ctx);
		//Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
		Task *T = TL.getTask();
		Loop *L = TL.getLoop();


		BasicBlock *Entry = cast<BasicBlock>(VMap[L->getLoopPreheader()]);
		BasicBlock *Header = cast<BasicBlock>(VMap[L->getHeader()]);
		BasicBlock *Exit = cast<BasicBlock>(VMap[TL.getExitBlock()]);
		PHINode *PrimaryIV = cast<PHINode>(VMap[TL.getPrimaryInduction().first]);
		Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry);
		Instruction *ClonedSyncReg = cast<Instruction>(
		VMap[T->getDetach()->getSyncRegion()]);

		// We no longer need the cloned sync region.
		ClonedSyncReg->eraseFromParent();

		// Set the helper function to have external linkage.
		// Get the thread ID for this invocation of Helper.
		IRBuilder<> B(Entry->getTerminator());
		Value *ThreadIdx = B.CreateCall(GetThreadIdx, ConstantInt::get(Int32Ty, 0));
		//Value *BlockIdx = B.CreateCall(GetBlockIdx, ConstantInt::get(Int32Ty, 0));
		//Value *BlockDim = B.CreateCall(GetBlockDim, ConstantInt::get(Int32Ty, 0));
		Value *ThreadID = B.CreateIntCast(ThreadIdx, PrimaryIV->getType(), false);


		Function *Helper = Out.Outline;
		//Helper->setName("kitsune_spirv_kernel");
		// Fix argument pointer types to global, nocapture
		// TODO: read/write attributes?
		LLVM_DEBUG(dbgs() << "Function type after globalization of argument pointers << " << *Helper->getType() << "\n");
		LLVM_DEBUG(dbgs() << "LLVMM after globalization of argument pointers << " << *Helper->getParent() << "\n");

		// Verify that the Thread ID corresponds to a valid iteration. Because Tapir
		// loops use canonical induction variables, valid iterations range from 0 to
		// the loop limit with stride 1. The End argument encodes the loop limit.
		// Get end and grainsize arguments
		Argument *End;
		Value *Grainsize;
		{
		auto OutlineArgsIter = Helper->arg_begin();
		// End argument is the first LC arg.
		End = &*OutlineArgsIter;

		// Get the grainsize value, which is either constant or the third LC arg.
		// ReplaceInstWithInst(gep, GetElementPtrInst::Create(
		if (unsigned ConstGrainsize = TL.getGrainsize())
		Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
		else
		// Grainsize argument is the third LC arg.
		Grainsize = &*++(++OutlineArgsIter);
		}
		ThreadID = B.CreateMul(ThreadID, Grainsize);
		Value *ThreadEndGrain = B.CreateAdd(ThreadID, Grainsize);
		Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, ThreadEndGrain, End);
		Value *ThreadEnd = B.CreateSelect(Cmp, ThreadEndGrain, End);
		Value *Cond = B.CreateICmpUGE(ThreadID, ThreadEnd);

		ReplaceInstWithInst(Entry->getTerminator(), BranchInst::Create(Exit, Header,
		Cond));
		// Use the thread ID as the start iteration number for the primary IV.
		PrimaryIVInput->replaceAllUsesWith(ThreadID);

		// Update cloned loop condition to use the thread-end value.
		unsigned TripCountIdx = 0;
		ICmpInst *ClonedCond = cast<ICmpInst>(VMap[TL.getCondition()]);
		if (ClonedCond->getOperand(0) != ThreadEnd)
		++TripCountIdx;
		ClonedCond->setOperand(TripCountIdx, ThreadEnd);
		assert(ClonedCond->getOperand(TripCountIdx) == ThreadEnd &&
		"End argument not used in condition");

		// Update paramaters with necessary address space modifcations
		SmallVector<Type*, 8> paramTys;
		for(auto &arg : Helper->args()){
		if (auto *apty = dyn_cast<PointerType>(arg.getType())){
		paramTys.push_back(PointerType::get(apty->getPointerElementType(), 1));
		} else {
		paramTys.push_back(arg.getType());
		}
		}
		ArrayRef<Type*> newParams(paramTys);
		if(auto *fpty = dyn_cast<PointerType>(Helper->getType())){
		if(auto *fty = dyn_cast<FunctionType>(fpty->getPointerElementType())){
		LLVM_DEBUG(dbgs() << "Helper is pointer to function " << *Helper->getType() << "\n");
		auto *NewHelper = Function::Create(
		FunctionType::get(fty->getReturnType(), newParams, false),
		GlobalValue::ExternalLinkage,
		"kitsune_llvm_kernel",
		LLVMM);

		ValueToValueMapTy NewVMap;
		auto argit = NewHelper->arg_begin();
		for (auto &arg : Helper->args()) {
		NewVMap[&arg] = argit++;
		}
		SmallVector< ReturnInst *,5> retinsts;
		CloneFunctionInto(NewHelper, Helper, NewVMap, false, retinsts);
		//Helper->mutateType(PointerType::get(FunctionType::get(fty->getReturnType(), newParams, false), 0));
		NewHelper->setCallingConv(CallingConv::SPIR_KERNEL);
		for(auto &arg : NewHelper->args()){
		if (auto *apty = dyn_cast<PointerType>(arg.getType())){
		arg.addAttr(Attribute::NoCapture);
		}
		}
		Helper = NewHelper;
		}
		}
		}

		void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
		DominatorTree &DT) {
		LLVMContext &Ctx = M.getContext();
		Type *Int8Ty = Type::getInt8Ty(Ctx);
		Type *Int32Ty = Type::getInt32Ty(Ctx);
		//Type *Int64Ty = Type::getInt64Ty(Ctx);
		Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);

		//Task *T = TL.getTask();
		//Instruction *ReplCall = cast<CallBase>(TOI.ReplCall);
		LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << M);
		Function *Parent = TOI.ReplCall->getFunction();
		Value *TripCount = OrderedInputs[0];
		BasicBlock* RCBB = TOI.ReplCall->getParent();
		BasicBlock* NBB = RCBB->splitBasicBlock(TOI.ReplCall);
		TOI.ReplCall->eraseFromParent();
		IRBuilder<> B(&NBB->front());

		// Compile the kernel
		//LLVMM.getFunctionList().remove(TOI.Outline);
		TOI.Outline->eraseFromParent();
		LLVMContext &LLVMCtx = LLVMM.getContext();

		SmallVector<Metadata *, 3> AV;
		//AV.push_back(ValueAsMetadata::get(Helper));
		//AV.push_back(MDString::get(LLVMCtx, "kernel"));
		//AV.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(LLVMCtx),
		// 1)));
		//Annotations->addOperand(MDNode::get(Ctx, AV));

		legacy::PassManager *PassManager = new legacy::PassManager;

		PassManager->add(createVerifierPass());

		// Add in our optimization passes

		//PassManager->add(createInstructionCombiningPass());
		PassManager->add(createReassociatePass());
		PassManager->add(createGVNPass());
		PassManager->add(createCFGSimplificationPass());
		PassManager->add(createLoopVectorizePass());
		PassManager->add(createSLPVectorizerPass());
		//PassManager->add(createBreakCriticalEdgesPass());
		PassManager->add(createConstantPropagationPass());
		PassManager->add(createDeadInstEliminationPass());
		PassManager->add(createDeadStoreEliminationPass());
		//PassManager->add(createInstructionCombiningPass());
		PassManager->add(createCFGSimplificationPass());
		PassManager->add(createDeadCodeEliminationPass());
		PassManager->run(LLVMM);

		delete PassManager;

		LLVM_DEBUG(dbgs() << "LLVM Module: " << LLVMM);


		// generate llvm kernel code
		SmallVector<char, 1<<20> mbuf;
		BitcodeWriter bcw(mbuf);
		bcw.writeModule(LLVMM);

		Constant *LLVM = ConstantDataArray::getRaw(mbuf.data(), mbuf.size(), Int8Ty);
		LLVMGlobal = new GlobalVariable(M, LLVM->getType(), true,
		GlobalValue::PrivateLinkage, LLVM,
		"gpu_" + Twine("kitsune_spirv_kernel"));

		//Value* TripCount = isSRetInput(TOI.InputSet[0]) ? TOI.InputSet[1] : TOI.InputSet[0];
		//Value RunStart = ReplCall->getArgOperand(getIVArgIndex(Parent,
		// TOI.InputSet));
		//Value TripCount = ReplCall->getArgOperand(getLimitArgIndex(Parent,
		// TOI.InputSet));

		Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID);
		Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy);

		Constant *kernelSize = ConstantInt::get(Int32Ty,
		LLVMGlobal->getInitializer()->getType()->getArrayNumElements());
		BasicBlock &EBB = Parent->getEntryBlock();
		IRBuilder<> EB(&EBB.front());
		EB.CreateCall(GPUInit, {});

		ArrayType* arrayType = ArrayType::get(VoidPtrTy, OrderedInputs.size());
		Value* argArray = B.CreateAlloca(arrayType);
		for (Value *V : OrderedInputs) {
		//Value *ElementSize = nullptr;
		LLVM_DEBUG(dbgs() << "Input set value: " << *V << "\n");
		Value *VPtr = B.CreateAlloca(V->getType());
		B.CreateStore(V, VPtr);
		Value *VoidVPtr = B.CreateBitCast(VPtr, VoidPtrTy);
		}

		Value *Grainsize = TL.getGrainsize() ?
		ConstantInt::get(TripCount->getType(), TL.getGrainsize()) :
		OrderedInputs[2];

		Value *RunSizeQ = B.CreateUDiv(TripCount, Grainsize);
		Value *RunRem = B.CreateURem(TripCount, Grainsize);
		Value *IsRem = B.CreateICmp(ICmpInst::ICMP_UGT, RunRem, ConstantInt::get(RunRem->getType(), 0));
		Value *IsRemAdd = B.CreateZExt(IsRem, RunSizeQ->getType());
		Value *RunSize = B.CreateAdd(RunSizeQ, IsRemAdd);

		Value* stream = B.CreateCall(GPULaunchKernel, { LLVMGlobal, argArray, RunSize });
		B.CreateCall(GPUWaitKernel, stream);

		LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M);
		}

llvm/lib/Transforms/Tapir/LoweringUtils.cpp

+3 −0

Original line number	Diff line number	Diff line
		@@ -22,6 +22,7 @@
		#include "llvm/Transforms/Tapir/OpenCilkABI.h"
		#include "llvm/Transforms/Tapir/OpenMPABI.h"
		#include "llvm/Transforms/Tapir/OpenCLABI.h"
		#include "llvm/Transforms/Tapir/GPUABI.h"
		#include "llvm/Transforms/Tapir/Outline.h"
		#include "llvm/Transforms/Tapir/QthreadsABI.h"
		#include "llvm/Transforms/Tapir/RealmABI.h"
		@@ -55,6 +56,8 @@ TapirTarget *llvm::getTapirTargetFromID(Module &M, TapirTargetID ID) {
		return new OpenMPABI(M);
		case TapirTargetID::OpenCL:
		return new OpenCLABI(M);
		case TapirTargetID::GPU:
		return new GPUABI(M);
		case TapirTargetID::Qthreads:
		return new QthreadsABI(M);
		case TapirTargetID::Realm: