Unverified Commit 9c0e6499 authored by Joseph Huber's avatar Joseph Huber Committed by GitHub
Browse files

[Offloading][NFC] Refactor handling of offloading entries (#72544)

Summary:
This patch is a simple refactoring of code out of the linker wrapper
into a common location. The main motivation behind this change is to
make it easier to change the handling in the future to accept a triple
to be used to emit entries that function on that target.
parent b1e039f3
Loading
Loading
Loading
Loading
+19 −19
Original line number Diff line number Diff line
@@ -10,9 +10,9 @@
// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \
// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=OPENMP

//      OPENMP: @__start_omp_offloading_entries = external hidden constant %__tgt_offload_entry
// OPENMP-NEXT: @__stop_omp_offloading_entries = external hidden constant %__tgt_offload_entry
// OPENMP-NEXT: @__dummy.omp_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "omp_offloading_entries"
//      OPENMP: @__start_omp_offloading_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
// OPENMP-NEXT: @__stop_omp_offloading_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
// OPENMP-NEXT: @__dummy.omp_offloading_entries = hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "omp_offloading_entries"
// OPENMP-NEXT: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}"
// OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr @.omp_offloading.device_image, ptr getelementptr inbounds ([[[SIZE]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
// OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
@@ -39,10 +39,10 @@

//      CUDA: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin"
// CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8
// CUDA-NEXT: @__dummy.cuda_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "cuda_offloading_entries"
// CUDA-NEXT: @.cuda.binary_handle = internal global ptr null
// CUDA-NEXT: @__start_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
// CUDA-NEXT: @__stop_cuda_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
// CUDA-NEXT: @__start_cuda_offloading_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
// CUDA-NEXT: @__stop_cuda_offloading_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
// CUDA-NEXT: @__dummy.cuda_offloading_entries = hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "cuda_offloading_entries"
// CUDA-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.cuda.fatbin_reg, ptr null }]

//      CUDA: define internal void @.cuda.fatbin_reg() section ".text.startup" {
@@ -68,13 +68,13 @@

//      CUDA: while.entry:
// CUDA-NEXT:  %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry ], [ %7, %if.end ]
// CUDA-NEXT:  %1 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 0
// CUDA-NEXT:  %1 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 0
// CUDA-NEXT:  %addr = load ptr, ptr %1, align 8
// CUDA-NEXT:  %2 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 1
// CUDA-NEXT:  %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 1
// CUDA-NEXT:  %name = load ptr, ptr %2, align 8
// CUDA-NEXT:  %3 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 2
// CUDA-NEXT:  %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 2
// CUDA-NEXT:  %size = load i64, ptr %3, align 4
// CUDA-NEXT:  %4 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 3
// CUDA-NEXT:  %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 3
// CUDA-NEXT:  %flag = load i32, ptr %4, align 4
// CUDA-NEXT:  %5 = icmp eq i64 %size, 0
// CUDA-NEXT:  br i1 %5, label %if.then, label %if.else
@@ -105,7 +105,7 @@
// CUDA-NEXT:   br label %if.end

//      CUDA: if.end:
// CUDA-NEXT:   %7 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 1
// CUDA-NEXT:   %7 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1
// CUDA-NEXT:   %8 = icmp eq ptr %7, @__stop_cuda_offloading_entries
// CUDA-NEXT:   br i1 %8, label %while.end, label %while.entry

@@ -121,10 +121,10 @@

//      HIP: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".hip_fatbin"
// HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8
// HIP-NEXT: @__dummy.hip_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "hip_offloading_entries"
// HIP-NEXT: @.hip.binary_handle = internal global ptr null
// HIP-NEXT: @__start_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
// HIP-NEXT: @__stop_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry]
// HIP-NEXT: @__start_hip_offloading_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
// HIP-NEXT: @__stop_hip_offloading_entries = external hidden constant [0 x %struct.__tgt_offload_entry]
// HIP-NEXT: @__dummy.hip_offloading_entries = hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "hip_offloading_entries"
// HIP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }]

//      HIP: define internal void @.hip.fatbin_reg() section ".text.startup" {
@@ -149,13 +149,13 @@

//      HIP: while.entry:
// HIP-NEXT:   %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], [ %7, %if.end ]
// HIP-NEXT:   %1 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 0
// HIP-NEXT:   %1 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 0
// HIP-NEXT:   %addr = load ptr, ptr %1, align 8
// HIP-NEXT:   %2 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 1
// HIP-NEXT:   %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 1
// HIP-NEXT:   %name = load ptr, ptr %2, align 8
// HIP-NEXT:   %3 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 2
// HIP-NEXT:   %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 2
// HIP-NEXT:   %size = load i64, ptr %3, align 4
// HIP-NEXT:   %4 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 3
// HIP-NEXT:   %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 3
// HIP-NEXT:   %flag = load i32, ptr %4, align 4
// HIP-NEXT:   %5 = icmp eq i64 %size, 0
// HIP-NEXT:   br i1 %5, label %if.then, label %if.else
@@ -186,7 +186,7 @@
// HIP-NEXT:   br label %if.end

//      HIP: if.end:
// HIP-NEXT:   %7 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 1
// HIP-NEXT:   %7 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1
// HIP-NEXT:   %8 = icmp eq ptr %7, @__stop_hip_offloading_entries
// HIP-NEXT:   br i1 %8, label %while.end, label %while.entry

+1 −0
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@ set(LLVM_LINK_COMPONENTS
  TargetParser
  CodeGen
  LTO
  FrontendOffloading
  )

set(LLVM_TARGET_DEFINITIONS LinkerWrapperOpts.td)
+22 −94
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@

#include "OffloadWrapper.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Frontend/Offloading/Utility.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
@@ -39,36 +40,7 @@ enum OffloadEntryKindFlag : uint32_t {
};

IntegerType *getSizeTTy(Module &M) {
  LLVMContext &C = M.getContext();
  switch (M.getDataLayout().getPointerTypeSize(PointerType::getUnqual(C))) {
  case 4u:
    return Type::getInt32Ty(C);
  case 8u:
    return Type::getInt64Ty(C);
  }
  llvm_unreachable("unsupported pointer type size");
}

// struct __tgt_offload_entry {
//   void *addr;
//   char *name;
//   size_t size;
//   int32_t flags;
//   int32_t reserved;
// };
StructType *getEntryTy(Module &M) {
  LLVMContext &C = M.getContext();
  StructType *EntryTy = StructType::getTypeByName(C, "__tgt_offload_entry");
  if (!EntryTy)
    EntryTy =
        StructType::create("__tgt_offload_entry", PointerType::getUnqual(C),
                           PointerType::getUnqual(C), getSizeTTy(M),
                           Type::getInt32Ty(C), Type::getInt32Ty(C));
  return EntryTy;
}

PointerType *getEntryPtrTy(Module &M) {
  return PointerType::getUnqual(getEntryTy(M));
  return M.getDataLayout().getIntPtrType(M.getContext());
}

// struct __tgt_device_image {
@@ -81,9 +53,10 @@ StructType *getDeviceImageTy(Module &M) {
  LLVMContext &C = M.getContext();
  StructType *ImageTy = StructType::getTypeByName(C, "__tgt_device_image");
  if (!ImageTy)
    ImageTy = StructType::create(
        "__tgt_device_image", PointerType::getUnqual(C),
        PointerType::getUnqual(C), getEntryPtrTy(M), getEntryPtrTy(M));
    ImageTy =
        StructType::create("__tgt_device_image", PointerType::getUnqual(C),
                           PointerType::getUnqual(C), PointerType::getUnqual(C),
                           PointerType::getUnqual(C));
  return ImageTy;
}

@@ -101,9 +74,9 @@ StructType *getBinDescTy(Module &M) {
  LLVMContext &C = M.getContext();
  StructType *DescTy = StructType::getTypeByName(C, "__tgt_bin_desc");
  if (!DescTy)
    DescTy = StructType::create("__tgt_bin_desc", Type::getInt32Ty(C),
                                getDeviceImagePtrTy(M), getEntryPtrTy(M),
                                getEntryPtrTy(M));
    DescTy = StructType::create(
        "__tgt_bin_desc", Type::getInt32Ty(C), getDeviceImagePtrTy(M),
        PointerType::getUnqual(C), PointerType::getUnqual(C));
  return DescTy;
}

@@ -151,28 +124,8 @@ PointerType *getBinDescPtrTy(Module &M) {
/// Global variable that represents BinDesc is returned.
GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs) {
  LLVMContext &C = M.getContext();
  // Create external begin/end symbols for the offload entries table.
  auto *EntriesB = new GlobalVariable(
      M, getEntryTy(M), /*isConstant*/ true, GlobalValue::ExternalLinkage,
      /*Initializer*/ nullptr, "__start_omp_offloading_entries");
  EntriesB->setVisibility(GlobalValue::HiddenVisibility);
  auto *EntriesE = new GlobalVariable(
      M, getEntryTy(M), /*isConstant*/ true, GlobalValue::ExternalLinkage,
      /*Initializer*/ nullptr, "__stop_omp_offloading_entries");
  EntriesE->setVisibility(GlobalValue::HiddenVisibility);

  // We assume that external begin/end symbols that we have created above will
  // be defined by the linker. But linker will do that only if linker inputs
  // have section with "omp_offloading_entries" name which is not guaranteed.
  // So, we just create dummy zero sized object in the offload entries section
  // to force linker to define those symbols.
  auto *DummyInit =
      ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
  auto *DummyEntry = new GlobalVariable(
      M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
      "__dummy.omp_offloading.entry");
  DummyEntry->setSection("omp_offloading_entries");
  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
  auto [EntriesB, EntriesE] =
      offloading::getOffloadEntryArray(M, "omp_offloading_entries");

  auto *Zero = ConstantInt::get(getSizeTTy(M), 0u);
  Constant *ZeroZero[] = {Zero, Zero};
@@ -328,18 +281,6 @@ GlobalVariable *createFatbinDesc(Module &M, ArrayRef<char> Image, bool IsHIP) {
  FatbinDesc->setSection(FatbinWrapperSection);
  FatbinDesc->setAlignment(Align(8));

  // We create a dummy entry to ensure the linker will define the begin / end
  // symbols. The CUDA runtime should ignore the null address if we attempt to
  // register it.
  auto *DummyInit =
      ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
  auto *DummyEntry = new GlobalVariable(
      M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
      IsHIP ? "__dummy.hip_offloading.entry" : "__dummy.cuda_offloading.entry");
  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);
  DummyEntry->setSection(IsHIP ? "hip_offloading_entries"
                               : "cuda_offloading_entries");

  return FatbinDesc;
}

@@ -368,6 +309,9 @@ GlobalVariable *createFatbinDesc(Module &M, ArrayRef<char> Image, bool IsHIP) {
/// }
Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) {
  LLVMContext &C = M.getContext();
  auto [EntriesB, EntriesE] = offloading::getOffloadEntryArray(
      M, IsHIP ? "hip_offloading_entries" : "cuda_offloading_entries");

  // Get the __cudaRegisterFunction function declaration.
  PointerType *Int8PtrTy = PointerType::get(C, 0);
  PointerType *Int8PtrPtrTy = PointerType::get(C, 0);
@@ -389,22 +333,6 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) {
  FunctionCallee RegVar = M.getOrInsertFunction(
      IsHIP ? "__hipRegisterVar" : "__cudaRegisterVar", RegVarTy);

  // Create the references to the start / stop symbols defined by the linker.
  auto *EntriesB =
      new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0),
                         /*isConstant*/ true, GlobalValue::ExternalLinkage,
                         /*Initializer*/ nullptr,
                         IsHIP ? "__start_hip_offloading_entries"
                               : "__start_cuda_offloading_entries");
  EntriesB->setVisibility(GlobalValue::HiddenVisibility);
  auto *EntriesE =
      new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0),
                         /*isConstant*/ true, GlobalValue::ExternalLinkage,
                         /*Initializer*/ nullptr,
                         IsHIP ? "__stop_hip_offloading_entries"
                               : "__stop_cuda_offloading_entries");
  EntriesE->setVisibility(GlobalValue::HiddenVisibility);

  auto *RegGlobalsTy = FunctionType::get(Type::getVoidTy(C), Int8PtrPtrTy,
                                         /*isVarArg*/ false);
  auto *RegGlobalsFn =
@@ -427,24 +355,24 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) {
  auto *EntryCmp = Builder.CreateICmpNE(EntriesB, EntriesE);
  Builder.CreateCondBr(EntryCmp, EntryBB, ExitBB);
  Builder.SetInsertPoint(EntryBB);
  auto *Entry = Builder.CreatePHI(getEntryPtrTy(M), 2, "entry");
  auto *Entry = Builder.CreatePHI(PointerType::getUnqual(C), 2, "entry");
  auto *AddrPtr =
      Builder.CreateInBoundsGEP(getEntryTy(M), Entry,
      Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
                                {ConstantInt::get(getSizeTTy(M), 0),
                                 ConstantInt::get(Type::getInt32Ty(C), 0)});
  auto *Addr = Builder.CreateLoad(Int8PtrTy, AddrPtr, "addr");
  auto *NamePtr =
      Builder.CreateInBoundsGEP(getEntryTy(M), Entry,
      Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
                                {ConstantInt::get(getSizeTTy(M), 0),
                                 ConstantInt::get(Type::getInt32Ty(C), 1)});
  auto *Name = Builder.CreateLoad(Int8PtrTy, NamePtr, "name");
  auto *SizePtr =
      Builder.CreateInBoundsGEP(getEntryTy(M), Entry,
      Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
                                {ConstantInt::get(getSizeTTy(M), 0),
                                 ConstantInt::get(Type::getInt32Ty(C), 2)});
  auto *Size = Builder.CreateLoad(getSizeTTy(M), SizePtr, "size");
  auto *FlagsPtr =
      Builder.CreateInBoundsGEP(getEntryTy(M), Entry,
      Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
                                {ConstantInt::get(getSizeTTy(M), 0),
                                 ConstantInt::get(Type::getInt32Ty(C), 3)});
  auto *Flags = Builder.CreateLoad(Type::getInt32Ty(C), FlagsPtr, "flag");
@@ -491,16 +419,16 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) {

  Builder.SetInsertPoint(IfEndBB);
  auto *NewEntry = Builder.CreateInBoundsGEP(
      getEntryTy(M), Entry, ConstantInt::get(getSizeTTy(M), 1));
      offloading::getEntryTy(M), Entry, ConstantInt::get(getSizeTTy(M), 1));
  auto *Cmp = Builder.CreateICmpEQ(
      NewEntry,
      ConstantExpr::getInBoundsGetElementPtr(
          ArrayType::get(getEntryTy(M), 0), EntriesE,
          ArrayType::get(offloading::getEntryTy(M), 0), EntriesE,
          ArrayRef<Constant *>({ConstantInt::get(getSizeTTy(M), 0),
                                ConstantInt::get(getSizeTTy(M), 0)})));
  Entry->addIncoming(
      ConstantExpr::getInBoundsGetElementPtr(
          ArrayType::get(getEntryTy(M), 0), EntriesB,
          ArrayType::get(offloading::getEntryTy(M), 0), EntriesB,
          ArrayRef<Constant *>({ConstantInt::get(getSizeTTy(M), 0),
                                ConstantInt::get(getSizeTTy(M), 0)})),
      &RegGlobalsFn->getEntryBlock());
+10 −1
Original line number Diff line number Diff line
@@ -12,6 +12,10 @@
namespace llvm {
namespace offloading {

/// Returns the type of the offloading entry we use to store kernels and
/// globals that will be registered with the offloading runtime.
StructType *getEntryTy(Module &M);

/// Create an offloading section struct used to register this global at
/// runtime.
///
@@ -33,5 +37,10 @@ namespace offloading {
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name,
                         uint64_t Size, int32_t Flags, StringRef SectionName);

/// Creates a pair of globals used to iterate the array of offloading entries by
/// accessing the section variables provided by the linker.
std::pair<GlobalVariable *, GlobalVariable *>
getOffloadEntryArray(Module &M, StringRef SectionName);

} // namespace offloading
} // namespace llvm
+30 −1
Original line number Diff line number Diff line
@@ -16,7 +16,7 @@ using namespace llvm;
using namespace llvm::offloading;

// TODO: Export this to the linker wrapper code registration.
static StructType *getEntryTy(Module &M) {
StructType *offloading::getEntryTy(Module &M) {
  LLVMContext &C = M.getContext();
  StructType *EntryTy =
      StructType::getTypeByName(C, "struct.__tgt_offload_entry");
@@ -65,3 +65,32 @@ void offloading::emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name,
  Entry->setSection(SectionName);
  Entry->setAlignment(Align(1));
}

std::pair<GlobalVariable *, GlobalVariable *>
offloading::getOffloadEntryArray(Module &M, StringRef SectionName) {
  auto *EntriesB =
      new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0),
                         /*isConstant=*/true, GlobalValue::ExternalLinkage,
                         /*Initializer=*/nullptr, "__start_" + SectionName);
  EntriesB->setVisibility(GlobalValue::HiddenVisibility);
  auto *EntriesE =
      new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0),
                         /*isConstant=*/true, GlobalValue::ExternalLinkage,
                         /*Initializer=*/nullptr, "__stop_" + SectionName);
  EntriesE->setVisibility(GlobalValue::HiddenVisibility);

  // We assume that external begin/end symbols that we have created above will
  // be defined by the linker. But linker will do that only if linker inputs
  // have section with "omp_offloading_entries" name which is not guaranteed.
  // So, we just create dummy zero sized object in the offload entries section
  // to force linker to define those symbols.
  auto *DummyInit =
      ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
  auto *DummyEntry = new GlobalVariable(M, DummyInit->getType(), true,
                                        GlobalVariable::ExternalLinkage,
                                        DummyInit, "__dummy." + SectionName);
  DummyEntry->setSection(SectionName);
  DummyEntry->setVisibility(GlobalValue::HiddenVisibility);

  return std::make_pair(EntriesB, EntriesE);
}