Commit 921bd299 authored by Johannes Doerfert's avatar Johannes Doerfert
Browse files

[OpenMP] Remove alignment for global <-> local reduction functions

The alignment did likely not help much but increases the memory
requirement. Note that half of the affected accesses are all performed
by a single thread in each block. The reads are by consecutive threads
in a single block.
parent abe71b77
Loading
Loading
Loading
Loading
+5 −27
Original line number Diff line number Diff line
@@ -85,18 +85,6 @@ public:
  ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
};

/// GPU Configuration:  This information can be derived from cuda registers,
/// however, providing compile time constants helps generate more efficient
/// code.  For all practical purposes this is fine because the configuration
/// is the same for all known NVPTX architectures.
enum MachineConfiguration : unsigned {
  /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
  /// specific Grid Values like GV_Warp_Size, GV_Slot_Size

  /// Global memory alignment for performance.
  GlobalMemoryAlignment = 128,
};

static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
  RefExpr = RefExpr->IgnoreParens();
  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
@@ -119,31 +107,23 @@ static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
}


static RecordDecl *buildRecordForGlobalizedVars(
    ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
    ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
    llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
        &MappedDeclsFields, int BufSize) {
        &MappedDeclsFields,
    int BufSize) {
  using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
    return nullptr;
  SmallVector<VarsDataTy, 4> GlobalizedVars;
  for (const ValueDecl *D : EscapedDecls)
    GlobalizedVars.emplace_back(
        CharUnits::fromQuantity(std::max(
            C.getDeclAlign(D).getQuantity(),
            static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
        D);
    GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
  for (const ValueDecl *D : EscapedDeclsForTeams)
    GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
  llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
    return L.first > R.first;
  });

  // Build struct _globalized_locals_ty {
  //         /*  globalized vars  */[WarSize] align (max(decl_align,
  //         GlobalMemoryAlignment))
  //         /*  globalized vars  */[WarSize] align (decl_align)
  //         /*  globalized vars  */ for EscapedDeclsForTeams
  //       };
  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
@@ -182,9 +162,7 @@ static RecordDecl *buildRecordForGlobalizedVars(
          /*BW=*/nullptr, /*Mutable=*/false,
          /*InitStyle=*/ICIS_NoInit);
      Field->setAccess(AS_public);
      llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
                                     static_cast<CharUnits::QuantityType>(
                                         GlobalMemoryAlignment)));
      llvm::APInt Align(32, Pair.first.getQuantity());
      Field->addAttr(AlignedAttr::CreateImplicit(
          C, /*IsAlignmentExpr=*/true,
          IntegerLiteral::Create(C, Align,
+30 −30
Original line number Diff line number Diff line
@@ -253,7 +253,7 @@ int bar(int n){
// CHECK1-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP7]], align 8
// CHECK1-NEXT:    store double [[TMP9]], ptr [[TMP8]], align 128
// CHECK1-NEXT:    store double [[TMP9]], ptr [[TMP8]], align 8
// CHECK1-NEXT:    ret void
//
//
@@ -294,7 +294,7 @@ int bar(int n){
// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
// CHECK1-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP8]], align 128
// CHECK1-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP8]], align 8
// CHECK1-NEXT:    store double [[TMP9]], ptr [[TMP7]], align 8
// CHECK1-NEXT:    ret void
//
@@ -583,13 +583,13 @@ int bar(int n){
// CHECK1-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
// CHECK1-NEXT:    store i8 [[TMP9]], ptr [[TMP8]], align 128
// CHECK1-NEXT:    store i8 [[TMP9]], ptr [[TMP8]], align 4
// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
// CHECK1-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP11]], align 4
// CHECK1-NEXT:    store float [[TMP13]], ptr [[TMP12]], align 128
// CHECK1-NEXT:    store float [[TMP13]], ptr [[TMP12]], align 4
// CHECK1-NEXT:    ret void
//
//
@@ -634,13 +634,13 @@ int bar(int n){
// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
// CHECK1-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 128
// CHECK1-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 4
// CHECK1-NEXT:    store i8 [[TMP9]], ptr [[TMP7]], align 1
// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
// CHECK1-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 128
// CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4
// CHECK1-NEXT:    store float [[TMP13]], ptr [[TMP11]], align 4
// CHECK1-NEXT:    ret void
//
@@ -1156,13 +1156,13 @@ int bar(int n){
// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
// CHECK1-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 128
// CHECK1-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
// CHECK1-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 128
// CHECK1-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 4
// CHECK1-NEXT:    ret void
//
//
@@ -1207,13 +1207,13 @@ int bar(int n){
// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
// CHECK1-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
// CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
// CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 4
// CHECK1-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
// CHECK1-NEXT:    ret void
//
@@ -1446,7 +1446,7 @@ int bar(int n){
// CHECK2-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP7]], align 8
// CHECK2-NEXT:    store double [[TMP9]], ptr [[TMP8]], align 128
// CHECK2-NEXT:    store double [[TMP9]], ptr [[TMP8]], align 8
// CHECK2-NEXT:    ret void
//
//
@@ -1487,7 +1487,7 @@ int bar(int n){
// CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK2-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP8]], align 128
// CHECK2-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP8]], align 8
// CHECK2-NEXT:    store double [[TMP9]], ptr [[TMP7]], align 8
// CHECK2-NEXT:    ret void
//
@@ -1776,13 +1776,13 @@ int bar(int n){
// CHECK2-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
// CHECK2-NEXT:    store i8 [[TMP9]], ptr [[TMP8]], align 128
// CHECK2-NEXT:    store i8 [[TMP9]], ptr [[TMP8]], align 4
// CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK2-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
// CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP11]], align 4
// CHECK2-NEXT:    store float [[TMP13]], ptr [[TMP12]], align 128
// CHECK2-NEXT:    store float [[TMP13]], ptr [[TMP12]], align 4
// CHECK2-NEXT:    ret void
//
//
@@ -1827,13 +1827,13 @@ int bar(int n){
// CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK2-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 128
// CHECK2-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 4
// CHECK2-NEXT:    store i8 [[TMP9]], ptr [[TMP7]], align 1
// CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK2-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
// CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 128
// CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4
// CHECK2-NEXT:    store float [[TMP13]], ptr [[TMP11]], align 4
// CHECK2-NEXT:    ret void
//
@@ -2349,13 +2349,13 @@ int bar(int n){
// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 128
// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
// CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
// CHECK2-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 128
// CHECK2-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 4
// CHECK2-NEXT:    ret void
//
//
@@ -2400,13 +2400,13 @@ int bar(int n){
// CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
// CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
// CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 4
// CHECK2-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
// CHECK2-NEXT:    ret void
//
@@ -2639,7 +2639,7 @@ int bar(int n){
// CHECK3-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x double], ptr [[E]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP7]], align 8
// CHECK3-NEXT:    store double [[TMP9]], ptr [[TMP8]], align 128
// CHECK3-NEXT:    store double [[TMP9]], ptr [[TMP8]], align 8
// CHECK3-NEXT:    ret void
//
//
@@ -2680,7 +2680,7 @@ int bar(int n){
// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK3-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x double], ptr [[E]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP8]], align 128
// CHECK3-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP8]], align 8
// CHECK3-NEXT:    store double [[TMP9]], ptr [[TMP7]], align 8
// CHECK3-NEXT:    ret void
//
@@ -2969,13 +2969,13 @@ int bar(int n){
// CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
// CHECK3-NEXT:    store i8 [[TMP9]], ptr [[TMP8]], align 128
// CHECK3-NEXT:    store i8 [[TMP9]], ptr [[TMP8]], align 4
// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK3-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x float], ptr [[D]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP11]], align 4
// CHECK3-NEXT:    store float [[TMP13]], ptr [[TMP12]], align 128
// CHECK3-NEXT:    store float [[TMP13]], ptr [[TMP12]], align 4
// CHECK3-NEXT:    ret void
//
//
@@ -3020,13 +3020,13 @@ int bar(int n){
// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 128
// CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 4
// CHECK3-NEXT:    store i8 [[TMP9]], ptr [[TMP7]], align 1
// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK3-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x float], ptr [[D]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 128
// CHECK3-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4
// CHECK3-NEXT:    store float [[TMP13]], ptr [[TMP11]], align 4
// CHECK3-NEXT:    ret void
//
@@ -3542,13 +3542,13 @@ int bar(int n){
// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
// CHECK3-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 128
// CHECK3-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
// CHECK3-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 128
// CHECK3-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 4
// CHECK3-NEXT:    ret void
//
//
@@ -3593,13 +3593,13 @@ int bar(int n){
// CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
// CHECK3-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
// CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
// CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 4
// CHECK3-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
// CHECK3-NEXT:    ret void
//
+10 −10
Original line number Diff line number Diff line
@@ -1815,7 +1815,7 @@ int foo() {
// IR-GPU-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
// IR-GPU-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
// IR-GPU-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
// IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[TMP8]], ptr align 4 [[TMP7]], i64 400, i1 false)
// IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP8]], ptr align 4 [[TMP7]], i64 400, i1 false)
// IR-GPU-NEXT:    ret void
//
//
@@ -1863,7 +1863,7 @@ int foo() {
// IR-GPU-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
// IR-GPU-NEXT:    [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
// IR-GPU-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
// IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 128 [[TMP8]], i64 400, i1 false)
// IR-GPU-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 4 [[TMP8]], i64 400, i1 false)
// IR-GPU-NEXT:    ret void
//
//
+16 −16

File changed.

Contains only whitespace changes.