Unverified Commit dc79e2a9 authored by jeanPerier's avatar jeanPerier Committed by GitHub
Browse files

[flang] avoid introducing iteration dependencies in WHERE and FORALL temporaries (#195053)

This patch improves the addressing of temporaries created when needed for simple FORALL or WHERE as below to not introduce iteration dependencies.

```
subroutine foo(p1, p2, mask)
  real, pointer :: p1(:), p2(:)
  logical :: mask(:)
  where (mask) p1 = p2
end subroutine
```

Instead of using a stack like temporary that uses a counter to push and fetch elements, the loop IVs are directly used to address the temporaries. This makes it easier to later vectorize or parallelize those loops.

This is only done when:
- This is not a FORALL with array expressions
- The dynamic type is the same at each iterations
- The WHERE and FORALL do not create loops of depth more than 15.
- If there are FORALLs, their strides are constants 1 or -1.

Note that only the addressing is impacted, the stack-like approach already allocated a temporary big enough for all the iterations regardless of the masking. So the temporary size will remain the same.

Assisted by: Claude
parent 94ca4909
Loading
Loading
Loading
Loading
+47 −7
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@
#ifndef FORTRAN_OPTIMIZER_BUILDER_TEMPORARYSTORAGE_H
#define FORTRAN_OPTIMIZER_BUILDER_TEMPORARYSTORAGE_H

#include "flang/Common/idioms.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"

namespace fir {
@@ -98,6 +99,34 @@ private:
  mlir::Value temp;
};

/// Multidimensional temporary indexed directly by the enclosing loop induction
/// variables (innermost loop is the first dimension). The indices passed to
/// pushValue/fetch are interpreted in the array's domain, which is described
/// by a fir.shape_shift built from the loop extents and lower bounds. This
/// avoids the loop-carried counter used by HomogeneousScalarStack, keeping
/// loop iterations independent. Limited to Fortran::common::maxRank dimensions.
class ArrayTemp {
public:
  ArrayTemp(mlir::Location loc, fir::FirOpBuilder &builder,
            fir::SequenceType declaredType, llvm::ArrayRef<mlir::Value> extents,
            llvm::ArrayRef<mlir::Value> lowerBounds,
            llvm::ArrayRef<mlir::Value> lengths, bool allocateOnHeap,
            llvm::StringRef name);

  void pushValue(mlir::Location loc, fir::FirOpBuilder &builder,
                 mlir::Value value, mlir::ValueRange indices);
  void resetFetchPosition(mlir::Location loc, fir::FirOpBuilder &builder) {}
  mlir::Value fetch(mlir::Location loc, fir::FirOpBuilder &builder,
                    mlir::ValueRange indices);
  void destroy(mlir::Location loc, fir::FirOpBuilder &builder);
  bool canBeFetchedAfterPush() const { return true; }

private:
  const bool allocateOnHeap;
  mlir::Value temp;
  llvm::SmallVector<mlir::Value> typeParams;
};

/// Structure to hold the value of a single entity.
class SimpleCopy {
public:
@@ -255,15 +284,25 @@ public:
  TemporaryStorage(T &&impl) : impl{std::forward<T>(impl)} {}

  void pushValue(mlir::Location loc, fir::FirOpBuilder &builder,
                 mlir::Value value) {
    std::visit([&](auto &temp) { temp.pushValue(loc, builder, value); }, impl);
                 mlir::Value value, mlir::ValueRange indices = {}) {
    // Only ArrayTemp uses the loop indices; other temps don't take them.
    std::visit(Fortran::common::visitors{
                   [&](ArrayTemp &temp) {
                     temp.pushValue(loc, builder, value, indices);
                   },
                   [&](auto &temp) { temp.pushValue(loc, builder, value); }},
               impl);
  }
  void resetFetchPosition(mlir::Location loc, fir::FirOpBuilder &builder) {
    std::visit([&](auto &temp) { temp.resetFetchPosition(loc, builder); },
               impl);
  }
  mlir::Value fetch(mlir::Location loc, fir::FirOpBuilder &builder) {
    return std::visit([&](auto &temp) { return temp.fetch(loc, builder); },
  mlir::Value fetch(mlir::Location loc, fir::FirOpBuilder &builder,
                    mlir::ValueRange indices = {}) {
    return std::visit(
        Fortran::common::visitors{
            [&](ArrayTemp &temp) { return temp.fetch(loc, builder, indices); },
            [&](auto &temp) { return temp.fetch(loc, builder); }},
        impl);
  }
  void destroy(mlir::Location loc, fir::FirOpBuilder &builder) {
@@ -282,8 +321,9 @@ public:
  }

private:
  std::variant<HomogeneousScalarStack, SimpleCopy, SSARegister, AnyValueStack,
               AnyVariableStack, AnyVectorSubscriptStack, AnyAddressStack>
  std::variant<HomogeneousScalarStack, ArrayTemp, SimpleCopy, SSARegister,
               AnyValueStack, AnyVariableStack, AnyVectorSubscriptStack,
               AnyAddressStack>
      impl;
};
} // namespace fir::factory
+82 −0
Original line number Diff line number Diff line
@@ -134,6 +134,88 @@ hlfir::Entity fir::factory::HomogeneousScalarStack::moveStackAsArrayExpr(
  return hlfir::Entity{hlfirExpr};
}

//===----------------------------------------------------------------------===//
// fir::factory::ArrayTemp implementation.
//===----------------------------------------------------------------------===//

fir::factory::ArrayTemp::ArrayTemp(mlir::Location loc,
                                   fir::FirOpBuilder &builder,
                                   fir::SequenceType declaredType,
                                   llvm::ArrayRef<mlir::Value> extents,
                                   llvm::ArrayRef<mlir::Value> lowerBounds,
                                   llvm::ArrayRef<mlir::Value> lengths,
                                   bool allocateOnHeap, llvm::StringRef name)
    : allocateOnHeap{allocateOnHeap},
      typeParams{lengths.begin(), lengths.end()} {
  assert(extents.size() == lowerBounds.size() &&
         "extents and lowerBounds must have the same size");
  assert(extents.size() == declaredType.getDimension() &&
         "declared type rank must match the number of extents");
  mlir::Value tempStorage;
  if (allocateOnHeap)
    tempStorage =
        builder.createHeapTemporary(loc, declaredType, name, extents, lengths);
  else
    tempStorage =
        builder.createTemporary(loc, declaredType, name, extents, lengths);
  // Use a fir.shape_shift so the temp's lower bounds match the loop bounds:
  // the indices passed to pushValue/fetch can then index it directly.
  mlir::Value shape = builder.genShape(loc, lowerBounds, extents);
  temp =
      hlfir::DeclareOp::create(builder, loc, tempStorage, name, shape, lengths)
          .getBase();
}

/// Generate an hlfir.designate on \p temp for the element at \p indices. The
/// indices are interpreted in the temp's array domain (matching its lower
/// bounds, which were set from the enclosing loop bounds).
static mlir::Value genArrayTempElementAddr(mlir::Location loc,
                                           fir::FirOpBuilder &builder,
                                           mlir::Value temp,
                                           mlir::ValueRange indices,
                                           mlir::ValueRange typeParams) {
  hlfir::Entity entity{temp};
  mlir::Type refTy = fir::ReferenceType::get(entity.getFortranElementType());
  mlir::Type idxTy = builder.getIndexType();
  llvm::SmallVector<mlir::Value> idxs;
  idxs.reserve(indices.size());
  for (mlir::Value idx : indices)
    idxs.push_back(builder.createConvert(loc, idxTy, idx));
  return hlfir::DesignateOp::create(builder, loc, refTy, temp, idxs,
                                    typeParams);
}

void fir::factory::ArrayTemp::pushValue(mlir::Location loc,
                                        fir::FirOpBuilder &builder,
                                        mlir::Value value,
                                        mlir::ValueRange indices) {
  hlfir::Entity entity{value};
  assert(entity.isScalar() && "cannot use ArrayTemp with array");
  // Match HomogeneousScalarStack: derived types go through the runtime path.
  if (!entity.hasIntrinsicType())
    TODO(loc, "creating ArrayTemp for derived types");
  mlir::Value addr =
      genArrayTempElementAddr(loc, builder, temp, indices, typeParams);
  hlfir::AssignOp::create(builder, loc, value, addr);
}

mlir::Value fir::factory::ArrayTemp::fetch(mlir::Location loc,
                                           fir::FirOpBuilder &builder,
                                           mlir::ValueRange indices) {
  mlir::Value addr =
      genArrayTempElementAddr(loc, builder, temp, indices, typeParams);
  return hlfir::loadTrivialScalar(loc, builder, hlfir::Entity{addr});
}

void fir::factory::ArrayTemp::destroy(mlir::Location loc,
                                      fir::FirOpBuilder &builder) {
  if (allocateOnHeap) {
    auto declare = temp.getDefiningOp<hlfir::DeclareOp>();
    assert(declare && "temp must have been declared");
    fir::FreeMemOp::create(builder, loc, declare.getMemref());
  }
}

//===----------------------------------------------------------------------===//
// fir::factory::SimpleCopy implementation.
//===----------------------------------------------------------------------===//
+86 −9
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
//===----------------------------------------------------------------------===//

#include "ScheduleOrderedAssignments.h"
#include "flang/Common/Fortran-consts.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/HLFIRTools.h"
#include "flang/Optimizer/Builder/TemporaryStorage.h"
@@ -257,6 +258,11 @@ private:
  bool currentLoopNestIterationNumberCanBeComputed(
      llvm::SmallVectorImpl<fir::DoLoopOp> &loopNest);

  /// Return the induction variables of the enclosing fir.do_loop nest at the
  /// current insertion point, innermost first (same order as
  /// currentLoopNestIterationNumberCanBeComputed).
  llvm::SmallVector<mlir::Value> getLoopIndices();

  template <typename T>
  fir::factory::TemporaryStorage *insertSavedEntity(mlir::Region &region,
                                                    T &&temp) {
@@ -669,7 +675,8 @@ OrderedAssignmentRewriter::getIfSaved(mlir::Region &region) {
  // If the region was saved in a previous run, fetch the saved value.
  if (auto temp = savedEntities.find(&region); temp != savedEntities.end()) {
    doBeforeLoopNest([&]() { temp->second.resetFetchPosition(loc, builder); });
    return ValueAndCleanUp{temp->second.fetch(loc, builder), std::nullopt};
    return ValueAndCleanUp{temp->second.fetch(loc, builder, getLoopIndices()),
                           std::nullopt};
  }
  return std::nullopt;
}
@@ -1109,6 +1116,61 @@ computeLoopNestIterationNumber(mlir::Location loc, fir::FirOpBuilder &builder,
  return loopExtent;
}

/// If \p value is a compile-time integer constant (possibly hidden behind
/// fir.convert ops), return its value. Otherwise return std::nullopt.
static std::optional<int64_t> unwrapConstantInt(mlir::Value value) {
  while (auto convert = value.getDefiningOp<fir::ConvertOp>())
    value = convert.getValue();
  return fir::getIntIfConstant(value);
}

/// Compute the extents and lower bounds of \p loopNest, in the same order as
/// \p loopNest (innermost first). The lower bound of each dimension is the
/// smallest induction variable value, so that the loop induction variable
/// can directly index the temp via fir.shape_shift. This only works when
/// every loop has a unit step: for step +1 the smallest iv is the loop's
/// lower bound; for step -1 it is the loop's upper bound. Returns false
/// (with \p extents and \p lowerBounds left in an unspecified state) when
/// any loop has a non-unit or non-constant step, signalling that the caller
/// should fall back to a counter-based temp.
static bool computeLoopNestExtentsAndLowerBounds(
    mlir::Location loc, fir::FirOpBuilder &builder,
    llvm::ArrayRef<fir::DoLoopOp> loopNest,
    llvm::SmallVectorImpl<mlir::Value> &extents,
    llvm::SmallVectorImpl<mlir::Value> &lowerBounds) {
  extents.reserve(loopNest.size());
  lowerBounds.reserve(loopNest.size());
  for (fir::DoLoopOp doLoop : loopNest) {
    auto step = unwrapConstantInt(doLoop.getStep());
    if (!step || std::abs(*step) != 1)
      return false;
    mlir::Value extent = builder.genExtentFromTriplet(
        loc, doLoop.getLowerBound(), doLoop.getUpperBound(), doLoop.getStep(),
        builder.getIndexType());
    extents.push_back(extent);
    lowerBounds.push_back(*step == 1 ? doLoop.getLowerBound()
                                     : doLoop.getUpperBound());
  }
  return true;
}

llvm::SmallVector<mlir::Value> OrderedAssignmentRewriter::getLoopIndices() {
  llvm::SmallVector<mlir::Value> indices;
  if (constructStack.empty())
    return indices;
  mlir::Operation *outerLoop = constructStack[0];
  mlir::Operation *currentConstruct = constructStack.back();
  while (currentConstruct) {
    if (auto doLoop = mlir::dyn_cast<fir::DoLoopOp>(currentConstruct))
      indices.push_back(doLoop.getInductionVar());
    if (currentConstruct == outerLoop)
      currentConstruct = nullptr;
    else
      currentConstruct = currentConstruct->getParentOp();
  }
  return indices;
}

/// Return a name for temporary storage that indicates in which context
/// the temporary storage was created.
static llvm::StringRef
@@ -1160,11 +1222,27 @@ void OrderedAssignmentRewriter::generateSaveEntity(
    bool loopShapeCanBePreComputed =
        currentLoopNestIterationNumberCanBeComputed(loopNest);
    doBeforeLoopNest([&] {
      /// For simple scalars inside loops whose total iteration number can be
      /// pre-computed, create a rank-1 array outside of the loops. It will be
      /// assigned/fetched inside the loops like a normal Fortran array given
      /// the iteration count.
      if (loopShapeCanBePreComputed && fir::isa_trivial(entityType)) {
      // For simple scalars in a precomputable loop nest, prefer the
      // multidimensional ArrayTemp (indexed by loop induction variables) so
      // there is no loop-carried counter. Fall back to the 1D counter-based
      // HomogeneousScalarStack when the nest is deeper than the maximum
      // fir.array rank or when any loop has a non-unit/non-constant step
      // (in which case the loop induction variable cannot index the temp
      // directly).
      llvm::SmallVector<mlir::Value> tempExtents;
      llvm::SmallVector<mlir::Value> tempLowerBounds;
      if (loopShapeCanBePreComputed && fir::isa_trivial(entityType) &&
          loopNest.size() <= static_cast<size_t>(Fortran::common::maxRank) &&
          computeLoopNestExtentsAndLowerBounds(loc, builder, loopNest,
                                               tempExtents, tempLowerBounds)) {
        auto sequenceType = mlir::cast<fir::SequenceType>(
            builder.getVarLenSeqTy(entityType, /*rank=*/loopNest.size()));
        temp = insertSavedEntity(
            region,
            fir::factory::ArrayTemp{loc, builder, sequenceType, tempExtents,
                                    tempLowerBounds,
                                    /*lengths=*/{}, allocateOnHeap, tempName});
      } else if (loopShapeCanBePreComputed && fir::isa_trivial(entityType)) {
        mlir::Value loopExtent =
            computeLoopNestIterationNumber(loc, builder, loopNest);
        auto sequenceType =
@@ -1174,7 +1252,6 @@ void OrderedAssignmentRewriter::generateSaveEntity(
                                     loc, builder, sequenceType, loopExtent,
                                     /*lenParams=*/{}, allocateOnHeap,
                                     /*stackThroughLoops=*/true, tempName});

      } else {
        // If the number of iteration is not known, or if the values at each
        // iterations are values that may have different shape, type parameters
@@ -1185,8 +1262,8 @@ void OrderedAssignmentRewriter::generateSaveEntity(
      }
    });
    // Inside the loop nest (and any fir.if if there are active masks), copy
    // the value to the temp and do clean-ups for the value if any.
    temp->pushValue(loc, builder, entity);
    // the value to the temp and do clean-ups of the value if any.
    temp->pushValue(loc, builder, entity, getLoopIndices());
  }

  // Delay the clean-up if the entity will be used in the same run (i.e., the
+45 −0
Original line number Diff line number Diff line
! Test that the lower-hlfir-ordered-assignments pass falls back to the
! 1D HomogeneousScalarStack temporary (counter-based) when the FORALL loop
! nest is deeper than Fortran::common::maxRank (15), because fir.array can
! only hold up to maxRank dimensions.
!
! Below maxRank, the new ArrayTemp is used and there is no counter; here we
! verify the opposite: the counter (a fir.alloca index, fir.load/addi/store
! pattern) is restored when the loop nest has 16 levels.
!
! The test uses a rank-8 array of derived type with a rank-8 array component
! to spread 16 indexable dimensions across the FORALL header.
!
! RUN: bbc -emit-hlfir -o - %s | fir-opt --lower-hlfir-ordered-assignments | FileCheck %s

module many_forall_mod
  type :: t
    real :: c(2,2,2,2,2,2,2,2)
  end type
contains
  subroutine more_than_15_forall(a)
    type(t), intent(inout) :: a(2,2,2,2,2,2,2,2)
    forall (i1=1:2, i2=1:2, i3=1:2, i4=1:2, i5=1:2, i6=1:2, i7=1:2, i8=1:2, &
            j1=1:2, j2=1:2, j3=1:2, j4=1:2, j5=1:2, j6=1:2, j7=1:2, j8=1:2)
      a(i1,i2,i3,i4,i5,i6,i7,i8)%c(j1,j2,j3,j4,j5,j6,j7,j8) = &
        a(3-i1,3-i2,3-i3,3-i4,3-i5,3-i6,3-i7,3-i8)%c(3-j1,3-j2,3-j3,3-j4,3-j5,3-j6,3-j7,3-j8)
    end forall
  end subroutine
end module
! With 16 nested loops, the temporary must be the 1D counter-based form
! (HomogeneousScalarStack) instead of a 16D ArrayTemp, since fir.array is
! limited to Fortran::common::maxRank dimensions.
!
! CHECK-LABEL: func.func @_QMmany_forall_modPmore_than_15_forall(
! There must be a counter in memory (fir.alloca index).
! CHECK:         %[[CTR:.*]] = fir.alloca index
! The temporary is a 1D fir.array<?xf32>.
! CHECK:         %[[ALLOC:.*]] = fir.allocmem !fir.array<?xf32>, %{{.*}} {bindc_name = ".tmp.forall", uniq_name = ""}
! Plain fir.shape (no shift), since the temp is indexed by the counter.
! CHECK:         %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
! CHECK:         hlfir.declare %[[ALLOC]](%[[SHAPE]]) {uniq_name = ".tmp.forall"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
! Inside the loop nest the counter is incremented and the temp is indexed
! through the counter (not directly through the loop induction variables).
! CHECK:         fir.load %[[CTR]] : !fir.ref<index>
! CHECK:         arith.addi %{{.*}}, %{{.*}} : index
! CHECK:         fir.store %{{.*}} to %[[CTR]] : !fir.ref<index>
+207 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading