Commit 6c3fee47 authored by Simon Tatham's avatar Simon Tatham
Browse files

[ARM,MVE] Add intrinsics for gather/scatter load/stores.

This patch adds two new families of intrinsics, both of which are
memory accesses taking a vector of locations to load from / store to.

The vldrq_gather_base / vstrq_scatter_base intrinsics take a vector of
base addresses, and an immediate offset to be added consistently to
each one. vldrq_gather_offset / vstrq_scatter_offset take a scalar
base address, and a vector of offsets to add to it. The
'shifted_offset' variants also multiply each offset by the element
size type, so that the vector is effectively of array indices.

At the IR level, these operations are represented by a single set of
four IR intrinsics: {gather,scatter} × {base,offset}. The other
details (signed/unsigned, shift, and memory element size as opposed to
vector element size) are all specified by IR intrinsic polymorphism
and immediate operands, because that made the selection job easier
than making a huge family of similarly named intrinsics.

I considered using the standard IR representations such as
llvm.masked.gather, but they're not a good fit. In order to use
llvm.masked.gather to represent a gather_offset load with element size
smaller than a pointer, you'd have to expand the <8 x i16> vector of
offsets into an <8 x i16*> vector of pointers, which would be split up
during legalization, so you'd spend most of your time undoing the mess
it had made. Also, ISel support for llvm.masked.gather would be easy
enough in a trivial way (you can expand it into a gather-base load
with a zero immediate offset), but instruction-selecting lots of
fiddly idioms back into all the _other_ MVE load instructions would be
much more work. So I think dedicated IR intrinsics are the more
sensible approach, at least for the moment.

On the clang tablegen side, I've added two new features to the
Tablegen source accepted by MveEmitter: a 'CopyKind' type node for
defining a type that varies with the parameter type (it lets you ask
for an unsigned integer type of the same width as the parameter), and
an 'unsignedflag' value node for passing an immediate IR operand which
is 0 for a signed integer type or 1 for an unsigned one. That lets me
write each kind of intrinsic just once and get all its subtypes and
immediate arguments generated automatically.

Also I've tweaked the handling of pointer-typed values in the code
generation part of MveEmitter: they're generated as Address rather
than Value (i.e. including an alignment) so that they can be given to
the ordinary IR load and store operations, but I'd omitted the code to
convert them back to Value when they're going to be used as an
argument to an IR intrinsic.

On the MC side, I've enhanced MVEVectorVTInfo so that it can tell you
not only the full assembly-language suffix for a given vector type
(like 's32' or 'u16') but also the numeric-only one used by store
instructions (just '32' or '16').

Reviewers: dmgreen

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D69791
parent f0c6890f
Loading
Loading
Loading
Loading
+149 −13
Original line number Diff line number Diff line
@@ -72,22 +72,158 @@ def vcvt#half#q_m_f16: Intrinsic<

} // loop over half = "b", "t"

let params = T.All32, pnt = PNT_None in
def vldrwq_gather_base_wb: Intrinsic<
    Vector, (args Ptr<VecOf<Unsigned<Scalar>>>:$addr, imm_mem7bit<4>:$offset),
    (seq (IRInt<"vldr_gather_base_wb", [Vector, VecOf<Unsigned<Scalar>>]>
multiclass gather_base<list<Type> types, int size> {
  let params = types, pnt = PNT_None in {
    def _gather_base: Intrinsic<
      Vector, (args UVector:$addr, imm_mem7bit<size>:$offset),
      (IRInt<"vldr_gather_base", [Vector, UVector]> $addr, $offset)>;

    def _gather_base_z: Intrinsic<
      Vector, (args UVector:$addr, imm_mem7bit<size>:$offset, Predicate:$pred),
      (IRInt<"vldr_gather_base_predicated", [Vector, UVector, Predicate]>
          $addr, $offset, $pred)>;

    def _gather_base_wb: Intrinsic<
      Vector, (args Ptr<UVector>:$addr, imm_mem7bit<size>:$offset),
      (seq (IRInt<"vldr_gather_base_wb", [Vector, UVector]>
               (load $addr), $offset):$pair,
           (store (xval $pair, 1), $addr),
           (xval $pair, 0))>;

let params = T.All64, pnt = PNT_None in
def vldrdq_gather_base_wb_z: Intrinsic<
    Vector, (args Ptr<VecOf<Unsigned<Scalar>>>:$addr, imm_mem7bit<8>:$offset,
    def _gather_base_wb_z: Intrinsic<
      Vector, (args Ptr<UVector>:$addr, imm_mem7bit<size>:$offset,
                    Predicate:$pred),
    (seq (IRInt<"vldr_gather_base_wb_predicated", [Vector, VecOf<Unsigned<Scalar>>, Predicate]>
      (seq (IRInt<"vldr_gather_base_wb_predicated",
                  [Vector, UVector, Predicate]>
               (load $addr), $offset, $pred):$pair,
           (store (xval $pair, 1), $addr),
           (xval $pair, 0))>;
  }
}

defm vldrwq: gather_base<T.All32, 4>;
defm vldrdq: gather_base<T.All64, 8>;

multiclass scatter_base<list<Type> types, int size> {
  let params = types in {
    def _scatter_base: Intrinsic<
      Void, (args UVector:$addr, imm_mem7bit<size>:$offset, Vector:$data),
      (IRInt<"vstr_scatter_base", [UVector, Vector]> $addr, $offset, $data)>;

    def _scatter_base_p: Intrinsic<
      Void, (args UVector:$addr, imm_mem7bit<size>:$offset, Vector:$data,
                  Predicate:$pred),
      (IRInt<"vstr_scatter_base_predicated", [UVector, Vector, Predicate]>
          $addr, $offset, $data, $pred)>;

    def _scatter_base_wb: Intrinsic<
      Void, (args Ptr<UVector>:$addr, imm_mem7bit<size>:$offset, Vector:$data),
      (seq (IRInt<"vstr_scatter_base_wb", [UVector, Vector]>
                 (load $addr), $offset, $data):$wbaddr,
           (store $wbaddr, $addr))>;

    def _scatter_base_wb_p: Intrinsic<
      Void, (args Ptr<UVector>:$addr, imm_mem7bit<size>:$offset,
                    Vector:$data, Predicate:$pred),
      (seq (IRInt<"vstr_scatter_base_wb_predicated",
                  [UVector, Vector, Predicate]>
               (load $addr), $offset, $data, $pred):$wbaddr,
           (store $wbaddr, $addr))>;
  }
}

defm vstrwq: scatter_base<T.All32, 4>;
defm vstrdq: scatter_base<T.All64, 8>;

multiclass gather_offset_unshifted<list<Type> types, PrimitiveType memtype> {
  let params = types in {
    def _gather_offset: Intrinsic<
      Vector, (args CPtr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets),
      (IRInt<"vldr_gather_offset",
             [Vector, CPtr<CopyKind<memtype, Scalar>>, UVector]>
          $base, $offsets, memtype.size, 0, (unsignedflag Scalar))>;
    def _gather_offset_z: Intrinsic<
      Vector, (args CPtr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets,
                    Predicate:$pred),
      (IRInt<"vldr_gather_offset_predicated",
             [Vector, CPtr<CopyKind<memtype, Scalar>>, UVector, Predicate]>
          $base, $offsets, memtype.size, 0, (unsignedflag Scalar), $pred)>;
  }
}

multiclass gather_offset_shifted<list<Type> types, PrimitiveType memtype,
                                 int shift> {
  let params = types in {
    def _gather_shifted_offset: Intrinsic<
      Vector, (args CPtr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets),
      (IRInt<"vldr_gather_offset",
             [Vector, CPtr<CopyKind<memtype, Scalar>>, UVector]>
          $base, $offsets, memtype.size, shift, (unsignedflag Scalar))>;
    def _gather_shifted_offset_z: Intrinsic<
      Vector, (args CPtr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets,
                    Predicate:$pred),
      (IRInt<"vldr_gather_offset_predicated",
             [Vector, CPtr<CopyKind<memtype, Scalar>>, UVector, Predicate]>
          $base, $offsets, memtype.size, shift, (unsignedflag Scalar), $pred)>;
  }
}

multiclass gather_offset_both<list<Type> types, PrimitiveType memtype,
                              int shift> {
  defm "": gather_offset_unshifted<types, memtype>;
  defm "": gather_offset_shifted<types, memtype, shift>;
}

defm vldrbq: gather_offset_unshifted<!listconcat(T.All8, T.Int16, T.Int32), u8>;
defm vldrhq: gather_offset_both<!listconcat(T.All16, T.Int32), u16, 1>;
defm vldrwq: gather_offset_both<T.All32, u32, 2>;
defm vldrdq: gather_offset_both<T.Int64, u64, 3>;

multiclass scatter_offset_unshifted<list<Type> types, PrimitiveType memtype> {
  let params = types in {
    def _scatter_offset: Intrinsic<
      Void, (args Ptr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets,
                  Vector:$data),
      (IRInt<"vstr_scatter_offset",
             [Ptr<CopyKind<memtype, Scalar>>, UVector, Vector]>
          $base, $offsets, $data, memtype.size, 0)>;
    def _scatter_offset_p: Intrinsic<
      Void, (args Ptr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets,
                  Vector:$data, Predicate:$pred),
      (IRInt<"vstr_scatter_offset_predicated",
             [Ptr<CopyKind<memtype, Scalar>>, UVector, Vector, Predicate]>
          $base, $offsets, $data, memtype.size, 0, $pred)>;
  }
}

multiclass scatter_offset_shifted<list<Type> types, PrimitiveType memtype,
                                  int shift> {
  let params = types in {
    def _scatter_shifted_offset: Intrinsic<
      Void, (args Ptr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets,
                  Vector:$data),
      (IRInt<"vstr_scatter_offset",
             [Ptr<CopyKind<memtype, Scalar>>, UVector, Vector]>
          $base, $offsets, $data, memtype.size, shift)>;
    def _scatter_shifted_offset_p: Intrinsic<
      Void, (args Ptr<CopyKind<memtype, Scalar>>:$base, UVector:$offsets,
                  Vector:$data, Predicate:$pred),
      (IRInt<"vstr_scatter_offset_predicated",
             [Ptr<CopyKind<memtype, Scalar>>, UVector, Vector, Predicate]>
          $base, $offsets, $data, memtype.size, shift, $pred)>;
  }
}

multiclass scatter_offset_both<list<Type> types, PrimitiveType memtype,
                               int shift> {
  defm "": scatter_offset_unshifted<types, memtype>;
  defm "": scatter_offset_shifted<types, memtype, shift>;
}

defm vstrbq: scatter_offset_unshifted<!listconcat(T.All8,T.Int16,T.Int32), u8>;
defm vstrhq: scatter_offset_both<!listconcat(T.All16, T.Int32), u16, 1>;
defm vstrwq: scatter_offset_both<T.All32, u32, 2>;
defm vstrdq: scatter_offset_both<T.Int64, u64, 3>;

let params = [Void], pnt = PNT_None in
def urshrl: Intrinsic<u64, (args u64:$value, imm_1to32:$shift),
+20 −5
Original line number Diff line number Diff line
@@ -82,6 +82,11 @@ class IRInt<string name_, list<Type> params_ = [], bit appendKind_ = 0> {
// the return value of the seq construction as a whole.
def seq;

// Another magic operation is 'unsignedflag', which you give a scalar
// _type_ as an argument, and it expands into 1 for an unsigned type
// and 0 for a signed (or floating) one.
def unsignedflag;

// If you put CustomCodegen<"foo"> in an intrinsic's codegen field, it
// indicates that the IR generation for that intrinsic is done by handwritten
// C++ and not autogenerated at all. The effect in the MVE builtin codegen
@@ -109,7 +114,7 @@ def CTO_Vec: ComplexTypeOp;
def CTO_Pred: ComplexTypeOp;
class CTO_Tuple<int n_>: ComplexTypeOp { int n = n_; }
class CTO_Pointer<bit const_>: ComplexTypeOp { bit const = const_; }
class CTO_Sign<bit signed_>: ComplexTypeOp { bit signed = signed_; }
def CTO_CopyKind: ComplexTypeOp;

// -----------------------------------------------------------------------------
// Instances of Type intended to be used directly in the specification of an
@@ -167,10 +172,20 @@ class MultiVector<int n>: ComplexType<(CTO_Tuple<n> Vector)>;
class Ptr<Type t>: ComplexType<(CTO_Pointer<0> t)>;
class CPtr<Type t>: ComplexType<(CTO_Pointer<1> t)>;

// Unsigned<t> expects t to be a scalar, and expands to the unsigned integer
// scalar of the same size. So it returns u16 if you give it s16 or f16 (or
// u16 itself).
class Unsigned<Type t>: ComplexType<(CTO_Sign<0> t)>;
// CopyKind<s,k> expects s and k to be scalar types. It returns a scalar type
// whose kind (signed, unsigned or float) matches that of k, and whose size
// matches that of s.
class CopyKind<Type s, Type k>: ComplexType<(CTO_CopyKind s, k)>;

// Unsigned<t> expects t to be a scalar type, and expands to the unsigned
// integer scalar of the same size. So it returns u16 if you give it s16 or
// f16 (or u16 itself).
class Unsigned<Type t>: ComplexType<(CTO_CopyKind t, u32)>;

// UScalar and UVector expand to the unsigned-integer versions of
// Scalar and Vector.
def UScalar: Unsigned<Scalar>;
def UVector: VecOf<UScalar>;

// -----------------------------------------------------------------------------
// Internal definitions for specifying immediate arguments for an intrinsic.
+2146 −0

File added.

Preview size limit exceeded, changes collapsed.

+56 −0
Original line number Diff line number Diff line
// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -verify -fsyntax-only %s

#include <arm_mve.h>

void test_load_offsets(uint32x4_t addr32, uint64x2_t addr64)
{
  // Offsets that should be a multiple of 8 times 0,1,...,127
  vldrdq_gather_base_s64(addr64, 0);
  vldrdq_gather_base_s64(addr64, 8);
  vldrdq_gather_base_s64(addr64, 2*8);
  vldrdq_gather_base_s64(addr64, 125*8);
  vldrdq_gather_base_s64(addr64, 126*8);
  vldrdq_gather_base_s64(addr64, 127*8);
  vldrdq_gather_base_s64(addr64, -8); // expected-error {{argument value -8 is outside the valid range [0, 1016]}}
  vldrdq_gather_base_s64(addr64, 128*8); // expected-error {{argument value 1024 is outside the valid range [0, 1016]}}
  vldrdq_gather_base_s64(addr64, 4); // expected-error {{argument should be a multiple of 8}}
  vldrdq_gather_base_s64(addr64, 1); // expected-error {{argument should be a multiple of 8}}

  // Offsets that should be a multiple of 4 times 0,1,...,127
  vldrwq_gather_base_s32(addr32, 0);
  vldrwq_gather_base_s32(addr32, 4);
  vldrwq_gather_base_s32(addr32, 2*4);
  vldrwq_gather_base_s32(addr32, 125*4);
  vldrwq_gather_base_s32(addr32, 126*4);
  vldrwq_gather_base_s32(addr32, 127*4);
  vldrwq_gather_base_s32(addr32, -4); // expected-error {{argument value -4 is outside the valid range [0, 508]}}
  vldrwq_gather_base_s32(addr32, 128*4); // expected-error {{argument value 512 is outside the valid range [0, 508]}}
  vldrwq_gather_base_s32(addr32, 2); // expected-error {{argument should be a multiple of 4}}
  vldrwq_gather_base_s32(addr32, 1); // expected-error {{argument should be a multiple of 4}}

  // Show that the polymorphic store intrinsics get the right set of
  // error checks after overload resolution. These ones expand to the
  // 8-byte granular versions...
  vstrdq_scatter_base(addr64, 0, addr64);
  vstrdq_scatter_base(addr64, 8, addr64);
  vstrdq_scatter_base(addr64, 2*8, addr64);
  vstrdq_scatter_base(addr64, 125*8, addr64);
  vstrdq_scatter_base(addr64, 126*8, addr64);
  vstrdq_scatter_base(addr64, 127*8, addr64);
  vstrdq_scatter_base(addr64, -8, addr64); // expected-error {{argument value -8 is outside the valid range [0, 1016]}}
  vstrdq_scatter_base(addr64, 128*8, addr64); // expected-error {{argument value 1024 is outside the valid range [0, 1016]}}
  vstrdq_scatter_base(addr64, 4, addr64); // expected-error {{argument should be a multiple of 8}}
  vstrdq_scatter_base(addr64, 1, addr64); // expected-error {{argument should be a multiple of 8}}

  /// ... and these ones to the 4-byte.
  vstrwq_scatter_base(addr32, 0, addr32);
  vstrwq_scatter_base(addr32, 4, addr32);
  vstrwq_scatter_base(addr32, 2*4, addr32);
  vstrwq_scatter_base(addr32, 125*4, addr32);
  vstrwq_scatter_base(addr32, 126*4, addr32);
  vstrwq_scatter_base(addr32, 127*4, addr32);
  vstrwq_scatter_base(addr32, -4, addr32); // expected-error {{argument value -4 is outside the valid range [0, 508]}}
  vstrwq_scatter_base(addr32, 128*4, addr32); // expected-error {{argument value 512 is outside the valid range [0, 508]}}
  vstrwq_scatter_base(addr32, 2, addr32); // expected-error {{argument should be a multiple of 4}}
  vstrwq_scatter_base(addr32, 1, addr32); // expected-error {{argument should be a multiple of 4}}
}
+32 −8
Original line number Diff line number Diff line
@@ -204,6 +204,9 @@ public:
      Name = "const " + Name;
    return Name + " *";
  }
  std::string llvmName() const override {
    return "llvm::PointerType::getUnqual(" + Pointee->llvmName() + ")";
  }

  static bool classof(const Type *T) {
    return T->typeKind() == TypeKind::Pointer;
@@ -512,6 +515,11 @@ public:
  void setVarname(const StringRef s) { VarName = s; }
  bool varnameUsed() const { return VarNameUsed; }

  // Emit code to generate this result as a Value *.
  virtual std::string asValue() {
    return varname();
  }

  // Code generation happens in multiple passes. This method tracks whether a
  // Result has yet been visited in a given pass, without the need for a
  // tedious loop in between passes that goes through and resets a 'visited'
@@ -547,6 +555,12 @@ public:
  std::string typeName() const override {
    return AddressType ? "Address" : Result::typeName();
  }
  // Emit code to generate this result as a Value *.
  std::string asValue() override {
    if (AddressType)
      return "(" + varname() + ".getPointer())";
    return Result::asValue();
  }
};

// Result subclass for an integer literal appearing in Tablegen. This may need
@@ -665,7 +679,7 @@ public:
    OS << "), llvm::SmallVector<Value *, " << Args.size() << "> {";
    const char *Sep = "";
    for (auto Arg : Args) {
      OS << Sep << Arg->varname();
      OS << Sep << Arg->asValue();
      Sep = ", ";
    }
    OS << "})";
@@ -974,17 +988,15 @@ const Type *MveEmitter::getType(DagInit *D, const Type *Param) {
    return getPointerType(Pointee, Op->getValueAsBit("const"));
  }

  if (Op->isSubClassOf("CTO_Sign")) {
    const ScalarType *ST = cast<ScalarType>(getType(D->getArg(0), Param));
    ScalarTypeKind NewKind = Op->getValueAsBit("signed")
                                 ? ScalarTypeKind::SignedInt
                                 : ScalarTypeKind::UnsignedInt;
  if (Op->getName() == "CTO_CopyKind") {
    const ScalarType *STSize = cast<ScalarType>(getType(D->getArg(0), Param));
    const ScalarType *STKind = cast<ScalarType>(getType(D->getArg(1), Param));
    for (const auto &kv : ScalarTypes) {
      const ScalarType *RT = kv.second.get();
      if (RT->kind() == NewKind && RT->sizeInBits() == ST->sizeInBits())
      if (RT->kind() == STKind->kind() && RT->sizeInBits() == STSize->sizeInBits())
        return RT;
    }
    PrintFatalError("Cannot change sign of this type");
    PrintFatalError("Cannot find a type to satisfy CopyKind");
  }

  PrintFatalError("Bad operator in type dag expression");
@@ -1025,6 +1037,18 @@ Result::Ptr MveEmitter::getCodeForDag(DagInit *D, const Result::Scope &Scope,
      }
    }
    PrintFatalError("Unsupported type cast");
  } else if (Op->getName() == "unsignedflag") {
    if (D->getNumArgs() != 1)
      PrintFatalError("unsignedflag should have exactly one argument");
    Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
    if (!TypeRec->isSubClassOf("Type"))
      PrintFatalError("unsignedflag's argument should be a type");
    if (const auto *ST = dyn_cast<ScalarType>(getType(TypeRec, Param))) {
      return std::make_shared<IntLiteralResult>(
        getScalarType("u32"), ST->kind() == ScalarTypeKind::UnsignedInt);
    } else {
      PrintFatalError("unsignedflag's argument should be a scalar type");
    }
  } else {
    std::vector<Result::Ptr> Args;
    for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i)
Loading