Commit 6d614a82 authored by Konstantin Pyzhov's avatar Konstantin Pyzhov
Browse files

Summary:

This CL adds clang declarations of built-in functions for AMDGPU MFMA intrinsics and instructions.
OpenCL tests for new built-ins are included.

Differential Revision: https://reviews.llvm.org/D72723
parent de2dfc8b
Loading
Loading
Loading
Loading
+25 −0
Original line number Diff line number Diff line
@@ -212,5 +212,30 @@ BUILTIN(__builtin_r600_read_tidig_z, "Ui", "nc")
BUILTIN(__builtin_r600_recipsqrt_ieee, "dd", "nc")
BUILTIN(__builtin_r600_recipsqrt_ieeef, "ff", "nc")

//===----------------------------------------------------------------------===//
// MFMA builtins.
//===----------------------------------------------------------------------===//

TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x1f32, "V32fffV32fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x1f32, "V16fffV16fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x1f32, "V4fffV4fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x2f32, "V16fffV16fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x4f32, "V4fffV4fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4f16, "V32fV4hV4hV32fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x4f16, "V16fV4hV4hV16fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x4f16, "V4fV4hV4hV4fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x8f16, "V16fV4hV4hV16fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x16f16, "V4fV4hV4hV4fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x4i8, "V32iiiV32iIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x4i8, "V16iiiV16iIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_4x4x4i8, "V4iiiV4iIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x8i8, "V16iiiV16iIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x16i8, "V4iiiV4iIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x2bf16, "V32fV2sV2sV32fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x2bf16, "V16fV2sV2sV16fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x2bf16, "V4fV2sV2sV4fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4bf16, "V16fV2sV2sV16fIiIiIi", "nc", "mai-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8bf16, "V4fV2sV2sV4fIiIiIi", "nc", "mai-insts")

#undef BUILTIN
#undef TARGET_BUILTIN
+1 −0
Original line number Diff line number Diff line
@@ -163,6 +163,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
      Features["dot4-insts"] = true;
      Features["dot5-insts"] = true;
      Features["dot6-insts"] = true;
      Features["mai-insts"] = true;
      LLVM_FALLTHROUGH;
    case GK_GFX906:
      Features["dl-insts"] = true;
+0 −0

Empty file added.

+0 −0

Empty file added.

+119 −99
Original line number Diff line number Diff line
@@ -1725,102 +1725,122 @@ def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn;
def int_amdgcn_global_atomic_fadd    : AMDGPUGlobalAtomicNoRtn;

// llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32f32_ty],
def int_amdgcn_mfma_f32_32x32x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x1f32">,
  Intrinsic<[llvm_v32f32_ty],
            [llvm_float_ty, llvm_float_ty, llvm_v32f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_16x16x1f32 : Intrinsic<[llvm_v16f32_ty],
def int_amdgcn_mfma_f32_16x16x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x1f32">,
  Intrinsic<[llvm_v16f32_ty],
            [llvm_float_ty, llvm_float_ty, llvm_v16f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_4x4x1f32 : Intrinsic<[llvm_v4f32_ty],
def int_amdgcn_mfma_f32_4x4x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_4x4x1f32">,
  Intrinsic<[llvm_v4f32_ty],
            [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_32x32x2f32 : Intrinsic<[llvm_v16f32_ty],
def int_amdgcn_mfma_f32_32x32x2f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x2f32">,
  Intrinsic<[llvm_v16f32_ty],
            [llvm_float_ty, llvm_float_ty, llvm_v16f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_16x16x4f32 : Intrinsic<[llvm_v4f32_ty],
def int_amdgcn_mfma_f32_16x16x4f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x4f32">,
  Intrinsic<[llvm_v4f32_ty],
            [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32f32_ty],
def int_amdgcn_mfma_f32_32x32x4f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x4f16">,
  Intrinsic<[llvm_v32f32_ty],
            [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_16x16x4f16 : Intrinsic<[llvm_v16f32_ty],
def int_amdgcn_mfma_f32_16x16x4f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x4f16">,
  Intrinsic<[llvm_v16f32_ty],
            [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v16f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_4x4x4f16 : Intrinsic<[llvm_v4f32_ty],
def int_amdgcn_mfma_f32_4x4x4f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_4x4x4f16">,
  Intrinsic<[llvm_v4f32_ty],
            [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v4f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_32x32x8f16 : Intrinsic<[llvm_v16f32_ty],
def int_amdgcn_mfma_f32_32x32x8f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x8f16">,
  Intrinsic<[llvm_v16f32_ty],
            [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v16f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_16x16x16f16 : Intrinsic<[llvm_v4f32_ty],
def int_amdgcn_mfma_f32_16x16x16f16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x16f16">,
  Intrinsic<[llvm_v4f32_ty],
            [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v4f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_i32_32x32x4i8 : Intrinsic<[llvm_v32i32_ty],
def int_amdgcn_mfma_i32_32x32x4i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_32x32x4i8">,
  Intrinsic<[llvm_v32i32_ty],
            [llvm_i32_ty, llvm_i32_ty, llvm_v32i32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_i32_16x16x4i8 : Intrinsic<[llvm_v16i32_ty],
def int_amdgcn_mfma_i32_16x16x4i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_16x16x4i8">,
  Intrinsic<[llvm_v16i32_ty],
            [llvm_i32_ty, llvm_i32_ty, llvm_v16i32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_i32_4x4x4i8 : Intrinsic<[llvm_v4i32_ty],
def int_amdgcn_mfma_i32_4x4x4i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_4x4x4i8">,
  Intrinsic<[llvm_v4i32_ty],
            [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_i32_32x32x8i8 : Intrinsic<[llvm_v16i32_ty],
def int_amdgcn_mfma_i32_32x32x8i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_32x32x8i8">,
  Intrinsic<[llvm_v16i32_ty],
            [llvm_i32_ty, llvm_i32_ty, llvm_v16i32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_i32_16x16x16i8 : Intrinsic<[llvm_v4i32_ty],
def int_amdgcn_mfma_i32_16x16x16i8 : GCCBuiltin<"__builtin_amdgcn_mfma_i32_16x16x16i8">,
  Intrinsic<[llvm_v4i32_ty],
            [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32f32_ty],
def int_amdgcn_mfma_f32_32x32x2bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x2bf16">,
  Intrinsic<[llvm_v32f32_ty],
            [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_16x16x2bf16 : Intrinsic<[llvm_v16f32_ty],
def int_amdgcn_mfma_f32_16x16x2bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x2bf16">,
  Intrinsic<[llvm_v16f32_ty],
            [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v16f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_4x4x2bf16 : Intrinsic<[llvm_v4f32_ty],
def int_amdgcn_mfma_f32_4x4x2bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_4x4x2bf16">,
  Intrinsic<[llvm_v4f32_ty],
            [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v4f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_32x32x4bf16 : Intrinsic<[llvm_v16f32_ty],
def int_amdgcn_mfma_f32_32x32x4bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x4bf16">,
  Intrinsic<[llvm_v16f32_ty],
            [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v16f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;

def int_amdgcn_mfma_f32_16x16x8bf16 : Intrinsic<[llvm_v4f32_ty],
def int_amdgcn_mfma_f32_16x16x8bf16 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x8bf16">,
  Intrinsic<[llvm_v4f32_ty],
            [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v4f32_ty,
            llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;