Commit d9972f84 authored by Artem Belevich's avatar Artem Belevich
Browse files

[NVPTX] Added llvm.nvvm.mma.m8n8k4.* intrinsics

Differential Revision: https://reviews.llvm.org/D69324
parent 7b9d6a22
Loading
Loading
Loading
Loading
+43 −14
Original line number Diff line number Diff line
@@ -53,6 +53,10 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> {
  string gft = Geom#":"#Frag#":"#ptx_elt_type;
  string ft = frag#":"#ptx_elt_type;
  list<LLVMType> regs = !cond(
    // mma.sync.m8n8k4 uses smaller a/b fragments than wmma fp ops
    !eq(gft,"m8n8k4:a:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret,
    !eq(gft,"m8n8k4:b:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret,

    // fp16 -> fp16/fp32 @  m16n16k16/m8n32k16/m32n8k16
    // All currently supported geometries use the same fragment format,
    // so we only need to consider {fragment, type}.
@@ -137,13 +141,19 @@ class MMA_SIGNATURE<WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
class WMMA_NAME_MMA<string ALayout, string BLayout, int Satfinite,
                    WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
  string signature = MMA_SIGNATURE<A, B, C, D>.ret;
  string llvm = "llvm.nvvm.wmma."
  string llvm = !if(
    !eq(A.geom, "m8n8k4"),
        "llvm.nvvm.mma.m8n8k4"
           # "." # ALayout
           # "." # BLayout
           # signature,
        "llvm.nvvm.wmma."
           # A.geom
           # ".mma"
           # "." # ALayout
           # "." # BLayout
           # signature
                # !if(Satfinite, ".satfinite", "");
           # !if(Satfinite, ".satfinite", ""));

  string record = !subst(".", "_",
                  !subst("llvm.", "int_", llvm));
@@ -160,7 +170,7 @@ class MMA_OPS<list<string> Geom, list<string> TypeA, list<string> TypeB,
     !foldl([]<list<WMMA_REGS>>, TypeA, t2, type_a, !listconcat(t2,
     !foldl([]<list<WMMA_REGS>>, !if(!size(TypeB), TypeB, [type_a]), t3, type_b, !listconcat(t3,
     !foldl([]<list<WMMA_REGS>>, TypeC, t4, type_c, !listconcat(t4,
     !foldl([]<list<WMMA_REGS>>, !if(!size(TypeC), TypeC, [type_c]), t5, type_d, !listconcat(t5,
     !foldl([]<list<WMMA_REGS>>, !if(!size(TypeD), TypeD, [type_c]), t5, type_d, !listconcat(t5,
            [[WMMA_REGS<geom, "a", type_a>,
              WMMA_REGS<geom, "b", type_b>,
              WMMA_REGS<geom, "c", type_c>,
@@ -185,19 +195,23 @@ class MMA_LDST_OPS<list<string> Geom, list<string> Frags, list<string> Types> {
// drives generation of corresponding intrinsics and instructions.
class NVVM_MMA_OPS<int _ = 0> {
  list<list<WMMA_REGS>> fp_mma_ops = MMA_OPS<
            ["m8n8k4"],
            ["f16"], [], ["f16", "f32"], ["f16", "f32"]>.ret;
  list<list<WMMA_REGS>> fp_wmma_ops = MMA_OPS<
            ["m16n16k16", "m32n8k16", "m8n32k16"],
            ["f16"], [], ["f16", "f32"], ["f16", "f32"]>.ret;
  list<list<WMMA_REGS>> int_mma_ops = MMA_OPS<
  list<list<WMMA_REGS>> int_wmma_ops = MMA_OPS<
            ["m16n16k16", "m32n8k16", "m8n32k16"],
            ["s8", "u8"], [], ["s32"], []>.ret;
  list<list<WMMA_REGS>> subint_mma_ops = MMA_OPS<
  list<list<WMMA_REGS>> subint_wmma_ops = MMA_OPS<
            ["m8n8k32"],
            ["s4", "u4"], [], ["s32"], []>.ret;
  list<list<WMMA_REGS>> bit_mma_ops = MMA_OPS<
  list<list<WMMA_REGS>> bit_wmma_ops = MMA_OPS<
            ["m8n8k128"],
            ["b1"], [], ["s32"], []>.ret;
  list<list<WMMA_REGS>> all_mma_ops = !listconcat(fp_mma_ops, int_mma_ops,
                                                  subint_mma_ops, bit_mma_ops);
  list<list<WMMA_REGS>> all_mma_ops = !listconcat(
            fp_mma_ops, fp_wmma_ops, int_wmma_ops,
            subint_wmma_ops, bit_wmma_ops);

  list<WMMA_REGS> ldst_ab_ops = MMA_LDST_OPS<
            ["m16n16k16", "m32n8k16", "m8n32k16"],
@@ -245,10 +259,25 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b
               # ":" # frags[0].frag
               ;
  string t = frags[0].ptx_elt_type;

  // gcd is a shortcut used to identify instructions that depend on
  // geom+frag_c+frag_d.  Not all instances of this class have all fragments
  // specified. If there are not enough fragments, the tail evaluates to '?'.
  string gcd = frags[0].geom
               # ":"
               # !if(!eq(!size(frags), 4),
                     frags[2].ptx_elt_type # frags[3].ptx_elt_type,
                     "?");
  list<int> ret = !cond(
    // Sub-int MMA only supports fixed A/B layout.
    // b1 does not support .satf.
    !eq(mma#":"#satf, "b1:row:col:0") : [1],
    // mma.m8n8k4 has no .satf modifier.
    !and(!eq(frags[0].geom, "m8n8k4"),
         !ne(satf, 0)): [],

    // mma.m8n8k4 has no C=f32 D=f16 variant.
    !eq(gcd, "m8n8k4:f32f16"): [],
    !eq(mma, "s4:row:col") : [1],
    !eq(mma, "u4:row:col") : [1],
    !eq(mma, "s4:row:col") : [1],
+28 −14
Original line number Diff line number Diff line
@@ -7400,7 +7400,9 @@ class WMMA_REGINFO<WMMA_REGS r>

    // u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1)
    !or(!eq(geom,"m8n8k128"),
        !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63]);
        !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63],

    !eq(geom, "m8n8k4") : [hasSM70, hasPTX64]);

  // template DAGs for instruction inputs/output.
  dag Outs = !dag(outs, ptx_regs, reg_names);
@@ -7546,13 +7548,25 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
  let OutOperandList = FragD.Outs;
  let InOperandList  = !con(Args, (ins MmaCode:$ptx));
  string TypeList = !cond(
    !eq(FragD.geom, "m8n8k4") : "." # FragD.ptx_elt_type
                                # ".f16.f16."
                                # FragC.ptx_elt_type,
    !eq(FragD.ptx_elt_type, "s32") : ".s32"
                                     # "." # FragA.ptx_elt_type
                                     # "." # FragB.ptx_elt_type
                                     # ".s32",
    1: "." # FragD.ptx_elt_type # "." # FragC.ptx_elt_type,
  );
  let AsmString = "wmma.mma"
  let AsmString = !if(!eq(FragA.geom, "m8n8k4"),
     "mma.sync.aligned.m8n8k4"
        # "." # ALayout
        # "." # BLayout
        # TypeList # "\n\t\t"
        # FragD.regstring # ",\n\t\t"
        # FragA.regstring # ",\n\t\t"
        # FragB.regstring # ",\n\t\t"
        # FragC.regstring # ";",
     "wmma.mma"
        # !if(!eq(FragA.ptx_elt_type, "b1"), ".xor.popc", "")
        # ".sync"
        # "${ptx:aligned}"
@@ -7564,7 +7578,7 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
        # FragD.regstring # ",\n\t\t"
        # FragA.regstring # ",\n\t\t"
        # FragB.regstring # ",\n\t\t"
                  # FragC.regstring # ";";
        # FragC.regstring # ";");
}

defset list<WMMA_INSTR> MMAs  = {
+101 −59
Original line number Diff line number Diff line
@@ -4,39 +4,47 @@
# Check all variants of instructions supported by PTX60 on SM70
# RUN: python %s --ptx=60 --gpu-arch=70 > %t-ptx60-sm_70.ll
# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,PTX60,SM70
# RUN:           --check-prefixes=INTRINSICS,M16N16
# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,PTX60U,SM70U
# RUN:           --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA
# RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \
# RUN:           | FileCheck %t-ptx60-sm_70.ll

# Check all variants of instructions supported by PTX61 on SM70
# RUN: python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll
# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,PTX60,PTX61,SM70
# RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM
# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,PTX61U,SM70U
# RUN:           --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA
# RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \
# RUN:           | FileCheck %t-ptx61-sm_70.ll

# Check all variants of instructions supported by PTX63 on SM72
# RUN: python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll
# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
# RUN:           --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72
# RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT
# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
# RUN:           --check-prefixes=INTRINSICS,PTX63U,SM72U
# RUN:           --check-prefixes=INTRINSICS,NOSUBINT,NOMMA
# RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \
# RUN:           | FileCheck %t-ptx63-sm_72.ll

# Check all variants of instructions supported by PTX63 on SM75
# RUN: python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll
# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
# RUN:           --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72,SM75
# RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT
# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
# RUN:           --check-prefixes=INTRINSICS,PTX63U,SM75U
# RUN:           --check-prefixes=INTRINSICS,NOMMA
# RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \
# RUN:           | FileCheck %t-ptx63-sm_75.ll

# Check all variants of instructions supported by PTX64 on SM70+
# RUN: python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll
# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM,MMA
# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
# RUN:           --check-prefixes=INTRINSICS,NOINT,NOSUBINT
# RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \
# RUN:           | FileCheck %t-ptx64-sm_70.ll

from __future__ import print_function

@@ -70,10 +78,11 @@ class MMAFrag:
  def __init__(self, geom, frag, ptx_elt_type):
    self.geom = geom
    self.frag = frag
    self.is_mma = True if geom == "m8n8k4" else False;
    self.mma_type = MMAType(ptx_elt_type);
    self.nregs = {
        "a:f16" : 8,
        "b:f16" : 8,
        "a:f16" : 2 if self.is_mma else 8,
        "b:f16" : 2 if self.is_mma else 8,
        "c:f16" : 4,
        "d:f16" : 4,
        "c:f32" : 8,
@@ -145,7 +154,9 @@ def make_ldst_ops(geoms, frags, types):
          in product(geoms, frags, types)]

def get_mma_ops():
  return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
  return (make_mma_ops(["m8n8k4"],
                       ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
          make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
                       ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
          make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
                       ["s8", "u8"], [], ["s32"], []) +
@@ -165,6 +176,8 @@ def get_ldst_ops(kind):

def is_geom_supported(geom):
  # geometries for FP and ints.
  if geom == "m8n8k4":
    return ptx_version >= 64
  if geom in ["m8n32k16", "m32n8k16"]:
    return ptx_version >= 61
  # geometries for sub-ints.
@@ -186,6 +199,13 @@ def is_mma_variant_supported(op, layout_a, layout_b, satf):
  if not (is_type_supported(op.a.mma_type.ptx_type)
          and is_geom_supported(op.a.geom)):
    return False
  if op.a.geom == "m8n8k4":
    if satf:
      return False
    if op.c.mma_type.ptx_type == "f32":
      # If C is f32, D must be, too.
      return op.d.mma_type.ptx_type == "f32"

  # sub-integer require row/col layout, and no satf.
  if op.a.mma_type.ptx_type in ["s4", "u4", "b1"]:
    if op.a.mma_type.ptx_type == "b1" and satf:
@@ -232,8 +252,6 @@ def get_pspace(space):
def check_pattern(frag):
   return "{{%s}}" % ", *".join([frag.mma_type.ptx_reg_pattern] * frag.nregs)

known_geoms = ["m16n16k16", "m8n32k16", "m32n8k16"]

def gen_wmma_load_tests():
  load_template = """
declare ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});
@@ -389,6 +407,8 @@ def mma_ptx_signature(op):
  if op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4", "b1"]:
    # int and sub-int instructions encode all four types as D.A.B.C
    return ".".join(x.mma_type.ptx_type for x in (op.d, op.a, op.b, op.c))
  if op.a.geom == "m8n8k4":
    return "%s.f16.f16.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
  else:
    # the rest are FP instructions use D.C
    return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
@@ -411,8 +431,10 @@ define ${ret_ty} @test_${function}(
  ret ${ret_ty} %r;
}
"""
  intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}"
  instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}"
  wmma_intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}"
  wmma_instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}"
  mma_intrinsic_template = "llvm.nvvm.mma.${geom}.${alayout}.${blayout}.${intrinsic_signature}"
  mma_instruction_template = "mma.sync${aligned}.${geom}.${alayout}.${blayout}.${ptx_signature}"

  generated_items=[]

@@ -436,6 +458,13 @@ define ${ret_ty} @test_${function}(
        "mma_variant" : ".xor.popc" if op.a.mma_type.ptx_type == "b1" else "",
    }

    if op.a.geom == "m8n8k4":
      intrinsic_template = mma_intrinsic_template
      instruction_template = mma_instruction_template
    else:
      intrinsic_template = wmma_intrinsic_template
      instruction_template = wmma_instruction_template

    test_params = params
    test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
    test_params["function"] = test_params["intrinsic"].replace(".", "_")
@@ -458,55 +487,68 @@ define ${ret_ty} @test_${function}(
# Generate set of checks to verify that that we did generate sensible set of
# tests for the given combination of PTX and SM variants.
#
# PTX<N>: verifies that we did generate tests for correct classes of intrinsics.
# PTX<N>U: verifies that we did not generate intrinsics unsupported by
#          the PTX version.
# SM<N>: verifies that we did generate correct classes of instructions for the SM.
# SM<N>U: verifies that we did not generate instructions unsupported by the SM
#
# Note that SM/PTX constraints overlap, but DAG checks do not allow overlapping
# matches. We implicitly rely that we generate multiple variants of most of the
# instructions and usually have enough input data to find more than one match of
# the same kind, if necessary. When it's not possible (e.g. there's only one
# m8n8k128.mma.row.col.b1), we may need to match PTX instruction instead.
def gen_check_unsupported_ops(items):
  print("; Complete list of intrinsics supported by PTX%d on sm_%d"
        % (ptx_version, gpu_arch))
  print("; INTRINSICS: {{^; INTRINSICS_LIST_BEGIN}}")
  print("""
; PTX60-DAG: m16n16k16.load.{{[ab].*}}.f16.p
; PTX60-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
; PTX60U-NOT: m32n8k16
; PTX60U-NOT: m8n32k16
; PTX60U-NOT: .{{s32|s[48]|u[48]|b1}}

; All features of PTX60, plus m32n8k16/m8n32k16 geometries.
; PTX61-DAG: m32n8k16.load.{{[ab].*}}.f16.p
; PTX61-DAG: m32n8k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
; PTX61-DAG: m8n32k16.load.{{[ab].*}}.f16.p
; PTX61-DAG: m8n32k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
; PTX61U-NOT: .{{s32|s[48]|u[48]|b1}}

; SM70U-NOT: .{{s32|s[48]|u[48]|b1}}

; PTX63 supports all features of PTX60+PTX61, plus support for integers.
; Alas we can"t just use PTX<N> checks for that as available instructions
; depend on SM integers need sm72+ and subinteger ops need sm75, so we
; transition to SM<N> checks
; SM72-DAG: m16n16k16.load.{{[ab].*}}.s8.p
; SM72-DAG: m8n32k16.load.{{[ab].*}}.s8.p
; SM72-DAG: m32n8k16.load.{{[ab].*}}.s8.p
; SM72-DAG: m16n16k16.load.{{[ab].*}}.u8.p
; SM72-DAG: m8n32k16.load.{{[ab].*}}.u8.p
; SM72-DAG: m32n8k16.load.{{[ab].*}}.u8.p
; SM72-DAG: m32n8k16.{{load|store}}.{{[cd].*\.s32}}.p
; SM72U-NOT: .{{s4|u4|b1}}

; SM75-DAG: m8n8k128.load.{{[ab].*}}.b1.p
; SM75-DAG: m8n8k32.load.{{[ab].*}}.s4.p
; SM75-DAG: m8n8k32.load.{{[ab].*}}.u4.p
; SM75-DAG: m8n8k128.{{load|store}}.{{[cd].*\.s32}}.p
; SM75-DAG: m8n8k32.{{load|store}}.{{[cd].*\.s32}}.p

; NOEXTGEOM-NOT: {{m8n32|m32n8}}
; NOINT-NOT: .{{s32|s8}}
; NOSUBINT-NOT: {{s4|u4|b1}}
; NOMMA-NOT: .m8n8k4.

; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p
; M16N16-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f32
; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f16
; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f16
; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f32

; PTX60 adds support for m32n8k16/m8n32k16 geometries.
; EXTGEOM-DAG: m32n8k16.load.{{[ab].*}}.f16.p
; EXTGEOM-DAG: m32n8k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f32
; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f16
; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f16
; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f32

; EXTGEOM-DAG: m8n32k16.load.{{[ab].*}}.f16.p
; EXTGEOM-DAG: m8n32k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f32
; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f16
; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f16
; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f32

; INT-DAG: m16n16k16.load.{{[ab].*}}.s8.p
; INT-DAG: m8n32k16.load.{{[ab].*}}.s8.p
; INT-DAG: m32n8k16.load.{{[ab].*}}.s8.p
; INT-DAG: m16n16k16.load.{{[ab].*}}.u8.p
; INT-DAG: m8n32k16.load.{{[ab].*}}.u8.p
; INT-DAG: m32n8k16.load.{{[ab].*}}.u8.p
; INT-DAG: m32n8k16.{{load|store}}.{{[cd].*\.s32}}.p
; INT-DAG: m16n16k16.mma.{{.*}}.u8
; INT-DAG: m16n16k16.mma.{{.*}}.s8
; INT-DAG: m8n32k16.mma.{{.*}}.u8
; INT-DAG: m8n32k16.mma.{{.*}}.s8
; INT-DAG: m32n8k16.mma.{{.*}}.u8
; INT-DAG: m32n8k16.mma.{{.*}}.s8

; SUBINT-DAG: m8n8k128.load.{{[ab].*}}.b1.p
; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.s4.p
; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.u4.p
; SUBINT-DAG: m8n8k128.{{load|store}}.{{[cd].*\.s32}}.p
; SUBINT-DAG: m8n8k32.{{load|store}}.{{[cd].*\.s32}}.p
; SUBINT-DAG: m8n8k32.mma.{{.*}}.u4
; SUBINT-DAG: m8n8k32.mma.{{.*}}.s4
; SUBINT-DAG: m8n8k128.mma.{{.*}}.b1

; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f32
; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f16
; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f16
; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f32
;

""")

  print("; INTRINSICS_LIST_BEGIN")