[NVPTX] Added llvm.nvvm.mma.m8n8k4.* intrinsics (d9972f84) · Commits · Cabrera, Anthony / llvm-project

llvm/include/llvm/IR/IntrinsicsNVVM.td

+43 −14

Original line number	Diff line number	Diff line
		@@ -53,6 +53,10 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> {
		string gft = Geom#":"#Frag#":"#ptx_elt_type;
		string ft = frag#":"#ptx_elt_type;
		list<LLVMType> regs = !cond(
		// mma.sync.m8n8k4 uses smaller a/b fragments than wmma fp ops
		!eq(gft,"m8n8k4:a:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret,
		!eq(gft,"m8n8k4:b:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret,

		// fp16 -> fp16/fp32 @ m16n16k16/m8n32k16/m32n8k16
		// All currently supported geometries use the same fragment format,
		// so we only need to consider {fragment, type}.
		@@ -137,13 +141,19 @@ class MMA_SIGNATURE<WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
		class WMMA_NAME_MMA<string ALayout, string BLayout, int Satfinite,
		WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
		string signature = MMA_SIGNATURE<A, B, C, D>.ret;
		string llvm = "llvm.nvvm.wmma."
		string llvm = !if(
		!eq(A.geom, "m8n8k4"),
		"llvm.nvvm.mma.m8n8k4"
		# "." # ALayout
		# "." # BLayout
		# signature,
		"llvm.nvvm.wmma."
		# A.geom
		# ".mma"
		# "." # ALayout
		# "." # BLayout
		# signature
		# !if(Satfinite, ".satfinite", "");
		# !if(Satfinite, ".satfinite", ""));

		string record = !subst(".", "_",
		!subst("llvm.", "int_", llvm));
		@@ -160,7 +170,7 @@ class MMA_OPS<list<string> Geom, list<string> TypeA, list<string> TypeB,
		!foldl([]<list<WMMA_REGS>>, TypeA, t2, type_a, !listconcat(t2,
		!foldl([]<list<WMMA_REGS>>, !if(!size(TypeB), TypeB, [type_a]), t3, type_b, !listconcat(t3,
		!foldl([]<list<WMMA_REGS>>, TypeC, t4, type_c, !listconcat(t4,
		!foldl([]<list<WMMA_REGS>>, !if(!size(TypeC), TypeC, [type_c]), t5, type_d, !listconcat(t5,
		!foldl([]<list<WMMA_REGS>>, !if(!size(TypeD), TypeD, [type_c]), t5, type_d, !listconcat(t5,
		[[WMMA_REGS<geom, "a", type_a>,
		WMMA_REGS<geom, "b", type_b>,
		WMMA_REGS<geom, "c", type_c>,
		@@ -185,19 +195,23 @@ class MMA_LDST_OPS<list<string> Geom, list<string> Frags, list<string> Types> {
		// drives generation of corresponding intrinsics and instructions.
		class NVVM_MMA_OPS<int _ = 0> {
		list<list<WMMA_REGS>> fp_mma_ops = MMA_OPS<
		["m8n8k4"],
		["f16"], [], ["f16", "f32"], ["f16", "f32"]>.ret;
		list<list<WMMA_REGS>> fp_wmma_ops = MMA_OPS<
		["m16n16k16", "m32n8k16", "m8n32k16"],
		["f16"], [], ["f16", "f32"], ["f16", "f32"]>.ret;
		list<list<WMMA_REGS>> int_mma_ops = MMA_OPS<
		list<list<WMMA_REGS>> int_wmma_ops = MMA_OPS<
		["m16n16k16", "m32n8k16", "m8n32k16"],
		["s8", "u8"], [], ["s32"], []>.ret;
		list<list<WMMA_REGS>> subint_mma_ops = MMA_OPS<
		list<list<WMMA_REGS>> subint_wmma_ops = MMA_OPS<
		["m8n8k32"],
		["s4", "u4"], [], ["s32"], []>.ret;
		list<list<WMMA_REGS>> bit_mma_ops = MMA_OPS<
		list<list<WMMA_REGS>> bit_wmma_ops = MMA_OPS<
		["m8n8k128"],
		["b1"], [], ["s32"], []>.ret;
		list<list<WMMA_REGS>> all_mma_ops = !listconcat(fp_mma_ops, int_mma_ops,
		subint_mma_ops, bit_mma_ops);
		list<list<WMMA_REGS>> all_mma_ops = !listconcat(
		fp_mma_ops, fp_wmma_ops, int_wmma_ops,
		subint_wmma_ops, bit_wmma_ops);

		list<WMMA_REGS> ldst_ab_ops = MMA_LDST_OPS<
		["m16n16k16", "m32n8k16", "m8n32k16"],
		@@ -245,10 +259,25 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b
		# ":" # frags[0].frag
		;
		string t = frags[0].ptx_elt_type;

		// gcd is a shortcut used to identify instructions that depend on
		// geom+frag_c+frag_d. Not all instances of this class have all fragments
		// specified. If there are not enough fragments, the tail evaluates to '?'.
		string gcd = frags[0].geom
		# ":"
		# !if(!eq(!size(frags), 4),
		frags[2].ptx_elt_type # frags[3].ptx_elt_type,
		"?");
		list<int> ret = !cond(
		// Sub-int MMA only supports fixed A/B layout.
		// b1 does not support .satf.
		!eq(mma#":"#satf, "b1:row:col:0") : [1],
		// mma.m8n8k4 has no .satf modifier.
		!and(!eq(frags[0].geom, "m8n8k4"),
		!ne(satf, 0)): [],

		// mma.m8n8k4 has no C=f32 D=f16 variant.
		!eq(gcd, "m8n8k4:f32f16"): [],
		!eq(mma, "s4:row:col") : [1],
		!eq(mma, "u4:row:col") : [1],
		!eq(mma, "s4:row:col") : [1],

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

+28 −14

Original line number	Diff line number	Diff line
		@@ -7400,7 +7400,9 @@ class WMMA_REGINFO<WMMA_REGS r>

		// u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1)
		!or(!eq(geom,"m8n8k128"),
		!eq(geom,"m8n8k32")) : [hasSM75, hasPTX63]);
		!eq(geom,"m8n8k32")) : [hasSM75, hasPTX63],

		!eq(geom, "m8n8k4") : [hasSM70, hasPTX64]);

		// template DAGs for instruction inputs/output.
		dag Outs = !dag(outs, ptx_regs, reg_names);
		@@ -7546,13 +7548,25 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
		let OutOperandList = FragD.Outs;
		let InOperandList = !con(Args, (ins MmaCode:$ptx));
		string TypeList = !cond(
		!eq(FragD.geom, "m8n8k4") : "." # FragD.ptx_elt_type
		# ".f16.f16."
		# FragC.ptx_elt_type,
		!eq(FragD.ptx_elt_type, "s32") : ".s32"
		# "." # FragA.ptx_elt_type
		# "." # FragB.ptx_elt_type
		# ".s32",
		1: "." # FragD.ptx_elt_type # "." # FragC.ptx_elt_type,
		);
		let AsmString = "wmma.mma"
		let AsmString = !if(!eq(FragA.geom, "m8n8k4"),
		"mma.sync.aligned.m8n8k4"
		# "." # ALayout
		# "." # BLayout
		# TypeList # "\n\t\t"
		# FragD.regstring # ",\n\t\t"
		# FragA.regstring # ",\n\t\t"
		# FragB.regstring # ",\n\t\t"
		# FragC.regstring # ";",
		"wmma.mma"
		# !if(!eq(FragA.ptx_elt_type, "b1"), ".xor.popc", "")
		# ".sync"
		# "${ptx:aligned}"
		@@ -7564,7 +7578,7 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
		# FragD.regstring # ",\n\t\t"
		# FragA.regstring # ",\n\t\t"
		# FragB.regstring # ",\n\t\t"
		# FragC.regstring # ";";
		# FragC.regstring # ";");
		}

		defset list<WMMA_INSTR> MMAs = {

llvm/test/CodeGen/NVPTX/wmma.py

+101 −59

Original line number	Diff line number	Diff line
		@@ -4,39 +4,47 @@
		# Check all variants of instructions supported by PTX60 on SM70
		# RUN: python %s --ptx=60 --gpu-arch=70 > %t-ptx60-sm_70.ll
		# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
		# RUN: --check-prefixes=INTRINSICS,PTX60,SM70
		# RUN: --check-prefixes=INTRINSICS,M16N16
		# RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
		# RUN: --check-prefixes=INTRINSICS,PTX60U,SM70U
		# RUN: --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA
		# RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \
		# RUN: \| FileCheck %t-ptx60-sm_70.ll

		# Check all variants of instructions supported by PTX61 on SM70
		# RUN: python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll
		# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
		# RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,SM70
		# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM
		# RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
		# RUN: --check-prefixes=INTRINSICS,PTX61U,SM70U
		# RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA
		# RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \
		# RUN: \| FileCheck %t-ptx61-sm_70.ll

		# Check all variants of instructions supported by PTX63 on SM72
		# RUN: python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll
		# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
		# RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72
		# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT
		# RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
		# RUN: --check-prefixes=INTRINSICS,PTX63U,SM72U
		# RUN: --check-prefixes=INTRINSICS,NOSUBINT,NOMMA
		# RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \
		# RUN: \| FileCheck %t-ptx63-sm_72.ll

		# Check all variants of instructions supported by PTX63 on SM75
		# RUN: python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll
		# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
		# RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72,SM75
		# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT
		# RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
		# RUN: --check-prefixes=INTRINSICS,PTX63U,SM75U
		# RUN: --check-prefixes=INTRINSICS,NOMMA
		# RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \
		# RUN: \| FileCheck %t-ptx63-sm_75.ll

		# Check all variants of instructions supported by PTX64 on SM70+
		# RUN: python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll
		# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
		# RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,MMA
		# RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
		# RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT
		# RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \
		# RUN: \| FileCheck %t-ptx64-sm_70.ll

		from __future__ import print_function

		@@ -70,10 +78,11 @@ class MMAFrag:
		def __init__(self, geom, frag, ptx_elt_type):
		self.geom = geom
		self.frag = frag
		self.is_mma = True if geom == "m8n8k4" else False;
		self.mma_type = MMAType(ptx_elt_type);
		self.nregs = {
		"a:f16" : 8,
		"b:f16" : 8,
		"a:f16" : 2 if self.is_mma else 8,
		"b:f16" : 2 if self.is_mma else 8,
		"c:f16" : 4,
		"d:f16" : 4,
		"c:f32" : 8,
		@@ -145,7 +154,9 @@ def make_ldst_ops(geoms, frags, types):
		in product(geoms, frags, types)]

		def get_mma_ops():
		return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
		return (make_mma_ops(["m8n8k4"],
		["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
		make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
		["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
		make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
		["s8", "u8"], [], ["s32"], []) +
		@@ -165,6 +176,8 @@ def get_ldst_ops(kind):

		def is_geom_supported(geom):
		# geometries for FP and ints.
		if geom == "m8n8k4":
		return ptx_version >= 64
		if geom in ["m8n32k16", "m32n8k16"]:
		return ptx_version >= 61
		# geometries for sub-ints.
		@@ -186,6 +199,13 @@ def is_mma_variant_supported(op, layout_a, layout_b, satf):
		if not (is_type_supported(op.a.mma_type.ptx_type)
		and is_geom_supported(op.a.geom)):
		return False
		if op.a.geom == "m8n8k4":
		if satf:
		return False
		if op.c.mma_type.ptx_type == "f32":
		# If C is f32, D must be, too.
		return op.d.mma_type.ptx_type == "f32"

		# sub-integer require row/col layout, and no satf.
		if op.a.mma_type.ptx_type in ["s4", "u4", "b1"]:
		if op.a.mma_type.ptx_type == "b1" and satf:
		@@ -232,8 +252,6 @@ def get_pspace(space):
		def check_pattern(frag):
		return "{{%s}}" % ", ".join([frag.mma_type.ptx_reg_pattern] frag.nregs)

		known_geoms = ["m16n16k16", "m8n32k16", "m32n8k16"]

		def gen_wmma_load_tests():
		load_template = """
		declare ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});
		@@ -389,6 +407,8 @@ def mma_ptx_signature(op):
		if op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4", "b1"]:
		# int and sub-int instructions encode all four types as D.A.B.C
		return ".".join(x.mma_type.ptx_type for x in (op.d, op.a, op.b, op.c))
		if op.a.geom == "m8n8k4":
		return "%s.f16.f16.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
		else:
		# the rest are FP instructions use D.C
		return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
		@@ -411,8 +431,10 @@ define ${ret_ty} @test_${function}(
		ret ${ret_ty} %r;
		}
		"""
		intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}"
		instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}"
		wmma_intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}"
		wmma_instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}"
		mma_intrinsic_template = "llvm.nvvm.mma.${geom}.${alayout}.${blayout}.${intrinsic_signature}"
		mma_instruction_template = "mma.sync${aligned}.${geom}.${alayout}.${blayout}.${ptx_signature}"

		generated_items=[]

		@@ -436,6 +458,13 @@ define ${ret_ty} @test_${function}(
		"mma_variant" : ".xor.popc" if op.a.mma_type.ptx_type == "b1" else "",
		}

		if op.a.geom == "m8n8k4":
		intrinsic_template = mma_intrinsic_template
		instruction_template = mma_instruction_template
		else:
		intrinsic_template = wmma_intrinsic_template
		instruction_template = wmma_instruction_template

		test_params = params
		test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
		test_params["function"] = test_params["intrinsic"].replace(".", "_")
		@@ -458,55 +487,68 @@ define ${ret_ty} @test_${function}(
		# Generate set of checks to verify that that we did generate sensible set of
		# tests for the given combination of PTX and SM variants.
		#
		# PTX<N>: verifies that we did generate tests for correct classes of intrinsics.
		# PTX<N>U: verifies that we did not generate intrinsics unsupported by
		# the PTX version.
		# SM<N>: verifies that we did generate correct classes of instructions for the SM.
		# SM<N>U: verifies that we did not generate instructions unsupported by the SM
		#
		# Note that SM/PTX constraints overlap, but DAG checks do not allow overlapping
		# matches. We implicitly rely that we generate multiple variants of most of the
		# instructions and usually have enough input data to find more than one match of
		# the same kind, if necessary. When it's not possible (e.g. there's only one
		# m8n8k128.mma.row.col.b1), we may need to match PTX instruction instead.
		def gen_check_unsupported_ops(items):
		print("; Complete list of intrinsics supported by PTX%d on sm_%d"
		% (ptx_version, gpu_arch))
		print("; INTRINSICS: {{^; INTRINSICS_LIST_BEGIN}}")
		print("""
		; PTX60-DAG: m16n16k16.load.{{[ab].*}}.f16.p
		; PTX60-DAG: m16n16k16.{{load\|store}}.{{[cd].*\.(f16\|f32)}}.p
		; PTX60U-NOT: m32n8k16
		; PTX60U-NOT: m8n32k16
		; PTX60U-NOT: .{{s32\|s[48]\|u[48]\|b1}}

		; All features of PTX60, plus m32n8k16/m8n32k16 geometries.
		; PTX61-DAG: m32n8k16.load.{{[ab].*}}.f16.p
		; PTX61-DAG: m32n8k16.{{load\|store}}.{{[cd].*\.(f16\|f32)}}.p
		; PTX61-DAG: m8n32k16.load.{{[ab].*}}.f16.p
		; PTX61-DAG: m8n32k16.{{load\|store}}.{{[cd].*\.(f16\|f32)}}.p
		; PTX61U-NOT: .{{s32\|s[48]\|u[48]\|b1}}

		; SM70U-NOT: .{{s32\|s[48]\|u[48]\|b1}}

		; PTX63 supports all features of PTX60+PTX61, plus support for integers.
		; Alas we can"t just use PTX<N> checks for that as available instructions
		; depend on SM integers need sm72+ and subinteger ops need sm75, so we
		; transition to SM<N> checks
		; SM72-DAG: m16n16k16.load.{{[ab].*}}.s8.p
		; SM72-DAG: m8n32k16.load.{{[ab].*}}.s8.p
		; SM72-DAG: m32n8k16.load.{{[ab].*}}.s8.p
		; SM72-DAG: m16n16k16.load.{{[ab].*}}.u8.p
		; SM72-DAG: m8n32k16.load.{{[ab].*}}.u8.p
		; SM72-DAG: m32n8k16.load.{{[ab].*}}.u8.p
		; SM72-DAG: m32n8k16.{{load\|store}}.{{[cd].*\.s32}}.p
		; SM72U-NOT: .{{s4\|u4\|b1}}

		; SM75-DAG: m8n8k128.load.{{[ab].*}}.b1.p
		; SM75-DAG: m8n8k32.load.{{[ab].*}}.s4.p
		; SM75-DAG: m8n8k32.load.{{[ab].*}}.u4.p
		; SM75-DAG: m8n8k128.{{load\|store}}.{{[cd].*\.s32}}.p
		; SM75-DAG: m8n8k32.{{load\|store}}.{{[cd].*\.s32}}.p

		; NOEXTGEOM-NOT: {{m8n32\|m32n8}}
		; NOINT-NOT: .{{s32\|s8}}
		; NOSUBINT-NOT: {{s4\|u4\|b1}}
		; NOMMA-NOT: .m8n8k4.

		; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p
		; M16N16-DAG: m16n16k16.{{load\|store}}.{{[cd].*\.(f16\|f32)}}.p
		; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f32
		; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f16
		; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f16
		; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f32

		; PTX60 adds support for m32n8k16/m8n32k16 geometries.
		; EXTGEOM-DAG: m32n8k16.load.{{[ab].*}}.f16.p
		; EXTGEOM-DAG: m32n8k16.{{load\|store}}.{{[cd].*\.(f16\|f32)}}.p
		; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f32
		; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f16
		; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f16
		; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f32

		; EXTGEOM-DAG: m8n32k16.load.{{[ab].*}}.f16.p
		; EXTGEOM-DAG: m8n32k16.{{load\|store}}.{{[cd].*\.(f16\|f32)}}.p
		; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f32
		; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f16
		; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f16
		; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f32

		; INT-DAG: m16n16k16.load.{{[ab].*}}.s8.p
		; INT-DAG: m8n32k16.load.{{[ab].*}}.s8.p
		; INT-DAG: m32n8k16.load.{{[ab].*}}.s8.p
		; INT-DAG: m16n16k16.load.{{[ab].*}}.u8.p
		; INT-DAG: m8n32k16.load.{{[ab].*}}.u8.p
		; INT-DAG: m32n8k16.load.{{[ab].*}}.u8.p
		; INT-DAG: m32n8k16.{{load\|store}}.{{[cd].*\.s32}}.p
		; INT-DAG: m16n16k16.mma.{{.*}}.u8
		; INT-DAG: m16n16k16.mma.{{.*}}.s8
		; INT-DAG: m8n32k16.mma.{{.*}}.u8
		; INT-DAG: m8n32k16.mma.{{.*}}.s8
		; INT-DAG: m32n8k16.mma.{{.*}}.u8
		; INT-DAG: m32n8k16.mma.{{.*}}.s8

		; SUBINT-DAG: m8n8k128.load.{{[ab].*}}.b1.p
		; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.s4.p
		; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.u4.p
		; SUBINT-DAG: m8n8k128.{{load\|store}}.{{[cd].*\.s32}}.p
		; SUBINT-DAG: m8n8k32.{{load\|store}}.{{[cd].*\.s32}}.p
		; SUBINT-DAG: m8n8k32.mma.{{.*}}.u4
		; SUBINT-DAG: m8n8k32.mma.{{.*}}.s4
		; SUBINT-DAG: m8n8k128.mma.{{.*}}.b1

		; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f32
		; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f16
		; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f16
		; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f32
		;

		""")

		print("; INTRINSICS_LIST_BEGIN")