Loading llvm/include/llvm/IR/IntrinsicsNVVM.td +43 −14 Original line number Diff line number Diff line Loading @@ -53,6 +53,10 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> { string gft = Geom#":"#Frag#":"#ptx_elt_type; string ft = frag#":"#ptx_elt_type; list<LLVMType> regs = !cond( // mma.sync.m8n8k4 uses smaller a/b fragments than wmma fp ops !eq(gft,"m8n8k4:a:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret, !eq(gft,"m8n8k4:b:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret, // fp16 -> fp16/fp32 @ m16n16k16/m8n32k16/m32n8k16 // All currently supported geometries use the same fragment format, // so we only need to consider {fragment, type}. Loading Loading @@ -137,13 +141,19 @@ class MMA_SIGNATURE<WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> { class WMMA_NAME_MMA<string ALayout, string BLayout, int Satfinite, WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> { string signature = MMA_SIGNATURE<A, B, C, D>.ret; string llvm = "llvm.nvvm.wmma." string llvm = !if( !eq(A.geom, "m8n8k4"), "llvm.nvvm.mma.m8n8k4" # "." # ALayout # "." # BLayout # signature, "llvm.nvvm.wmma." # A.geom # ".mma" # "." # ALayout # "." # BLayout # signature # !if(Satfinite, ".satfinite", ""); # !if(Satfinite, ".satfinite", "")); string record = !subst(".", "_", !subst("llvm.", "int_", llvm)); Loading @@ -160,7 +170,7 @@ class MMA_OPS<list<string> Geom, list<string> TypeA, list<string> TypeB, !foldl([]<list<WMMA_REGS>>, TypeA, t2, type_a, !listconcat(t2, !foldl([]<list<WMMA_REGS>>, !if(!size(TypeB), TypeB, [type_a]), t3, type_b, !listconcat(t3, !foldl([]<list<WMMA_REGS>>, TypeC, t4, type_c, !listconcat(t4, !foldl([]<list<WMMA_REGS>>, !if(!size(TypeC), TypeC, [type_c]), t5, type_d, !listconcat(t5, !foldl([]<list<WMMA_REGS>>, !if(!size(TypeD), TypeD, [type_c]), t5, type_d, !listconcat(t5, [[WMMA_REGS<geom, "a", type_a>, WMMA_REGS<geom, "b", type_b>, WMMA_REGS<geom, "c", type_c>, Loading @@ -185,19 +195,23 @@ class MMA_LDST_OPS<list<string> Geom, list<string> Frags, list<string> Types> { // drives generation of corresponding intrinsics and instructions. class NVVM_MMA_OPS<int _ = 0> { list<list<WMMA_REGS>> fp_mma_ops = MMA_OPS< ["m8n8k4"], ["f16"], [], ["f16", "f32"], ["f16", "f32"]>.ret; list<list<WMMA_REGS>> fp_wmma_ops = MMA_OPS< ["m16n16k16", "m32n8k16", "m8n32k16"], ["f16"], [], ["f16", "f32"], ["f16", "f32"]>.ret; list<list<WMMA_REGS>> int_mma_ops = MMA_OPS< list<list<WMMA_REGS>> int_wmma_ops = MMA_OPS< ["m16n16k16", "m32n8k16", "m8n32k16"], ["s8", "u8"], [], ["s32"], []>.ret; list<list<WMMA_REGS>> subint_mma_ops = MMA_OPS< list<list<WMMA_REGS>> subint_wmma_ops = MMA_OPS< ["m8n8k32"], ["s4", "u4"], [], ["s32"], []>.ret; list<list<WMMA_REGS>> bit_mma_ops = MMA_OPS< list<list<WMMA_REGS>> bit_wmma_ops = MMA_OPS< ["m8n8k128"], ["b1"], [], ["s32"], []>.ret; list<list<WMMA_REGS>> all_mma_ops = !listconcat(fp_mma_ops, int_mma_ops, subint_mma_ops, bit_mma_ops); list<list<WMMA_REGS>> all_mma_ops = !listconcat( fp_mma_ops, fp_wmma_ops, int_wmma_ops, subint_wmma_ops, bit_wmma_ops); list<WMMA_REGS> ldst_ab_ops = MMA_LDST_OPS< ["m16n16k16", "m32n8k16", "m8n32k16"], Loading Loading @@ -245,10 +259,25 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b # ":" # frags[0].frag ; string t = frags[0].ptx_elt_type; // gcd is a shortcut used to identify instructions that depend on // geom+frag_c+frag_d. Not all instances of this class have all fragments // specified. If there are not enough fragments, the tail evaluates to '?'. string gcd = frags[0].geom # ":" # !if(!eq(!size(frags), 4), frags[2].ptx_elt_type # frags[3].ptx_elt_type, "?"); list<int> ret = !cond( // Sub-int MMA only supports fixed A/B layout. // b1 does not support .satf. !eq(mma#":"#satf, "b1:row:col:0") : [1], // mma.m8n8k4 has no .satf modifier. !and(!eq(frags[0].geom, "m8n8k4"), !ne(satf, 0)): [], // mma.m8n8k4 has no C=f32 D=f16 variant. !eq(gcd, "m8n8k4:f32f16"): [], !eq(mma, "s4:row:col") : [1], !eq(mma, "u4:row:col") : [1], !eq(mma, "s4:row:col") : [1], Loading llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +28 −14 Original line number Diff line number Diff line Loading @@ -7400,7 +7400,9 @@ class WMMA_REGINFO<WMMA_REGS r> // u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1) !or(!eq(geom,"m8n8k128"), !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63]); !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63], !eq(geom, "m8n8k4") : [hasSM70, hasPTX64]); // template DAGs for instruction inputs/output. dag Outs = !dag(outs, ptx_regs, reg_names); Loading Loading @@ -7546,13 +7548,25 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB, let OutOperandList = FragD.Outs; let InOperandList = !con(Args, (ins MmaCode:$ptx)); string TypeList = !cond( !eq(FragD.geom, "m8n8k4") : "." # FragD.ptx_elt_type # ".f16.f16." # FragC.ptx_elt_type, !eq(FragD.ptx_elt_type, "s32") : ".s32" # "." # FragA.ptx_elt_type # "." # FragB.ptx_elt_type # ".s32", 1: "." # FragD.ptx_elt_type # "." # FragC.ptx_elt_type, ); let AsmString = "wmma.mma" let AsmString = !if(!eq(FragA.geom, "m8n8k4"), "mma.sync.aligned.m8n8k4" # "." # ALayout # "." # BLayout # TypeList # "\n\t\t" # FragD.regstring # ",\n\t\t" # FragA.regstring # ",\n\t\t" # FragB.regstring # ",\n\t\t" # FragC.regstring # ";", "wmma.mma" # !if(!eq(FragA.ptx_elt_type, "b1"), ".xor.popc", "") # ".sync" # "${ptx:aligned}" Loading @@ -7564,7 +7578,7 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB, # FragD.regstring # ",\n\t\t" # FragA.regstring # ",\n\t\t" # FragB.regstring # ",\n\t\t" # FragC.regstring # ";"; # FragC.regstring # ";"); } defset list<WMMA_INSTR> MMAs = { Loading llvm/test/CodeGen/NVPTX/wmma.py +101 −59 Original line number Diff line number Diff line Loading @@ -4,39 +4,47 @@ # Check all variants of instructions supported by PTX60 on SM70 # RUN: python %s --ptx=60 --gpu-arch=70 > %t-ptx60-sm_70.ll # RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,PTX60,SM70 # RUN: --check-prefixes=INTRINSICS,M16N16 # RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,PTX60U,SM70U # RUN: --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA # RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \ # RUN: | FileCheck %t-ptx60-sm_70.ll # Check all variants of instructions supported by PTX61 on SM70 # RUN: python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll # RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,SM70 # RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM # RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,PTX61U,SM70U # RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA # RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \ # RUN: | FileCheck %t-ptx61-sm_70.ll # Check all variants of instructions supported by PTX63 on SM72 # RUN: python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll # RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \ # RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72 # RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT # RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \ # RUN: --check-prefixes=INTRINSICS,PTX63U,SM72U # RUN: --check-prefixes=INTRINSICS,NOSUBINT,NOMMA # RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \ # RUN: | FileCheck %t-ptx63-sm_72.ll # Check all variants of instructions supported by PTX63 on SM75 # RUN: python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll # RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \ # RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72,SM75 # RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT # RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \ # RUN: --check-prefixes=INTRINSICS,PTX63U,SM75U # RUN: --check-prefixes=INTRINSICS,NOMMA # RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \ # RUN: | FileCheck %t-ptx63-sm_75.ll # Check all variants of instructions supported by PTX64 on SM70+ # RUN: python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll # RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,MMA # RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT # RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \ # RUN: | FileCheck %t-ptx64-sm_70.ll from __future__ import print_function Loading Loading @@ -70,10 +78,11 @@ class MMAFrag: def __init__(self, geom, frag, ptx_elt_type): self.geom = geom self.frag = frag self.is_mma = True if geom == "m8n8k4" else False; self.mma_type = MMAType(ptx_elt_type); self.nregs = { "a:f16" : 8, "b:f16" : 8, "a:f16" : 2 if self.is_mma else 8, "b:f16" : 2 if self.is_mma else 8, "c:f16" : 4, "d:f16" : 4, "c:f32" : 8, Loading Loading @@ -145,7 +154,9 @@ def make_ldst_ops(geoms, frags, types): in product(geoms, frags, types)] def get_mma_ops(): return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], return (make_mma_ops(["m8n8k4"], ["f16"], [], ["f16", "f32"], ["f16", "f32"]) + make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], ["f16"], [], ["f16", "f32"], ["f16", "f32"]) + make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], ["s8", "u8"], [], ["s32"], []) + Loading @@ -165,6 +176,8 @@ def get_ldst_ops(kind): def is_geom_supported(geom): # geometries for FP and ints. if geom == "m8n8k4": return ptx_version >= 64 if geom in ["m8n32k16", "m32n8k16"]: return ptx_version >= 61 # geometries for sub-ints. Loading @@ -186,6 +199,13 @@ def is_mma_variant_supported(op, layout_a, layout_b, satf): if not (is_type_supported(op.a.mma_type.ptx_type) and is_geom_supported(op.a.geom)): return False if op.a.geom == "m8n8k4": if satf: return False if op.c.mma_type.ptx_type == "f32": # If C is f32, D must be, too. return op.d.mma_type.ptx_type == "f32" # sub-integer require row/col layout, and no satf. if op.a.mma_type.ptx_type in ["s4", "u4", "b1"]: if op.a.mma_type.ptx_type == "b1" and satf: Loading Loading @@ -232,8 +252,6 @@ def get_pspace(space): def check_pattern(frag): return "{{%s}}" % ", *".join([frag.mma_type.ptx_reg_pattern] * frag.nregs) known_geoms = ["m16n16k16", "m8n32k16", "m32n8k16"] def gen_wmma_load_tests(): load_template = """ declare ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args}); Loading Loading @@ -389,6 +407,8 @@ def mma_ptx_signature(op): if op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4", "b1"]: # int and sub-int instructions encode all four types as D.A.B.C return ".".join(x.mma_type.ptx_type for x in (op.d, op.a, op.b, op.c)) if op.a.geom == "m8n8k4": return "%s.f16.f16.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type) else: # the rest are FP instructions use D.C return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type) Loading @@ -411,8 +431,10 @@ define ${ret_ty} @test_${function}( ret ${ret_ty} %r; } """ intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}" instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}" wmma_intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}" wmma_instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}" mma_intrinsic_template = "llvm.nvvm.mma.${geom}.${alayout}.${blayout}.${intrinsic_signature}" mma_instruction_template = "mma.sync${aligned}.${geom}.${alayout}.${blayout}.${ptx_signature}" generated_items=[] Loading @@ -436,6 +458,13 @@ define ${ret_ty} @test_${function}( "mma_variant" : ".xor.popc" if op.a.mma_type.ptx_type == "b1" else "", } if op.a.geom == "m8n8k4": intrinsic_template = mma_intrinsic_template instruction_template = mma_instruction_template else: intrinsic_template = wmma_intrinsic_template instruction_template = wmma_instruction_template test_params = params test_params["intrinsic"] = Template(intrinsic_template).substitute(params) test_params["function"] = test_params["intrinsic"].replace(".", "_") Loading @@ -458,55 +487,68 @@ define ${ret_ty} @test_${function}( # Generate set of checks to verify that that we did generate sensible set of # tests for the given combination of PTX and SM variants. # # PTX<N>: verifies that we did generate tests for correct classes of intrinsics. # PTX<N>U: verifies that we did not generate intrinsics unsupported by # the PTX version. # SM<N>: verifies that we did generate correct classes of instructions for the SM. # SM<N>U: verifies that we did not generate instructions unsupported by the SM # # Note that SM/PTX constraints overlap, but DAG checks do not allow overlapping # matches. We implicitly rely that we generate multiple variants of most of the # instructions and usually have enough input data to find more than one match of # the same kind, if necessary. When it's not possible (e.g. there's only one # m8n8k128.mma.row.col.b1), we may need to match PTX instruction instead. def gen_check_unsupported_ops(items): print("; Complete list of intrinsics supported by PTX%d on sm_%d" % (ptx_version, gpu_arch)) print("; INTRINSICS: {{^; INTRINSICS_LIST_BEGIN}}") print(""" ; PTX60-DAG: m16n16k16.load.{{[ab].*}}.f16.p ; PTX60-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; PTX60U-NOT: m32n8k16 ; PTX60U-NOT: m8n32k16 ; PTX60U-NOT: .{{s32|s[48]|u[48]|b1}} ; All features of PTX60, plus m32n8k16/m8n32k16 geometries. ; PTX61-DAG: m32n8k16.load.{{[ab].*}}.f16.p ; PTX61-DAG: m32n8k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; PTX61-DAG: m8n32k16.load.{{[ab].*}}.f16.p ; PTX61-DAG: m8n32k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; PTX61U-NOT: .{{s32|s[48]|u[48]|b1}} ; SM70U-NOT: .{{s32|s[48]|u[48]|b1}} ; PTX63 supports all features of PTX60+PTX61, plus support for integers. ; Alas we can"t just use PTX<N> checks for that as available instructions ; depend on SM integers need sm72+ and subinteger ops need sm75, so we ; transition to SM<N> checks ; SM72-DAG: m16n16k16.load.{{[ab].*}}.s8.p ; SM72-DAG: m8n32k16.load.{{[ab].*}}.s8.p ; SM72-DAG: m32n8k16.load.{{[ab].*}}.s8.p ; SM72-DAG: m16n16k16.load.{{[ab].*}}.u8.p ; SM72-DAG: m8n32k16.load.{{[ab].*}}.u8.p ; SM72-DAG: m32n8k16.load.{{[ab].*}}.u8.p ; SM72-DAG: m32n8k16.{{load|store}}.{{[cd].*\.s32}}.p ; SM72U-NOT: .{{s4|u4|b1}} ; SM75-DAG: m8n8k128.load.{{[ab].*}}.b1.p ; SM75-DAG: m8n8k32.load.{{[ab].*}}.s4.p ; SM75-DAG: m8n8k32.load.{{[ab].*}}.u4.p ; SM75-DAG: m8n8k128.{{load|store}}.{{[cd].*\.s32}}.p ; SM75-DAG: m8n8k32.{{load|store}}.{{[cd].*\.s32}}.p ; NOEXTGEOM-NOT: {{m8n32|m32n8}} ; NOINT-NOT: .{{s32|s8}} ; NOSUBINT-NOT: {{s4|u4|b1}} ; NOMMA-NOT: .m8n8k4. ; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p ; M16N16-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f32 ; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f16 ; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f16 ; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f32 ; PTX60 adds support for m32n8k16/m8n32k16 geometries. ; EXTGEOM-DAG: m32n8k16.load.{{[ab].*}}.f16.p ; EXTGEOM-DAG: m32n8k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f32 ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f16 ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f16 ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f32 ; EXTGEOM-DAG: m8n32k16.load.{{[ab].*}}.f16.p ; EXTGEOM-DAG: m8n32k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f32 ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f16 ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f16 ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f32 ; INT-DAG: m16n16k16.load.{{[ab].*}}.s8.p ; INT-DAG: m8n32k16.load.{{[ab].*}}.s8.p ; INT-DAG: m32n8k16.load.{{[ab].*}}.s8.p ; INT-DAG: m16n16k16.load.{{[ab].*}}.u8.p ; INT-DAG: m8n32k16.load.{{[ab].*}}.u8.p ; INT-DAG: m32n8k16.load.{{[ab].*}}.u8.p ; INT-DAG: m32n8k16.{{load|store}}.{{[cd].*\.s32}}.p ; INT-DAG: m16n16k16.mma.{{.*}}.u8 ; INT-DAG: m16n16k16.mma.{{.*}}.s8 ; INT-DAG: m8n32k16.mma.{{.*}}.u8 ; INT-DAG: m8n32k16.mma.{{.*}}.s8 ; INT-DAG: m32n8k16.mma.{{.*}}.u8 ; INT-DAG: m32n8k16.mma.{{.*}}.s8 ; SUBINT-DAG: m8n8k128.load.{{[ab].*}}.b1.p ; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.s4.p ; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.u4.p ; SUBINT-DAG: m8n8k128.{{load|store}}.{{[cd].*\.s32}}.p ; SUBINT-DAG: m8n8k32.{{load|store}}.{{[cd].*\.s32}}.p ; SUBINT-DAG: m8n8k32.mma.{{.*}}.u4 ; SUBINT-DAG: m8n8k32.mma.{{.*}}.s4 ; SUBINT-DAG: m8n8k128.mma.{{.*}}.b1 ; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f32 ; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f16 ; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f16 ; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f32 ; """) print("; INTRINSICS_LIST_BEGIN") Loading Loading
llvm/include/llvm/IR/IntrinsicsNVVM.td +43 −14 Original line number Diff line number Diff line Loading @@ -53,6 +53,10 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType> { string gft = Geom#":"#Frag#":"#ptx_elt_type; string ft = frag#":"#ptx_elt_type; list<LLVMType> regs = !cond( // mma.sync.m8n8k4 uses smaller a/b fragments than wmma fp ops !eq(gft,"m8n8k4:a:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret, !eq(gft,"m8n8k4:b:f16") : RepLLVMType<2, llvm_v2f16_ty>.ret, // fp16 -> fp16/fp32 @ m16n16k16/m8n32k16/m32n8k16 // All currently supported geometries use the same fragment format, // so we only need to consider {fragment, type}. Loading Loading @@ -137,13 +141,19 @@ class MMA_SIGNATURE<WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> { class WMMA_NAME_MMA<string ALayout, string BLayout, int Satfinite, WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> { string signature = MMA_SIGNATURE<A, B, C, D>.ret; string llvm = "llvm.nvvm.wmma." string llvm = !if( !eq(A.geom, "m8n8k4"), "llvm.nvvm.mma.m8n8k4" # "." # ALayout # "." # BLayout # signature, "llvm.nvvm.wmma." # A.geom # ".mma" # "." # ALayout # "." # BLayout # signature # !if(Satfinite, ".satfinite", ""); # !if(Satfinite, ".satfinite", "")); string record = !subst(".", "_", !subst("llvm.", "int_", llvm)); Loading @@ -160,7 +170,7 @@ class MMA_OPS<list<string> Geom, list<string> TypeA, list<string> TypeB, !foldl([]<list<WMMA_REGS>>, TypeA, t2, type_a, !listconcat(t2, !foldl([]<list<WMMA_REGS>>, !if(!size(TypeB), TypeB, [type_a]), t3, type_b, !listconcat(t3, !foldl([]<list<WMMA_REGS>>, TypeC, t4, type_c, !listconcat(t4, !foldl([]<list<WMMA_REGS>>, !if(!size(TypeC), TypeC, [type_c]), t5, type_d, !listconcat(t5, !foldl([]<list<WMMA_REGS>>, !if(!size(TypeD), TypeD, [type_c]), t5, type_d, !listconcat(t5, [[WMMA_REGS<geom, "a", type_a>, WMMA_REGS<geom, "b", type_b>, WMMA_REGS<geom, "c", type_c>, Loading @@ -185,19 +195,23 @@ class MMA_LDST_OPS<list<string> Geom, list<string> Frags, list<string> Types> { // drives generation of corresponding intrinsics and instructions. class NVVM_MMA_OPS<int _ = 0> { list<list<WMMA_REGS>> fp_mma_ops = MMA_OPS< ["m8n8k4"], ["f16"], [], ["f16", "f32"], ["f16", "f32"]>.ret; list<list<WMMA_REGS>> fp_wmma_ops = MMA_OPS< ["m16n16k16", "m32n8k16", "m8n32k16"], ["f16"], [], ["f16", "f32"], ["f16", "f32"]>.ret; list<list<WMMA_REGS>> int_mma_ops = MMA_OPS< list<list<WMMA_REGS>> int_wmma_ops = MMA_OPS< ["m16n16k16", "m32n8k16", "m8n32k16"], ["s8", "u8"], [], ["s32"], []>.ret; list<list<WMMA_REGS>> subint_mma_ops = MMA_OPS< list<list<WMMA_REGS>> subint_wmma_ops = MMA_OPS< ["m8n8k32"], ["s4", "u4"], [], ["s32"], []>.ret; list<list<WMMA_REGS>> bit_mma_ops = MMA_OPS< list<list<WMMA_REGS>> bit_wmma_ops = MMA_OPS< ["m8n8k128"], ["b1"], [], ["s32"], []>.ret; list<list<WMMA_REGS>> all_mma_ops = !listconcat(fp_mma_ops, int_mma_ops, subint_mma_ops, bit_mma_ops); list<list<WMMA_REGS>> all_mma_ops = !listconcat( fp_mma_ops, fp_wmma_ops, int_wmma_ops, subint_wmma_ops, bit_wmma_ops); list<WMMA_REGS> ldst_ab_ops = MMA_LDST_OPS< ["m16n16k16", "m32n8k16", "m8n32k16"], Loading Loading @@ -245,10 +259,25 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b # ":" # frags[0].frag ; string t = frags[0].ptx_elt_type; // gcd is a shortcut used to identify instructions that depend on // geom+frag_c+frag_d. Not all instances of this class have all fragments // specified. If there are not enough fragments, the tail evaluates to '?'. string gcd = frags[0].geom # ":" # !if(!eq(!size(frags), 4), frags[2].ptx_elt_type # frags[3].ptx_elt_type, "?"); list<int> ret = !cond( // Sub-int MMA only supports fixed A/B layout. // b1 does not support .satf. !eq(mma#":"#satf, "b1:row:col:0") : [1], // mma.m8n8k4 has no .satf modifier. !and(!eq(frags[0].geom, "m8n8k4"), !ne(satf, 0)): [], // mma.m8n8k4 has no C=f32 D=f16 variant. !eq(gcd, "m8n8k4:f32f16"): [], !eq(mma, "s4:row:col") : [1], !eq(mma, "u4:row:col") : [1], !eq(mma, "s4:row:col") : [1], Loading
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +28 −14 Original line number Diff line number Diff line Loading @@ -7400,7 +7400,9 @@ class WMMA_REGINFO<WMMA_REGS r> // u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1) !or(!eq(geom,"m8n8k128"), !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63]); !eq(geom,"m8n8k32")) : [hasSM75, hasPTX63], !eq(geom, "m8n8k4") : [hasSM70, hasPTX64]); // template DAGs for instruction inputs/output. dag Outs = !dag(outs, ptx_regs, reg_names); Loading Loading @@ -7546,13 +7548,25 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB, let OutOperandList = FragD.Outs; let InOperandList = !con(Args, (ins MmaCode:$ptx)); string TypeList = !cond( !eq(FragD.geom, "m8n8k4") : "." # FragD.ptx_elt_type # ".f16.f16." # FragC.ptx_elt_type, !eq(FragD.ptx_elt_type, "s32") : ".s32" # "." # FragA.ptx_elt_type # "." # FragB.ptx_elt_type # ".s32", 1: "." # FragD.ptx_elt_type # "." # FragC.ptx_elt_type, ); let AsmString = "wmma.mma" let AsmString = !if(!eq(FragA.geom, "m8n8k4"), "mma.sync.aligned.m8n8k4" # "." # ALayout # "." # BLayout # TypeList # "\n\t\t" # FragD.regstring # ",\n\t\t" # FragA.regstring # ",\n\t\t" # FragB.regstring # ",\n\t\t" # FragC.regstring # ";", "wmma.mma" # !if(!eq(FragA.ptx_elt_type, "b1"), ".xor.popc", "") # ".sync" # "${ptx:aligned}" Loading @@ -7564,7 +7578,7 @@ class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB, # FragD.regstring # ",\n\t\t" # FragA.regstring # ",\n\t\t" # FragB.regstring # ",\n\t\t" # FragC.regstring # ";"; # FragC.regstring # ";"); } defset list<WMMA_INSTR> MMAs = { Loading
llvm/test/CodeGen/NVPTX/wmma.py +101 −59 Original line number Diff line number Diff line Loading @@ -4,39 +4,47 @@ # Check all variants of instructions supported by PTX60 on SM70 # RUN: python %s --ptx=60 --gpu-arch=70 > %t-ptx60-sm_70.ll # RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,PTX60,SM70 # RUN: --check-prefixes=INTRINSICS,M16N16 # RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,PTX60U,SM70U # RUN: --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA # RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \ # RUN: | FileCheck %t-ptx60-sm_70.ll # Check all variants of instructions supported by PTX61 on SM70 # RUN: python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll # RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,SM70 # RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM # RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,PTX61U,SM70U # RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA # RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \ # RUN: | FileCheck %t-ptx61-sm_70.ll # Check all variants of instructions supported by PTX63 on SM72 # RUN: python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll # RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \ # RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72 # RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT # RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \ # RUN: --check-prefixes=INTRINSICS,PTX63U,SM72U # RUN: --check-prefixes=INTRINSICS,NOSUBINT,NOMMA # RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \ # RUN: | FileCheck %t-ptx63-sm_72.ll # Check all variants of instructions supported by PTX63 on SM75 # RUN: python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll # RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \ # RUN: --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72,SM75 # RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT # RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \ # RUN: --check-prefixes=INTRINSICS,PTX63U,SM75U # RUN: --check-prefixes=INTRINSICS,NOMMA # RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \ # RUN: | FileCheck %t-ptx63-sm_75.ll # Check all variants of instructions supported by PTX64 on SM70+ # RUN: python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll # RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,M16N16,EXTGEOM,MMA # RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \ # RUN: --check-prefixes=INTRINSICS,NOINT,NOSUBINT # RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \ # RUN: | FileCheck %t-ptx64-sm_70.ll from __future__ import print_function Loading Loading @@ -70,10 +78,11 @@ class MMAFrag: def __init__(self, geom, frag, ptx_elt_type): self.geom = geom self.frag = frag self.is_mma = True if geom == "m8n8k4" else False; self.mma_type = MMAType(ptx_elt_type); self.nregs = { "a:f16" : 8, "b:f16" : 8, "a:f16" : 2 if self.is_mma else 8, "b:f16" : 2 if self.is_mma else 8, "c:f16" : 4, "d:f16" : 4, "c:f32" : 8, Loading Loading @@ -145,7 +154,9 @@ def make_ldst_ops(geoms, frags, types): in product(geoms, frags, types)] def get_mma_ops(): return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], return (make_mma_ops(["m8n8k4"], ["f16"], [], ["f16", "f32"], ["f16", "f32"]) + make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], ["f16"], [], ["f16", "f32"], ["f16", "f32"]) + make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], ["s8", "u8"], [], ["s32"], []) + Loading @@ -165,6 +176,8 @@ def get_ldst_ops(kind): def is_geom_supported(geom): # geometries for FP and ints. if geom == "m8n8k4": return ptx_version >= 64 if geom in ["m8n32k16", "m32n8k16"]: return ptx_version >= 61 # geometries for sub-ints. Loading @@ -186,6 +199,13 @@ def is_mma_variant_supported(op, layout_a, layout_b, satf): if not (is_type_supported(op.a.mma_type.ptx_type) and is_geom_supported(op.a.geom)): return False if op.a.geom == "m8n8k4": if satf: return False if op.c.mma_type.ptx_type == "f32": # If C is f32, D must be, too. return op.d.mma_type.ptx_type == "f32" # sub-integer require row/col layout, and no satf. if op.a.mma_type.ptx_type in ["s4", "u4", "b1"]: if op.a.mma_type.ptx_type == "b1" and satf: Loading Loading @@ -232,8 +252,6 @@ def get_pspace(space): def check_pattern(frag): return "{{%s}}" % ", *".join([frag.mma_type.ptx_reg_pattern] * frag.nregs) known_geoms = ["m16n16k16", "m8n32k16", "m32n8k16"] def gen_wmma_load_tests(): load_template = """ declare ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args}); Loading Loading @@ -389,6 +407,8 @@ def mma_ptx_signature(op): if op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4", "b1"]: # int and sub-int instructions encode all four types as D.A.B.C return ".".join(x.mma_type.ptx_type for x in (op.d, op.a, op.b, op.c)) if op.a.geom == "m8n8k4": return "%s.f16.f16.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type) else: # the rest are FP instructions use D.C return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type) Loading @@ -411,8 +431,10 @@ define ${ret_ty} @test_${function}( ret ${ret_ty} %r; } """ intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}" instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}" wmma_intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}" wmma_instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}" mma_intrinsic_template = "llvm.nvvm.mma.${geom}.${alayout}.${blayout}.${intrinsic_signature}" mma_instruction_template = "mma.sync${aligned}.${geom}.${alayout}.${blayout}.${ptx_signature}" generated_items=[] Loading @@ -436,6 +458,13 @@ define ${ret_ty} @test_${function}( "mma_variant" : ".xor.popc" if op.a.mma_type.ptx_type == "b1" else "", } if op.a.geom == "m8n8k4": intrinsic_template = mma_intrinsic_template instruction_template = mma_instruction_template else: intrinsic_template = wmma_intrinsic_template instruction_template = wmma_instruction_template test_params = params test_params["intrinsic"] = Template(intrinsic_template).substitute(params) test_params["function"] = test_params["intrinsic"].replace(".", "_") Loading @@ -458,55 +487,68 @@ define ${ret_ty} @test_${function}( # Generate set of checks to verify that that we did generate sensible set of # tests for the given combination of PTX and SM variants. # # PTX<N>: verifies that we did generate tests for correct classes of intrinsics. # PTX<N>U: verifies that we did not generate intrinsics unsupported by # the PTX version. # SM<N>: verifies that we did generate correct classes of instructions for the SM. # SM<N>U: verifies that we did not generate instructions unsupported by the SM # # Note that SM/PTX constraints overlap, but DAG checks do not allow overlapping # matches. We implicitly rely that we generate multiple variants of most of the # instructions and usually have enough input data to find more than one match of # the same kind, if necessary. When it's not possible (e.g. there's only one # m8n8k128.mma.row.col.b1), we may need to match PTX instruction instead. def gen_check_unsupported_ops(items): print("; Complete list of intrinsics supported by PTX%d on sm_%d" % (ptx_version, gpu_arch)) print("; INTRINSICS: {{^; INTRINSICS_LIST_BEGIN}}") print(""" ; PTX60-DAG: m16n16k16.load.{{[ab].*}}.f16.p ; PTX60-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; PTX60U-NOT: m32n8k16 ; PTX60U-NOT: m8n32k16 ; PTX60U-NOT: .{{s32|s[48]|u[48]|b1}} ; All features of PTX60, plus m32n8k16/m8n32k16 geometries. ; PTX61-DAG: m32n8k16.load.{{[ab].*}}.f16.p ; PTX61-DAG: m32n8k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; PTX61-DAG: m8n32k16.load.{{[ab].*}}.f16.p ; PTX61-DAG: m8n32k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; PTX61U-NOT: .{{s32|s[48]|u[48]|b1}} ; SM70U-NOT: .{{s32|s[48]|u[48]|b1}} ; PTX63 supports all features of PTX60+PTX61, plus support for integers. ; Alas we can"t just use PTX<N> checks for that as available instructions ; depend on SM integers need sm72+ and subinteger ops need sm75, so we ; transition to SM<N> checks ; SM72-DAG: m16n16k16.load.{{[ab].*}}.s8.p ; SM72-DAG: m8n32k16.load.{{[ab].*}}.s8.p ; SM72-DAG: m32n8k16.load.{{[ab].*}}.s8.p ; SM72-DAG: m16n16k16.load.{{[ab].*}}.u8.p ; SM72-DAG: m8n32k16.load.{{[ab].*}}.u8.p ; SM72-DAG: m32n8k16.load.{{[ab].*}}.u8.p ; SM72-DAG: m32n8k16.{{load|store}}.{{[cd].*\.s32}}.p ; SM72U-NOT: .{{s4|u4|b1}} ; SM75-DAG: m8n8k128.load.{{[ab].*}}.b1.p ; SM75-DAG: m8n8k32.load.{{[ab].*}}.s4.p ; SM75-DAG: m8n8k32.load.{{[ab].*}}.u4.p ; SM75-DAG: m8n8k128.{{load|store}}.{{[cd].*\.s32}}.p ; SM75-DAG: m8n8k32.{{load|store}}.{{[cd].*\.s32}}.p ; NOEXTGEOM-NOT: {{m8n32|m32n8}} ; NOINT-NOT: .{{s32|s8}} ; NOSUBINT-NOT: {{s4|u4|b1}} ; NOMMA-NOT: .m8n8k4. ; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p ; M16N16-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f32 ; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f16 ; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f16 ; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f32 ; PTX60 adds support for m32n8k16/m8n32k16 geometries. ; EXTGEOM-DAG: m32n8k16.load.{{[ab].*}}.f16.p ; EXTGEOM-DAG: m32n8k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f32 ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f16 ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f16 ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f32 ; EXTGEOM-DAG: m8n32k16.load.{{[ab].*}}.f16.p ; EXTGEOM-DAG: m8n32k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f32 ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f16 ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f16 ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f32 ; INT-DAG: m16n16k16.load.{{[ab].*}}.s8.p ; INT-DAG: m8n32k16.load.{{[ab].*}}.s8.p ; INT-DAG: m32n8k16.load.{{[ab].*}}.s8.p ; INT-DAG: m16n16k16.load.{{[ab].*}}.u8.p ; INT-DAG: m8n32k16.load.{{[ab].*}}.u8.p ; INT-DAG: m32n8k16.load.{{[ab].*}}.u8.p ; INT-DAG: m32n8k16.{{load|store}}.{{[cd].*\.s32}}.p ; INT-DAG: m16n16k16.mma.{{.*}}.u8 ; INT-DAG: m16n16k16.mma.{{.*}}.s8 ; INT-DAG: m8n32k16.mma.{{.*}}.u8 ; INT-DAG: m8n32k16.mma.{{.*}}.s8 ; INT-DAG: m32n8k16.mma.{{.*}}.u8 ; INT-DAG: m32n8k16.mma.{{.*}}.s8 ; SUBINT-DAG: m8n8k128.load.{{[ab].*}}.b1.p ; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.s4.p ; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.u4.p ; SUBINT-DAG: m8n8k128.{{load|store}}.{{[cd].*\.s32}}.p ; SUBINT-DAG: m8n8k32.{{load|store}}.{{[cd].*\.s32}}.p ; SUBINT-DAG: m8n8k32.mma.{{.*}}.u4 ; SUBINT-DAG: m8n8k32.mma.{{.*}}.s4 ; SUBINT-DAG: m8n8k128.mma.{{.*}}.b1 ; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f32 ; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f16 ; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f16 ; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f32 ; """) print("; INTRINSICS_LIST_BEGIN") Loading