[NVPTX] Add intrinsics for narrow-fp to bf16 conversions (#191376) (83b4c5cd) · Commits · llvm-doe / llvm-project

llvm/docs/NVPTXUsage.rst

+18 −5

Original line number	Diff line number	Diff line
		@@ -1134,6 +1134,14 @@ The following table describes the rounding modes used across these intrinsics:
		\| \| the input. \|
		+-----------------------+---------------------------------------------------+

		.. _scale-factor:

		Some conversions involve a scale factor which is provided as a packed 16-bit
		integer containing two scaling factors of type ``ue8m0``, one for each input.
		For down conversion, inputs are divided by ``scale_factor`` and then the
		conversion is performed. For up-conversion, inputs are converted to destination
		type and then multiplied by ``scale_factor``.

		``fp8`` Conversion Intrinsics
		^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

		@@ -1148,6 +1156,7 @@ Syntax:
		declare i16 @llvm.bf16x2.to{.e4m3x2, .e5m2x2}.rn{.relu}.satfinite(<2 x bfloat> %a)
		declare i16 @llvm.bf16x2.to.ue8m0x2{.rz, .rp}{.satfinite}(<2 x bfloat> %a)
		declare <2 x half> @llvm.nvvm{.e4m3x2, .e5m2x2}.to.f16x2.rn{.relu}(i16 %a)
		declare <2 x bfloat> @llvm.nvvm{.e4m3x2, .e5m2x2}.to.bf16x2.rn{.relu}{.satfinite}.scale.n2.ue8m0(i16 %a, i16 %scale_factor)
		declare <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 %a)
		declare <4 x i8> @llvm.nvvm.f32x4.to{.e4m3x4, .e5m2x4}.rs{.relu}.satfinite(<4 x f32> %a, i32 %rnd_bits)

		@@ -1172,6 +1181,8 @@ specified destination format. The ``satfinite`` modifier is assumed to be
		present for conversions involving ``e4m3`` and ``e5m2`` types as the
		destination.

		For scale factor, see :ref:`scale-factor <scale-factor>`.

		For more information, see `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt>`__.

		``s2f6`` Conversion Intrinsics
		@@ -1204,11 +1215,7 @@ result is sign-preserved ``MAX_NORM`` of the destination format. Also, if the
		input is ``NaN``, then the result is the positive ``MAX_NORM`` of the
		destination format.

		The operand ``%scale_factor`` stores two packed scaling factors of type
		``ue8m0``, one for each input. For down conversion, inputs are divided by
		``scale_factor`` and then the conversion is performed. For up-conversion,
		inputs are converted to destination type and then multiplied by
		``scale_factor``.
		For scale factor, see :ref:`scale-factor <scale-factor>`.

		For more information, see `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt>`__.

		@@ -1224,6 +1231,7 @@ Syntax:
		declare i16 @llvm.nvvm.f16x2.to{.e2m3x2, .e3m2x2}.rn{.relu}.satfinite(<2 x half> %a)
		declare i16 @llvm.nvvm.bf16x2.to{.e2m3x2, .e3m2x2}.rn{.relu}.satfinite(<2 x bfloat> %a)
		declare <2 x half> @llvm.nvvm{.e2m3x2, .e3m2x2}.to.f16x2.rn{.relu}(i16 %a)
		declare <2 x bfloat> @llvm.nvvm{.e2m3x2, .e3m2x2}.to.bf16x2.rn{.relu}{.satfinite}.scale.n2.ue8m0(i16 %a, i16 %scale_factor)
		declare <4 x i8> @llvm.nvvm.f32x4.to{.e2m3x4, .e3m2x4}.rs{.relu}.satfinite(<4 x f32> %a, i32 %rnd_bits)

		Overview:
		@@ -1245,6 +1253,8 @@ result is sign-preserved ``MAX_NORM`` of the destination format. Also, if the
		input is ``NaN``, then the result is the positive ``MAX_NORM`` of the
		destination format.

		For scale factor, see :ref:`scale-factor <scale-factor>`.

		For more information, see `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt>`__.

		``fp4`` Conversion Intrinsics
		@@ -1259,6 +1269,7 @@ Syntax:
		declare i16 @llvm.nvvm.f16x2.to.e2m1x2.rn{.relu}.satfinite(<2 x half> %a)
		declare i16 @llvm.nvvm.bf16x2.to.e2m1x2.rn{.relu}.satfinite(<2 x bfloat> %a)
		declare <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn{.relu}(i16 %a)
		declare <2 x bfloat> @llvm.nvvm.e2m1x2.to.bf16x2.rn{.relu}{.satfinite}.scale.n2.ue8m0(i16 %a, i16 %scale_factor)
		declare i16 @llvm.nvvm.f32x4.to.e2m1x4.rs{.relu}.satfinite(<4 x f32> %a, i32 %rnd_bits)

		Overview:
		@@ -1281,6 +1292,8 @@ result is sign-preserved ``MAX_NORM`` of the destination format. Also, if the
		input is ``NaN``, then the result is the positive ``MAX_NORM`` of the
		destination format.

		For scale factor, see :ref:`scale-factor <scale-factor>`.

		For more information, see `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt>`__.

		Arithmetic Intrinsics

llvm/include/llvm/IR/IntrinsicsNVVM.td

+15 −0

Original line number	Diff line number	Diff line
		@@ -1795,6 +1795,11 @@ let TargetPrefix = "nvvm" in {

		def int_nvvm_bf16x2_to_ # type # _rn # relu # _satfinite
		: PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>;

		foreach satfinite = ["", "_satfinite"] in {
		def int_nvvm_ # type # _to_bf16x2_rn # relu # satfinite # _scale_n2_ue8m0
		: PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty, llvm_i16_ty]>;
		}
		}
		}

		@@ -1820,6 +1825,11 @@ let TargetPrefix = "nvvm" in {

		def int_nvvm_bf16x2_to_e2m1x2_rn # relu # _satfinite
		: PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>;

		foreach satfinite = ["", "_satfinite"] in {
		def int_nvvm_e2m1x2_to_bf16x2_rn # relu # satfinite # _scale_n2_ue8m0
		: PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty, llvm_i16_ty]>;
		}
		}

		// RS rounding mode (Stochastic Rounding) conversions for f4x4 type
		@@ -1843,6 +1853,11 @@ let TargetPrefix = "nvvm" in {

		def int_nvvm_bf16x2_to_ # type # _rn # relu # _satfinite
		: PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>;

		foreach satfinite = ["", "_satfinite"] in {
		def int_nvvm_ # type # _to_bf16x2_rn # relu # satfinite # _scale_n2_ue8m0
		: PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty, llvm_i16_ty]>;
		}
		}
		}

llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp

+5 −0

Original line number	Diff line number	Diff line
		@@ -118,6 +118,11 @@ void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum,
		if (Imm & NVPTX::PTXCvtMode::SAT_FLAG)
		O << ".sat";
		return;
		} else if (Modifier == "satfinite") {
		// SATFINITE flag
		if (Imm & NVPTX::PTXCvtMode::SATFINITE_FLAG)
		O << ".satfinite";
		return;
		} else if (Modifier == "relu") {
		// RELU flag
		if (Imm & NVPTX::PTXCvtMode::RELU_FLAG)

llvm/lib/Target/NVPTX/NVPTX.h

+2 −1

Original line number	Diff line number	Diff line
		@@ -236,7 +236,8 @@ enum CvtMode {
		BASE_MASK = 0x0F,
		FTZ_FLAG = 0x10,
		SAT_FLAG = 0x20,
		RELU_FLAG = 0x40
		RELU_FLAG = 0x40,
		SATFINITE_FLAG = 0x80
		};
		}

llvm/lib/Target/NVPTX/NVPTX.td

+1 −1

Original line number	Diff line number	Diff line
		@@ -106,7 +106,7 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53, 60,

		foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65, 70, 71, 72,
		73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88,
		90, 91] in
		90, 91, 92] in
		def PTX#version : FeaturePTX<version>;

		def Is64Bit : Predicate<"Subtarget->getTargetTriple().getArch() == Triple::nvptx64">;