Merge branch 'fp32_residual_conn' into 'main' (c84f0752) · Commits · candle / Megatron-LM

megatron/arguments.py

+9 −0

Original line number	Diff line number	Diff line
		@@ -183,6 +183,9 @@ def parse_args(extra_args_provider=None, defaults={},
		# Mixed precision checks.
		if args.fp16_lm_cross_entropy:
		assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
		if args.fp32_residual_connection:
		assert args.fp16, \
		'residual connection in fp32 only supported when using fp16.'
		# Activation checkpointing.
		if args.distribute_checkpointed_activations:
		assert args.checkpoint_activations, \
		@@ -197,6 +200,10 @@ def parse_args(extra_args_provider=None, defaults={},
		if args.scaled_masked_softmax_fusion:
		fused_kernels.load_scaled_masked_softmax_fusion_kernel()

		# Load mixed precision fused layer norm.
		if args.fp32_residual_connection:
		fused_kernels.load_fused_mix_prec_layer_norm_kernel()

		_print_args(args)
		return args

		@@ -435,6 +442,8 @@ def _add_mixed_precision_args(parser):

		group.add_argument('--fp16', action='store_true',
		help='Run model in fp16 mode.')
		group.add_argument('--fp32-residual-connection', action='store_true',
		help='Move residual connections to fp32.')
		group.add_argument('--apply-query-key-layer-scaling', action='store_true',
		help='Scale Q * K^T by 1 / layer-number. If this flag '
		'is set, then it will automatically set '

megatron/fused_kernels/init.py

+26 −0

Original line number	Diff line number	Diff line
		@@ -98,3 +98,29 @@ def load_scaled_masked_softmax_fusion_kernel():
		'--expt-relaxed-constexpr',
		'--expt-extended-lambda',
		'--use_fast_math'] + cc_flag)


		def load_fused_mix_prec_layer_norm_kernel():

		# Check, if CUDA11 is installed for compute capability 8.0
		cc_flag = []
		_, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
		if int(bare_metal_major) >= 11:
		cc_flag.append('-gencode')
		cc_flag.append('arch=compute_80,code=sm_80')

		srcpath = pathlib.Path(__file__).parent.absolute()
		buildpath = srcpath / 'build'

		create_build_dir(buildpath)

		fused_mix_prec_layer_norm_cuda = cpp_extension.load(
		name='fused_mix_prec_layer_norm_cuda',
		sources=[srcpath / 'layer_norm_cuda.cpp',
		srcpath / 'layer_norm_cuda_kernel.cu'],
		build_directory=buildpath,
		extra_cflags=['-O3'],
		extra_cuda_cflags=['-O3',
		'-gencode', 'arch=compute_70,code=sm_70',
		'-maxrregcount=50',
		'--use_fast_math'] + cc_flag)

megatron/fused_kernels/compat.h

0 → 100644

+31 −0

Original line number	Diff line number	Diff line
		/* coding=utf-8
		* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
		*
		* Licensed under the Apache License, Version 2.0 (the "License");
		* you may not use this file except in compliance with the License.
		* You may obtain a copy of the License at
		*
		* http://www.apache.org/licenses/LICENSE-2.0
		*
		* Unless required by applicable law or agreed to in writing, software
		* distributed under the License is distributed on an "AS IS" BASIS,
		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		* See the License for the specific language governing permissions and
		* limitations under the License.
		*/

		/*This code is copied fron NVIDIA apex:
		* https://github.com/NVIDIA/apex
		* with minor changes. */



		#ifndef TORCH_CHECK
		#define TORCH_CHECK AT_CHECK
		#endif

		#ifdef VERSION_GE_1_3
		#define DATA_PTR data_ptr
		#else
		#define DATA_PTR data
		#endif

megatron/fused_kernels/layer_norm_cuda.cpp

0 → 100644

+260 −0

Original line number	Diff line number	Diff line
		/* coding=utf-8
		* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
		*
		* Licensed under the Apache License, Version 2.0 (the "License");
		* you may not use this file except in compliance with the License.
		* You may obtain a copy of the License at
		*
		* http://www.apache.org/licenses/LICENSE-2.0
		*
		* Unless required by applicable law or agreed to in writing, software
		* distributed under the License is distributed on an "AS IS" BASIS,
		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		* See the License for the specific language governing permissions and
		* limitations under the License.
		*/

		/*This code is copied fron NVIDIA apex:
		* https://github.com/NVIDIA/apex
		* with minor changes. */

		#include <torch/extension.h>
		#include <vector>
		#include <cassert>
		#include "compat.h"

		namespace {
		void compute_n1_n2(
		at::Tensor input,
		#ifdef VERSION_GE_1_1
		at::IntArrayRef normalized_shape,
		#else
		at::IntList normalized_shape,
		#endif
		int& n1,
		int& n2)
		{
		int idiff = input.ndimension() - normalized_shape.size();
		n2 = 1;
		for (int i = 0; i < (int)normalized_shape.size(); ++i) {
		assert( input.sizes()[i+idiff] == normalized_shape[i] );
		n2 *= normalized_shape[i];
		}
		n1 = 1;
		for (int i = 0; i < idiff; ++i) {
		n1 *= input.sizes()[i];
		}
		}

		void check_args(
		#ifdef VERSION_GE_1_1
		at::IntArrayRef normalized_shape,
		#else
		at::IntList normalized_shape,
		#endif
		at::Tensor gamma,
		at::Tensor beta
		)
		{
		TORCH_CHECK(!gamma.defined() \|\| gamma.sizes().equals(normalized_shape));
		TORCH_CHECK(!beta.defined() \|\| beta.sizes().equals(normalized_shape));
		}

		void check_args(
		at::Tensor input,
		#ifdef VERSION_GE_1_1
		at::IntArrayRef normalized_shape,
		#else
		at::IntList normalized_shape,
		#endif
		int& n1,
		int& n2
		)
		{
		int64_t normalized_ndim = normalized_shape.size();

		if (normalized_ndim < 1) {
		std::stringstream ss;
		ss << "Expected normalized_shape to be at least 1-dimensional, i.e., "
		<< "containing at least one element, but got normalized_shape="
		<< normalized_shape;
		throw std::runtime_error(ss.str());
		}

		auto input_shape = input.sizes();
		auto input_ndim = input.dim();

		if (input_ndim < normalized_ndim \|\|
		!input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) {
		std::stringstream ss;
		ss << "Given normalized_shape=" << normalized_shape
		<< ", expected input with shape [*";
		for (auto size : normalized_shape) {
		ss << ", " << size;
		}
		ss << "], but got input of size" << input_shape;
		throw std::runtime_error(ss.str());
		}

		compute_n1_n2(input,normalized_shape,n1,n2);
		}


		void check_args(
		at::Tensor input,
		#ifdef VERSION_GE_1_1
		at::IntArrayRef normalized_shape,
		#else
		at::IntList normalized_shape,
		#endif
		at::Tensor gamma,
		at::Tensor beta,
		int& n1,
		int& n2
		)
		{
		check_args(input,normalized_shape,n1,n2);
		check_args(normalized_shape,gamma,beta);
		}
		}

		void cuda_layer_norm(
		at::Tensor* output,
		at::Tensor* mean,
		at::Tensor* invvar,
		at::Tensor* input,
		int n1,
		int n2,
		#ifdef VERSION_GE_1_1
		at::IntArrayRef normalized_shape,
		#else
		at::IntList normalized_shape,
		#endif
		at::Tensor* gamma,
		at::Tensor* beta,
		double epsilon);

		#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
		#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
		#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

		std::vector<at::Tensor> layer_norm(
		at::Tensor input,
		#ifdef VERSION_GE_1_1
		at::IntArrayRef normalized_shape,
		#else
		at::IntList normalized_shape,
		#endif
		double epsilon) {
		CHECK_INPUT(input);
		int n1,n2;
		check_args(input,normalized_shape,n1,n2);
		at::Tensor output = at::empty_like(input);
		at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input.scalar_type()));
		at::Tensor invvar = at::empty_like(mean);
		cuda_layer_norm(&output,&mean,&invvar,&input,n1,n2,
		normalized_shape,NULL,NULL,epsilon);
		return {output, mean, invvar};
		}
		std::vector<at::Tensor> layer_norm_affine(
		at::Tensor input,
		#ifdef VERSION_GE_1_1
		at::IntArrayRef normalized_shape,
		#else
		at::IntList normalized_shape,
		#endif
		at::Tensor gamma,
		at::Tensor beta,
		double epsilon) {
		CHECK_INPUT(input);
		CHECK_INPUT(gamma);
		CHECK_INPUT(beta);
		int n1,n2;
		check_args(input,normalized_shape,gamma,beta,n1,n2);
		at::Tensor output = at::empty_like(input, input.options().dtype(at::ScalarType::Half));
		at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input.scalar_type()));
		at::Tensor invvar = at::empty_like(mean);
		cuda_layer_norm(&output,&mean,&invvar,&input,n1,n2,
		normalized_shape,&gamma,&beta,epsilon);
		return {output, mean, invvar};
		}

		void cuda_layer_norm_gradient(
		at::Tensor* dout,
		at::Tensor* mean,
		at::Tensor* invvar,
		at::Tensor* input,
		int n1,
		int n2,
		#ifdef VERSION_GE_1_1
		at::IntArrayRef normalized_shape,
		#else
		at::IntList normalized_shape,
		#endif
		at::Tensor* gamma,
		at::Tensor* beta,
		double epsilon,
		at::Tensor* grad_input,
		at::Tensor* grad_gamma,
		at::Tensor* grad_beta
		);

		at::Tensor layer_norm_gradient(
		at::Tensor dout,
		at::Tensor mean,
		at::Tensor invvar,
		at::Tensor input,
		#ifdef VERSION_GE_1_1
		at::IntArrayRef normalized_shape,
		#else
		at::IntList normalized_shape,
		#endif
		double epsilon) {
		CHECK_INPUT(dout);
		CHECK_INPUT(mean);
		CHECK_INPUT(invvar);
		CHECK_INPUT(input);
		int n1,n2;
		check_args(input,normalized_shape,n1,n2);
		at::Tensor grad_input = at::empty_like(input);
		cuda_layer_norm_gradient(&dout,&mean,&invvar,&input,n1,n2,
		normalized_shape,NULL,NULL,epsilon,
		&grad_input,NULL,NULL);
		return grad_input;
		}
		std::vector<at::Tensor> layer_norm_gradient_affine(
		at::Tensor dout,
		at::Tensor mean,
		at::Tensor invvar,
		at::Tensor input,
		#ifdef VERSION_GE_1_1
		at::IntArrayRef normalized_shape,
		#else
		at::IntList normalized_shape,
		#endif
		at::Tensor gamma,
		at::Tensor beta,
		double epsilon) {
		CHECK_INPUT(dout);
		CHECK_INPUT(mean);
		CHECK_INPUT(invvar);
		CHECK_INPUT(input);
		CHECK_INPUT(gamma);
		CHECK_INPUT(beta);
		int n1,n2;
		check_args(input,normalized_shape,gamma,beta,n1,n2);
		at::Tensor grad_input = at::empty_like(input);
		at::Tensor grad_gamma = at::empty_like(gamma);
		at::Tensor grad_beta = at::empty_like(beta);
		cuda_layer_norm_gradient(&dout,&mean,&invvar,&input,n1,n2,
		normalized_shape,&gamma,&beta,epsilon,
		&grad_input,&grad_gamma,&grad_beta);
		return {grad_input, grad_gamma, grad_beta};
		}

		PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
		m.def("forward_affine", &layer_norm_affine, "LayerNorm forward (CUDA)");
		m.def("forward", &layer_norm, "LayerNorm forward (CUDA)");
		m.def("backward_affine", &layer_norm_gradient_affine, "LayerNorm backward (CUDA)");
		m.def("backward", &layer_norm_gradient, "LayerNorm backward (CUDA)");
		}

megatron/fused_kernels/layer_norm_cuda_kernel.cu

0 → 100644

+829 −0

File added.

Preview size limit exceeded, changes collapsed.