moved module to model and removed fp16 (b84d7a90) · Commits · candle / Megatron-LM

megatron/fp16/init.py

deleted100644 → 0

+0 −30

Original line number	Diff line number	Diff line
		# coding=utf-8
		# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.
		from .fp16util import (
		BN_convert_float,
		network_to_half,
		prep_param_lists,
		model_grads_to_master_grads,
		master_params_to_model_params,
		tofp16,
		to_python_float,
		clip_grad_norm,
		convert_module,
		convert_network,
		FP16Model,
		)

		from .fp16 import *
		from .loss_scaler import *

megatron/fp16/fp16.py

deleted100755 → 0

+0 −659

File deleted.

Preview size limit exceeded, changes collapsed.

megatron/fp16/fp16util.py

deleted100644 → 0

+0 −216

Original line number	Diff line number	Diff line
		# coding=utf-8
		# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.

		import torch
		import torch.nn as nn
		from torch.autograd import Variable
		from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors

		from apex.multi_tensor_apply import multi_tensor_applier
		import amp_C

		from megatron import mpu


		class tofp16(nn.Module):
		"""
		Utility module that implements::

		def forward(self, input):
		return input.half()
		"""

		def __init__(self):
		super(tofp16, self).__init__()

		def forward(self, input):
		return input.half()


		def BN_convert_float(module):
		"""
		Utility function for network_to_half().

		Retained for legacy purposes.
		"""
		if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
		module.float()
		for child in module.children():
		BN_convert_float(child)
		return module


		def network_to_half(network):
		"""
		Convert model to half precision in a batchnorm-safe way.

		Retained for legacy purposes. It is recommended to use FP16Model.
		"""
		return nn.Sequential(tofp16(), BN_convert_float(network.half()))


		def convert_module(module, dtype):
		"""
		Converts a module's immediate parameters and buffers to dtype.
		"""
		for param in module.parameters(recurse=False):
		if param is not None:
		if param.data.dtype.is_floating_point:
		param.data = param.data.to(dtype=dtype)
		if param._grad is not None and param._grad.data.dtype.is_floating_point:
		param._grad.data = param._grad.data.to(dtype=dtype)

		for buf in module.buffers(recurse=False):
		if buf is not None and buf.data.dtype.is_floating_point:
		buf.data = buf.data.to(dtype=dtype)


		def convert_network(network, dtype):
		"""
		Converts a network's parameters and buffers to dtype.
		"""
		for module in network.modules():
		if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
		continue
		convert_module(module, dtype)
		return network


		class FP16Model(nn.Module):
		"""
		Convert model to half precision in a batchnorm-safe way.
		"""

		def __init__(self, network):
		super(FP16Model, self).__init__()
		self.network = convert_network(network, dtype=torch.half)

		def forward(self, *inputs):
		inputs = tuple(t.half() for t in inputs)
		return self.network(*inputs)


		def backwards_debug_hook(grad):
		raise RuntimeError("master_params recieved a gradient in the backward pass!")


		def prep_param_lists(model, flat_master=False):
		"""
		Creates a list of FP32 master parameters for a given model, as in
		`Training Neural Networks with Mixed Precision: Real Examples`_.

		Args:
		model (torch.nn.Module): Existing Pytorch model
		flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization.
		Returns:
		A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element.

		Example::

		model_params, master_params = prep_param_lists(model)

		.. warning::
		Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.

		.. _`Training Neural Networks with Mixed Precision: Real Examples`:
		http://on-demand.gputechconf.com/gtc/2018/video/S81012/
		"""
		model_params = [param for param in model.parameters() if param.requires_grad]

		if flat_master:
		# Give the user some more useful error messages
		try:
		# flatten_dense_tensors returns a contiguous flat array.
		# http://pytorch.org/docs/master/_modules/torch/_utils.html
		master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
		except BaseException:
		print("Error in prep_param_lists: model may contain a mixture of parameters "
		"of different types. Use flat_master=False, or use F16_Optimizer.")
		raise
		master_params = torch.nn.Parameter(master_params)
		master_params.requires_grad = True
		# master_params.register_hook(backwards_debug_hook)
		if master_params.grad is None:
		master_params.grad = master_params.new(*master_params.size())
		return model_params, [master_params]
		else:
		master_params = [param.clone().float().detach() for param in model_params]
		for param in master_params:
		param.requires_grad = True
		return model_params, master_params


		def model_grads_to_master_grads(model_params, master_params, flat_master=False):
		"""
		Copy model gradients to master gradients.

		Args:
		model_params: List of model parameters created by :func:`prep_param_lists`.
		master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
		"""
		if flat_master:
		# The flattening may incur one more deep copy than is necessary.
		master_params[0].grad.data.copy_(
		_flatten_dense_tensors([p.grad.data for p in model_params]))
		else:
		for model, master in zip(model_params, master_params):
		if model.grad is not None:
		if master.grad is None:
		master.grad = Variable(master.data.new(*master.data.size()))
		else:
		master.grad = None
		model_grads = [p.grad for p in model_params if p.grad is not None]
		master_grads = [p.grad for p in master_params if p.grad is not None]
		_overflow_buf = torch.cuda.IntTensor([0])
		multi_tensor_applier(amp_C.multi_tensor_scale,
		_overflow_buf,
		[model_grads, master_grads],
		1.0)


		def master_params_to_model_params(model_params, master_params, flat_master=False):
		"""
		Copy master parameters to model parameters.

		Args:
		model_params: List of model parameters created by :func:`prep_param_lists`.
		master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
		"""
		if flat_master:
		for model, master in zip(model_params,
		_unflatten_dense_tensors(master_params[0].data, model_params)):
		model.data.copy_(master)
		else:
		for model, master in zip(model_params, master_params):
		model.data.copy_(master.data)

		# Backward compatibility fixes


		def to_python_float(t):
		if hasattr(t, 'item'):
		return t.item()
		else:
		return t[0]


		TORCH_MAJOR = int(torch.__version__.split('.')[0])
		TORCH_MINOR = int(torch.__version__.split('.')[1])

		clip_grad_norm = None #mpu.clip_grad_norm
		# elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
		# clip_grad_norm = torch.nn.utils.clip_grad_norm
		# else:
		# clip_grad_norm = torch.nn.utils.clip_grad_norm_

megatron/fp16/loss_scaler.py

deleted100755 → 0

+0 −272

Original line number	Diff line number	Diff line
		# coding=utf-8
		# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.

		import torch

		from apex.multi_tensor_apply import multi_tensor_applier
		import amp_C

		from megatron import mpu

		# item() is a recent addition, so this helps with backward compatibility.


		def to_python_float(t):
		if hasattr(t, 'item'):
		return t.item()
		else:
		return t[0]


		class LossScaler:
		"""
		Class that manages a static loss scale. This class is intended to interact with
		:class:`FP16_Optimizer`, and should not be directly manipulated by the user.

		Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
		:class:`FP16_Optimizer`'s constructor.

		Args:
		scale (float, optional, default=1.0): The loss scale.
		"""

		def __init__(self, scale=1):
		self.cur_scale = scale

		# `params` is a list / generator of torch.Variable
		def has_overflow(self, params):
		return False

		# `x` is a torch.Tensor
		def _has_inf_or_nan(x):
		return False

		def update_scale(self, overflow):
		pass

		@property
		def loss_scale(self):
		return self.cur_scale

		def scale_gradient(self, module, grad_in, grad_out):
		_overflow_buf = torch.cuda.IntTensor([0])
		multi_tensor_applier(amp_C.multi_tensor_scale,
		_overflow_buf,
		[grad_in, grad_in],
		self.loss_scale)
		return grad_in

		def backward(self, output_tensor, retain_graph=False, output_tensor_grad=None):
		# If output_tensor_grad is None, this is the last stage, and
		# output_tensor is actually the loss and needs to be scaled.
		# Otherwise, output_tensor does not need to be scaled again since
		# output_tensor_grad is already scaled.
		if output_tensor_grad is None:
		scaled_output_tensor = output_tensor * self.loss_scale
		else:
		scaled_output_tensor = output_tensor
		torch.autograd.backward(scaled_output_tensor, grad_tensors=output_tensor_grad,
		retain_graph=retain_graph)


		class DynamicLossScaler:
		"""
		Class that manages dynamic loss scaling. It is recommended to use :class:`DynamicLossScaler`
		indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
		:class:`FP16_Optimizer`. However, it's important to understand how :class:`DynamicLossScaler`
		operates, because the default options can be changed using the
		the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.

		Loss scaling is designed to combat the problem of underflowing gradients encountered at long
		times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss
		scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are
		encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
		occurred.
		:class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
		and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
		If a certain number of iterations occur without overflowing gradients detected,
		:class:`DynamicLossScaler` increases the loss scale once more.
		In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
		always using the highest loss scale possible without incurring overflow.

		Args:
		init_scale (float, optional, default=2**32): Initial loss scale attempted by :class:`DynamicLossScaler.`
		scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
		scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale.
		"""

		def __init__(self,
		init_scale=2**32,
		scale_factor=2.,
		scale_window=1000,
		min_scale=1,
		delayed_shift=1,
		consecutive_hysteresis=False):
		self.cur_scale = init_scale
		self.cur_iter = 0
		self.last_overflow_iter = -1
		self.scale_factor = scale_factor
		self.scale_window = scale_window
		self.min_scale = min_scale
		self.delayed_shift = delayed_shift
		self.cur_hysteresis = delayed_shift
		self.consecutive_hysteresis = consecutive_hysteresis

		# `params` is a list / generator of torch.Variable
		def has_overflow_serial(self, params):
		for p in params:
		if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
		return True

		return False

		def has_overflow(self, params):
		overflow = self.has_overflow_serial(params)
		# Since each model parallel GPU carries only part of the model,
		# make sure overflow flag is synced across all the model parallel GPUs
		overflow_gpu = torch.cuda.ByteTensor([overflow])
		torch.distributed.all_reduce(overflow_gpu,
		op=torch.distributed.ReduceOp.MAX,
		group=mpu.get_model_parallel_group())
		overflow = overflow_gpu[0].item()
		return bool(overflow)

		# `x` is a torch.Tensor

		def _has_inf_or_nan(x):
		try:
		# if x is half, the .float() incurs an additional deep copy, but it's necessary if
		# Pytorch's .sum() creates a one-element tensor of the same type as x
		# (which is true for some recent version of pytorch).
		cpu_sum = float(x.float().sum())
		# More efficient version that can be used if .sum() returns a Python scalar
		# cpu_sum = float(x.sum())
		except RuntimeError as instance:
		# We want to check if inst is actually an overflow exception.
		# RuntimeError could come from a different error.
		# If so, we still want the exception to propagate.
		if "value cannot be converted" not in instance.args[0]:
		raise
		return True
		else:
		if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
		return True
		return False

		# `overflow` is boolean indicating whether the gradient overflowed
		def update_scale(self, overflow):

		if not hasattr(self, 'min_scale'):
		self.min_scale = 1
		if not hasattr(self, 'delayed_shift'):
		self.delayed_shift = 1
		if not hasattr(self, 'cur_hysteresis'):
		self.cur_hysteresis = 1
		if not hasattr(self, 'consecutive_hysteresis'):
		self.consecutive_hysteresis = True
		if overflow:
		# self.cur_scale /= self.scale_factor
		if self.delayed_shift == 1 or self.cur_hysteresis == 1:
		self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
		else:
		self.cur_hysteresis -= 1
		self.last_overflow_iter = self.cur_iter
		else:
		if self.consecutive_hysteresis:
		self.cur_hysteresis = self.delayed_shift
		if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
		if not self.consecutive_hysteresis:
		self.cur_hysteresis = self.delayed_shift
		self.cur_scale *= self.scale_factor
		self.cur_iter += 1

		@property
		def loss_scale(self):
		return self.cur_scale

		def scale_gradient(self, module, grad_in, grad_out):
		_overflow_buf = torch.cuda.IntTensor([0])
		multi_tensor_applier(amp_C.multi_tensor_scale,
		_overflow_buf,
		[grad_in, grad_in],
		self.loss_scale)
		return grad_in

		def backward(self, output_tensor, retain_graph=False, output_tensor_grad=None):
		# If output_tensor_grad is None, this is the last stage, and
		# output_tensor is actually the loss and needs to be scaled.
		# Otherwise, output_tensor does not need to be scaled again since
		# output_tensor_grad is already scaled.
		if output_tensor_grad is None:
		scaled_output_tensor = output_tensor * self.loss_scale
		else:
		scaled_output_tensor = output_tensor
		torch.autograd.backward(scaled_output_tensor, grad_tensors=output_tensor_grad,
		retain_graph=retain_graph)


		##############################################################
		# Example usage below here -- assuming it's in a separate file
		##############################################################
		"""
		TO-DO separate out into an example.
		if __name__ == "__main__":
		import torch
		from torch.autograd import Variable
		from dynamic_loss_scaler import DynamicLossScaler

		# N is batch size; D_in is input dimension;
		# H is hidden dimension; D_out is output dimension.
		N, D_in, H, D_out = 64, 1000, 100, 10

		# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
		x = Variable(torch.randn(N, D_in), requires_grad=False)
		y = Variable(torch.randn(N, D_out), requires_grad=False)

		w1 = Variable(torch.randn(D_in, H), requires_grad=True)
		w2 = Variable(torch.randn(H, D_out), requires_grad=True)
		parameters = [w1, w2]

		learning_rate = 1e-6
		optimizer = torch.optim.SGD(parameters, lr=learning_rate)
		loss_scaler = DynamicLossScaler()

		for t in range(500):
		y_pred = x.mm(w1).clamp(min=0).mm(w2)
		loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
		print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
		print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
		print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))

		# Run backprop
		optimizer.zero_grad()
		loss.backward()

		# Check for overflow
		has_overflow = DynamicLossScaler.has_overflow(parameters)

		# If no overflow, unscale grad and update as usual
		if not has_overflow:
		for param in parameters:
		param.grad.data.mul_(1. / loss_scaler.loss_scale)
		optimizer.step()
		# Otherwise, don't do anything -- ie, skip iteration
		else:
		print('OVERFLOW!')

		# Update loss scale for next iteration
		loss_scaler.update_scale(has_overflow)

		"""

megatron/model/init.py

100755 → 100644

+10 −3

Original line number	Diff line number	Diff line
		@@ -30,9 +30,16 @@ def import_layernorm(fp32_residual_connection):


		from .distributed import *
		from .bert_model import BertModel, BertModelFirstStage, BertModelIntermediateStage, BertModelLastStage
		from .realm_model import ICTBertModel
		from .gpt2_model import GPT2Model, GPT2ModelFirstStage, GPT2ModelIntermediateStage, GPT2ModelLastStage
		from .bert_model import (BertModel,
		BertModelFirstStage,
		BertModelIntermediateStage,
		BertModelLastStage)
		from .gpt2_model import (GPT2Model,
		GPT2ModelFirstStage,
		GPT2ModelIntermediateStage,
		GPT2ModelLastStage)
		from .language_model import get_language_model
		from .module import FP16Module
		from .realm_model import ICTBertModel