Merge branch 'megatron_sampler' into 'main' (75bd9b54) · Commits · candle / Megatron-LM

megatron/arguments.py

+14 −6

Original line number	Diff line number	Diff line
		@@ -72,6 +72,9 @@ def parse_args(extra_args_provider=None, defaults={},
		print('using {} for parameters ...'.format(args.params_dtype),
		flush=True)

		# Consumed tokens.
		args.consumed_train_samples = 0
		args.consumed_valid_samples = 0

		# Set input defaults.
		for key in defaults:
		@@ -133,14 +136,16 @@ def parse_args(extra_args_provider=None, defaults={},
		def _print_args(args):
		"""Print arguments."""
		if args.rank == 0:
		print('-------------------- arguments --------------------', flush=True)
		print('------------------------ arguments ------------------------',
		flush=True)
		str_list = []
		for arg in vars(args):
		dots = '.' * (32 - len(arg))
		dots = '.' * (48 - len(arg))
		str_list.append(' {} {} {}'.format(arg, dots, getattr(args, arg)))
		for arg in sorted(str_list, key=lambda x: x.lower()):
		print(arg, flush=True)
		print('---------------- end of arguments ----------------', flush=True)
		print('-------------------- end of arguments ---------------------',
		flush=True)


		def _check_arg_is_not_none(args, arg):
		@@ -275,7 +280,7 @@ def _add_learning_rate_args(parser):
		'and initial warmup, the learing rate at each '
		'iteration would be different.')
		group.add_argument('--lr-decay-style', type=str, default='linear',
		choices=['constant', 'linear', 'cosine', 'exponential'],
		choices=['constant', 'linear', 'cosine'],
		help='Learning rate decay function.')
		group.add_argument('--lr-decay-iters', type=int, default=None,
		help='number of iterations to decay learning rate over,'
		@@ -397,8 +402,11 @@ def _add_validation_args(parser):
		def _add_data_args(parser):
		group = parser.add_argument_group(title='data and dataloader')

		group.add_argument('--data-path', type=str, default=None,
		help='Path to combined dataset to split.')
		group.add_argument('--data-path', nargs='*', default=None,
		help='Path to the training dataset. Accepted format:'
		'1) a single data path, 2) multiple datasets in the'
		'form: dataset1-weight dataset1-path dataset2-weight '
		'dataset2-path ...')
		group.add_argument('--split', type=str, default='969, 30, 1',
		help='Comma-separated list of proportions for training,'
		' validation, and test split. For example the split '

megatron/checkpointing.py

+4 −1

Original line number	Diff line number	Diff line
		@@ -214,11 +214,14 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
		checkpoint_name))
		sys.exit()


		# Check arguments.
		assert args.consumed_train_samples == 0
		assert args.consumed_valid_samples == 0
		if 'args' in state_dict:
		checkpoint_args = state_dict['args']
		check_checkpoint_args(checkpoint_args)
		args.consumed_train_samples = getattr(args, 'consumed_train_samples', 0)
		args.consumed_valid_samples = getattr(args, 'consumed_valid_samples', 0)
		else:
		print_rank_0('could not find arguments in the checkpoint ...')

megatron/data/blendable_dataset.py

0 → 100644

+75 −0

Original line number	Diff line number	Diff line
		# coding=utf-8
		# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.

		"""Blendable dataset."""

		import time

		import numpy as np
		import torch

		from megatron import print_rank_0
		from megatron import mpu


		class BlendableDataset(torch.utils.data.Dataset):


		def __init__(self, datasets, weights):

		self.datasets = datasets
		num_datasets = len(datasets)
		assert num_datasets == len(weights)

		self.size = 0
		for dataset in self.datasets:
		self.size += len(dataset)

		# Normalize weights.
		weights = np.array(weights, dtype=np.float64)
		sum_weights = np.sum(weights)
		assert sum_weights > 0.0
		weights /= sum_weights

		# Build indecies.
		start_time = time.time()
		assert num_datasets < 255
		self.dataset_index = np.zeros(self.size, dtype=np.uint8)
		self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)

		if torch.distributed.get_rank() == 0:
		from megatron.data.dataset_utils import compile_helper
		compile_helper()
		# Simple barrier
		tmp = torch.cuda.LongTensor([1])
		torch.distributed.all_reduce(tmp, group=mpu.get_data_parallel_group())

		from megatron.data import helpers
		helpers.build_blending_indices(self.dataset_index,
		self.dataset_sample_index,
		weights, num_datasets, self.size,
		torch.distributed.get_rank() == 0)
		print_rank_0('> elapsed time for building blendable dataset indices: '
		'{:.2f} (sec)'.format(time.time() - start_time))


		def __len__(self):
		return self.size


		def __getitem__(self, idx):
		dataset_idx = self.dataset_index[idx]
		sample_idx = self.dataset_sample_index[idx]
		return self.datasets[dataset_idx][sample_idx]

megatron/data/data_loaders.py

0 → 100644

+95 −0

Original line number	Diff line number	Diff line
		# coding=utf-8
		# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.

		"""Dataloaders."""


		import torch

		from megatron import get_args
		from megatron import mpu


		def build_pretraining_data_loader(dataset, consumed_samples):
		"""Buld dataloader given an input dataset."""

		if dataset is None:
		return None
		args = get_args()

		world_size = mpu.get_data_parallel_world_size()
		global_batch_size = args.batch_size * world_size

		# Megatron sampler
		batch_sampler = MegatronPretrainingSampler(
		total_samples=len(dataset),
		consumed_samples=consumed_samples,
		global_batch_size=global_batch_size,
		rank=mpu.get_data_parallel_rank(),
		world_size=world_size)

		# Torch dataloader.
		return torch.utils.data.DataLoader(dataset,
		batch_sampler=batch_sampler,
		num_workers=args.num_workers,
		pin_memory=True)


		class MegatronPretrainingSampler:


		def __init__(self, total_samples, consumed_samples,
		global_batch_size, rank, world_size):
		# Keep a copy of input params for later use.
		self.total_samples = total_samples
		self.consumed_samples = consumed_samples
		self.global_batch_size = global_batch_size
		self.rank = rank

		# Sanity checks.
		assert self.total_samples > 0, \
		'no sample to consume: {}'.format(self.total_samples)
		assert self.consumed_samples < self.total_samples, \
		'no samples left to consume: {}, {}'.format(self.consumed_samples,
		self.total_samples)
		assert self.global_batch_size > 0, \
		'Unexpected global batch size: {}'.format(self.global_batch_size)
		assert world_size > 0,\
		'non zero world size is expected: {}'.format(world_size)
		assert self.rank < world_size,\
		'rank should be smaller than world size: {}, {}'.format(
		self.rank, world_size)

		# Batch size per rank.
		assert self.global_batch_size % world_size == 0,\
		'global batch size must be divisible by world size: {}, {}'.format(
		self.global_batch_size, world_size)
		self.batch_size_per_rank = self.global_batch_size // world_size


		def __len__(self):
		return self.total_samples


		def __iter__(self):
		batch = []
		# Last batch if not complete will be dropped.
		for idx in range(self.consumed_samples, self.total_samples):
		batch.append(idx)
		if len(batch) == self.global_batch_size:
		start_idx = self.rank * self.batch_size_per_rank
		end_idx = start_idx + self.batch_size_per_rank
		yield batch[start_idx:end_idx]
		batch = []

megatron/data/dataset_utils.py

+74 −0

Original line number	Diff line number	Diff line
		@@ -18,11 +18,13 @@
		# https://github.com/google-research/albert/blob/master/create_pretraining_data.py
		# with some modifications.

		import math
		import time
		import collections

		import numpy as np
		from megatron import get_args, print_rank_0
		from megatron.data.blendable_dataset import BlendableDataset
		from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset

		DSET_TYPE_STD = 'standard_bert'
		@@ -31,6 +33,38 @@ DSET_TYPE_ICT = 'ict'
		DSET_TYPES = [DSET_TYPE_ICT, DSET_TYPE_STD]


		def get_datasets_weights_and_num_samples(data_prefix,
		train_valid_test_num_samples):

		# The data prefix should be in the format of:
		# weight-1, data-prefix-1, weight-2, data-prefix-2, ..
		assert len(data_prefix) % 2 == 0
		num_datasets = len(data_prefix) // 2
		weights = [0]*num_datasets
		prefixes = [0]*num_datasets
		for i in range(num_datasets):
		weights[i] = float(data_prefix[2*i])
		prefixes[i] = (data_prefix[2*i+1]).strip()
		# Normalize weights
		weight_sum = 0.0
		for weight in weights:
		weight_sum += weight
		assert weight_sum > 0.0
		weights = [weight / weight_sum for weight in weights]

		# Add 0.5% (the 1.005 factor) so in case the bleding dataset does
		# not uniformly distribute the number of samples, we still have
		# samples left to feed to the network.
		datasets_train_valid_test_num_samples = []
		for weight in weights:
		datasets_train_valid_test_num_samples.append(
		[int(math.ceil(val * weight * 1.005))
		for val in train_valid_test_num_samples])


		return prefixes, weights, datasets_train_valid_test_num_samples


		def compile_helper():
		"""Compile helper function ar runtime. Make sure this
		is invoked on a single process."""
		@@ -360,6 +394,46 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
		short_seq_prob, seed, skip_warmup,
		dataset_type='standard_bert'):

		if len(data_prefix) == 1:
		return _build_train_valid_test_datasets(data_prefix[0],
		data_impl, splits_string,
		train_valid_test_num_samples,
		max_seq_length, masked_lm_prob,
		short_seq_prob, seed,
		skip_warmup,
		dataset_type=dataset_type)
		# Blending dataset.
		# Parse the values.
		output = get_datasets_weights_and_num_samples(data_prefix,
		train_valid_test_num_samples)
		prefixes, weights, datasets_train_valid_test_num_samples = output

		# Build individual datasets.
		train_datasets = []
		valid_datasets = []
		test_datasets = []
		for i in range(len(prefixes)):
		train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
		prefixes[i], data_impl, splits_string,
		datasets_train_valid_test_num_samples[i],
		max_seq_length, masked_lm_prob, short_seq_prob,
		seed, skip_warmup, dataset_type=dataset_type)

		# Blend.
		blending_train_dataset = BlendableDataset(train_datasets, weights)
		blending_valid_dataset = BlendableDataset(valid_datasets, weights)
		blending_test_dataset = BlendableDataset(test_datasets, weights)

		return (blending_train_dataset, blending_valid_dataset,
		blending_test_dataset)


		def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
		train_valid_test_num_samples,
		max_seq_length, masked_lm_prob,
		short_seq_prob, seed, skip_warmup,
		dataset_type='standard_bert'):

		if dataset_type not in DSET_TYPES:
		raise ValueError("Invalid dataset_type: ", dataset_type)