simplified sampler (4311b695) · Commits · candle / Megatron-LM

megatron/data/sampler.py

0 → 100644

+64 −0

Original line number	Diff line number	Diff line
		# coding=utf-8
		# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.

		"""Megatorn Sampler."""


		class MegatronPretrainingSampler:


		def __init__(self, total_samples, consumed_samples,
		global_batch_size, rank, world_size):
		# Keep a copy of input params for later use.
		self.total_samples = total_samples
		self.consumed_samples = consumed_samples
		self.global_batch_size = global_batch_size
		self.rank = rank

		# Sanity checks.
		assert self.total_samples > 0, \
		'no sample to consume: {}'.format(self.total_samples)
		assert self.consumed_samples < self.total_samples, \
		'no samples left to consume: {}, {}'.format(self.consumed_samples,
		self.total_samples)
		assert self.global_batch_size > 0, \
		'Unexpected global batch size: {}'.format(self.global_batch_size)
		assert world_size > 0,\
		'non zero world size is expected: {}'.format(world_size)
		assert self.rank < world_size,\
		'rank should be smaller than world size: {}, {}'.format(
		self.rank, world_size)

		# Batch size per rank.
		assert self.global_batch_size % world_size == 0,\
		'global batch size must be divisible by world size: {}, {}'.format(
		self.global_batch_size, world_size)
		self.batch_size_per_rank = self.global_batch_size // world_size


		def __len__(self):
		return self.total_samples


		def __iter__(self):
		batch = []
		# Last batch if not complete will be dropped.
		for idx in range(self.consumed_samples, self.total_samples):
		batch.append(idx)
		if len(batch) == self.global_batch_size:
		start_idx = self.rank * self.batch_size_per_rank
		end_idx = start_idx + self.batch_size_per_rank
		yield batch[start_idx:end_idx]
		batch = []

megatron/data/samplers.py

deleted100644 → 0

+0 −148

Original line number	Diff line number	Diff line
		# coding=utf-8
		# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.

		"""Batch samplers that work with either random or sequential data samplers."""

		import torch
		from torch.utils import data


		class RandomSampler(data.sampler.Sampler):
		"""Based off of pytorch RandomSampler and DistributedSampler. Essentially
		a RandomSampler, but this class lets the user set an epoch like
		DistributedSampler Samples elements randomly. If without replacement, then
		sample from a shuffled dataset. If with replacement, then user can
		specify ``num_samples`` to draw.
		Arguments:
		data_source (Dataset): dataset to sample from
		num_samples (int): number of samples to draw, default=len(dataset)
		replacement (bool): samples are drawn with replacement if ``True``,
		default=False
		"""

		def __init__(self, data_source, replacement=False, num_samples=None):
		self.data_source = data_source
		self.replacement = replacement
		self._num_samples = num_samples
		self.epoch = -1

		if self._num_samples is not None and replacement is False:
		raise ValueError("With replacement=False, num_samples should not "
		"be specified, since a random permute will be "
		"performed.")

		if not isinstance(self.num_samples, int) or self.num_samples <= 0:
		raise ValueError("num_samples should be a positive integer "
		"value, but got num_samples={}".format(
		self.num_samples))
		if not isinstance(self.replacement, bool):
		raise ValueError("replacement should be a boolean value, but got "
		"replacement={}".format(self.replacement))

		@property
		def num_samples(self):
		# dataset size might change at runtime
		if self._num_samples is None:
		return len(self.data_source)
		return self._num_samples

		def __iter__(self):
		n = len(self.data_source)
		g = torch.Generator()
		if self.epoch >= 0:
		g.manual_seed(self.epoch)
		if self.replacement:
		return iter(torch.randint(high=n, size=(self.num_samples,),
		dtype=torch.int64, generator=g).tolist())
		return iter(torch.randperm(n, generator=g).tolist())

		def __len__(self):
		return self.num_samples

		def set_epoch(self, epoch):
		self.epoch = epoch


		class DistributedBatchSampler(data.sampler.BatchSampler):
		"""Similar to normal implementation of distributed sampler, except
		implementation is at the batch sampler level, instead of just the
		sampler level. This allows wrapping of arbitrary data samplers
		(sequential, random, WeightedRandomSampler, etc.) with this batch
		sampler.

		The `interleave` argument specifies how to distribute a batch. A value
		of True combined with the above random sampler is equivalent to pytorch's
		torch.utils.data.distributed.DistributedSampler.

		For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2
		specifying True will result in the following samples for each gpu:
		GPU0: [0,2,4,6] GPU1: [1,3,5,7]
		specifying False will result in the following samples:
		GPU0: [0,1,2,3] GPU1: [4,5,6,7]"""

		def __init__(self, sampler, batch_size, drop_last, rank=-1,
		world_size=2, wrap_last=False, interleave=False):
		super(DistributedBatchSampler, self).__init__(sampler, batch_size,
		drop_last)
		if rank == -1:
		assert False, 'should not be here'
		rank = torch.distributed.get_rank()
		self.rank = rank
		self.world_size = world_size
		self.sampler.wrap_around = 0
		self.wrap_around = 0
		self.wrap_last = wrap_last
		self.start_iter = 0
		self.interleave = interleave

		def __iter__(self):
		batch = []
		i = 0
		for idx in self.data_iterator(self.sampler, wrap_around=False):
		batch.append(idx)
		if len(batch) == self.batch_size:
		tbatch = self._batch(batch)
		if i >= self.start_iter:
		yield tbatch
		self.start_iter = 0
		i += 1
		batch = []
		batch_len = len(batch)
		if batch_len > 0 and not self.drop_last:
		if self.wrap_last:
		self.sampler.wrap_around -= (self.batch_size)
		self.wrap_around += (len(batch))
		self.wrap_around %= self.batch_size
		yield self._batch(batch)
		if self.wrap_last:
		self.sampler.wrap_around += self.batch_size

		def data_iterator(self, _iter, wrap_around=False):
		"""iterates through data and handles wrap around"""
		for i, idx in enumerate(_iter):
		if i < self.wrap_around % self.batch_size:
		continue
		if wrap_around:
		self.wrap_around += 1
		self.wrap_around %= self.batch_size
		yield idx

		def _batch(self, batch):
		"""extracts samples only pertaining to this worker's batch"""
		if self.interleave:
		return batch[self.rank:self.batch_size:self.world_size]
		start = self.rank * self.batch_size // self.world_size
		end = (self.rank + 1) * self.batch_size // self.world_size
		return batch[start:end]

megatron/deprecated_data_utils/init.py

deleted100644 → 0

+0 −141

Original line number	Diff line number	Diff line
		# coding=utf-8
		# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.
		"""utils for creating datasets"""
		import os
		import math

		import torch

		from .samplers import DistributedBatchSampler
		from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset
		from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader
		from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, GPT2BPETokenizer, make_tokenizer
		from . import corpora

		TRAIN_DATA = 0
		VAL_DATA = 1
		TEST_DATA = 2


		def should_split(split):
		"""
		given split proportions checks if should split
		Examples:
		>>> should_split([10,0,0])
		False
		>>> should_split([1,.1,.2])
		True
		"""
		return max(split) / sum(split) != 1.


		def get_ext(path):
		"""gets path extension"""
		return os.path.splitext(path)[1]


		def get_dataset(path, **kwargs):
		"""gets dataset object based on keyword args and file at `path`"""
		if supported_corpus(path):
		return corpora.NAMED_CORPORA[path](**kwargs)
		ext = get_ext(path)
		if '.json' in ext:
		text = json_dataset(path, **kwargs)
		elif ext in ['.csv', '.tsv']:
		text = csv_dataset(path, **kwargs)
		else:
		raise NotImplementedError('data file type %s is not supported' % (ext))
		return text


		def supported_corpus(corpus_name):
		"""checks if corpus name is defined in `corpora.py`"""
		return corpus_name in corpora.NAMED_CORPORA


		def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
		delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
		tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
		model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None,
		parallel_group=None, **kwargs):
		"""function to create datasets+tokenizers for common options"""
		if isinstance(process_fn, str):
		process_fn = eval(process_fn)
		if non_binary_cols is not None:
		# multilabel dataset support (only for csvs)
		label_key = non_binary_cols

		def get_dataset_from_path(path_):
		if lazy:
		# get lazily loaded dataset
		named_corpora = False
		if supported_corpus(path_):
		named_corpora = True
		name = path_
		path_ = corpora.NAMED_CORPORA[path_].PATH
		if torch.distributed.get_rank() == 0 and not exists_lazy(path_, data_type='data'):
		# create cached version of dataset for lazy loading if it doesn't exist
		text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
		delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose)
		make_lazy(path_, text.X, data_type='data')
		# This should be a barrier but nccl barrier assumes
		# device_index=rank which is not the case for model
		# parallel case
		counts = torch.cuda.LongTensor([1])
		torch.distributed.all_reduce(counts, group=parallel_group)
		assert counts[0].item() == torch.distributed.get_world_size(
		group=parallel_group)

		text = lazy_array_loader(path_, data_type='data', map_fn=process_fn)
		else:
		# get dataset
		text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
		delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn)
		return text
		# get one or multiple datasets and concatenate
		if isinstance(path, str):
		path = [path]
		datasets = [get_dataset_from_path(p) for p in path]
		if len(datasets) == 1:
		ds = datasets[0]
		else:
		ds = ConcatDataset(datasets)
		# make tokenizer for dataset
		if tokenizer is None:
		tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type,
		pad_token, character_converage, **kwargs)

		ds_type = ''
		if 'ds_type' in kwargs:
		ds_type = kwargs['ds_type']
		ds.SetTokenizer(tokenizer)
		# Split dataset into train/val/test (and wrap bert dataset)
		if should_split(split):
		ds = split_ds(ds, split)
		if 'bert' in ds_type.lower():
		presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
		dstype = bert_sentencepair_dataset
		ds = [dstype(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
		if d is not None else None for d in ds]
		elif ds_type.lower() == 'gpt2':
		ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
		else:
		if 'bert' in ds_type.lower():
		presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
		dstype = bert_sentencepair_dataset
		ds = dstype(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
		elif ds_type.lower() == 'gpt2':
		ds = GPT2Dataset(ds, max_seq_len=seq_length)
		return ds, tokenizer

megatron/deprecated_data_utils/configure_data.py

deleted100644 → 0

+0 −252

Original line number	Diff line number	Diff line
		# coding=utf-8
		# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.

		"""parses arguments and preps data loader"""

		import copy
		import torch

		from megatron import data_utils
		from megatron import mpu


		class DataConfig:

		def __init__(self, defaults={}):
		super(DataConfig, self).__init__()
		self.defaults = defaults

		def apply(self, args):
		if torch.distributed.get_rank() == 0:
		print('configuring data')
		self.apply_defaults(args)
		return make_loaders(args)

		def set_defaults(self, **kwargs):
		for k, v in kwargs.items():
		self.defaults[k] = v

		def apply_defaults(self, args):
		for k, v in self.defaults.items():
		k = k.replace('-', '_')
		if not hasattr(args, k):
		setattr(args, k, v)


		def make_data_loader(dataset, batch_size, args):

		shuffle = args.shuffle
		if shuffle:
		sampler = data_utils.samplers.RandomSampler(
		dataset, replacement=True, num_samples=batch_size * args.train_iters)
		else:
		sampler = torch.utils.data.SequentialSampler(dataset)
		world_size = torch.distributed.get_world_size(
		group=mpu.get_data_parallel_group())
		rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
		distributed = world_size > 1
		drop_last = distributed

		if distributed:
		batch_sampler = data_utils.samplers.DistributedBatchSampler(sampler,
		batch_size,
		drop_last,
		rank,
		world_size)
		else:
		batch_sampler = torch.utils.data.BatchSampler(sampler,
		batch_size,
		drop_last)

		data_loader = torch.utils.data.DataLoader(dataset,
		batch_sampler=batch_sampler,
		num_workers=args.num_workers,
		pin_memory=True)

		return data_loader


		def make_tfrecord_loaders(args):
		"""Load train/val/test dataset from shuffled TFRecords"""

		import data_utils.tf_dl
		data_set_args = {'batch_size': args.batch_size,
		'max_seq_len': args.seq_length,
		'max_preds_per_seq': args.max_preds_per_seq,
		'train': True,
		'num_workers': max(args.num_workers, 1),
		'seed': args.seed + args.rank + 1,
		'threaded_dl': args.num_workers > 0
		}
		train = data_utils.tf_dl.TFRecordDataLoader(args.train_data,
		**data_set_args)
		data_set_args['train'] = False
		if args.eval_seq_length is not None:
		data_set_args['max_seq_len'] = args.eval_seq_length
		if args.eval_max_preds_per_seq is not None:
		data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
		valid = None
		if args.valid_data is not None:
		valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data,
		**data_set_args)
		test = None
		if args.test_data is not None:
		test = data_utils.tf_dl.TFRecordDataLoader(args.test_data,
		**data_set_args)
		tokenizer = data_utils.make_tokenizer(args.tokenizer_type,
		train,
		args.tokenizer_path,
		args.vocab_size,
		args.tokenizer_model_type,
		cache_dir=args.cache_dir)

		return (train, valid, test), tokenizer


		def make_loaders(args):
		"""makes training/val/test"""

		if args.data_loader == 'tfrecords':
		return make_tfrecord_loaders(args)
		world_size = torch.distributed.get_world_size(
		group=mpu.get_data_parallel_group())
		batch_size = args.batch_size * world_size
		eval_batch_size = batch_size
		if args.eval_batch_size is not None:
		eval_batch_size = args.eval_batch_size * world_size
		seq_length = args.seq_length
		if seq_length < 0:
		seq_length = seq_length * world_size
		eval_seq_length = args.eval_seq_length
		if eval_seq_length is not None and eval_seq_length < 0:
		eval_seq_length = eval_seq_length * world_size
		split = get_split(args)
		if args.data_path is not None:
		args.train_data = args.data_path
		data_set_args = {
		'path': args.train_data,
		'seq_length': seq_length,
		'lazy': args.data_loader == 'lazy',
		'delim': args.delim,
		'text_key': args.text_key,
		'label_key': 'label',
		'non_binary_cols': None,
		'ds_type': args.data_set_type,
		'split': split,
		'loose': args.loose_json,
		'tokenizer_type': args.tokenizer_type,
		'tokenizer_model_path': args.tokenizer_path,
		'vocab_size': args.vocab_size,
		'model_type': args.tokenizer_model_type,
		'cache_dir': args.cache_dir,
		'max_preds_per_seq': args.max_preds_per_seq,
		'presplit_sentences': args.presplit_sentences,
		'parallel_group': mpu.get_data_parallel_group()}

		eval_set_args = copy.copy(data_set_args)
		eval_set_args['split'] = [1.]
		# if optional eval args were set then replace their
		# equivalent values in the arg dict
		if eval_seq_length:
		eval_set_args['seq_length'] = eval_seq_length
		if args.eval_max_preds_per_seq:
		eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
		if args.eval_text_key is not None:
		eval_set_args['text_key'] = args.eval_text_key

		# make datasets splits and tokenizer
		train = None
		valid = None
		test = None

		if args.train_data is not None:
		train, tokenizer = data_utils.make_dataset(**data_set_args)
		if data_utils.should_split(split):
		train, valid, test = train
		eval_set_args['tokenizer'] = tokenizer

		# make training and val dataset if necessary
		if valid is None and args.valid_data is not None:
		eval_set_args['path'] = args.valid_data
		valid, tokenizer = data_utils.make_dataset(**eval_set_args)
		eval_set_args['tokenizer'] = tokenizer
		if test is None and args.test_data is not None:
		eval_set_args['path'] = args.test_data
		test, tokenizer = data_utils.make_dataset(**eval_set_args)

		# wrap datasets with data loader
		if train is not None and args.batch_size > 0:
		train = make_data_loader(train, batch_size, args)
		args.do_train = True
		else:
		args.do_train = False
		eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
		if valid is not None:
		valid = make_data_loader(valid, eval_batch_size, args)
		args.do_valid = True
		else:
		args.do_valid = False
		if test is not None:
		test = make_data_loader(test, eval_batch_size, args)
		args.do_test = True
		else:
		args.do_test = False

		return (train, valid, test), tokenizer


		def get_split(args):
		"""
		Get dataset splits from comma separated string list
		"""
		splits = []
		if args.split.find(',') != -1:
		splits = [float(s) for s in args.split.split(',')]
		elif args.split.find('/') != -1:
		splits = [float(s) for s in args.split.split('/')]
		else:
		splits = [float(args.split)]
		split_total = sum(splits)
		if split_total < 1.:
		splits.append(1 - split_total)
		while len(splits) < 3:
		splits.append(0.)
		splits = splits[:3]
		if args.valid_data is not None:
		splits[1] = 0.
		if args.test_data is not None:
		splits[2] = 0.
		final_sum = sum(splits)
		return [s / final_sum for s in splits]


		def configure_data():
		"""add cmdline flags for configuring datasets"""
		# These are options that are used by data_utils, but are either
		# deprecated or not meant to be exposed to the command line user.
		# These options are intneded to be set in code by specific scripts.
		defaults = {
		'world_size': 1,
		'rank': -1,
		'persist_state': 0,
		'lazy': False,
		'transpose': False,
		'data_set_type': 'supervised',
		'seq_length': 256,
		'eval_seq_length': 256,
		'samples_per_shard': 100
		}

		return DataConfig(defaults=defaults)

megatron/deprecated_data_utils/corpora.py

deleted100755 → 0

+0 −61

File deleted.

Preview size limit exceeded, changes collapsed.