Commit 1c1a55da authored Apr 09, 2020 by Mohammad

addressed Jareds comments

parent 898fcb94

megatron/data/gpt2_dataset.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -253,7 +253,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length,
		num_epochs, tokens_per_epoch):
		"""Sample index mapping is a 2D array with sizes
		[number-of-samples + 1, 2] where [..., 0] contains
		the index into `doc_idx` and [..., 0] is the
		the index into `doc_idx` and [..., 1] is the
		starting offset in that document."""

		# Total number of samples. For -1 see comments in `_num_epochs`.

megatron/data/helpers.cpp

+4 −2

Original line number	Diff line number	Diff line
		@@ -38,8 +38,10 @@ py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
		const int32_t seq_length,
		const int32_t num_epochs,
		const int64_t tokens_per_epoch) {
		/* Sample index mapping is a 2D array with sizes [number-of-samples + 1, 2]
		where [..., 0] contains the index into `doc_idx` and [..., 0] is the
		/* Sample index (sample_idx) is used for gpt2 like dataset for which
		the documents are flattened and the samples are built based on this
		1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
		where [..., 0] contains the index into `doc_idx` and [..., 1] is the
		starting offset in that document.*/

		// Consistency checks.