Loading megatron/data/gpt2_dataset.py +1 −1 Original line number Diff line number Diff line Loading @@ -253,7 +253,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch): """Sample index mapping is a 2D array with sizes [number-of-samples + 1, 2] where [..., 0] contains the index into `doc_idx` and [..., 0] is the the index into `doc_idx` and [..., 1] is the starting offset in that document.""" # Total number of samples. For -1 see comments in `_num_epochs`. Loading megatron/data/helpers.cpp +4 −2 Original line number Diff line number Diff line Loading @@ -38,8 +38,10 @@ py::array build_sample_idx(const py::array_t<int32_t>& sizes_, const int32_t seq_length, const int32_t num_epochs, const int64_t tokens_per_epoch) { /* Sample index mapping is a 2D array with sizes [number-of-samples + 1, 2] where [..., 0] contains the index into `doc_idx` and [..., 0] is the /* Sample index (sample_idx) is used for gpt2 like dataset for which the documents are flattened and the samples are built based on this 1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2] where [..., 0] contains the index into `doc_idx` and [..., 1] is the starting offset in that document.*/ // Consistency checks. Loading Loading
megatron/data/gpt2_dataset.py +1 −1 Original line number Diff line number Diff line Loading @@ -253,7 +253,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch): """Sample index mapping is a 2D array with sizes [number-of-samples + 1, 2] where [..., 0] contains the index into `doc_idx` and [..., 0] is the the index into `doc_idx` and [..., 1] is the starting offset in that document.""" # Total number of samples. For -1 see comments in `_num_epochs`. Loading
megatron/data/helpers.cpp +4 −2 Original line number Diff line number Diff line Loading @@ -38,8 +38,10 @@ py::array build_sample_idx(const py::array_t<int32_t>& sizes_, const int32_t seq_length, const int32_t num_epochs, const int64_t tokens_per_epoch) { /* Sample index mapping is a 2D array with sizes [number-of-samples + 1, 2] where [..., 0] contains the index into `doc_idx` and [..., 0] is the /* Sample index (sample_idx) is used for gpt2 like dataset for which the documents are flattened and the samples are built based on this 1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2] where [..., 0] contains the index into `doc_idx` and [..., 1] is the starting offset in that document.*/ // Consistency checks. Loading