Loading megatron/data/albert_dataset.py +27 −10 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ from .dataset_utils import build_training_sample from . import helpers from megatron.data import FullBertTokenizer, indexed_dataset from megatron.utils import print_rank_0 class AlbertDataset(Dataset): Loading @@ -31,11 +32,19 @@ class AlbertDataset(Dataset): # Build the samples mapping. if not num_epochs: if not max_num_samples: raise ValueError("Need to specify either max_num_samples or num_epochs") num_epochs = int(max_num_samples / len(indexed_dataset)) + 1 raise ValueError("Need to specify either max_num_samples " "or num_epochs") num_epochs = np.iinfo(np.int32).max - 1 if not max_num_samples: max_num_samples = len(indexed_dataset) * num_epochs print(f"Building the sample map for {num_epochs} epochs or {max_num_samples} samples.") max_num_samples = np.iinfo(np.int64).max - 1 # Make sure the types match the helpers input types. assert indexed_dataset.doc_idx.dtype == np.int64 assert indexed_dataset.sizes.dtype == np.int32 # Build samples mapping verbose = torch.distributed.get_rank()==0 start_time = time.time() self.samples_mapping = helpers.build_mapping( indexed_dataset.doc_idx, indexed_dataset.sizes, Loading @@ -43,7 +52,14 @@ class AlbertDataset(Dataset): max_num_samples, self.max_seq_length-3, # account for added tokens short_seq_prob, self.seed) self.seed, verbose) # Make sure all the ranks have built the mapping torch.distributed.barrier() print_rank_0('> elasped time to build samples mapping (seconds): ' '{:2f}'.format(time.time() - start_time)) exit() # Vocab stuff. self.vocab_id_list = list(tokenizer.inv_vocab.keys()) Loading @@ -59,11 +75,12 @@ class AlbertDataset(Dataset): num_epochs, max_num_samples, masked_lm_prob, max_seq_length, short_seq_prob, seed, skip_warmup=False): tokenizer = FullBertTokenizer(vocab, do_lower_case=True) print("> Reading dataset index") idx_ds = indexed_dataset.make_dataset(data_prefix, data_impl, skip_warmup) print("> Finished creating indexed dataset") return cls(idx_ds, tokenizer, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, short_seq_prob, seed) print_rank_0("> Reading dataset index ...") idx_ds = indexed_dataset.make_dataset(data_prefix, data_impl, skip_warmup) print_rank_0("> Finished creating indexed dataset") return cls(idx_ds, tokenizer, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, short_seq_prob, seed) def num_tokens(self): return self.tokenizer.vocab_size() Loading megatron/data/helpers.cpp +157 −97 Original line number Diff line number Diff line /* Helper methods for fast index mapping builds */ #include <algorithm> #include <iostream> Loading @@ -6,46 +7,61 @@ #include <stdexcept> #include <pybind11/pybind11.h> #include <pybind11/numpy.h> #include <random> namespace py = pybind11; using namespace std; inline uint32_t get_sample_len(const int short_seq_ratio, const uint32_t max_length) { inline int32_t get_target_sample_len(const int32_t short_seq_ratio, const int32_t max_length, std::mt19937& rand32_gen) { /* Training sample length. */ const auto random_number = rand(); const auto random_number = rand32_gen(); if ((random_number % short_seq_ratio) == 0) { return 2 + random_number % (max_length - 1); } return max_length; } template<typename DocIdx> py::array build_mapping_impl(const py::array_t<uint32_t>& docs_, const py::array_t<uint16_t>& sizes_, const int num_epochs, py::array build_mapping_impl(const py::array_t<int64_t>& docs_, const py::array_t<int32_t>& sizes_, const int32_t num_epochs, const uint64_t max_num_samples, const int max_seq_length, const int32_t max_seq_length, const double short_seq_prob, const int seed) { const int32_t seed, const bool verbose) { /* Build a mapping of (start-index, end-index, sequence-length) where start and end index are the indices of the sentences in the sample and sequence-length is the target sequence length. */ if (verbose) { cout << " > using " << docs_.shape(0) - 1 << " documents with " << sizes_.shape(0) << " sentences ..." << endl << std::flush; } cout << "> building dataset mapping for " << docs_.shape(0) - 1\ << " documents with " << sizes_.shape(0) << " sentences ..." << std::flush << endl; // Consistency checks. assert(num_epochs > 0); assert(max_seq_length > 1); assert(short_seq_prob > 0.0); assert(short_seq_prob <= 1.0); assert(seed > 0); // For efficiency, convert probability to ratio. const auto short_seq_ratio = static_cast<int>(round(1.0 / short_seq_prob)); // For efficiency, convert probability to ratio. Note: rand() generates int. const auto short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob)); // Remove bound checks. auto docs = docs_.unchecked<1>(); auto sizes = sizes_.unchecked<1>(); // Check for consistency. if (docs[docs.shape(0) - 1] != sizes.shape(0)) { cout << "document values is not consistent with length of sizes: " << docs[docs.shape(0) - 1] << " != " << sizes.shape(0) << endl; throw(-1); throw std::length_error("docs and sizes"); } // Mapping and it's length (1D). Loading @@ -55,36 +71,39 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_, // Perform two iterations, in the first iteration get the size // and allocate memory and in the second iteration populate the map. bool second = false; for (int iteration=0; iteration < 2; ++iteration) { for (int32_t iteration=0; iteration<2; ++iteration) { // Set the seed so both iterations produce the same results. srand(seed); std::mt19937 rand32_gen(seed); // Set the flag on second iteration. second = iteration == 1; second = (iteration == 1); // Counters: uint32_t empty_docs = 0; uint32_t one_sent_docs = 0; uint64_t empty_docs = 0; uint64_t one_sent_docs = 0; // Current map index. uint64_t map_index = 0; // For each epoch: for (int epoch=0; epoch < num_epochs; ++epoch) { if (map_index >= max_num_samples && !second) { for (int32_t epoch=0; epoch<num_epochs; ++epoch) { if (map_index >= max_num_samples) { if (verbose && (!second)) { cout << " > reached " << max_num_samples << " samples after " << epoch << " epochs ..." << std::flush << endl; << epoch << " epochs ..." << endl << std::flush; } break; } // For each document: for (int doc=0; doc < (docs.shape(0) - 1); ++doc) { for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) { // Document sentences are in [sent_index_first, sent_index_last). // Document sentences are in [sent_index_first, sent_index_last) const auto sent_index_first = docs[doc]; const auto sent_index_last = docs[doc + 1]; // At the begining of the document previous index is the start index. // At the begining of the document previous index is the // start index. auto prev_start_index = sent_index_first; // Remaining documents. Loading @@ -93,13 +112,10 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_, // Some bookkeeping if ((epoch == 0) && (!second)) { if (num_remain_sent == 0) { cout << "***WARNING*** document " << doc << " is empty" << endl; empty_docs += 1; ++empty_docs; } if (num_remain_sent == 1) { // cout << "***WARNING*** document " << doc << // " has one sentence" << endl; one_sent_docs += 1; ++one_sent_docs; } } Loading @@ -107,69 +123,85 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_, if (num_remain_sent > 1) { // Set values. auto size = uint32_t{0}; auto num_sent = uint32_t{0}; auto seq_len = get_sample_len(short_seq_ratio, max_seq_length); auto seq_len = int32_t{0}; auto num_sent = int32_t{0}; auto target_seq_len = get_target_sample_len(short_seq_ratio, max_seq_length, rand32_gen); // Loop through sentences. for (auto sent_index=sent_index_first; sent_index < sent_index_last; ++sent_index) { // Add the size and number of sentences. size += sizes[sent_index]; num_sent += 1; num_remain_sent -= 1; seq_len += sizes[sent_index]; ++num_sent; --num_remain_sent; // If we have reached the target length. // and if not only one sentence is left in the document. // and if we have at least two sentneces. // and if we have reached end of the document. if (((size >= seq_len) && (num_remain_sent > 1) && if (((seq_len >= target_seq_len) && (num_remain_sent > 1) && (num_sent > 1) ) || (num_remain_sent == 0)) { // Check for overflow. if ((3 * map_index + 2) > std::numeric_limits<int64_t>::max()) { cout << "number of samples exceeded maximum " << "allowed by type int64: " << std::numeric_limits<int64_t>::max() << endl; throw std::overflow_error("Number of samples"); } // Populate the map. if (second) { const auto map_index_0 = 3 * map_index; maps[map_index_0] = prev_start_index; maps[map_index_0 + 1] = sent_index + 1; maps[map_index_0 + 2] = seq_len; maps[map_index_0] = static_cast<DocIdx>(prev_start_index); maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1); maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len); } // Update indices / counters. // check for overflow if (map_index == std::numeric_limits<DocIdx>::max()) { cout << "number of samples exceeded maximum allowed by type: " << std::numeric_limits<DocIdx>::max() << endl; throw std::overflow_error("Number of samples"); } map_index += 1; ++map_index; prev_start_index = sent_index + 1; seq_len = get_sample_len(short_seq_ratio, max_seq_length); size = 0; target_seq_len = get_target_sample_len(short_seq_ratio, max_seq_length, rand32_gen); seq_len = 0; num_sent = 0; } } } // for (auto sent_index=sent_index_first; ... } // if (num_remain_sent > 1) { } // for (int doc=0; doc < num_docs; ++doc) { } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { cout << " number of samples: " << map_index << endl; cout << " number of empty documents: " << empty_docs << endl; cout << " number of documents with one sentence: " << one_sent_docs << endl; if (verbose) { cout << " > number of empty documents: " << empty_docs << endl << std::flush; cout << " > number of documents with one sentence: " << one_sent_docs << endl << std::flush; cout << " > will create mapping for " << map_index << " samples" << endl << std::flush; } assert(maps == NULL); assert(num_samples < 0); maps = new DocIdx[3*map_index]; num_samples = map_index; num_samples = static_cast<int64_t>(map_index); } } // for (int iteration=0; iteration < 2; ++iteration) { // Shuffle. // We need a 64 bit random number generator as we might have more // than 2 billion samples. std::mt19937_64 rand64_gen(seed + 1); for (auto i=(num_samples - 1); i > 0; --i) { const auto j = rand() % (i + 1); const auto j = static_cast<int64_t>(rand64_gen() % (i + 1)); const auto i0 = 3 * i; const auto j0 = 3 * j; // Swap values. Loading @@ -178,38 +210,66 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_, swap(maps[i0 + 2], maps[j0 + 2]); } if (verbose) { cout << "> done building the mapping." << endl; } // Method to deallocate memory. py::capsule free_when_done(maps, [](void *mem_) { DocIdx *mem = reinterpret_cast<DocIdx*>(mem_); cout << "freeing memory for the dataset mapping" << endl; delete[] mem; }); // Return the numpy array. const auto byte_size = sizeof(DocIdx); return py::array(std::vector<int64_t>{num_samples, 3}, // shape {3*4, 4}, // C-style contiguous strides {3*byte_size, byte_size}, // C-style contiguous strides maps, // the data pointer free_when_done); // numpy array references } py::array build_mapping(const py::array& docs_, const py::array& sizes_, py::array build_mapping(const py::array_t<int64_t>& docs_, const py::array_t<int>& sizes_, const int num_epochs, const uint64_t max_num_samples, const int max_seq_length, const double short_seq_prob, const int seed) { const int seed, const bool verbose) { if (verbose) { cout << "> building sample map using: " << endl << std::flush; cout << " number of epochs: " << num_epochs << endl << std::flush; cout << " maximum number of samples: " << max_num_samples << endl << std::flush; cout << " maximum sequence length: " << max_seq_length << endl << std::flush; cout << " short sequence probability: " << short_seq_prob << endl << std::flush; cout << " seed: " << seed << endl << std::flush; } if (sizes_.size() > std::numeric_limits<uint32_t>::max()) { return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed); if (verbose) { cout << " > using uint64 for data mapping..." << endl << std::flush; } return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, verbose); } else { return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed); if (verbose) { cout << " > using uint32 for data mapping..." << endl << std::flush; } return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, verbose); } } PYBIND11_MODULE(helpers, m) { m.def("build_mapping", &build_mapping); Loading megatron/data/indexed_dataset.py +10 −9 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ from itertools import accumulate import numpy as np import torch from megatron.utils import print_rank_0 def __best_fitting_dtype(vocab_size=None): if vocab_size is not None and vocab_size < 65500: Loading Loading @@ -317,7 +318,7 @@ class IndexedDatasetBuilder(object): def _warmup_mmap_file(path): with open(path, 'rb') as stream: while stream.read(1 * 1024 * 1024): while stream.read(100 * 1024 * 1024): pass Loading Loading @@ -390,17 +391,17 @@ class MMapIndexedDataset(torch.utils.data.Dataset): offset = stream.tell() if not skip_warmup: print("> Warming up index mmap file...") print_rank_0("> Warming up index mmap file...") _warmup_mmap_file(path) self._bin_buffer_mmap = np.memmap(path, mode='r', order='C') self._bin_buffer = memoryview(self._bin_buffer_mmap) print("> Reading sizes...") print_rank_0("> Reading sizes...") self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset) print("> Reading pointers...") print_rank_0("> Reading pointers...") self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len, offset=offset + self._sizes.nbytes) print("> Reading document index...") print_rank_0("> Reading document index...") self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count, offset=offset + self._sizes.nbytes + self._pointers.nbytes) def __del__(self): Loading Loading @@ -446,13 +447,13 @@ class MMapIndexedDataset(torch.utils.data.Dataset): self._index = self.Index(index_file_path(self._path), skip_warmup) if not skip_warmup: print("> Warming up data mmap file...") print_rank_0("> Warming up data mmap file...") _warmup_mmap_file(data_file_path(self._path)) print("> Creating numpy buffer of mmap...") print_rank_0("> Creating numpy buffer of mmap...") self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C') print("> Creating memory view of numpy buffer...") print_rank_0("> Creating memory view of numpy buffer...") self._bin_buffer = memoryview(self._bin_buffer_mmap) print("> Done") print_rank_0("> Done") def __del__(self): self._bin_buffer_mmap._mmap.close() Loading Loading
megatron/data/albert_dataset.py +27 −10 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ from .dataset_utils import build_training_sample from . import helpers from megatron.data import FullBertTokenizer, indexed_dataset from megatron.utils import print_rank_0 class AlbertDataset(Dataset): Loading @@ -31,11 +32,19 @@ class AlbertDataset(Dataset): # Build the samples mapping. if not num_epochs: if not max_num_samples: raise ValueError("Need to specify either max_num_samples or num_epochs") num_epochs = int(max_num_samples / len(indexed_dataset)) + 1 raise ValueError("Need to specify either max_num_samples " "or num_epochs") num_epochs = np.iinfo(np.int32).max - 1 if not max_num_samples: max_num_samples = len(indexed_dataset) * num_epochs print(f"Building the sample map for {num_epochs} epochs or {max_num_samples} samples.") max_num_samples = np.iinfo(np.int64).max - 1 # Make sure the types match the helpers input types. assert indexed_dataset.doc_idx.dtype == np.int64 assert indexed_dataset.sizes.dtype == np.int32 # Build samples mapping verbose = torch.distributed.get_rank()==0 start_time = time.time() self.samples_mapping = helpers.build_mapping( indexed_dataset.doc_idx, indexed_dataset.sizes, Loading @@ -43,7 +52,14 @@ class AlbertDataset(Dataset): max_num_samples, self.max_seq_length-3, # account for added tokens short_seq_prob, self.seed) self.seed, verbose) # Make sure all the ranks have built the mapping torch.distributed.barrier() print_rank_0('> elasped time to build samples mapping (seconds): ' '{:2f}'.format(time.time() - start_time)) exit() # Vocab stuff. self.vocab_id_list = list(tokenizer.inv_vocab.keys()) Loading @@ -59,11 +75,12 @@ class AlbertDataset(Dataset): num_epochs, max_num_samples, masked_lm_prob, max_seq_length, short_seq_prob, seed, skip_warmup=False): tokenizer = FullBertTokenizer(vocab, do_lower_case=True) print("> Reading dataset index") idx_ds = indexed_dataset.make_dataset(data_prefix, data_impl, skip_warmup) print("> Finished creating indexed dataset") return cls(idx_ds, tokenizer, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, short_seq_prob, seed) print_rank_0("> Reading dataset index ...") idx_ds = indexed_dataset.make_dataset(data_prefix, data_impl, skip_warmup) print_rank_0("> Finished creating indexed dataset") return cls(idx_ds, tokenizer, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, short_seq_prob, seed) def num_tokens(self): return self.tokenizer.vocab_size() Loading
megatron/data/helpers.cpp +157 −97 Original line number Diff line number Diff line /* Helper methods for fast index mapping builds */ #include <algorithm> #include <iostream> Loading @@ -6,46 +7,61 @@ #include <stdexcept> #include <pybind11/pybind11.h> #include <pybind11/numpy.h> #include <random> namespace py = pybind11; using namespace std; inline uint32_t get_sample_len(const int short_seq_ratio, const uint32_t max_length) { inline int32_t get_target_sample_len(const int32_t short_seq_ratio, const int32_t max_length, std::mt19937& rand32_gen) { /* Training sample length. */ const auto random_number = rand(); const auto random_number = rand32_gen(); if ((random_number % short_seq_ratio) == 0) { return 2 + random_number % (max_length - 1); } return max_length; } template<typename DocIdx> py::array build_mapping_impl(const py::array_t<uint32_t>& docs_, const py::array_t<uint16_t>& sizes_, const int num_epochs, py::array build_mapping_impl(const py::array_t<int64_t>& docs_, const py::array_t<int32_t>& sizes_, const int32_t num_epochs, const uint64_t max_num_samples, const int max_seq_length, const int32_t max_seq_length, const double short_seq_prob, const int seed) { const int32_t seed, const bool verbose) { /* Build a mapping of (start-index, end-index, sequence-length) where start and end index are the indices of the sentences in the sample and sequence-length is the target sequence length. */ if (verbose) { cout << " > using " << docs_.shape(0) - 1 << " documents with " << sizes_.shape(0) << " sentences ..." << endl << std::flush; } cout << "> building dataset mapping for " << docs_.shape(0) - 1\ << " documents with " << sizes_.shape(0) << " sentences ..." << std::flush << endl; // Consistency checks. assert(num_epochs > 0); assert(max_seq_length > 1); assert(short_seq_prob > 0.0); assert(short_seq_prob <= 1.0); assert(seed > 0); // For efficiency, convert probability to ratio. const auto short_seq_ratio = static_cast<int>(round(1.0 / short_seq_prob)); // For efficiency, convert probability to ratio. Note: rand() generates int. const auto short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob)); // Remove bound checks. auto docs = docs_.unchecked<1>(); auto sizes = sizes_.unchecked<1>(); // Check for consistency. if (docs[docs.shape(0) - 1] != sizes.shape(0)) { cout << "document values is not consistent with length of sizes: " << docs[docs.shape(0) - 1] << " != " << sizes.shape(0) << endl; throw(-1); throw std::length_error("docs and sizes"); } // Mapping and it's length (1D). Loading @@ -55,36 +71,39 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_, // Perform two iterations, in the first iteration get the size // and allocate memory and in the second iteration populate the map. bool second = false; for (int iteration=0; iteration < 2; ++iteration) { for (int32_t iteration=0; iteration<2; ++iteration) { // Set the seed so both iterations produce the same results. srand(seed); std::mt19937 rand32_gen(seed); // Set the flag on second iteration. second = iteration == 1; second = (iteration == 1); // Counters: uint32_t empty_docs = 0; uint32_t one_sent_docs = 0; uint64_t empty_docs = 0; uint64_t one_sent_docs = 0; // Current map index. uint64_t map_index = 0; // For each epoch: for (int epoch=0; epoch < num_epochs; ++epoch) { if (map_index >= max_num_samples && !second) { for (int32_t epoch=0; epoch<num_epochs; ++epoch) { if (map_index >= max_num_samples) { if (verbose && (!second)) { cout << " > reached " << max_num_samples << " samples after " << epoch << " epochs ..." << std::flush << endl; << epoch << " epochs ..." << endl << std::flush; } break; } // For each document: for (int doc=0; doc < (docs.shape(0) - 1); ++doc) { for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) { // Document sentences are in [sent_index_first, sent_index_last). // Document sentences are in [sent_index_first, sent_index_last) const auto sent_index_first = docs[doc]; const auto sent_index_last = docs[doc + 1]; // At the begining of the document previous index is the start index. // At the begining of the document previous index is the // start index. auto prev_start_index = sent_index_first; // Remaining documents. Loading @@ -93,13 +112,10 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_, // Some bookkeeping if ((epoch == 0) && (!second)) { if (num_remain_sent == 0) { cout << "***WARNING*** document " << doc << " is empty" << endl; empty_docs += 1; ++empty_docs; } if (num_remain_sent == 1) { // cout << "***WARNING*** document " << doc << // " has one sentence" << endl; one_sent_docs += 1; ++one_sent_docs; } } Loading @@ -107,69 +123,85 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_, if (num_remain_sent > 1) { // Set values. auto size = uint32_t{0}; auto num_sent = uint32_t{0}; auto seq_len = get_sample_len(short_seq_ratio, max_seq_length); auto seq_len = int32_t{0}; auto num_sent = int32_t{0}; auto target_seq_len = get_target_sample_len(short_seq_ratio, max_seq_length, rand32_gen); // Loop through sentences. for (auto sent_index=sent_index_first; sent_index < sent_index_last; ++sent_index) { // Add the size and number of sentences. size += sizes[sent_index]; num_sent += 1; num_remain_sent -= 1; seq_len += sizes[sent_index]; ++num_sent; --num_remain_sent; // If we have reached the target length. // and if not only one sentence is left in the document. // and if we have at least two sentneces. // and if we have reached end of the document. if (((size >= seq_len) && (num_remain_sent > 1) && if (((seq_len >= target_seq_len) && (num_remain_sent > 1) && (num_sent > 1) ) || (num_remain_sent == 0)) { // Check for overflow. if ((3 * map_index + 2) > std::numeric_limits<int64_t>::max()) { cout << "number of samples exceeded maximum " << "allowed by type int64: " << std::numeric_limits<int64_t>::max() << endl; throw std::overflow_error("Number of samples"); } // Populate the map. if (second) { const auto map_index_0 = 3 * map_index; maps[map_index_0] = prev_start_index; maps[map_index_0 + 1] = sent_index + 1; maps[map_index_0 + 2] = seq_len; maps[map_index_0] = static_cast<DocIdx>(prev_start_index); maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1); maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len); } // Update indices / counters. // check for overflow if (map_index == std::numeric_limits<DocIdx>::max()) { cout << "number of samples exceeded maximum allowed by type: " << std::numeric_limits<DocIdx>::max() << endl; throw std::overflow_error("Number of samples"); } map_index += 1; ++map_index; prev_start_index = sent_index + 1; seq_len = get_sample_len(short_seq_ratio, max_seq_length); size = 0; target_seq_len = get_target_sample_len(short_seq_ratio, max_seq_length, rand32_gen); seq_len = 0; num_sent = 0; } } } // for (auto sent_index=sent_index_first; ... } // if (num_remain_sent > 1) { } // for (int doc=0; doc < num_docs; ++doc) { } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { cout << " number of samples: " << map_index << endl; cout << " number of empty documents: " << empty_docs << endl; cout << " number of documents with one sentence: " << one_sent_docs << endl; if (verbose) { cout << " > number of empty documents: " << empty_docs << endl << std::flush; cout << " > number of documents with one sentence: " << one_sent_docs << endl << std::flush; cout << " > will create mapping for " << map_index << " samples" << endl << std::flush; } assert(maps == NULL); assert(num_samples < 0); maps = new DocIdx[3*map_index]; num_samples = map_index; num_samples = static_cast<int64_t>(map_index); } } // for (int iteration=0; iteration < 2; ++iteration) { // Shuffle. // We need a 64 bit random number generator as we might have more // than 2 billion samples. std::mt19937_64 rand64_gen(seed + 1); for (auto i=(num_samples - 1); i > 0; --i) { const auto j = rand() % (i + 1); const auto j = static_cast<int64_t>(rand64_gen() % (i + 1)); const auto i0 = 3 * i; const auto j0 = 3 * j; // Swap values. Loading @@ -178,38 +210,66 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_, swap(maps[i0 + 2], maps[j0 + 2]); } if (verbose) { cout << "> done building the mapping." << endl; } // Method to deallocate memory. py::capsule free_when_done(maps, [](void *mem_) { DocIdx *mem = reinterpret_cast<DocIdx*>(mem_); cout << "freeing memory for the dataset mapping" << endl; delete[] mem; }); // Return the numpy array. const auto byte_size = sizeof(DocIdx); return py::array(std::vector<int64_t>{num_samples, 3}, // shape {3*4, 4}, // C-style contiguous strides {3*byte_size, byte_size}, // C-style contiguous strides maps, // the data pointer free_when_done); // numpy array references } py::array build_mapping(const py::array& docs_, const py::array& sizes_, py::array build_mapping(const py::array_t<int64_t>& docs_, const py::array_t<int>& sizes_, const int num_epochs, const uint64_t max_num_samples, const int max_seq_length, const double short_seq_prob, const int seed) { const int seed, const bool verbose) { if (verbose) { cout << "> building sample map using: " << endl << std::flush; cout << " number of epochs: " << num_epochs << endl << std::flush; cout << " maximum number of samples: " << max_num_samples << endl << std::flush; cout << " maximum sequence length: " << max_seq_length << endl << std::flush; cout << " short sequence probability: " << short_seq_prob << endl << std::flush; cout << " seed: " << seed << endl << std::flush; } if (sizes_.size() > std::numeric_limits<uint32_t>::max()) { return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed); if (verbose) { cout << " > using uint64 for data mapping..." << endl << std::flush; } return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, verbose); } else { return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed); if (verbose) { cout << " > using uint32 for data mapping..." << endl << std::flush; } return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, verbose); } } PYBIND11_MODULE(helpers, m) { m.def("build_mapping", &build_mapping); Loading
megatron/data/indexed_dataset.py +10 −9 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ from itertools import accumulate import numpy as np import torch from megatron.utils import print_rank_0 def __best_fitting_dtype(vocab_size=None): if vocab_size is not None and vocab_size < 65500: Loading Loading @@ -317,7 +318,7 @@ class IndexedDatasetBuilder(object): def _warmup_mmap_file(path): with open(path, 'rb') as stream: while stream.read(1 * 1024 * 1024): while stream.read(100 * 1024 * 1024): pass Loading Loading @@ -390,17 +391,17 @@ class MMapIndexedDataset(torch.utils.data.Dataset): offset = stream.tell() if not skip_warmup: print("> Warming up index mmap file...") print_rank_0("> Warming up index mmap file...") _warmup_mmap_file(path) self._bin_buffer_mmap = np.memmap(path, mode='r', order='C') self._bin_buffer = memoryview(self._bin_buffer_mmap) print("> Reading sizes...") print_rank_0("> Reading sizes...") self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset) print("> Reading pointers...") print_rank_0("> Reading pointers...") self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len, offset=offset + self._sizes.nbytes) print("> Reading document index...") print_rank_0("> Reading document index...") self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count, offset=offset + self._sizes.nbytes + self._pointers.nbytes) def __del__(self): Loading Loading @@ -446,13 +447,13 @@ class MMapIndexedDataset(torch.utils.data.Dataset): self._index = self.Index(index_file_path(self._path), skip_warmup) if not skip_warmup: print("> Warming up data mmap file...") print_rank_0("> Warming up data mmap file...") _warmup_mmap_file(data_file_path(self._path)) print("> Creating numpy buffer of mmap...") print_rank_0("> Creating numpy buffer of mmap...") self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C') print("> Creating memory view of numpy buffer...") print_rank_0("> Creating memory view of numpy buffer...") self._bin_buffer = memoryview(self._bin_buffer_mmap) print("> Done") print_rank_0("> Done") def __del__(self): self._bin_buffer_mmap._mmap.close() Loading