Loading megatron/data/dataset.py +53 −4 Original line number Diff line number Diff line Loading @@ -8,7 +8,7 @@ import torch from torch.utils.data import Dataset from dataset_utils import build_training_sample #from data.mapping import build_training_samples_mapping class AlbertDataSet(Dataset): Loading Loading @@ -57,7 +57,7 @@ class AlbertDataSet(Dataset): self.mask_id, self.pad_id, self.masked_lm_prob, rng) ''' def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng): """With probability `short_seq_prob` generate a smaller sequence lenght.""" if np_rng.random() < short_seq_prob: Loading Loading @@ -169,7 +169,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length, print('****************************************************************\n') return samples_np ''' # WILL BE REPLACED WITH JARED'S class JaredDataset(object): Loading Loading @@ -207,7 +207,7 @@ if __name__ == '__main__': sentences.extend(sent) yield sentences input_file = '/raid/mshoeybi/data/albert/sample/samples_1000.json' input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json' vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt' tokenizer = FullTokenizer(vocab_file, do_lower_case=True) Loading Loading @@ -236,6 +236,55 @@ if __name__ == '__main__': for i in range(1, len(doc_idx)): doc_idx[i] += doc_idx[i-1] #max_size = np.iinfo(np.int32).max // 32 import time docs_np = np.array(doc_idx, dtype=np.uint32) sizes_np = np.array(sizes, dtype=np.uint16) start_time = time.time() max_seq_length = 512 max_size = docs_np.shape[0] lens = np.full(max_size, max_seq_length-3, dtype=np.uint16) lens_rand = np.random.randint(low=2, high=(max_seq_length-2), size=max_size//10, dtype=np.uint16) lens_view = lens[:max_size//10] np.copyto(lens_view, lens_rand) np.random.shuffle(lens) print('num docs', max_size) print('lens time', time.time() - start_time) import helpers start_time = time.time() maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234) print('maps time', time.time() - start_time) print(maps) exit() start_time = time.time() max_size = 10 #np.iinfo(np.int32).max 32 docs = np.arange(10, dtype=np.uint32) print(docs) a = example.doit(docs, max_size) print(type(a)) print(a.shape) print(a) print(time.time() - start_time) exit() #start_time = time.time() count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10) print(count) maps = maps[:count] np.random.shuffle(maps) print(time.time() - start_time) exit() indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list) dataset = AlbertDataSet(indexed_dataset=indexed_dataset, tokenizer=tokenizer, Loading megatron/data/helpers.cpp 0 → 100644 +202 −0 Original line number Diff line number Diff line #include <algorithm> #include <iostream> #include <limits> #include <math.h> #include <pybind11/pybind11.h> #include <pybind11/numpy.h> namespace py = pybind11; using namespace std; inline uint32_t get_sample_len(const int short_seq_ratio, const uint32_t max_length) { /* Training sample length. */ const auto random_number = rand(); if ((random_number % short_seq_ratio) == 0) { return 2 + random_number % (max_length - 1); } return max_length; } py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_, const py::array_t<uint16_t>& sizes_, const int num_epochs, const int max_num_samples, const int max_seq_length, const double short_seq_prob, const int seed) { cout << "> building dataset mapping for " << docs_.shape(0) - 1 << " documents with " << sizes_.shape(0) << " sentences ..." << endl; // For efficiency, convert probability to ratio. const int short_seq_ratio = int(round(1.0 / short_seq_prob)); // Remove bound checks. auto docs = docs_.unchecked<1>(); auto sizes = sizes_.unchecked<1>(); // Check for consistency. if (docs[docs.shape(0) - 1] != sizes.shape(0)) { cout << "document values is not consistent with length of sizes: " << docs[docs.shape(0) - 1] << " != " << sizes.shape(0) << endl; throw(-1); } // Mapping and it's length (1D). int num_samples = -1; uint32_t* maps = NULL; // Perform two iterations, in the first iteration get the size // and allocate memory and in the second iteration populate the map. bool second = false; for (int iteration=0; iteration < 2; ++iteration) { // Set the seed so both iterations produce the same results. srand(seed); // Set the flag on second iteration. if (iteration == 1) { second = true; } // Counters: uint32_t empty_docs = 0; uint32_t one_sent_docs = 0; // Current map index. uint64_t map_index = 0; // For each epoch: for (int epoch=0; epoch < num_epochs; ++epoch) { if (map_index >= max_num_samples) { cout << " > reached " << max_num_samples << " samples after " << epoch << " epochs ..." << endl; break; } // For each document: for (int doc=0; doc < (docs.shape(0) - 1); ++doc) { // Document sentences are in [sent_index_first, sent_index_last). const uint32_t sent_index_first = docs[doc]; const uint32_t sent_index_last = docs[doc + 1]; // At the begining of the document previous index is the start index. uint32_t prev_start_index = sent_index_first; // Remaining documents. uint32_t num_remain_sent = sent_index_last - sent_index_first; // Some bookkeeping if ((epoch == 0) && (!second)) { if (num_remain_sent == 0) { cout << "***WARNING*** document " << doc << " is empty" << endl; empty_docs += 1; } if (num_remain_sent == 1) { cout << "***WARNING*** document " << doc << " has one sentence" << endl; one_sent_docs += 1; } } // If we have more than two sentences. if (num_remain_sent > 1) { // Set values. uint32_t size = 0; uint32_t num_sent = 0; uint32_t seq_len = get_sample_len(short_seq_ratio, max_seq_length); // Loop through sentences. for (uint32_t sent_index=sent_index_first; sent_index < sent_index_last; ++sent_index) { // Add the size and number of sentences. size += sizes[sent_index]; num_sent += 1; num_remain_sent -= 1; // If we have reached the target length. // and if not only one sentence is left in the document. // and if we have at least two sentneces. // and if we have reached end of the document. if (((size >= seq_len) && (num_remain_sent > 1) && (num_sent > 1) ) || (num_remain_sent == 0)) { // Populate the map. if (second) { const uint64_t map_index_0 = 3 * map_index; maps[map_index_0] = prev_start_index; maps[map_index_0 + 1] = sent_index + 1; maps[map_index_0 + 2] = seq_len; } // Update indices / counters. map_index += 1; prev_start_index = sent_index + 1; seq_len = get_sample_len(short_seq_ratio, max_seq_length); size = 0; num_sent = 0; } } } // if (num_remain_sent > 1) { } // for (int doc=0; doc < num_docs; ++doc) { } // for (int epoch=0; epoch < num_epochs; ++epoch) { // For now only support mappings up to MAX_INT. if (map_index > std::numeric_limits<int>::max()) { cout << "number of samples ("<< map_index <<") exceeded MAX_INT" << endl; throw(-1); } else if (!second) { cout << " number of samples: " << map_index << endl; cout << " number of empty documents: " << empty_docs << endl; cout << " number of documents with one sentence: " << one_sent_docs << endl; maps = new uint32_t[3*map_index]; num_samples = int(map_index); } } // for (int iteration=0; iteration < 2; ++iteration) { // Shuffle. for (int i=(num_samples - 1); i > 0; --i) { const int j = rand() % (i + 1); uint64_t i0 = 3 * i; uint64_t j0 = 3 * j; // Swap values. swap(maps[i0], maps[j0]); swap(maps[i0 + 1], maps[j0 + 1]); swap(maps[i0 + 2], maps[j0 + 2]); } cout << " > done building the mapping." << endl; // Method to deallocate memory. py::capsule free_when_done(maps, [](void *mem_) { uint32_t *mem = reinterpret_cast<uint32_t *>(mem_); cout << "freeing memory for the dataset mapping" << endl; delete[] mem; }); // Return the numpy array. return py::array_t<uint32_t>({num_samples, 3}, // shape {3*4, 4}, // C-style contiguous strides maps, // the data pointer free_when_done); // numpy array references } PYBIND11_MODULE(helpers, m) { m.def("build_mapping", &build_mapping); } Loading
megatron/data/dataset.py +53 −4 Original line number Diff line number Diff line Loading @@ -8,7 +8,7 @@ import torch from torch.utils.data import Dataset from dataset_utils import build_training_sample #from data.mapping import build_training_samples_mapping class AlbertDataSet(Dataset): Loading Loading @@ -57,7 +57,7 @@ class AlbertDataSet(Dataset): self.mask_id, self.pad_id, self.masked_lm_prob, rng) ''' def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng): """With probability `short_seq_prob` generate a smaller sequence lenght.""" if np_rng.random() < short_seq_prob: Loading Loading @@ -169,7 +169,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length, print('****************************************************************\n') return samples_np ''' # WILL BE REPLACED WITH JARED'S class JaredDataset(object): Loading Loading @@ -207,7 +207,7 @@ if __name__ == '__main__': sentences.extend(sent) yield sentences input_file = '/raid/mshoeybi/data/albert/sample/samples_1000.json' input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json' vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt' tokenizer = FullTokenizer(vocab_file, do_lower_case=True) Loading Loading @@ -236,6 +236,55 @@ if __name__ == '__main__': for i in range(1, len(doc_idx)): doc_idx[i] += doc_idx[i-1] #max_size = np.iinfo(np.int32).max // 32 import time docs_np = np.array(doc_idx, dtype=np.uint32) sizes_np = np.array(sizes, dtype=np.uint16) start_time = time.time() max_seq_length = 512 max_size = docs_np.shape[0] lens = np.full(max_size, max_seq_length-3, dtype=np.uint16) lens_rand = np.random.randint(low=2, high=(max_seq_length-2), size=max_size//10, dtype=np.uint16) lens_view = lens[:max_size//10] np.copyto(lens_view, lens_rand) np.random.shuffle(lens) print('num docs', max_size) print('lens time', time.time() - start_time) import helpers start_time = time.time() maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234) print('maps time', time.time() - start_time) print(maps) exit() start_time = time.time() max_size = 10 #np.iinfo(np.int32).max 32 docs = np.arange(10, dtype=np.uint32) print(docs) a = example.doit(docs, max_size) print(type(a)) print(a.shape) print(a) print(time.time() - start_time) exit() #start_time = time.time() count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10) print(count) maps = maps[:count] np.random.shuffle(maps) print(time.time() - start_time) exit() indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list) dataset = AlbertDataSet(indexed_dataset=indexed_dataset, tokenizer=tokenizer, Loading
megatron/data/helpers.cpp 0 → 100644 +202 −0 Original line number Diff line number Diff line #include <algorithm> #include <iostream> #include <limits> #include <math.h> #include <pybind11/pybind11.h> #include <pybind11/numpy.h> namespace py = pybind11; using namespace std; inline uint32_t get_sample_len(const int short_seq_ratio, const uint32_t max_length) { /* Training sample length. */ const auto random_number = rand(); if ((random_number % short_seq_ratio) == 0) { return 2 + random_number % (max_length - 1); } return max_length; } py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_, const py::array_t<uint16_t>& sizes_, const int num_epochs, const int max_num_samples, const int max_seq_length, const double short_seq_prob, const int seed) { cout << "> building dataset mapping for " << docs_.shape(0) - 1 << " documents with " << sizes_.shape(0) << " sentences ..." << endl; // For efficiency, convert probability to ratio. const int short_seq_ratio = int(round(1.0 / short_seq_prob)); // Remove bound checks. auto docs = docs_.unchecked<1>(); auto sizes = sizes_.unchecked<1>(); // Check for consistency. if (docs[docs.shape(0) - 1] != sizes.shape(0)) { cout << "document values is not consistent with length of sizes: " << docs[docs.shape(0) - 1] << " != " << sizes.shape(0) << endl; throw(-1); } // Mapping and it's length (1D). int num_samples = -1; uint32_t* maps = NULL; // Perform two iterations, in the first iteration get the size // and allocate memory and in the second iteration populate the map. bool second = false; for (int iteration=0; iteration < 2; ++iteration) { // Set the seed so both iterations produce the same results. srand(seed); // Set the flag on second iteration. if (iteration == 1) { second = true; } // Counters: uint32_t empty_docs = 0; uint32_t one_sent_docs = 0; // Current map index. uint64_t map_index = 0; // For each epoch: for (int epoch=0; epoch < num_epochs; ++epoch) { if (map_index >= max_num_samples) { cout << " > reached " << max_num_samples << " samples after " << epoch << " epochs ..." << endl; break; } // For each document: for (int doc=0; doc < (docs.shape(0) - 1); ++doc) { // Document sentences are in [sent_index_first, sent_index_last). const uint32_t sent_index_first = docs[doc]; const uint32_t sent_index_last = docs[doc + 1]; // At the begining of the document previous index is the start index. uint32_t prev_start_index = sent_index_first; // Remaining documents. uint32_t num_remain_sent = sent_index_last - sent_index_first; // Some bookkeeping if ((epoch == 0) && (!second)) { if (num_remain_sent == 0) { cout << "***WARNING*** document " << doc << " is empty" << endl; empty_docs += 1; } if (num_remain_sent == 1) { cout << "***WARNING*** document " << doc << " has one sentence" << endl; one_sent_docs += 1; } } // If we have more than two sentences. if (num_remain_sent > 1) { // Set values. uint32_t size = 0; uint32_t num_sent = 0; uint32_t seq_len = get_sample_len(short_seq_ratio, max_seq_length); // Loop through sentences. for (uint32_t sent_index=sent_index_first; sent_index < sent_index_last; ++sent_index) { // Add the size and number of sentences. size += sizes[sent_index]; num_sent += 1; num_remain_sent -= 1; // If we have reached the target length. // and if not only one sentence is left in the document. // and if we have at least two sentneces. // and if we have reached end of the document. if (((size >= seq_len) && (num_remain_sent > 1) && (num_sent > 1) ) || (num_remain_sent == 0)) { // Populate the map. if (second) { const uint64_t map_index_0 = 3 * map_index; maps[map_index_0] = prev_start_index; maps[map_index_0 + 1] = sent_index + 1; maps[map_index_0 + 2] = seq_len; } // Update indices / counters. map_index += 1; prev_start_index = sent_index + 1; seq_len = get_sample_len(short_seq_ratio, max_seq_length); size = 0; num_sent = 0; } } } // if (num_remain_sent > 1) { } // for (int doc=0; doc < num_docs; ++doc) { } // for (int epoch=0; epoch < num_epochs; ++epoch) { // For now only support mappings up to MAX_INT. if (map_index > std::numeric_limits<int>::max()) { cout << "number of samples ("<< map_index <<") exceeded MAX_INT" << endl; throw(-1); } else if (!second) { cout << " number of samples: " << map_index << endl; cout << " number of empty documents: " << empty_docs << endl; cout << " number of documents with one sentence: " << one_sent_docs << endl; maps = new uint32_t[3*map_index]; num_samples = int(map_index); } } // for (int iteration=0; iteration < 2; ++iteration) { // Shuffle. for (int i=(num_samples - 1); i > 0; --i) { const int j = rand() % (i + 1); uint64_t i0 = 3 * i; uint64_t j0 = 3 * j; // Swap values. swap(maps[i0], maps[j0]); swap(maps[i0 + 1], maps[j0 + 1]); swap(maps[i0 + 2], maps[j0 + 2]); } cout << " > done building the mapping." << endl; // Method to deallocate memory. py::capsule free_when_done(maps, [](void *mem_) { uint32_t *mem = reinterpret_cast<uint32_t *>(mem_); cout << "freeing memory for the dataset mapping" << endl; delete[] mem; }); // Return the numpy array. return py::array_t<uint32_t>({num_samples, 3}, // shape {3*4, 4}, // C-style contiguous strides maps, // the data pointer free_when_done); // numpy array references } PYBIND11_MODULE(helpers, m) { m.def("build_mapping", &build_mapping); }