c++ code working (f51ceb7c) · Commits · candle / Megatron-LM

megatron/data/dataset.py

+53 −4

Original line number	Diff line number	Diff line
		@@ -8,7 +8,7 @@ import torch
		from torch.utils.data import Dataset

		from dataset_utils import build_training_sample

		#from data.mapping import build_training_samples_mapping

		class AlbertDataSet(Dataset):

		@@ -57,7 +57,7 @@ class AlbertDataSet(Dataset):
		self.mask_id, self.pad_id,
		self.masked_lm_prob, rng)


		'''
		def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
		"""With probability `short_seq_prob` generate a smaller sequence lenght."""
		if np_rng.random() < short_seq_prob:
		@@ -169,7 +169,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
		print('****************************************************************\n')

		return samples_np

		'''

		# WILL BE REPLACED WITH JARED'S
		class JaredDataset(object):
		@@ -207,7 +207,7 @@ if __name__ == '__main__':
		sentences.extend(sent)
		yield sentences

		input_file = '/raid/mshoeybi/data/albert/sample/samples_1000.json'
		input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
		vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'

		tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
		@@ -236,6 +236,55 @@ if __name__ == '__main__':
		for i in range(1, len(doc_idx)):
		doc_idx[i] += doc_idx[i-1]

		#max_size = np.iinfo(np.int32).max // 32

		import time

		docs_np = np.array(doc_idx, dtype=np.uint32)
		sizes_np = np.array(sizes, dtype=np.uint16)

		start_time = time.time()
		max_seq_length = 512
		max_size = docs_np.shape[0]
		lens = np.full(max_size, max_seq_length-3, dtype=np.uint16)
		lens_rand = np.random.randint(low=2, high=(max_seq_length-2),
		size=max_size//10, dtype=np.uint16)
		lens_view = lens[:max_size//10]
		np.copyto(lens_view, lens_rand)
		np.random.shuffle(lens)
		print('num docs', max_size)
		print('lens time', time.time() - start_time)

		import helpers
		start_time = time.time()
		maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234)
		print('maps time', time.time() - start_time)
		print(maps)
		exit()

		start_time = time.time()
		max_size = 10 #np.iinfo(np.int32).max 32
		docs = np.arange(10, dtype=np.uint32)
		print(docs)

		a = example.doit(docs, max_size)
		print(type(a))
		print(a.shape)
		print(a)
		print(time.time() - start_time)
		exit()


		#start_time = time.time()
		count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10)
		print(count)
		maps = maps[:count]
		np.random.shuffle(maps)
		print(time.time() - start_time)


		exit()

		indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
		dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
		tokenizer=tokenizer,

megatron/data/helpers.cpp

0 → 100644

+202 −0

Original line number	Diff line number	Diff line

		#include <algorithm>
		#include <iostream>
		#include <limits>
		#include <math.h>
		#include <pybind11/pybind11.h>
		#include <pybind11/numpy.h>

		namespace py = pybind11;
		using namespace std;


		inline uint32_t get_sample_len(const int short_seq_ratio,
		const uint32_t max_length) {
		/* Training sample length. */
		const auto random_number = rand();
		if ((random_number % short_seq_ratio) == 0) {
		return 2 + random_number % (max_length - 1);
		}
		return max_length;
		}


		py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
		const py::array_t<uint16_t>& sizes_,
		const int num_epochs,
		const int max_num_samples,
		const int max_seq_length,
		const double short_seq_prob,
		const int seed) {

		cout << "> building dataset mapping for " << docs_.shape(0) - 1 <<
		" documents with " << sizes_.shape(0) << " sentences ..." << endl;

		// For efficiency, convert probability to ratio.
		const int short_seq_ratio = int(round(1.0 / short_seq_prob));

		// Remove bound checks.
		auto docs = docs_.unchecked<1>();
		auto sizes = sizes_.unchecked<1>();

		// Check for consistency.
		if (docs[docs.shape(0) - 1] != sizes.shape(0)) {
		cout << "document values is not consistent with length of sizes: " <<
		docs[docs.shape(0) - 1] << " != " << sizes.shape(0) << endl;
		throw(-1);
		}

		// Mapping and it's length (1D).
		int num_samples = -1;
		uint32_t* maps = NULL;

		// Perform two iterations, in the first iteration get the size
		// and allocate memory and in the second iteration populate the map.
		bool second = false;
		for (int iteration=0; iteration < 2; ++iteration) {

		// Set the seed so both iterations produce the same results.
		srand(seed);

		// Set the flag on second iteration.
		if (iteration == 1) {
		second = true;
		}

		// Counters:
		uint32_t empty_docs = 0;
		uint32_t one_sent_docs = 0;

		// Current map index.
		uint64_t map_index = 0;

		// For each epoch:
		for (int epoch=0; epoch < num_epochs; ++epoch) {
		if (map_index >= max_num_samples) {
		cout << " > reached " << max_num_samples << " samples after " <<
		epoch << " epochs ..." << endl;
		break;
		}
		// For each document:
		for (int doc=0; doc < (docs.shape(0) - 1); ++doc) {

		// Document sentences are in [sent_index_first, sent_index_last).
		const uint32_t sent_index_first = docs[doc];
		const uint32_t sent_index_last = docs[doc + 1];

		// At the begining of the document previous index is the start index.
		uint32_t prev_start_index = sent_index_first;

		// Remaining documents.
		uint32_t num_remain_sent = sent_index_last - sent_index_first;

		// Some bookkeeping
		if ((epoch == 0) && (!second)) {
		if (num_remain_sent == 0) {
		cout << "*WARNING* document " << doc << " is empty" << endl;
		empty_docs += 1;
		}
		if (num_remain_sent == 1) {
		cout << "*WARNING* document " << doc <<
		" has one sentence" << endl;
		one_sent_docs += 1;
		}
		}

		// If we have more than two sentences.
		if (num_remain_sent > 1) {

		// Set values.
		uint32_t size = 0;
		uint32_t num_sent = 0;
		uint32_t seq_len = get_sample_len(short_seq_ratio, max_seq_length);

		// Loop through sentences.
		for (uint32_t sent_index=sent_index_first;
		sent_index < sent_index_last; ++sent_index) {

		// Add the size and number of sentences.
		size += sizes[sent_index];
		num_sent += 1;
		num_remain_sent -= 1;

		// If we have reached the target length.
		// and if not only one sentence is left in the document.
		// and if we have at least two sentneces.
		// and if we have reached end of the document.
		if (((size >= seq_len) && (num_remain_sent > 1) &&
		(num_sent > 1) ) \|\| (num_remain_sent == 0)) {

		// Populate the map.
		if (second) {
		const uint64_t map_index_0 = 3 * map_index;
		maps[map_index_0] = prev_start_index;
		maps[map_index_0 + 1] = sent_index + 1;
		maps[map_index_0 + 2] = seq_len;
		}

		// Update indices / counters.
		map_index += 1;
		prev_start_index = sent_index + 1;
		seq_len = get_sample_len(short_seq_ratio, max_seq_length);
		size = 0;
		num_sent = 0;
		}
		}

		} // if (num_remain_sent > 1) {
		} // for (int doc=0; doc < num_docs; ++doc) {
		} // for (int epoch=0; epoch < num_epochs; ++epoch) {

		// For now only support mappings up to MAX_INT.
		if (map_index > std::numeric_limits<int>::max()) {
		cout << "number of samples ("<< map_index <<") exceeded MAX_INT" << endl;
		throw(-1);
		}
		else if (!second) {
		cout << " number of samples: " <<
		map_index << endl;
		cout << " number of empty documents: " <<
		empty_docs << endl;
		cout << " number of documents with one sentence: " <<
		one_sent_docs << endl;
		maps = new uint32_t[3*map_index];
		num_samples = int(map_index);
		}

		} // for (int iteration=0; iteration < 2; ++iteration) {

		// Shuffle.
		for (int i=(num_samples - 1); i > 0; --i) {
		const int j = rand() % (i + 1);
		uint64_t i0 = 3 * i;
		uint64_t j0 = 3 * j;
		// Swap values.
		swap(maps[i0], maps[j0]);
		swap(maps[i0 + 1], maps[j0 + 1]);
		swap(maps[i0 + 2], maps[j0 + 2]);
		}

		cout << " > done building the mapping." << endl;

		// Method to deallocate memory.
		py::capsule free_when_done(maps, [](void *mem_) {
		uint32_t mem = reinterpret_cast<uint32_t >(mem_);
		cout << "freeing memory for the dataset mapping" << endl;
		delete[] mem;
		});

		// Return the numpy array.
		return py::array_t<uint32_t>({num_samples, 3}, // shape
		{3*4, 4}, // C-style contiguous strides
		maps, // the data pointer
		free_when_done); // numpy array references

		}


		PYBIND11_MODULE(helpers, m) {
		m.def("build_mapping", &build_mapping);
		}