Commit 46a536cc authored Jun 10, 2020 by Mohammad Shoeybi

Merge branch 'mmap_data_samples' into 'master'

mmap sample map

See merge request ADLR/megatron-lm!88

parents cfe35fd3 dd76c914

megatron/data/bert_dataset.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -279,7 +279,7 @@ def get_samples_mapping_(indexed_dataset,
		print_rank_0(' > loading indexed mapping from {}'.format(
		indexmap_filename))
		start_time = time.time()
		samples_mapping = np.load(indexmap_filename, allow_pickle=True)
		samples_mapping = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r')
		print_rank_0(' loaded indexed file in {:3.3f} seconds'.format(
		time.time() - start_time))
		print_rank_0(' total number of samples: {}'.format(

megatron/data/gpt2_dataset.py

+3 −3

Original line number	Diff line number	Diff line
		@@ -212,13 +212,13 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
		start_time = time.time()
		print_rank_0(' > loading doc-idx mapping from {}'.format(
		doc_idx_filename))
		doc_idx = np.load(doc_idx_filename, allow_pickle=True)
		doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r')
		print_rank_0(' > loading sample-idx mapping from {}'.format(
		sample_idx_filename))
		sample_idx = np.load(sample_idx_filename, allow_pickle=True)
		sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r')
		print_rank_0(' > loading shuffle-idx mapping from {}'.format(
		shuffle_idx_filename))
		shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True)
		shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r')
		print_rank_0(' loaded indexed file in {:3.3f} seconds'.format(
		time.time() - start_time))
		print_rank_0(' total number of samples: {}'.format(