Loading megatron/data/bert_dataset.py +3 −0 Original line number Diff line number Diff line Loading @@ -246,6 +246,9 @@ def get_samples_mapping_(indexed_dataset, start_time = time.time() print_rank_0(' > building sapmles index mapping for {} ...'.format( name)) # First compile and then import. from megatron.data.dataset_utils import compile_helper compile_helper() from megatron.data import helpers samples_mapping = helpers.build_mapping( indexed_dataset.doc_idx, Loading megatron/data/dataset_utils.py +9 −0 Original line number Diff line number Diff line Loading @@ -18,6 +18,15 @@ import collections import numpy as np def compile_helper(): """Compile helper function ar runtime. Make sure this is invoked on a single process.""" import os import subprocess path = os.path.abspath(os.path.dirname(__file__)) subprocess.run(['make', '-C', path]) def build_training_sample(sample, target_seq_length, max_seq_length, vocab_id_list, vocab_id_to_token_dict, Loading megatron/data/gpt2_dataset.py +3 −0 Original line number Diff line number Diff line Loading @@ -178,6 +178,9 @@ def _build_index_mappings(name, data_prefix, documents, sizes, # sample-idx. start_time = time.time() # Use C++ implementation for speed. # First compile and then import. from megatron.data.dataset_utils import compile_helper compile_helper() from megatron.data import helpers assert doc_idx.dtype == np.int32 assert sizes.dtype == np.int32 Loading Loading
megatron/data/bert_dataset.py +3 −0 Original line number Diff line number Diff line Loading @@ -246,6 +246,9 @@ def get_samples_mapping_(indexed_dataset, start_time = time.time() print_rank_0(' > building sapmles index mapping for {} ...'.format( name)) # First compile and then import. from megatron.data.dataset_utils import compile_helper compile_helper() from megatron.data import helpers samples_mapping = helpers.build_mapping( indexed_dataset.doc_idx, Loading
megatron/data/dataset_utils.py +9 −0 Original line number Diff line number Diff line Loading @@ -18,6 +18,15 @@ import collections import numpy as np def compile_helper(): """Compile helper function ar runtime. Make sure this is invoked on a single process.""" import os import subprocess path = os.path.abspath(os.path.dirname(__file__)) subprocess.run(['make', '-C', path]) def build_training_sample(sample, target_seq_length, max_seq_length, vocab_id_list, vocab_id_to_token_dict, Loading
megatron/data/gpt2_dataset.py +3 −0 Original line number Diff line number Diff line Loading @@ -178,6 +178,9 @@ def _build_index_mappings(name, data_prefix, documents, sizes, # sample-idx. start_time = time.time() # Use C++ implementation for speed. # First compile and then import. from megatron.data.dataset_utils import compile_helper compile_helper() from megatron.data import helpers assert doc_idx.dtype == np.int32 assert sizes.dtype == np.int32 Loading