Commit f1f9fa0a authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'staging_compile_helpers' into 'staging'

runtime compilation of helpers

See merge request ADLR/megatron-lm!61
parents 8134313d 9073c4bd
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -246,6 +246,9 @@ def get_samples_mapping_(indexed_dataset,
        start_time = time.time()
        print_rank_0(' > building sapmles index mapping for {} ...'.format(
            name))
        # First compile and then import.
        from megatron.data.dataset_utils import compile_helper
        compile_helper()
        from megatron.data import helpers
        samples_mapping = helpers.build_mapping(
            indexed_dataset.doc_idx,
+13 −0
Original line number Diff line number Diff line
@@ -18,6 +18,19 @@ import collections
import numpy as np


def compile_helper():
    """Compile helper function ar runtime. Make sure this
    is invoked on a single process."""
    import os
    import subprocess
    path = os.path.abspath(os.path.dirname(__file__))
    ret = subprocess.run(['make', '-C', path])
    if ret.returncode != 0:
        print("Making C++ dataset helpers module failed, exiting.")
        import sys
        sys.exit(1)


def build_training_sample(sample,
                          target_seq_length, max_seq_length,
                          vocab_id_list, vocab_id_to_token_dict,
+3 −0
Original line number Diff line number Diff line
@@ -178,6 +178,9 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
            # sample-idx.
            start_time = time.time()
            # Use C++ implementation for speed.
            # First compile and then import.
            from megatron.data.dataset_utils import compile_helper
            compile_helper()
            from megatron.data import helpers
            assert doc_idx.dtype == np.int32
            assert sizes.dtype == np.int32