Commit 59dea876 authored by Wang, Xiao's avatar Wang, Xiao
Browse files

add comments for is_pipeline_first_stage and is_pipeline_last_stage. Add...

add comments for is_pipeline_first_stage and is_pipeline_last_stage. Add comments for TRAIN_START_TIME
parent 393249bc
Loading
Loading
Loading
Loading
+6 −2
Original line number Diff line number Diff line
@@ -96,9 +96,12 @@ def pretrain(train_valid_test_dataset_provider,
    initialize_megatron(extra_args_provider=extra_args_provider,
                        args_defaults=args_defaults)


    # Adjust the startup time so it reflects the largest value.
    # This will be closer to what scheduler will see (outside of
    # image ... launches.
    # XW: TRAIN_START_TIME was measured in the beginning of the code. Here it is made a global variable
    # XW: The reduce operation should be MAX instead of MIN here
    global _TRAIN_START_TIME
    start_time_tensor = torch.cuda.DoubleTensor([_TRAIN_START_TIME])
    torch.distributed.all_reduce(start_time_tensor,
@@ -217,8 +220,9 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap
            this_model.model_type = model_type
            model.append(this_model)
    else:
        pre_process = mpu.is_pipeline_first_stage()
        post_process = mpu.is_pipeline_last_stage()
        pre_process = mpu.is_pipeline_first_stage()   #check if the GPU gets the pipeline first sub-model
        post_process = mpu.is_pipeline_last_stage()   #check if the GPU gets the pipeline last sub-model

        add_encoder = True
        add_decoder = True
        if model_type == ModelType.encoder_and_decoder: