Loading megatron/training.py +6 −2 Original line number Diff line number Diff line Loading @@ -96,9 +96,12 @@ def pretrain(train_valid_test_dataset_provider, initialize_megatron(extra_args_provider=extra_args_provider, args_defaults=args_defaults) # Adjust the startup time so it reflects the largest value. # This will be closer to what scheduler will see (outside of # image ... launches. # XW: TRAIN_START_TIME was measured in the beginning of the code. Here it is made a global variable # XW: The reduce operation should be MAX instead of MIN here global _TRAIN_START_TIME start_time_tensor = torch.cuda.DoubleTensor([_TRAIN_START_TIME]) torch.distributed.all_reduce(start_time_tensor, Loading Loading @@ -217,8 +220,9 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap this_model.model_type = model_type model.append(this_model) else: pre_process = mpu.is_pipeline_first_stage() post_process = mpu.is_pipeline_last_stage() pre_process = mpu.is_pipeline_first_stage() #check if the GPU gets the pipeline first sub-model post_process = mpu.is_pipeline_last_stage() #check if the GPU gets the pipeline last sub-model add_encoder = True add_decoder = True if model_type == ModelType.encoder_and_decoder: Loading Loading
megatron/training.py +6 −2 Original line number Diff line number Diff line Loading @@ -96,9 +96,12 @@ def pretrain(train_valid_test_dataset_provider, initialize_megatron(extra_args_provider=extra_args_provider, args_defaults=args_defaults) # Adjust the startup time so it reflects the largest value. # This will be closer to what scheduler will see (outside of # image ... launches. # XW: TRAIN_START_TIME was measured in the beginning of the code. Here it is made a global variable # XW: The reduce operation should be MAX instead of MIN here global _TRAIN_START_TIME start_time_tensor = torch.cuda.DoubleTensor([_TRAIN_START_TIME]) torch.distributed.all_reduce(start_time_tensor, Loading Loading @@ -217,8 +220,9 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap this_model.model_type = model_type model.append(this_model) else: pre_process = mpu.is_pipeline_first_stage() post_process = mpu.is_pipeline_last_stage() pre_process = mpu.is_pipeline_first_stage() #check if the GPU gets the pipeline first sub-model post_process = mpu.is_pipeline_last_stage() #check if the GPU gets the pipeline last sub-model add_encoder = True add_decoder = True if model_type == ModelType.encoder_and_decoder: Loading