Loading megatron/arguments.py +13 −11 Original line number Diff line number Diff line Loading @@ -245,17 +245,6 @@ def parse_args(extra_args_provider=None, defaults={}, if args.fp32_residual_connection: assert args.fp16 or args.bf16, \ 'residual connection in fp32 only supported when using fp16 or bf16.' # Activation checkpointing. if args.distribute_checkpointed_activations: assert args.tensor_model_parallel_size > 1, 'can distribute ' \ 'checkpointed activations only across tensor model ' \ 'parallel groups' assert args.activations_checkpoint_method is not None, \ 'for distribute-checkpointed-activations to work you '\ 'need to use a activation-checkpoint method ' assert args.num_layers_per_virtual_pipeline_stage is None, \ 'currently distrobuted checkpoint activations only supported for ' \ 'nointerleaved pipeline parallelism' TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) Loading @@ -267,6 +256,19 @@ def parse_args(extra_args_provider=None, defaults={}, 'pytorch v1.11 (nvidia pytorch container paired with v1.11). ' 'Defaulting to no_persist_layer_norm=True') # Activation checkpointing. if args.distribute_checkpointed_activations: assert args.tensor_model_parallel_size > 1, 'can distribute ' \ 'checkpointed activations only across tensor model ' \ 'parallel groups' assert args.activations_checkpoint_method is not None, \ 'for distributed checkpoint activations to work you '\ 'need to use a activation-checkpoint method ' assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 10, \ 'distributed checkpoint activations are supported for pytorch ' \ 'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \ 'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR) _print_args(args) return args Loading megatron/model/transformer.py +2 −19 Original line number Diff line number Diff line Loading @@ -626,23 +626,6 @@ class ParallelTransformer(MegatronModule): return x_ return custom_forward def distribute_checkpointed_activations_helper(layer_number): """Distribute checkpointed activations across the tensor model Parallel ranks if the `distribute-checkpointed-activations is on and either of the following conditions is met: - it is not the first layer in the in the pipeline stage. The first layer is used in the pipeline parallelism and changing its shape throws error in the backward pass. - we are at the first pipline stage so the input tensor is not used in pipeline parallelism. Note that no pipeline parallelism is a special case of this. """ not_first_layer_in_pipeline_stage = (layer_number > 0) is_first_pipeline_stage = ( mpu.get_pipeline_model_parallel_rank() == 0) return self.distribute_checkpointed_activations and \ (not_first_layer_in_pipeline_stage or is_first_pipeline_stage) if self.activations_checkpoint_method == 'uniform': # Uniformly divide the total number of Transformer layers and checkpoint # the input activation of each divided chunk. Loading @@ -651,7 +634,7 @@ class ParallelTransformer(MegatronModule): while l < self.num_layers: hidden_states = mpu.checkpoint( custom(l, l + self.activations_checkpoint_num_layers), distribute_checkpointed_activations_helper(l), self.distribute_checkpointed_activations, hidden_states, attention_mask, encoder_output, enc_dec_attn_mask) l += self.activations_checkpoint_num_layers elif self.activations_checkpoint_method == 'block': Loading @@ -662,7 +645,7 @@ class ParallelTransformer(MegatronModule): if l < self.activations_checkpoint_num_layers: hidden_states = mpu.checkpoint( custom(l, l + 1), distribute_checkpointed_activations_helper(l), self.distribute_checkpointed_activations, hidden_states, attention_mask, encoder_output, enc_dec_attn_mask) else: hidden_states = custom(l, l + 1)( Loading megatron/mpu/random.py +0 −1 Original line number Diff line number Diff line Loading @@ -24,7 +24,6 @@ from torch import _C from torch.cuda import _lazy_call, device as device_ctx_manager from torch.utils.checkpoint import detach_variable from megatron import get_args from megatron.memory import allocate_mem_buff from .initialize import get_data_parallel_rank Loading Loading
megatron/arguments.py +13 −11 Original line number Diff line number Diff line Loading @@ -245,17 +245,6 @@ def parse_args(extra_args_provider=None, defaults={}, if args.fp32_residual_connection: assert args.fp16 or args.bf16, \ 'residual connection in fp32 only supported when using fp16 or bf16.' # Activation checkpointing. if args.distribute_checkpointed_activations: assert args.tensor_model_parallel_size > 1, 'can distribute ' \ 'checkpointed activations only across tensor model ' \ 'parallel groups' assert args.activations_checkpoint_method is not None, \ 'for distribute-checkpointed-activations to work you '\ 'need to use a activation-checkpoint method ' assert args.num_layers_per_virtual_pipeline_stage is None, \ 'currently distrobuted checkpoint activations only supported for ' \ 'nointerleaved pipeline parallelism' TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) Loading @@ -267,6 +256,19 @@ def parse_args(extra_args_provider=None, defaults={}, 'pytorch v1.11 (nvidia pytorch container paired with v1.11). ' 'Defaulting to no_persist_layer_norm=True') # Activation checkpointing. if args.distribute_checkpointed_activations: assert args.tensor_model_parallel_size > 1, 'can distribute ' \ 'checkpointed activations only across tensor model ' \ 'parallel groups' assert args.activations_checkpoint_method is not None, \ 'for distributed checkpoint activations to work you '\ 'need to use a activation-checkpoint method ' assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 10, \ 'distributed checkpoint activations are supported for pytorch ' \ 'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \ 'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR) _print_args(args) return args Loading
megatron/model/transformer.py +2 −19 Original line number Diff line number Diff line Loading @@ -626,23 +626,6 @@ class ParallelTransformer(MegatronModule): return x_ return custom_forward def distribute_checkpointed_activations_helper(layer_number): """Distribute checkpointed activations across the tensor model Parallel ranks if the `distribute-checkpointed-activations is on and either of the following conditions is met: - it is not the first layer in the in the pipeline stage. The first layer is used in the pipeline parallelism and changing its shape throws error in the backward pass. - we are at the first pipline stage so the input tensor is not used in pipeline parallelism. Note that no pipeline parallelism is a special case of this. """ not_first_layer_in_pipeline_stage = (layer_number > 0) is_first_pipeline_stage = ( mpu.get_pipeline_model_parallel_rank() == 0) return self.distribute_checkpointed_activations and \ (not_first_layer_in_pipeline_stage or is_first_pipeline_stage) if self.activations_checkpoint_method == 'uniform': # Uniformly divide the total number of Transformer layers and checkpoint # the input activation of each divided chunk. Loading @@ -651,7 +634,7 @@ class ParallelTransformer(MegatronModule): while l < self.num_layers: hidden_states = mpu.checkpoint( custom(l, l + self.activations_checkpoint_num_layers), distribute_checkpointed_activations_helper(l), self.distribute_checkpointed_activations, hidden_states, attention_mask, encoder_output, enc_dec_attn_mask) l += self.activations_checkpoint_num_layers elif self.activations_checkpoint_method == 'block': Loading @@ -662,7 +645,7 @@ class ParallelTransformer(MegatronModule): if l < self.activations_checkpoint_num_layers: hidden_states = mpu.checkpoint( custom(l, l + 1), distribute_checkpointed_activations_helper(l), self.distribute_checkpointed_activations, hidden_states, attention_mask, encoder_output, enc_dec_attn_mask) else: hidden_states = custom(l, l + 1)( Loading
megatron/mpu/random.py +0 −1 Original line number Diff line number Diff line Loading @@ -24,7 +24,6 @@ from torch import _C from torch.cuda import _lazy_call, device as device_ctx_manager from torch.utils.checkpoint import detach_variable from megatron import get_args from megatron.memory import allocate_mem_buff from .initialize import get_data_parallel_rank Loading