Loading megatron/model/transformer.py +1 −3 Original line number Diff line number Diff line Loading @@ -53,8 +53,7 @@ class ParallelMLP(MegatronModule): MLP will take the input with h hidden state, project it to 4*h hidden dimension, perform nonlinear transformation, and project the state back into h hidden dimension. At the end, dropout is also applied. state back into h hidden dimension. """ def __init__(self, init_method, output_layer_init_method): Loading Loading @@ -84,7 +83,6 @@ class ParallelMLP(MegatronModule): init_method=output_layer_init_method, skip_bias_add=True) def forward(self, hidden_states): # [s, b, 4hp] Loading megatron/mpu/layers.py +4 −4 Original line number Diff line number Diff line Loading @@ -316,8 +316,8 @@ class RowParallelLinear(torch.nn.Module): keep_master_weight_for_test: This was added for testing and should be set to False. It returns the master weights used for initialization. skip_bias_add: This was added to enable performance optimations where bias can be fused with other elementwise operations. we skip skip_bias_add: This was added to enable performance optimization where bias can be fused with other elementwise operations. We skip adding bias but instead return it. """ Loading Loading
megatron/model/transformer.py +1 −3 Original line number Diff line number Diff line Loading @@ -53,8 +53,7 @@ class ParallelMLP(MegatronModule): MLP will take the input with h hidden state, project it to 4*h hidden dimension, perform nonlinear transformation, and project the state back into h hidden dimension. At the end, dropout is also applied. state back into h hidden dimension. """ def __init__(self, init_method, output_layer_init_method): Loading Loading @@ -84,7 +83,6 @@ class ParallelMLP(MegatronModule): init_method=output_layer_init_method, skip_bias_add=True) def forward(self, hidden_states): # [s, b, 4hp] Loading
megatron/mpu/layers.py +4 −4 Original line number Diff line number Diff line Loading @@ -316,8 +316,8 @@ class RowParallelLinear(torch.nn.Module): keep_master_weight_for_test: This was added for testing and should be set to False. It returns the master weights used for initialization. skip_bias_add: This was added to enable performance optimations where bias can be fused with other elementwise operations. we skip skip_bias_add: This was added to enable performance optimization where bias can be fused with other elementwise operations. We skip adding bias but instead return it. """ Loading