Commit f69f699f authored Aug 16, 2021 by mshoeybi

Merge branch 'typo' of https://github.com/hwijeen/Megatron-LM into github_small_prs

parents 2ef470e8 e515f026

megatron/model/transformer.py

+1 −3

Original line number	Diff line number	Diff line
		@@ -53,8 +53,7 @@ class ParallelMLP(MegatronModule):

		MLP will take the input with h hidden state, project it to 4*h
		hidden dimension, perform nonlinear transformation, and project the
		state back into h hidden dimension. At the end, dropout is also
		applied.
		state back into h hidden dimension.
		"""

		def __init__(self, init_method, output_layer_init_method):
		@@ -84,7 +83,6 @@ class ParallelMLP(MegatronModule):
		init_method=output_layer_init_method,
		skip_bias_add=True)


		def forward(self, hidden_states):

		# [s, b, 4hp]

megatron/mpu/layers.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -316,8 +316,8 @@ class RowParallelLinear(torch.nn.Module):
		keep_master_weight_for_test: This was added for testing and should be
		set to False. It returns the master weights
		used for initialization.
		skip_bias_add: This was added to enable performance optimations where bias
		can be fused with other elementwise operations. we skip
		skip_bias_add: This was added to enable performance optimization where bias
		can be fused with other elementwise operations. We skip
		adding bias but instead return it.
		"""