Loading megatron/model/transformer.py +1 −1 Original line number Diff line number Diff line Loading @@ -218,7 +218,7 @@ class ParallelSelfAttention(MegatronModule): if checkpoint_version == 0: # [s, b, (3 * np * hn)] --> [s, b, (np * 3 * hn)] mixed_x_layer = self._transpose_last_dim(mixed_x_layer, 3, True) elif checkpoint_version == 1: elif checkpoint_version == 1.0: # [s, b, (np * hn * 3)] --> [s, b, (np * 3 * hn)] mixed_x_layer = self._transpose_last_dim(mixed_x_layer, 3, False) Loading Loading
megatron/model/transformer.py +1 −1 Original line number Diff line number Diff line Loading @@ -218,7 +218,7 @@ class ParallelSelfAttention(MegatronModule): if checkpoint_version == 0: # [s, b, (3 * np * hn)] --> [s, b, (np * 3 * hn)] mixed_x_layer = self._transpose_last_dim(mixed_x_layer, 3, True) elif checkpoint_version == 1: elif checkpoint_version == 1.0: # [s, b, (np * hn * 3)] --> [s, b, (np * 3 * hn)] mixed_x_layer = self._transpose_last_dim(mixed_x_layer, 3, False) Loading