Loading megatron/model/bert_model.py +1 −1 Original line number Diff line number Diff line Loading @@ -138,11 +138,11 @@ class BertModel(MegatronModule): self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) #XW: get_language_model initializes embeddings and Parallel Transformer self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=self.add_binary_head, Loading Loading
megatron/model/bert_model.py +1 −1 Original line number Diff line number Diff line Loading @@ -138,11 +138,11 @@ class BertModel(MegatronModule): self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal(args.init_method_std, args.num_layers) #XW: get_language_model initializes embeddings and Parallel Transformer self.language_model, self._language_model_key = get_language_model( num_tokentypes=num_tokentypes, add_pooler=self.add_binary_head, Loading