Loading megatron/model/classification.py +1 −2 Original line number Diff line number Diff line Loading @@ -52,8 +52,7 @@ class Classification(MegatronModule): def forward(self, input_ids, attention_mask, tokentype_ids): extended_attention_mask = bert_extended_attention_mask( attention_mask, next(self.language_model.parameters()).dtype) extended_attention_mask = bert_extended_attention_mask(attention_mask) position_ids = bert_position_ids(input_ids) _, pooled_output = self.language_model(input_ids, Loading megatron/model/multiple_choice.py +1 −2 Original line number Diff line number Diff line Loading @@ -64,8 +64,7 @@ class MultipleChoice(MegatronModule): attention_mask = attention_mask.view(-1, attention_mask.size(-1)) tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1)) extended_attention_mask = bert_extended_attention_mask( attention_mask, next(self.language_model.parameters()).dtype) extended_attention_mask = bert_extended_attention_mask(attention_mask) position_ids = bert_position_ids(input_ids) _, pooled_output = self.language_model(input_ids, Loading tasks/finetune_utils.py +3 −3 Original line number Diff line number Diff line Loading @@ -161,7 +161,7 @@ def _train(model, optimizer, lr_scheduler, forward_step, start_iteration = 0 # Train for one step. losses_dict, _ = train_step(forward_step, batch, model, losses_dict, skipped_iter = train_step(forward_step, batch, model, optimizer, lr_scheduler) iteration += 1 Loading @@ -169,7 +169,7 @@ def _train(model, optimizer, lr_scheduler, forward_step, report_memory_flag = training_log(losses_dict, losses_dict_sum, optimizer.param_groups[0]['lr'], iteration, optimizer.loss_scale, report_memory_flag) report_memory_flag, skipped_iter) # Autoresume if args.adlr_autoresume and \ Loading Loading
megatron/model/classification.py +1 −2 Original line number Diff line number Diff line Loading @@ -52,8 +52,7 @@ class Classification(MegatronModule): def forward(self, input_ids, attention_mask, tokentype_ids): extended_attention_mask = bert_extended_attention_mask( attention_mask, next(self.language_model.parameters()).dtype) extended_attention_mask = bert_extended_attention_mask(attention_mask) position_ids = bert_position_ids(input_ids) _, pooled_output = self.language_model(input_ids, Loading
megatron/model/multiple_choice.py +1 −2 Original line number Diff line number Diff line Loading @@ -64,8 +64,7 @@ class MultipleChoice(MegatronModule): attention_mask = attention_mask.view(-1, attention_mask.size(-1)) tokentype_ids = tokentype_ids.view(-1, tokentype_ids.size(-1)) extended_attention_mask = bert_extended_attention_mask( attention_mask, next(self.language_model.parameters()).dtype) extended_attention_mask = bert_extended_attention_mask(attention_mask) position_ids = bert_position_ids(input_ids) _, pooled_output = self.language_model(input_ids, Loading
tasks/finetune_utils.py +3 −3 Original line number Diff line number Diff line Loading @@ -161,7 +161,7 @@ def _train(model, optimizer, lr_scheduler, forward_step, start_iteration = 0 # Train for one step. losses_dict, _ = train_step(forward_step, batch, model, losses_dict, skipped_iter = train_step(forward_step, batch, model, optimizer, lr_scheduler) iteration += 1 Loading @@ -169,7 +169,7 @@ def _train(model, optimizer, lr_scheduler, forward_step, report_memory_flag = training_log(losses_dict, losses_dict_sum, optimizer.param_groups[0]['lr'], iteration, optimizer.loss_scale, report_memory_flag) report_memory_flag, skipped_iter) # Autoresume if args.adlr_autoresume and \ Loading