Loading megatron/utils.py +1 −1 Original line number Diff line number Diff line Loading @@ -84,7 +84,7 @@ def average_losses_across_data_parallel_group(losses): [loss.clone().detach().view(1) for loss in losses]) torch.distributed.all_reduce(averaged_losses, group=mpu.get_data_parallel_group()) averaged_losses = averaged_losses / averaged_losses = averaged_losses / \ torch.distributed.get_world_size(group=mpu.get_data_parallel_group()) return averaged_losses Loading Loading
megatron/utils.py +1 −1 Original line number Diff line number Diff line Loading @@ -84,7 +84,7 @@ def average_losses_across_data_parallel_group(losses): [loss.clone().detach().view(1) for loss in losses]) torch.distributed.all_reduce(averaged_losses, group=mpu.get_data_parallel_group()) averaged_losses = averaged_losses / averaged_losses = averaged_losses / \ torch.distributed.get_world_size(group=mpu.get_data_parallel_group()) return averaged_losses Loading