Loading megatron/arguments.py +3 −0 Original line number Diff line number Diff line Loading @@ -328,6 +328,9 @@ def _add_logging_args(parser): action='store_true', help='If set, write validation perplexity to ' 'tensorboard.') group.add_argument('--log-memory-to-tensorboard', action='store_true', help='Enable memory logging to tensorboard.') return parser Loading megatron/training.py +17 −0 Original line number Diff line number Diff line Loading @@ -531,6 +531,23 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, if args.log_timers_to_tensorboard: timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) if args.log_memory_to_tensorboard: mem_stats = torch.cuda.memory_stats() writer.add_scalar( "mem-reserved-bytes", mem_stats["reserved_bytes.all.current"], iteration, ) writer.add_scalar( "mem-allocated-bytes", mem_stats["allocated_bytes.all.current"], iteration, ) writer.add_scalar( "mem-allocated-count", mem_stats["allocation.all.current"], iteration, ) if iteration % args.log_interval == 0: elapsed_time = timers('interval-time').elapsed() Loading Loading
megatron/arguments.py +3 −0 Original line number Diff line number Diff line Loading @@ -328,6 +328,9 @@ def _add_logging_args(parser): action='store_true', help='If set, write validation perplexity to ' 'tensorboard.') group.add_argument('--log-memory-to-tensorboard', action='store_true', help='Enable memory logging to tensorboard.') return parser Loading
megatron/training.py +17 −0 Original line number Diff line number Diff line Loading @@ -531,6 +531,23 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, if args.log_timers_to_tensorboard: timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) if args.log_memory_to_tensorboard: mem_stats = torch.cuda.memory_stats() writer.add_scalar( "mem-reserved-bytes", mem_stats["reserved_bytes.all.current"], iteration, ) writer.add_scalar( "mem-allocated-bytes", mem_stats["allocated_bytes.all.current"], iteration, ) writer.add_scalar( "mem-allocated-count", mem_stats["allocation.all.current"], iteration, ) if iteration % args.log_interval == 0: elapsed_time = timers('interval-time').elapsed() Loading