Commit 24b7c3c0 authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

added memory stats (allocated/reserved) to tensorboard logging

parent 90e0a0dd
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -328,6 +328,10 @@ def _add_logging_args(parser):
                       action='store_true',
                       help='If set, write validation perplexity to '
                       'tensorboard.')
    group.add_argument('--no-log-memory-to-tensorboard',
                       action='store_false',
                       help='Disable memory logging to tensorboard.',
                       dest='log_memory_to_tensorboard')

    return parser

+17 −0
Original line number Diff line number Diff line
@@ -531,6 +531,23 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
        if args.log_timers_to_tensorboard:
            timers.write(timers_to_log, writer, iteration,
                         normalizer=total_iterations)
        if args.log_memory_to_tensorboard:
            mem_stats = torch.cuda.memory_stats()
            writer.add_scalar(
                "mem-reserved-bytes",
                mem_stats["reserved_bytes.all.current"],
                iteration,
            )
            writer.add_scalar(
                "mem-allocated-bytes",
                mem_stats["allocated_bytes.all.current"],
                iteration,
            )
            writer.add_scalar(
                "mem-allocated-count",
                mem_stats["allocation.all.current"],
                iteration,
            )

    if iteration % args.log_interval == 0:
        elapsed_time = timers('interval-time').elapsed()