Commit 3bd2e973 authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

added flag/logic for emptying unused memory

parent 78a69eaf
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -601,6 +601,15 @@ def _add_distributed_args(parser):
    group.add_argument('--use-cpu-initialization', action='store_true',
                       default=None, help='If set, affine parallel weights '
                       'initialization uses CPU' )
    # group.add_argument('--empty-unused-memory-each-iter', action='store_true',
    #                    help='Call torch.cuda.empty_cache() each iteration '
    #                    '(training and eval), to reduce fragmentation',
    #                    default=False)
    group.add_argument('--empty-unused-memory-each-iter', default=0, type=int,
                       choices=[0, 1, 2],
                       help='Call torch.cuda.empty_cache() each iteration '
                       '(training and eval), to reduce fragmentation.'
                       '0=off, 1=moderate, 2=aggressive.')
    return parser


+9 −0
Original line number Diff line number Diff line
@@ -362,6 +362,11 @@ def train_step(forward_step_func, data_iterator,
        forward_step_func, data_iterator, model,
        optimizer, timers, forward_only=False)

    # Empty unused memory
    if args.empty_unused_memory_each_iter >= 2:
        raise Exception("hi.")
        torch.cuda.empty_cache()

    # All-reduce if needed.
    if args.DDP_impl == 'local':
        timers('backward-params-all-reduce').start()
@@ -408,6 +413,10 @@ def train_step(forward_step_func, data_iterator,
    else:
        skipped_iter = 1

    # Empty unused memory
    if args.empty_unused_memory_each_iter >= 1:
        torch.cuda.empty_cache()

    if mpu.is_pipeline_last_stage(ignore_virtual=True):
        # Average loss across microbatches.
        loss_reduced = {}