Commit 68797d90 authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'lmcafee/empty-cache' into 'main'

bug fix for previous MR (Flag to call empty_cache() each iteration, to reduce fragmentation)

See merge request ADLR/megatron-lm!309
parents 8e8d7603 db08a636
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -363,7 +363,7 @@ def train_step(forward_step_func, data_iterator,
        optimizer, timers, forward_only=False)

    # Empty unused memory
    if args.empty_unused_memory_each_iter >= 1:
    if args.empty_unused_memory_level >= 1:
        torch.cuda.empty_cache()

    # All-reduce if needed.
@@ -413,7 +413,7 @@ def train_step(forward_step_func, data_iterator,
        skipped_iter = 1

    # Empty unused memory
    if args.empty_unused_memory_each_iter >= 2:
    if args.empty_unused_memory_level >= 2:
        torch.cuda.empty_cache()

    if mpu.is_pipeline_last_stage(ignore_virtual=True):
@@ -725,7 +725,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
                timers=None, forward_only=True)

            # Empty unused memory
            if args.empty_unused_memory_each_iter >= 1:
            if args.empty_unused_memory_level >= 1:
                torch.cuda.empty_cache()

            if mpu.is_pipeline_last_stage(ignore_virtual=True):