Loading megatron/training.py +3 −3 Original line number Diff line number Diff line Loading @@ -363,7 +363,7 @@ def train_step(forward_step_func, data_iterator, optimizer, timers, forward_only=False) # Empty unused memory if args.empty_unused_memory_each_iter >= 1: if args.empty_unused_memory_level >= 1: torch.cuda.empty_cache() # All-reduce if needed. Loading Loading @@ -413,7 +413,7 @@ def train_step(forward_step_func, data_iterator, skipped_iter = 1 # Empty unused memory if args.empty_unused_memory_each_iter >= 2: if args.empty_unused_memory_level >= 2: torch.cuda.empty_cache() if mpu.is_pipeline_last_stage(ignore_virtual=True): Loading Loading @@ -725,7 +725,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False): timers=None, forward_only=True) # Empty unused memory if args.empty_unused_memory_each_iter >= 1: if args.empty_unused_memory_level >= 1: torch.cuda.empty_cache() if mpu.is_pipeline_last_stage(ignore_virtual=True): Loading Loading
megatron/training.py +3 −3 Original line number Diff line number Diff line Loading @@ -363,7 +363,7 @@ def train_step(forward_step_func, data_iterator, optimizer, timers, forward_only=False) # Empty unused memory if args.empty_unused_memory_each_iter >= 1: if args.empty_unused_memory_level >= 1: torch.cuda.empty_cache() # All-reduce if needed. Loading Loading @@ -413,7 +413,7 @@ def train_step(forward_step_func, data_iterator, skipped_iter = 1 # Empty unused memory if args.empty_unused_memory_each_iter >= 2: if args.empty_unused_memory_level >= 2: torch.cuda.empty_cache() if mpu.is_pipeline_last_stage(ignore_virtual=True): Loading Loading @@ -725,7 +725,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False): timers=None, forward_only=True) # Empty unused memory if args.empty_unused_memory_each_iter >= 1: if args.empty_unused_memory_level >= 1: torch.cuda.empty_cache() if mpu.is_pipeline_last_stage(ignore_virtual=True): Loading