added flag/logic for emptying unused memory (3bd2e973) · Commits · candle / Megatron-LM

megatron/arguments.py

+9 −0

Original line number	Diff line number	Diff line
		@@ -601,6 +601,15 @@ def _add_distributed_args(parser):
		group.add_argument('--use-cpu-initialization', action='store_true',
		default=None, help='If set, affine parallel weights '
		'initialization uses CPU' )
		# group.add_argument('--empty-unused-memory-each-iter', action='store_true',
		# help='Call torch.cuda.empty_cache() each iteration '
		# '(training and eval), to reduce fragmentation',
		# default=False)
		group.add_argument('--empty-unused-memory-each-iter', default=0, type=int,
		choices=[0, 1, 2],
		help='Call torch.cuda.empty_cache() each iteration '
		'(training and eval), to reduce fragmentation.'
		'0=off, 1=moderate, 2=aggressive.')
		return parser

+9 −0

Original line number	Diff line number	Diff line
		@@ -362,6 +362,11 @@ def train_step(forward_step_func, data_iterator,
		forward_step_func, data_iterator, model,
		optimizer, timers, forward_only=False)

		# Empty unused memory
		if args.empty_unused_memory_each_iter >= 2:
		raise Exception("hi.")
		torch.cuda.empty_cache()

		# All-reduce if needed.
		if args.DDP_impl == 'local':
		timers('backward-params-all-reduce').start()
		@@ -408,6 +413,10 @@ def train_step(forward_step_func, data_iterator,
		else:
		skipped_iter = 1

		# Empty unused memory
		if args.empty_unused_memory_each_iter >= 1:
		torch.cuda.empty_cache()

		if mpu.is_pipeline_last_stage(ignore_virtual=True):
		# Average loss across microbatches.
		loss_reduced = {}