more cleanup (8fc5e323) · Commits · candle / Megatron-LM

megatron/schedules.py

+9 −60

Original line number	Diff line number	Diff line
		@@ -28,10 +28,6 @@ from megatron.model import DistributedDataParallel as LocalDDP
		from megatron.model import Float16Module
		from megatron.model import ModelType

		# >>>
		from lutil import pax, tp, KEY_RANK
		# <<<

		def get_forward_backward_func():
		args = get_args()
		if mpu.get_pipeline_model_parallel_world_size() > 1:
		@@ -46,36 +42,6 @@ def get_forward_backward_func():
		forward_backward_func = forward_backward_no_pipelining
		return forward_backward_func

		# >>>
		# def free_output_tensor(output_tensors, deallocate_pipeline_outputs):
		# '''Pseudo-free (i.e., set to scalar) the output tensor's '.data' field.

		# This method should be called right after the output tensor has been
		# sent to the next pipeline stage. At this point, the output tensor is
		# only useful for its '.grad_fn' field, and not its '.data'.
		# '''
		# # >>>
		# # raise Exception("hi.")
		# # <<<
		# if not deallocate_pipeline_outputs or output_tensors is None:
		# return
		# if isinstance(output_tensors, torch.Tensor):
		# output_tensors = [output_tensors]
		# for output_tensor in output_tensors:
		# # >>>
		# # if output_tensor.nelement() < 10:
		# # # raise Exception("interesting.")
		# # continue
		# # <<<
		# # >>>
		# # output_tensor.data = torch.cuda.FloatTensor([0])
		# output_tensor.data = torch.empty(
		# (1,),
		# device = torch.cuda.current_device(),
		# dtype = output_tensor.dtype,
		# )
		# # <<<
		# <<<
		def deallocate_output_tensor(out):
		'''Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field.

		@@ -118,8 +84,6 @@ def custom_backward(output, grad_output):
		)

		# Call c++ engine [ see torch/csrc/autograd/python_engine.cpp ]
		# >>>
		try:
		Variable._execution_engine.run_backward(
		tensors = (output,),
		grad_tensors = (grad_output,),
		@@ -129,10 +93,6 @@ def custom_backward(output, grad_output):
		allow_unreachable=True,
		accumulate_grad=True,
		)
		except Exception as e:
		print(">>>> rank = %d. <<<<" % torch.distributed.get_rank())
		raise e
		# <<<


		def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_reduced):
		@@ -163,14 +123,6 @@ def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_r
		losses_reduced.append(loss_reduced)
		timers('forward-compute').stop()

		# >>>
		# if torch.distributed.get_rank() == 4:
		# pax(4, {
		# "output_tensor" : tp(output_tensor),
		# "input_tensor[-1]" : tp(input_tensor[-1]),
		# })
		# <<<

		# If T5 model (or other model with encoder and decoder)
		# and in decoder stack, then send encoder_hidden_state
		# downstream as well.
		@@ -425,9 +377,6 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
		tensor_shape=tensor_shape,
		timers=timers)
		input_tensors[next_forward_model_chunk_id].append(input_tensor)
		# >>>
		pax({"output_tensor": output_tensor})
		# <<<
		deallocate_output_tensor(output_tensor)

		# Run 1F1B in steady state.