Fixing text generation and zeroshot eval and addressing comments. (64a83fb5) · Commits · candle / Megatron-LM

megatron/model/init.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -15,7 +15,7 @@

		from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm

		from .distributed import *
		from .distributed import DistributedDataParallel
		from .bert_model import BertModel
		from .gpt_model import GPTModel
		from .language_model import get_language_model

megatron/schedules.py

+6 −3

Original line number	Diff line number	Diff line
		@@ -22,7 +22,9 @@ from megatron import get_num_microbatches
		from megatron import get_timers
		from megatron import mpu
		from megatron import p2p_communication

		from megatron.utils import unwrap_model
		from megatron.model import DistributedDataParallel as LocalDDP
		from megatron.model import Float16Module

		def get_forward_backward_func():
		args = get_args()
		@@ -46,8 +48,9 @@ def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_r
		timers = get_timers()

		timers('forward-compute').start()
		# TODO
		model.module.module.set_input_tensor(input_tensor)
		unwrapped_model = unwrap_model(
		model, (torchDDP, LocalDDP, Float16Module))
		unwrapped_model.set_input_tensor(input_tensor)
		output_tensor, loss_func = forward_step_func(data_iterator, model)
		if mpu.is_pipeline_last_stage():
		output_tensor = loss_func(output_tensor)

megatron/text_generation_utils.py

+9 −2

Original line number	Diff line number	Diff line
		@@ -26,9 +26,14 @@ import torch.nn.functional as F
		from megatron import get_args
		from megatron import get_tokenizer
		from megatron import mpu
		from megatron.utils import get_ltor_masks_and_position_ids
		from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
		from megatron.p2p_communication import recv_forward, send_forward

		# These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
		from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
		from megatron.model import DistributedDataParallel as LocalDDP
		from megatron.model import Float16Module

		def get_batch(context_tokens):
		"""Generate batch from context tokens."""
		args = get_args()
		@@ -403,7 +408,9 @@ def forward_step(model, tokens, position_ids, attention_mask, tokentype_ids,
		input_tensor = recv_forward()

		# Forward pass through the model.
		model.set_input_tensor(input_tensor)
		unwrapped_model = unwrap_model(
		model, (torchDDP, LocalDDP, Float16Module))
		unwrapped_model.set_input_tensor(input_tensor)
		output_tensor = model(tokens, position_ids, attention_mask,
		tokentype_ids=tokentype_ids,
		layer_past=layer_past,

tasks/finetune_utils.py

+3 −0

Original line number	Diff line number	Diff line
		@@ -231,6 +231,9 @@ def finetune(train_valid_datasets_provider, model_provider,
		args = get_args()
		timers = get_timers()

		assert args.rampup_batch_size is None, \
		'batch size scaling is not supported for finetuning'

		# Train and validation data loaders.
		timers('train/valid/test dataset/dataloder').start()
		if args.epochs > 0:

tasks/zeroshot_gpt/evaluate.py

+13 −24

Original line number	Diff line number	Diff line
		@@ -24,20 +24,24 @@ from megatron import print_rank_0, is_last_rank
		from megatron import get_tokenizer
		from megatron import mpu
		from megatron.checkpointing import load_checkpoint
		from megatron.model import GPTModel, GPTModelFirstStage, GPTModelLastStage, GPTModelIntermediateStage
		from megatron.model import GPTModel
		from megatron.training import get_model
		from megatron.utils import get_ltor_masks_and_position_ids
		from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
		from megatron.p2p_communication import recv_forward, send_forward
		from tasks.finetune_utils import build_data_loader

		from .datasets import build_dataset

		# These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
		from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
		from megatron.model import DistributedDataParallel as LocalDDP
		from megatron.model import Float16Module

		def get_model_provider(eval_metric):
		"""Based on evaluation metric set the parallel-output flag and
		return the model provider."""

		def model_provider():
		def model_provider(pre_process=True, post_process=True):
		"""Build the model."""

		if eval_metric == 'loss':
		@@ -49,17 +53,8 @@ def get_model_provider(eval_metric):
		'is not supported.'.format(eval_metric))

		print_rank_0('building GPT model ...')
		if mpu.get_pipeline_model_parallel_world_size() > 1:
		# Determine model based on position of stage in pipeline.
		if mpu.is_pipeline_first_stage():
		model = GPTModelFirstStage(num_tokentypes=0)
		elif mpu.is_pipeline_last_stage():
		model = GPTModelLastStage(
		parallel_output=parallel_output, num_tokentypes=0)
		else:
		model = GPTModelIntermediateStage(num_tokentypes=0)
		else:
		model = GPTModel(num_tokentypes=0, parallel_output=parallel_output)
		model = GPTModel(num_tokentypes=0, parallel_output=parallel_output,
		pre_process=pre_process, post_process=post_process)

		return model

		@@ -98,19 +93,13 @@ def forward_step(batch, model, eval_metric):
		args = get_args()
		args.micro_batch_size = len(labels)

		# Forward model.
		input_tensor = recv_forward()

		# Forward pass through the model.
		if mpu.is_pipeline_first_stage():
		assert input_tensor is None
		if mpu.is_pipeline_last_stage():
		unwrapped_model = unwrap_model(
		model, (torchDDP, LocalDDP, Float16Module))
		unwrapped_model.set_input_tensor(input_tensor)
		output = model(tokens, position_ids, attention_mask)
		else:
		output = model(tokens, position_ids, attention_mask)
		else:
		assert input_tensor is not None
		output = model(input_tensor, attention_mask)

		send_forward(output)