Merge branch 'main_retriver_merge' into 'main' (68233932) · Commits · candle / Megatron-LM

examples/pretrain_ict.sh

0 → 100755

+44 −0

Original line number	Diff line number	Diff line
		#! /bin/bash

		# Runs the "217M" parameter biencoder model for ICT retriever

		RANK=0
		WORLD_SIZE=1

		PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
		TEXT_DATA_PATH=<Specify path and file prefix of the text data>
		TITLE_DATA_PATH=<Specify path and file prefix od the titles>
		CHECKPOINT_PATH=<Specify path>


		python pretrain_ict.py \
		--num-layers 12 \
		--hidden-size 768 \
		--num-attention-heads 12 \
		--tensor-model-parallel-size 1 \
		--micro-batch-size 32 \
		--seq-length 256 \
		--max-position-embeddings 512 \
		--train-iters 100000 \
		--vocab-file bert-vocab.txt \
		--tokenizer-type BertWordPieceLowerCase \
		--DDP-impl torch \
		--bert-load ${PRETRAINED_BERT_PATH} \
		--log-interval 100 \
		--eval-interval 1000 \
		--eval-iters 10 \
		--retriever-report-topk-accuracies 1 5 10 20 100 \
		--retriever-score-scaling \
		--load $CHECKPOINT_PATH \
		--save $CHECKPOINT_PATH \
		--data-path ${TEXT_DATA_PATH} \
		--titles-data-path ${TITLE_DATA_PATH} \
		--lr 0.0001 \
		--lr-decay-style linear \
		--weight-decay 1e-2 \
		--clip-grad 1.0 \
		--lr-warmup-fraction 0.01 \
		--save-interval 4000 \
		--exit-interval 8000 \
		--query-in-block-prob 0.1 \
		--fp16

megatron/arguments.py

+16 −6

Original line number	Diff line number	Diff line
		@@ -39,7 +39,7 @@ def parse_args(extra_args_provider=None, defaults={},
		parser = _add_validation_args(parser)
		parser = _add_data_args(parser)
		parser = _add_autoresume_args(parser)
		parser = _add_realm_args(parser)
		parser = _add_biencoder_args(parser)
		parser = _add_vit_args(parser)
		parser = _add_logging_args(parser)

		@@ -672,13 +672,19 @@ def _add_autoresume_args(parser):
		return parser


		def _add_realm_args(parser):
		group = parser.add_argument_group(title='realm')
		def _add_biencoder_args(parser):
		group = parser.add_argument_group(title='biencoder')

		# network size
		group.add_argument('--ict-head-size', type=int, default=None,
		help='Size of block embeddings to be used in ICT and '
		'REALM (paper default: 128)')
		group.add_argument('--biencoder-projection-dim', type=int, default=0,
		help='Size of projection head used in biencoder (paper'
		' default: 128)')
		group.add_argument('--biencoder-shared-query-context-model', action='store_true',
		help='Whether to share the parameters of the query '
		'and context models or not')

		# checkpointing
		group.add_argument('--ict-load', type=str, default=None,
		@@ -697,8 +703,12 @@ def _add_realm_args(parser):
		help='Whether to use one sentence documents in ICT')

		# training
		group.add_argument('--report-topk-accuracies', nargs='+', default=[],
		help="Which top-k accuracies to report (e.g. '1 5 20')")
		group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
		default=[], help="Which top-k accuracies to report "
		"(e.g. '1 5 20')")
		group.add_argument('--retriever-score-scaling', action='store_true',
		help='Whether to scale retriever scores by inverse '
		'square root of hidden size')

		# faiss index
		group.add_argument('--faiss-use-gpu', action='store_true',

megatron/checkpointing.py

+31 −22

Original line number	Diff line number	Diff line
		@@ -206,6 +206,33 @@ def _transpose_first_dim(t, num_splits, num_splits_first, model):

		return t

		def fix_query_key_value_ordering(model, checkpoint_version):
		"""Fix up query/key/value matrix ordering if checkpoint
		version is smaller than 2.0
		"""
		if checkpoint_version < 2.0:
		for name, param in model.named_parameters():
		if name.endswith(('.query_key_value.weight', '.query_key_value.bias')):
		if checkpoint_version == 0:
		fixed_param = _transpose_first_dim(param.data, 3, True, model)
		elif checkpoint_version == 1.0:
		fixed_param = _transpose_first_dim(param.data, 3, False, model)
		else:
		print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
		sys.exit()
		param.data.copy_(fixed_param)
		if name.endswith(('.key_value.weight', '.key_value.bias')):
		if checkpoint_version == 0:
		fixed_param = _transpose_first_dim(param.data, 2, True, model)
		elif checkpoint_version == 1.0:
		fixed_param = _transpose_first_dim(param.data, 2, False, model)
		else:
		print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
		sys.exit()
		param.data.copy_(fixed_param)
		print_rank_0(" succesfully fixed query-key-values ordering for"
		" checkpoint version {}".format(checkpoint_version))

		def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True):
		"""Load a model checkpoint and return the iteration.
		strict (bool): whether to strictly enforce that the keys in
		@@ -308,28 +335,10 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
		mpu.set_virtual_pipeline_model_parallel_rank(i)
		model[i].load_state_dict(state_dict['model%d' % i], strict=strict)

		# Fix up query/key/value matrix ordering
		if get_checkpoint_version() < 2.0:
		# Fix up query/key/value matrix ordering if needed
		checkpoint_version = get_checkpoint_version()
		for name, param in model.named_parameters():
		if name.endswith(('.query_key_value.weight', '.query_key_value.bias')):
		if checkpoint_version == 0:
		fixed_param = _transpose_first_dim(param.data, 3, True, model)
		elif checkpoint_version == 1.0:
		fixed_param = _transpose_first_dim(param.data, 3, False, model)
		else:
		print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
		sys.exit()
		param.data.copy_(fixed_param)
		if name.endswith(('.key_value.weight', '.key_value.bias')):
		if checkpoint_version == 0:
		fixed_param = _transpose_first_dim(param.data, 2, True, model)
		elif checkpoint_version == 1.0:
		fixed_param = _transpose_first_dim(param.data, 2, False, model)
		else:
		print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
		sys.exit()
		param.data.copy_(fixed_param)
		print_rank_0(f' checkpoint version {checkpoint_version}')
		fix_query_key_value_ordering(model, checkpoint_version)

		# Optimizer.
		if not release and not args.finetune and not args.no_load_optim:

megatron/data/biencoder_dataset_utils.py

0 → 100644

+171 −0

Original line number	Diff line number	Diff line
		import os
		import time

		import numpy as np
		import torch

		from megatron import mpu, print_rank_0
		from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
		from megatron import get_args, get_tokenizer, print_rank_0, mpu


		def get_ict_batch(data_iterator):
		# Items and their type.
		keys = ['query_tokens', 'query_mask',
		'context_tokens', 'context_mask', 'block_data']
		datatype = torch.int64

		# Broadcast data.
		if data_iterator is None:
		data = None
		else:
		data = next(data_iterator)
		data_b = mpu.broadcast_data(keys, data, datatype)

		# Unpack.
		query_tokens = data_b['query_tokens'].long()
		query_mask = data_b['query_mask'] < 0.5
		context_tokens = data_b['context_tokens'].long()
		context_mask = data_b['context_mask'] < 0.5
		block_indices = data_b['block_data'].long()

		return query_tokens, query_mask,\
		context_tokens, context_mask, block_indices


		def join_str_list(str_list):
		"""Join a list of strings, handling spaces appropriately"""
		result = ""
		for s in str_list:
		if s.startswith("##"):
		result += s[2:]
		else:
		result += " " + s
		return result


		class BlockSampleData(object):
		"""A struct for fully describing a fixed-size block of data as used in REALM

		:param start_idx: for first sentence of the block
		:param end_idx: for last sentence of the block (may be partially truncated in sample construction)
		:param doc_idx: the index of the document from which the block comes in the original indexed dataset
		:param block_idx: a unique integer identifier given to every block.
		"""
		def __init__(self, start_idx, end_idx, doc_idx, block_idx):
		self.start_idx = start_idx
		self.end_idx = end_idx
		self.doc_idx = doc_idx
		self.block_idx = block_idx

		def as_array(self):
		return np.array([self.start_idx, self.end_idx, self.doc_idx, self.block_idx]).astype(np.int64)

		def as_tuple(self):
		return self.start_idx, self.end_idx, self.doc_idx, self.block_idx


		class BlockSamplesMapping(object):
		def __init__(self, mapping_array):
		# make sure that the array is compatible with BlockSampleData
		assert mapping_array.shape[1] == 4
		self.mapping_array = mapping_array

		def __len__(self):
		return self.mapping_array.shape[0]

		def __getitem__(self, idx):
		"""Get the data associated with an indexed sample."""
		sample_data = BlockSampleData(*self.mapping_array[idx])
		return sample_data


		def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epochs,
		max_num_samples, max_seq_length, seed, name, use_one_sent_docs=False):
		"""Get samples mapping for a dataset over fixed size blocks. This function also requires
		a dataset of the titles for the source documents since their lengths must be taken into account.

		:return: samples_mapping (BlockSamplesMapping)
		"""

		if not num_epochs:
		if not max_num_samples:
		raise ValueError("Need to specify either max_num_samples "
		"or num_epochs")
		num_epochs = np.iinfo(np.int32).max - 1
		if not max_num_samples:
		max_num_samples = np.iinfo(np.int64).max - 1

		# Filename of the index mapping
		indexmap_filename = data_prefix
		indexmap_filename += '_{}_indexmap'.format(name)
		if num_epochs != (np.iinfo(np.int32).max - 1):
		indexmap_filename += '_{}ep'.format(num_epochs)
		if max_num_samples != (np.iinfo(np.int64).max - 1):
		indexmap_filename += '_{}mns'.format(max_num_samples)
		indexmap_filename += '_{}msl'.format(max_seq_length)
		indexmap_filename += '_{}s'.format(seed)
		if use_one_sent_docs:
		indexmap_filename += '_1sentok'
		indexmap_filename += '.npy'

		# Build the indexed mapping if not exist.
		if mpu.get_data_parallel_rank() == 0 and \
		not os.path.isfile(indexmap_filename):
		print(' > WARNING: could not find index map file {}, building '
		'the indices on rank 0 ...'.format(indexmap_filename))

		# Make sure the types match the helpers input types.
		assert block_dataset.doc_idx.dtype == np.int64
		assert block_dataset.sizes.dtype == np.int32

		# Build samples mapping
		verbose = torch.distributed.get_rank() == 0
		start_time = time.time()
		print_rank_0(' > building samples index mapping for {} ...'.format(
		name))

		from megatron.data import helpers
		mapping_array = helpers.build_blocks_mapping(
		block_dataset.doc_idx,
		block_dataset.sizes,
		title_dataset.sizes,
		num_epochs,
		max_num_samples,
		max_seq_length - 3, # account for added tokens
		seed,
		verbose,
		use_one_sent_docs)


		print_rank_0(' > done building samples index mapping')
		np.save(indexmap_filename, mapping_array, allow_pickle=True)
		print_rank_0(' > saved the index mapping in {}'.format(
		indexmap_filename))
		# Make sure all the ranks have built the mapping
		print_rank_0(' > elapsed time to build and save samples mapping '
		'(seconds): {:4f}'.format(
		time.time() - start_time))

		# This should be a barrier but nccl barrier assumes
		# device_index=rank which is not the case for model
		# parallel case
		counts = torch.cuda.LongTensor([1])
		torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
		assert counts[0].item() == torch.distributed.get_world_size(
		group=mpu.get_data_parallel_group())

		# Load indexed dataset.
		print_rank_0(' > loading indexed mapping from {}'.format(
		indexmap_filename))
		start_time = time.time()

		mapping_array = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r')
		samples_mapping = BlockSamplesMapping(mapping_array)

		print_rank_0(' loaded indexed file in {:3.3f} seconds'.format(
		time.time() - start_time))
		print_rank_0(' total number of samples: {}'.format(
		mapping_array.shape[0]))

		return samples_mapping

megatron/data/ict_dataset.py

+20 −4

Original line number	Diff line number	Diff line
		@@ -9,6 +9,16 @@ from megatron import get_args
		from megatron.data.dataset_utils import get_indexed_dataset_
		from megatron.data.realm_dataset_utils import get_block_samples_mapping

		def make_attention_mask(source_block, target_block):
		"""
		Returns a 2-dimensional (2-D) attention mask
		:param source_block: 1-D array
		:param target_block: 1-D array
		"""
		mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1)
		mask = mask.astype(np.int64)
		# (source_length, target_length)
		return mask

		def get_ict_dataset(use_titles=True, query_in_block_prob=1):
		"""Get a dataset which uses block samples mappings to get ICT/block indexing data (via get_block())
		@@ -39,7 +49,7 @@ class ICTDataset(Dataset):
		"""Dataset containing sentences and their blocks for an inverse cloze task."""
		def __init__(self, name, block_dataset, title_dataset, data_prefix,
		num_epochs, max_num_samples, max_seq_length, query_in_block_prob,
		seed, use_titles=True, use_one_sent_docs=False):
		seed, use_titles=True, use_one_sent_docs=False, binary_head=False):
		self.name = name
		self.seed = seed
		self.max_seq_length = max_seq_length
		@@ -93,14 +103,20 @@ class ICTDataset(Dataset):
		block = list(itertools.chain(*block))[:self.max_seq_length - title_pad_offset]

		query_tokens, query_pad_mask = self.concat_and_pad_tokens(query)
		block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
		context_tokens, context_pad_mask = self.concat_and_pad_tokens(block, title)

		query_mask = make_attention_mask(query_tokens, query_tokens)
		context_mask = make_attention_mask(context_tokens, context_tokens)

		block_data = sample_data.as_array()

		sample = {
		'query_tokens': query_tokens,
		'query_mask': query_mask,
		'query_pad_mask': query_pad_mask,
		'block_tokens': block_tokens,
		'block_pad_mask': block_pad_mask,
		'context_tokens': context_tokens,
		'context_mask': context_mask,
		'context_pad_mask': context_pad_mask,
		'block_data': block_data,
		}