Merge branch 'indexer-merge' into 'master' (4ac91436) · Commits · candle / Megatron-LM

README.md

+25 −0

Original line number	Diff line number	Diff line
		@@ -319,6 +319,31 @@ python pretrain_ict.py \

		</pre>

		### Building an Index of Block Embeddings
		After having trained an ICT model, you can now embed an entire dataset of blocks by creating a `BlockData` structure. After that has been saved, you can load it
		and wrap it with a `FaissMIPSIndex` to do fast similarity search which is key in the learned information retrieval pipeline. The initial index can be built with the following script, meant to be run in an interactive session. It can leverage multiple GPUs on multiple nodes to index large datasets much more quickly.

		<pre>
		python tools/create_doc_index.py \
		--num-layers 12 \
		--hidden-size 768 \
		--ict-head-size 128 \
		--num-attention-heads 12 \
		--batch-size 128 \
		--checkpoint-activations \
		--seq-length 256 \
		--max-position-embeddings 256 \
		--ict-load /path/to/pretrained_ict \
		--data-path /path/to/indexed_dataset \
		--titles-data-path /path/to/titles_indexed_dataset \
		--block-data-path embedded_blocks.pkl \
		--indexer-log-interval 1000 \
		--indexer-batch-size 128 \
		--vocab-file /path/to/vocab.txt \
		--num-workers 2 \
		--fp16
		</pre>

		<a id="evaluation-and-tasks"></a>
		# Evaluation and Tasks

megatron/arguments.py

+12 −1

Original line number	Diff line number	Diff line
		@@ -411,12 +411,23 @@ def _add_realm_args(parser):
		help='Path to titles dataset used for ICT')
		group.add_argument('--query-in-block-prob', type=float, default=0.1,
		help='Probability of keeping query in block for ICT dataset')
		group.add_argument('--ict-one-sent', action='store_true',
		group.add_argument('--use-one-sent-docs', action='store_true',
		help='Whether to use one sentence documents in ICT')

		# training
		group.add_argument('--report-topk-accuracies', nargs='+', default=[],
		help="Which top-k accuracies to report (e.g. '1 5 20')")

		# faiss index
		group.add_argument('--faiss-use-gpu', action='store_true',
		help='Whether create the FaissMIPSIndex on GPU')
		group.add_argument('--block-data-path', type=str, default=None,
		help='Where to save/load BlockData to/from')

		# indexer
		group.add_argument('--indexer-batch-size', type=int, default=128,
		help='How large of batches to use when doing indexing jobs')
		group.add_argument('--indexer-log-interval', type=int, default=1000,
		help='After how many batches should the indexer report progress')
		return parser

megatron/checkpointing.py

+42 −2

Original line number	Diff line number	Diff line
		@@ -21,9 +21,9 @@ import sys
		import numpy as np

		import torch
		from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
		from torch.nn.parallel import DistributedDataParallel as torchDDP

		from megatron import mpu
		from megatron import mpu, get_args
		from megatron import get_args
		from megatron import print_rank_0

		@@ -244,3 +244,43 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
		print(' successfully loaded {}'.format(checkpoint_name))

		return iteration


		def load_ict_checkpoint(model, only_query_model=False, only_block_model=False, from_realm_chkpt=False):
		"""selectively load ICT models for indexing/retrieving from ICT or REALM checkpoints"""

		args = get_args()

		if isinstance(model, torchDDP):
		model = model.module

		load_path = args.load if from_realm_chkpt else args.ict_load

		tracker_filename = get_checkpoint_tracker_filename(load_path)
		with open(tracker_filename, 'r') as f:
		iteration = int(f.read().strip())

		# assert iteration > 0
		checkpoint_name = get_checkpoint_name(load_path, iteration, False)
		if mpu.get_data_parallel_rank() == 0:
		print('global rank {} is loading checkpoint {}'.format(
		torch.distributed.get_rank(), checkpoint_name))

		state_dict = torch.load(checkpoint_name, map_location='cpu')
		ict_state_dict = state_dict['model']
		if from_realm_chkpt and mpu.get_data_parallel_rank() == 0:
		print(" loading ICT state dict from REALM", flush=True)
		ict_state_dict = ict_state_dict['retriever']['ict_model']

		if only_query_model:
		ict_state_dict.pop('context_model')
		if only_block_model:
		ict_state_dict.pop('question_model')

		model.load_state_dict(ict_state_dict)
		torch.distributed.barrier()

		if mpu.get_data_parallel_rank() == 0:
		print(' successfully loaded {}'.format(checkpoint_name))

		return model

megatron/data/dataset_utils.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -417,7 +417,6 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
		num_epochs=None,
		max_num_samples=train_valid_test_num_samples[index],
		max_seq_length=max_seq_length,
		short_seq_prob=short_seq_prob,
		seed=seed
		)

		@@ -427,13 +426,14 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
		block_dataset=indexed_dataset,
		title_dataset=title_dataset,
		query_in_block_prob=args.query_in_block_prob,
		use_one_sent_docs=args.ict_one_sent,
		use_one_sent_docs=args.use_one_sent_docs,
		**kwargs
		)
		else:
		dataset = BertDataset(
		indexed_dataset=indexed_dataset,
		masked_lm_prob=masked_lm_prob,
		short_seq_prob=short_seq_prob,
		**kwargs
		)

megatron/data/ict_dataset.py

+35 −7

Original line number	Diff line number	Diff line
		@@ -5,21 +5,47 @@ import numpy as np
		from torch.utils.data import Dataset

		from megatron import get_tokenizer
		from megatron import get_args
		from megatron.data.dataset_utils import get_indexed_dataset_
		from megatron.data.realm_dataset_utils import get_block_samples_mapping


		def get_ict_dataset(use_titles=True, query_in_block_prob=1):
		"""Get a dataset which uses block samples mappings to get ICT/block indexing data (via get_block())
		rather than for training, since it is only built with a single epoch sample mapping.
		"""
		args = get_args()
		block_dataset = get_indexed_dataset_(args.data_path, 'mmap', True)
		titles_dataset = get_indexed_dataset_(args.titles_data_path, 'mmap', True)

		kwargs = dict(
		name='full',
		block_dataset=block_dataset,
		title_dataset=titles_dataset,
		data_prefix=args.data_path,
		num_epochs=1,
		max_num_samples=None,
		max_seq_length=args.seq_length,
		seed=1,
		query_in_block_prob=query_in_block_prob,
		use_titles=use_titles,
		use_one_sent_docs=args.use_one_sent_docs
		)
		dataset = ICTDataset(**kwargs)
		return dataset


		class ICTDataset(Dataset):
		"""Dataset containing sentences and their blocks for an inverse cloze task."""
		def __init__(self, name, block_dataset, title_dataset, data_prefix,
		num_epochs, max_num_samples, max_seq_length,
		query_in_block_prob, short_seq_prob, seed, use_titles=True, use_one_sent_docs=False):
		num_epochs, max_num_samples, max_seq_length, query_in_block_prob,
		seed, use_titles=True, use_one_sent_docs=False):
		self.name = name
		self.seed = seed
		self.max_seq_length = max_seq_length
		self.query_in_block_prob = query_in_block_prob
		self.block_dataset = block_dataset
		self.title_dataset = title_dataset
		self.short_seq_prob = short_seq_prob
		self.rng = random.Random(self.seed)
		self.use_titles = use_titles
		self.use_one_sent_docs = use_one_sent_docs
		@@ -36,11 +62,13 @@ class ICTDataset(Dataset):
		self.pad_id = self.tokenizer.pad

		def __len__(self):
		return self.samples_mapping.shape[0]
		return len(self.samples_mapping)

		def __getitem__(self, idx):
		"""Get an ICT example of a pseudo-query and the block of text from which it was extracted"""
		start_idx, end_idx, doc_idx, block_idx = self.samples_mapping[idx]
		sample_data = self.samples_mapping[idx]
		start_idx, end_idx, doc_idx, block_idx = sample_data.as_tuple()

		if self.use_titles:
		title = self.title_dataset[int(doc_idx)]
		title_pad_offset = 3 + len(title)
		@@ -48,7 +76,7 @@ class ICTDataset(Dataset):
		title = None
		title_pad_offset = 2
		block = [self.block_dataset[i] for i in range(start_idx, end_idx)]
		assert len(block) > 1 or self.use_one_sent_docs
		assert len(block) > 1 or self.use_one_sent_docs or self.query_in_block_prob == 1

		# randint() is inclusive for Python rng
		rand_sent_idx = self.rng.randint(0, len(block) - 1)
		@@ -66,7 +94,7 @@ class ICTDataset(Dataset):

		query_tokens, query_pad_mask = self.concat_and_pad_tokens(query)
		block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
		block_data = np.array([start_idx, end_idx, doc_idx, block_idx]).astype(np.int64)
		block_data = sample_data.as_array()

		sample = {
		'query_tokens': query_tokens,