Loading examples/create_embeddings.shdeleted 100644 → 0 +0 −32 Original line number Diff line number Diff line #!/bin/bash # Compute embeddings for each entry of a given dataset (e.g. Wikipedia) RANK=0 WORLD_SIZE=1 # Wikipedia data can be downloaded from the following link: # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset> EMBEDDING_PATH=<Specify path to store embeddings> CHECKPOINT_PATH=<Specify path of pretrained ICT model> python tools/create_doc_index.py \ --num-layers 12 \ --hidden-size 768 \ --num-attention-heads 12 \ --tensor-model-parallel-size 1 \ --micro-batch-size 128 \ --checkpoint-activations \ --seq-length 512 \ --retriever-seq-length 256 \ --max-position-embeddings 512 \ --load ${CHECKPOINT_PATH} \ --evidence-data-path ${EVIDENCE_DATA_DIR} \ --embedding-path ${EMBEDDING_PATH} \ --indexer-log-interval 1000 \ --indexer-batch-size 128 \ --vocab-file bert-vocab.txt \ --num-workers 2 \ --fp16 examples/evaluate_ict_zeroshot_nq.sh→examples/evaluate_retriever_nq.sh +8 −5 Original line number Diff line number Diff line #!/bin/bash # Evaluate natural question test data given Wikipedia embeddings and pretrained # ICT model # ICT model or a finetuned model for Natural Question task # Datasets can be downloaded from the following link: # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset> EMBEDDING_PATH=<Specify path of the embeddings> CHECKPOINT_PATH=<Specify path of pretrained ICT model> CHECKPOINT_PATH=<Specify path of pretrained ICT model or finetuned model> QA_FILE=<Path of the natural question test dataset> QA_FILE=<Path of the natural question dev or test dataset> python tasks/main.py \ --task ICT-ZEROSHOT-NQ \ --task RETRIEVER-EVAL \ --tokenizer-type BertWordPieceLowerCase \ --num-layers 12 \ --hidden-size 768 \ Loading @@ -32,5 +32,8 @@ python tasks/main.py \ --num-workers 2 \ --faiss-use-gpu \ --retriever-report-topk-accuracies 1 5 20 100 \ --fp16 --fp16 \ --indexer-log-interval 1000 \ --indexer-batch-size 128 examples/finetune_retriever_distributed.sh 0 → 100755 +56 −0 Original line number Diff line number Diff line #!/bin/bash # Finetune a BERT or pretrained ICT model using Google natural question data # Datasets can be downloaded from the following link: # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py WORLD_SIZE=8 DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ --nnodes 1 \ --node_rank 0 \ --master_addr localhost \ --master_port 6000" CHECKPOINT_PATH=<Specify path for the finetuned retriever model> # Load either of the below BERT_LOAD_PATH=<Path of BERT pretrained model> PRETRAINED_CHECKPOINT=<Path of Pretrained ICT model> python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ --task RET-FINETUNE-NQ \ --train-with-neg \ --train-hard-neg 1 \ --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \ --num-layers 12 \ --hidden-size 768 \ --num-attention-heads 12 \ --tensor-model-parallel-size 1 \ --tokenizer-type BertWordPieceLowerCase \ --train-data nq-train.json \ --valid-data nq-dev.json \ --save ${CHECKPOINT_PATH} \ --load ${CHECKPOINT_PATH} \ --vocab-file bert-vocab.txt \ --bert-load ${BERT_LOAD_PATH} \ --save-interval 5000 \ --log-interval 10 \ --eval-interval 25000 \ --eval-iters 100 \ --indexer-log-interval 1000 \ --faiss-use-gpu \ --DDP-impl torch \ --fp16 \ --retriever-report-topk-accuracies 1 5 10 20 100 \ --seq-length 512 \ --retriever-seq-length 256 \ --max-position-embeddings 512 \ --retriever-score-scaling \ --epochs 80 \ --micro-batch-size 8 \ --eval-micro-batch-size 16 \ --indexer-batch-size 128 \ --lr 2e-5 \ --lr-warmup-fraction 0.01 \ --weight-decay 1e-1 megatron/indexer.py +7 −5 Original line number Diff line number Diff line import sys import time import torch import torch.distributed as dist from megatron import get_args from megatron import get_args, print_rank_0 from megatron import mpu from megatron.checkpointing import load_biencoder_checkpoint from megatron.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset from megatron.data.orqa_wiki_dataset import get_open_retrieval_batch from megatron.data.biencoder_dataset_utils import get_one_epoch_dataloader from megatron.data.realm_index import detach, OpenRetreivalDataStore from megatron.model.biencoder_model import biencoder_model_provider from megatron.model.biencoder_model import get_model_provider from megatron.training import get_model Loading @@ -29,7 +30,6 @@ class IndexBuilder(object): # need to know whether we're using a REALM checkpoint (args.load) # or ICT checkpoint assert not (args.load and args.ict_load) #self.using_realm_chkpt = args.ict_load is None self.log_interval = args.indexer_log_interval self.batch_size = args.indexer_batch_size Loading @@ -47,8 +47,8 @@ class IndexBuilder(object): if self.biencoder_shared_query_context_model: only_context_model = False model = get_model(lambda: biencoder_model_provider(only_context_model \ = only_context_model, biencoder_shared_query_context_model = \ model = get_model(get_model_provider(only_context_model=\ only_context_model, biencoder_shared_query_context_model=\ self.biencoder_shared_query_context_model)) self.model = load_biencoder_checkpoint(model, Loading Loading @@ -85,6 +85,7 @@ class IndexBuilder(object): """ assert len(self.model) == 1 unwrapped_model = self.model[0] while not hasattr(unwrapped_model, 'embed_text'): unwrapped_model = unwrapped_model.module Loading @@ -103,6 +104,7 @@ class IndexBuilder(object): context_logits = unwrapped_model.embed_text( unwrapped_model.context_model, context_tokens, context_mask, context_types) context_logits = detach(context_logits) row_id = detach(row_id) Loading megatron/model/biencoder_model.py +44 −9 Original line number Diff line number Diff line Loading @@ -15,11 +15,30 @@ from megatron.model.utils import init_method_normal from megatron.model.utils import scaled_init_method_normal from .module import MegatronModule def get_model_provider(only_query_model=False, only_context_model=False, biencoder_shared_query_context_model=False): def model_provider(pre_process=True, post_process=True): """Build the model.""" print_rank_0('building Bienoder model ...') model = biencoder_model_provider(only_query_model=only_query_model, only_context_model = only_context_model, biencoder_shared_query_context_model = \ biencoder_shared_query_context_model, pre_process=pre_process, post_process=post_process) return model return model_provider def biencoder_model_provider(only_query_model=False, only_context_model=False, biencoder_shared_query_context_model=False): biencoder_shared_query_context_model=False, pre_process=True, post_process=True): """Build the model.""" args = get_args() assert mpu.get_tensor_model_parallel_world_size() == 1 and \ mpu.get_pipeline_model_parallel_world_size() == 1, \ Loading @@ -35,7 +54,9 @@ def biencoder_model_provider(only_query_model=False, only_query_model=only_query_model, only_context_model=only_context_model, biencoder_shared_query_context_model=\ biencoder_shared_query_context_model) biencoder_shared_query_context_model, pre_process=pre_process, post_process=post_process) return model Loading @@ -48,13 +69,17 @@ class BiEncoderModel(MegatronModule): parallel_output=True, only_query_model=False, only_context_model=False, biencoder_shared_query_context_model=False): biencoder_shared_query_context_model=False, pre_process=True, post_process=True): super(BiEncoderModel, self).__init__() args = get_args() bert_kwargs = dict( num_tokentypes=num_tokentypes, parallel_output=parallel_output) parallel_output=parallel_output, pre_process=pre_process, post_process=post_process) self.biencoder_shared_query_context_model = \ biencoder_shared_query_context_model Loading @@ -78,6 +103,13 @@ class BiEncoderModel(MegatronModule): self.context_model = PretrainedBertModel(**bert_kwargs) self._context_key = 'context_model' def set_input_tensor(self, input_tensor): """See megatron.model.transformer.set_input_tensor()""" # this is just a placeholder and will be needed when model # parallelism will be used # self.language_model.set_input_tensor(input_tensor) return def forward(self, query_tokens, query_attention_mask, query_types, context_tokens, context_attention_mask, context_types): """Run a forward pass for each of the models and Loading Loading @@ -217,7 +249,7 @@ class PretrainedBertModel(MegatronModule): learned information retrieval.""" def __init__(self, num_tokentypes=2, parallel_output=True): parallel_output=True, pre_process=True, post_process=True): super(PretrainedBertModel, self).__init__() args = get_args() Loading @@ -225,6 +257,8 @@ class PretrainedBertModel(MegatronModule): self.pad_id = tokenizer.pad self.biencoder_projection_dim = args.biencoder_projection_dim self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal( args.init_method_std, args.num_layers) Loading @@ -234,7 +268,9 @@ class PretrainedBertModel(MegatronModule): add_pooler=False, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method) scaled_init_method=scaled_init_method, pre_process=self.pre_process, post_process=self.post_process) if args.biencoder_projection_dim > 0: self.projection_enc = get_linear_layer(args.hidden_size, Loading @@ -247,7 +283,6 @@ class PretrainedBertModel(MegatronModule): #extended_attention_mask = bert_extended_attention_mask(attention_mask) position_ids = bert_position_ids(input_ids) lm_output = self.language_model(input_ids, position_ids, extended_attention_mask, Loading Loading @@ -285,7 +320,7 @@ class PretrainedBertModel(MegatronModule): def load_state_dict(self, state_dict, strict=True): """Customized load.""" print_rank_0("loading BERT weights") print_rank_0("loading pretrained weights") self.language_model.load_state_dict( state_dict[self._language_model_key], strict=strict) Loading Loading
examples/create_embeddings.shdeleted 100644 → 0 +0 −32 Original line number Diff line number Diff line #!/bin/bash # Compute embeddings for each entry of a given dataset (e.g. Wikipedia) RANK=0 WORLD_SIZE=1 # Wikipedia data can be downloaded from the following link: # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset> EMBEDDING_PATH=<Specify path to store embeddings> CHECKPOINT_PATH=<Specify path of pretrained ICT model> python tools/create_doc_index.py \ --num-layers 12 \ --hidden-size 768 \ --num-attention-heads 12 \ --tensor-model-parallel-size 1 \ --micro-batch-size 128 \ --checkpoint-activations \ --seq-length 512 \ --retriever-seq-length 256 \ --max-position-embeddings 512 \ --load ${CHECKPOINT_PATH} \ --evidence-data-path ${EVIDENCE_DATA_DIR} \ --embedding-path ${EMBEDDING_PATH} \ --indexer-log-interval 1000 \ --indexer-batch-size 128 \ --vocab-file bert-vocab.txt \ --num-workers 2 \ --fp16
examples/evaluate_ict_zeroshot_nq.sh→examples/evaluate_retriever_nq.sh +8 −5 Original line number Diff line number Diff line #!/bin/bash # Evaluate natural question test data given Wikipedia embeddings and pretrained # ICT model # ICT model or a finetuned model for Natural Question task # Datasets can be downloaded from the following link: # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset> EMBEDDING_PATH=<Specify path of the embeddings> CHECKPOINT_PATH=<Specify path of pretrained ICT model> CHECKPOINT_PATH=<Specify path of pretrained ICT model or finetuned model> QA_FILE=<Path of the natural question test dataset> QA_FILE=<Path of the natural question dev or test dataset> python tasks/main.py \ --task ICT-ZEROSHOT-NQ \ --task RETRIEVER-EVAL \ --tokenizer-type BertWordPieceLowerCase \ --num-layers 12 \ --hidden-size 768 \ Loading @@ -32,5 +32,8 @@ python tasks/main.py \ --num-workers 2 \ --faiss-use-gpu \ --retriever-report-topk-accuracies 1 5 20 100 \ --fp16 --fp16 \ --indexer-log-interval 1000 \ --indexer-batch-size 128
examples/finetune_retriever_distributed.sh 0 → 100755 +56 −0 Original line number Diff line number Diff line #!/bin/bash # Finetune a BERT or pretrained ICT model using Google natural question data # Datasets can be downloaded from the following link: # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py WORLD_SIZE=8 DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ --nnodes 1 \ --node_rank 0 \ --master_addr localhost \ --master_port 6000" CHECKPOINT_PATH=<Specify path for the finetuned retriever model> # Load either of the below BERT_LOAD_PATH=<Path of BERT pretrained model> PRETRAINED_CHECKPOINT=<Path of Pretrained ICT model> python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ --task RET-FINETUNE-NQ \ --train-with-neg \ --train-hard-neg 1 \ --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \ --num-layers 12 \ --hidden-size 768 \ --num-attention-heads 12 \ --tensor-model-parallel-size 1 \ --tokenizer-type BertWordPieceLowerCase \ --train-data nq-train.json \ --valid-data nq-dev.json \ --save ${CHECKPOINT_PATH} \ --load ${CHECKPOINT_PATH} \ --vocab-file bert-vocab.txt \ --bert-load ${BERT_LOAD_PATH} \ --save-interval 5000 \ --log-interval 10 \ --eval-interval 25000 \ --eval-iters 100 \ --indexer-log-interval 1000 \ --faiss-use-gpu \ --DDP-impl torch \ --fp16 \ --retriever-report-topk-accuracies 1 5 10 20 100 \ --seq-length 512 \ --retriever-seq-length 256 \ --max-position-embeddings 512 \ --retriever-score-scaling \ --epochs 80 \ --micro-batch-size 8 \ --eval-micro-batch-size 16 \ --indexer-batch-size 128 \ --lr 2e-5 \ --lr-warmup-fraction 0.01 \ --weight-decay 1e-1
megatron/indexer.py +7 −5 Original line number Diff line number Diff line import sys import time import torch import torch.distributed as dist from megatron import get_args from megatron import get_args, print_rank_0 from megatron import mpu from megatron.checkpointing import load_biencoder_checkpoint from megatron.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset from megatron.data.orqa_wiki_dataset import get_open_retrieval_batch from megatron.data.biencoder_dataset_utils import get_one_epoch_dataloader from megatron.data.realm_index import detach, OpenRetreivalDataStore from megatron.model.biencoder_model import biencoder_model_provider from megatron.model.biencoder_model import get_model_provider from megatron.training import get_model Loading @@ -29,7 +30,6 @@ class IndexBuilder(object): # need to know whether we're using a REALM checkpoint (args.load) # or ICT checkpoint assert not (args.load and args.ict_load) #self.using_realm_chkpt = args.ict_load is None self.log_interval = args.indexer_log_interval self.batch_size = args.indexer_batch_size Loading @@ -47,8 +47,8 @@ class IndexBuilder(object): if self.biencoder_shared_query_context_model: only_context_model = False model = get_model(lambda: biencoder_model_provider(only_context_model \ = only_context_model, biencoder_shared_query_context_model = \ model = get_model(get_model_provider(only_context_model=\ only_context_model, biencoder_shared_query_context_model=\ self.biencoder_shared_query_context_model)) self.model = load_biencoder_checkpoint(model, Loading Loading @@ -85,6 +85,7 @@ class IndexBuilder(object): """ assert len(self.model) == 1 unwrapped_model = self.model[0] while not hasattr(unwrapped_model, 'embed_text'): unwrapped_model = unwrapped_model.module Loading @@ -103,6 +104,7 @@ class IndexBuilder(object): context_logits = unwrapped_model.embed_text( unwrapped_model.context_model, context_tokens, context_mask, context_types) context_logits = detach(context_logits) row_id = detach(row_id) Loading
megatron/model/biencoder_model.py +44 −9 Original line number Diff line number Diff line Loading @@ -15,11 +15,30 @@ from megatron.model.utils import init_method_normal from megatron.model.utils import scaled_init_method_normal from .module import MegatronModule def get_model_provider(only_query_model=False, only_context_model=False, biencoder_shared_query_context_model=False): def model_provider(pre_process=True, post_process=True): """Build the model.""" print_rank_0('building Bienoder model ...') model = biencoder_model_provider(only_query_model=only_query_model, only_context_model = only_context_model, biencoder_shared_query_context_model = \ biencoder_shared_query_context_model, pre_process=pre_process, post_process=post_process) return model return model_provider def biencoder_model_provider(only_query_model=False, only_context_model=False, biencoder_shared_query_context_model=False): biencoder_shared_query_context_model=False, pre_process=True, post_process=True): """Build the model.""" args = get_args() assert mpu.get_tensor_model_parallel_world_size() == 1 and \ mpu.get_pipeline_model_parallel_world_size() == 1, \ Loading @@ -35,7 +54,9 @@ def biencoder_model_provider(only_query_model=False, only_query_model=only_query_model, only_context_model=only_context_model, biencoder_shared_query_context_model=\ biencoder_shared_query_context_model) biencoder_shared_query_context_model, pre_process=pre_process, post_process=post_process) return model Loading @@ -48,13 +69,17 @@ class BiEncoderModel(MegatronModule): parallel_output=True, only_query_model=False, only_context_model=False, biencoder_shared_query_context_model=False): biencoder_shared_query_context_model=False, pre_process=True, post_process=True): super(BiEncoderModel, self).__init__() args = get_args() bert_kwargs = dict( num_tokentypes=num_tokentypes, parallel_output=parallel_output) parallel_output=parallel_output, pre_process=pre_process, post_process=post_process) self.biencoder_shared_query_context_model = \ biencoder_shared_query_context_model Loading @@ -78,6 +103,13 @@ class BiEncoderModel(MegatronModule): self.context_model = PretrainedBertModel(**bert_kwargs) self._context_key = 'context_model' def set_input_tensor(self, input_tensor): """See megatron.model.transformer.set_input_tensor()""" # this is just a placeholder and will be needed when model # parallelism will be used # self.language_model.set_input_tensor(input_tensor) return def forward(self, query_tokens, query_attention_mask, query_types, context_tokens, context_attention_mask, context_types): """Run a forward pass for each of the models and Loading Loading @@ -217,7 +249,7 @@ class PretrainedBertModel(MegatronModule): learned information retrieval.""" def __init__(self, num_tokentypes=2, parallel_output=True): parallel_output=True, pre_process=True, post_process=True): super(PretrainedBertModel, self).__init__() args = get_args() Loading @@ -225,6 +257,8 @@ class PretrainedBertModel(MegatronModule): self.pad_id = tokenizer.pad self.biencoder_projection_dim = args.biencoder_projection_dim self.parallel_output = parallel_output self.pre_process = pre_process self.post_process = post_process init_method = init_method_normal(args.init_method_std) scaled_init_method = scaled_init_method_normal( args.init_method_std, args.num_layers) Loading @@ -234,7 +268,9 @@ class PretrainedBertModel(MegatronModule): add_pooler=False, encoder_attn_mask_type=AttnMaskType.padding, init_method=init_method, scaled_init_method=scaled_init_method) scaled_init_method=scaled_init_method, pre_process=self.pre_process, post_process=self.post_process) if args.biencoder_projection_dim > 0: self.projection_enc = get_linear_layer(args.hidden_size, Loading @@ -247,7 +283,6 @@ class PretrainedBertModel(MegatronModule): #extended_attention_mask = bert_extended_attention_mask(attention_mask) position_ids = bert_position_ids(input_ids) lm_output = self.language_model(input_ids, position_ids, extended_attention_mask, Loading Loading @@ -285,7 +320,7 @@ class PretrainedBertModel(MegatronModule): def load_state_dict(self, state_dict, strict=True): """Customized load.""" print_rank_0("loading BERT weights") print_rank_0("loading pretrained weights") self.language_model.load_state_dict( state_dict[self._language_model_key], strict=strict) Loading