Commit e919dd8e authored by Mostofa Patwary's avatar Mostofa Patwary
Browse files

cleared the commented codes

parent bfc20ecf
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -606,8 +606,8 @@ def _add_biencoder_args(parser):
    # faiss index
    group.add_argument('--faiss-use-gpu', action='store_true',
                       help='Whether create the FaissMIPSIndex on GPU')
    #group.add_argument('--block-data-path', type=str, default=None,
    #                   help='Where to save/load BlockData to/from')
    group.add_argument('--block-data-path', type=str, default=None,
                       help='Where to save/load BlockData to/from')

    # indexer
    group.add_argument('--indexer-batch-size', type=int, default=128,
+1 −7
Original line number Diff line number Diff line
@@ -59,12 +59,6 @@ class AnnealingLR(object):
        """Learning rate decay functions from:
              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""

        #print_rank_0(self.warmup_steps)
        #print_rank_0(self.num_steps)
        #print_rank_0(self.warmup_steps)
        #print_rank_0(self.max_lr)
        #print_rank_0(self.max_lr * float(self.num_steps) / float(self.warmup_steps))

        # Use linear warmup for the initial part.
        if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps:
            return self.max_lr * float(self.num_steps) / \
@@ -103,7 +97,7 @@ class AnnealingLR(object):
        new_lr = self.get_lr()
        for group in self.optimizer.param_groups:
            group['lr'] = new_lr
        #print_rank_0(new_lr)


    def state_dict(self):
        state_dict = {
+7 −16
Original line number Diff line number Diff line
@@ -160,11 +160,6 @@ class BiEncoderModel(MegatronModule):
            iteration = int(f.read().strip())
            assert iteration > 0

        #for param in self.query_model.language_model.parameters():
        #    print(param.data)
            #break
            #sys.exit()

        checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False)
        if mpu.get_data_parallel_rank() == 0:
            print('global rank {} is loading BERT checkpoint {}'.format(
@@ -193,10 +188,6 @@ class BiEncoderModel(MegatronModule):
                if self.query_model is not None and self.projection_dim > 0:
                    self.context_model.projection_enc.load_state_dict\
                        (query_proj_state_dict)
        #for param in self.query_model.language_model.parameters():
        #    print(param.data)
        #    #sys.exit()



class PretrainedBertModel(MegatronModule):
+2 −23
Original line number Diff line number Diff line
@@ -48,7 +48,7 @@ from megatron.model import get_params_for_weight_decay_optimization
from megatron.model.realm_model import ICTBertModel
from megatron.utils import check_adlr_autoresume_termination
from megatron.data.data_loaders import build_pretraining_data_loader
from megatron.utils import report_memory, params_grad_norm, params_global_norm, print_model, print_grads
from megatron.utils import report_memory


def print_datetime(string):
@@ -648,7 +648,6 @@ def train_step(forward_step_func, data_iterator,
    if args.fp16:
        optimizer.update_master_grads()
    timers('backward-master-grad').stop()
    grad_norm_local = None

    # Clipping gradients helps prevent the exploding gradient.
    timers('backward-clip-grad').start()
@@ -663,30 +662,14 @@ def train_step(forward_step_func, data_iterator,
            mpu.clip_grad_norm(parameters, args.clip_grad,
                               parameter_names=parameter_names)
        else:
            grad_norm_local = optimizer.clip_master_grads(args.clip_grad)
            optimizer.clip_master_grads(args.clip_grad)
    timers('backward-clip-grad').stop()

    #print_rank_0("print-grad_norm_local {}".format(grad_norm_local))
    
    #print_rank_0("after backward")
    #print_grads(model)
    #print_model(model)
    #print_rank_0(params_global_norm(model))
    #print_rank_0(params_grad_norm(model))

    # Update parameters.
    timers('optimizer').start()
    optimizer.step()
    timers('optimizer').stop()

    #print_rank_0("after optimizer")
    #print_model(model)
    #print_rank_0(params_global_norm(model))
    #print_rank_0(params_grad_norm(model))
    #sys.exit()
    
    #print_rank_0("print-optimizer.overflow {}".format(optimizer.overflow))

    # Update learning rate.
    skipped_iter = 0
    if not (args.fp16 and optimizer.overflow):
@@ -861,10 +844,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
    # Iterations.
    iteration = args.iteration

    #print_rank_0("Check betas before iterations")
    #for group in optimizer.optimizer.param_groups:
    #    print_rank_0("betas {} lr {} weight_decay {} eps {}".format(group['betas'], group['lr'], group['weight_decay'], group['eps']))

    timers('interval time').start()
    print_datetime('before the start of training step')
    report_memory_flag = True
+1 −37
Original line number Diff line number Diff line
@@ -150,40 +150,4 @@ def get_ltor_masks_and_position_ids(data,

    return attention_mask, loss_mask, position_ids

def params_grad_norm(model):
    print_rank_0("params_grad_norm")
    norm2 = torch.cuda.FloatTensor([0.0])
    for param in model.parameters():
        if param.grad is None:
            continue
        norm = torch.norm(param.grad.data.float(), 2)
        norm2 += norm * norm
    torch.distributed.all_reduce(norm2)
    norm = norm2 ** 0.5
    return norm.item()


def params_global_norm(model):
    print_rank_0("params_global_norm")
    norm2 = torch.cuda.FloatTensor([0.0])
    for param in model.parameters():
        norm = torch.norm(param.data.float(), 2)
        norm2 += norm * norm
    torch.distributed.all_reduce(norm2)
    norm = norm2 ** 0.5
    return norm.item()

def print_model(model):
    print_rank_0("print-model")
    for name, param in model.named_parameters():
        if param.requires_grad:
            #print("{} {}".format(name, param.data), flush=True)
            print_rank_0("{} {}".format(name, param.data))
            return

def print_grads(model):
    print_rank_0("print-grads")
    for name, param in model.named_parameters():
        if param.grad is None:
            continue
        print_rank_0("{} {}".format(name, param.grad)) 
Loading