Loading megatron/arguments.py +2 −2 Original line number Diff line number Diff line Loading @@ -606,8 +606,8 @@ def _add_biencoder_args(parser): # faiss index group.add_argument('--faiss-use-gpu', action='store_true', help='Whether create the FaissMIPSIndex on GPU') #group.add_argument('--block-data-path', type=str, default=None, # help='Where to save/load BlockData to/from') group.add_argument('--block-data-path', type=str, default=None, help='Where to save/load BlockData to/from') # indexer group.add_argument('--indexer-batch-size', type=int, default=128, Loading megatron/learning_rates.py +1 −7 Original line number Diff line number Diff line Loading @@ -59,12 +59,6 @@ class AnnealingLR(object): """Learning rate decay functions from: https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" #print_rank_0(self.warmup_steps) #print_rank_0(self.num_steps) #print_rank_0(self.warmup_steps) #print_rank_0(self.max_lr) #print_rank_0(self.max_lr * float(self.num_steps) / float(self.warmup_steps)) # Use linear warmup for the initial part. if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps: return self.max_lr * float(self.num_steps) / \ Loading Loading @@ -103,7 +97,7 @@ class AnnealingLR(object): new_lr = self.get_lr() for group in self.optimizer.param_groups: group['lr'] = new_lr #print_rank_0(new_lr) def state_dict(self): state_dict = { Loading megatron/model/biencoder_model.py +7 −16 Original line number Diff line number Diff line Loading @@ -160,11 +160,6 @@ class BiEncoderModel(MegatronModule): iteration = int(f.read().strip()) assert iteration > 0 #for param in self.query_model.language_model.parameters(): # print(param.data) #break #sys.exit() checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False) if mpu.get_data_parallel_rank() == 0: print('global rank {} is loading BERT checkpoint {}'.format( Loading Loading @@ -193,10 +188,6 @@ class BiEncoderModel(MegatronModule): if self.query_model is not None and self.projection_dim > 0: self.context_model.projection_enc.load_state_dict\ (query_proj_state_dict) #for param in self.query_model.language_model.parameters(): # print(param.data) # #sys.exit() class PretrainedBertModel(MegatronModule): Loading megatron/training.py +2 −23 Original line number Diff line number Diff line Loading @@ -48,7 +48,7 @@ from megatron.model import get_params_for_weight_decay_optimization from megatron.model.realm_model import ICTBertModel from megatron.utils import check_adlr_autoresume_termination from megatron.data.data_loaders import build_pretraining_data_loader from megatron.utils import report_memory, params_grad_norm, params_global_norm, print_model, print_grads from megatron.utils import report_memory def print_datetime(string): Loading Loading @@ -648,7 +648,6 @@ def train_step(forward_step_func, data_iterator, if args.fp16: optimizer.update_master_grads() timers('backward-master-grad').stop() grad_norm_local = None # Clipping gradients helps prevent the exploding gradient. timers('backward-clip-grad').start() Loading @@ -663,30 +662,14 @@ def train_step(forward_step_func, data_iterator, mpu.clip_grad_norm(parameters, args.clip_grad, parameter_names=parameter_names) else: grad_norm_local = optimizer.clip_master_grads(args.clip_grad) optimizer.clip_master_grads(args.clip_grad) timers('backward-clip-grad').stop() #print_rank_0("print-grad_norm_local {}".format(grad_norm_local)) #print_rank_0("after backward") #print_grads(model) #print_model(model) #print_rank_0(params_global_norm(model)) #print_rank_0(params_grad_norm(model)) # Update parameters. timers('optimizer').start() optimizer.step() timers('optimizer').stop() #print_rank_0("after optimizer") #print_model(model) #print_rank_0(params_global_norm(model)) #print_rank_0(params_grad_norm(model)) #sys.exit() #print_rank_0("print-optimizer.overflow {}".format(optimizer.overflow)) # Update learning rate. skipped_iter = 0 if not (args.fp16 and optimizer.overflow): Loading Loading @@ -861,10 +844,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler, # Iterations. iteration = args.iteration #print_rank_0("Check betas before iterations") #for group in optimizer.optimizer.param_groups: # print_rank_0("betas {} lr {} weight_decay {} eps {}".format(group['betas'], group['lr'], group['weight_decay'], group['eps'])) timers('interval time').start() print_datetime('before the start of training step') report_memory_flag = True Loading megatron/utils.py +1 −37 Original line number Diff line number Diff line Loading @@ -150,40 +150,4 @@ def get_ltor_masks_and_position_ids(data, return attention_mask, loss_mask, position_ids def params_grad_norm(model): print_rank_0("params_grad_norm") norm2 = torch.cuda.FloatTensor([0.0]) for param in model.parameters(): if param.grad is None: continue norm = torch.norm(param.grad.data.float(), 2) norm2 += norm * norm torch.distributed.all_reduce(norm2) norm = norm2 ** 0.5 return norm.item() def params_global_norm(model): print_rank_0("params_global_norm") norm2 = torch.cuda.FloatTensor([0.0]) for param in model.parameters(): norm = torch.norm(param.data.float(), 2) norm2 += norm * norm torch.distributed.all_reduce(norm2) norm = norm2 ** 0.5 return norm.item() def print_model(model): print_rank_0("print-model") for name, param in model.named_parameters(): if param.requires_grad: #print("{} {}".format(name, param.data), flush=True) print_rank_0("{} {}".format(name, param.data)) return def print_grads(model): print_rank_0("print-grads") for name, param in model.named_parameters(): if param.grad is None: continue print_rank_0("{} {}".format(name, param.grad)) Loading
megatron/arguments.py +2 −2 Original line number Diff line number Diff line Loading @@ -606,8 +606,8 @@ def _add_biencoder_args(parser): # faiss index group.add_argument('--faiss-use-gpu', action='store_true', help='Whether create the FaissMIPSIndex on GPU') #group.add_argument('--block-data-path', type=str, default=None, # help='Where to save/load BlockData to/from') group.add_argument('--block-data-path', type=str, default=None, help='Where to save/load BlockData to/from') # indexer group.add_argument('--indexer-batch-size', type=int, default=128, Loading
megatron/learning_rates.py +1 −7 Original line number Diff line number Diff line Loading @@ -59,12 +59,6 @@ class AnnealingLR(object): """Learning rate decay functions from: https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" #print_rank_0(self.warmup_steps) #print_rank_0(self.num_steps) #print_rank_0(self.warmup_steps) #print_rank_0(self.max_lr) #print_rank_0(self.max_lr * float(self.num_steps) / float(self.warmup_steps)) # Use linear warmup for the initial part. if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps: return self.max_lr * float(self.num_steps) / \ Loading Loading @@ -103,7 +97,7 @@ class AnnealingLR(object): new_lr = self.get_lr() for group in self.optimizer.param_groups: group['lr'] = new_lr #print_rank_0(new_lr) def state_dict(self): state_dict = { Loading
megatron/model/biencoder_model.py +7 −16 Original line number Diff line number Diff line Loading @@ -160,11 +160,6 @@ class BiEncoderModel(MegatronModule): iteration = int(f.read().strip()) assert iteration > 0 #for param in self.query_model.language_model.parameters(): # print(param.data) #break #sys.exit() checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False) if mpu.get_data_parallel_rank() == 0: print('global rank {} is loading BERT checkpoint {}'.format( Loading Loading @@ -193,10 +188,6 @@ class BiEncoderModel(MegatronModule): if self.query_model is not None and self.projection_dim > 0: self.context_model.projection_enc.load_state_dict\ (query_proj_state_dict) #for param in self.query_model.language_model.parameters(): # print(param.data) # #sys.exit() class PretrainedBertModel(MegatronModule): Loading
megatron/training.py +2 −23 Original line number Diff line number Diff line Loading @@ -48,7 +48,7 @@ from megatron.model import get_params_for_weight_decay_optimization from megatron.model.realm_model import ICTBertModel from megatron.utils import check_adlr_autoresume_termination from megatron.data.data_loaders import build_pretraining_data_loader from megatron.utils import report_memory, params_grad_norm, params_global_norm, print_model, print_grads from megatron.utils import report_memory def print_datetime(string): Loading Loading @@ -648,7 +648,6 @@ def train_step(forward_step_func, data_iterator, if args.fp16: optimizer.update_master_grads() timers('backward-master-grad').stop() grad_norm_local = None # Clipping gradients helps prevent the exploding gradient. timers('backward-clip-grad').start() Loading @@ -663,30 +662,14 @@ def train_step(forward_step_func, data_iterator, mpu.clip_grad_norm(parameters, args.clip_grad, parameter_names=parameter_names) else: grad_norm_local = optimizer.clip_master_grads(args.clip_grad) optimizer.clip_master_grads(args.clip_grad) timers('backward-clip-grad').stop() #print_rank_0("print-grad_norm_local {}".format(grad_norm_local)) #print_rank_0("after backward") #print_grads(model) #print_model(model) #print_rank_0(params_global_norm(model)) #print_rank_0(params_grad_norm(model)) # Update parameters. timers('optimizer').start() optimizer.step() timers('optimizer').stop() #print_rank_0("after optimizer") #print_model(model) #print_rank_0(params_global_norm(model)) #print_rank_0(params_grad_norm(model)) #sys.exit() #print_rank_0("print-optimizer.overflow {}".format(optimizer.overflow)) # Update learning rate. skipped_iter = 0 if not (args.fp16 and optimizer.overflow): Loading Loading @@ -861,10 +844,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler, # Iterations. iteration = args.iteration #print_rank_0("Check betas before iterations") #for group in optimizer.optimizer.param_groups: # print_rank_0("betas {} lr {} weight_decay {} eps {}".format(group['betas'], group['lr'], group['weight_decay'], group['eps'])) timers('interval time').start() print_datetime('before the start of training step') report_memory_flag = True Loading
megatron/utils.py +1 −37 Original line number Diff line number Diff line Loading @@ -150,40 +150,4 @@ def get_ltor_masks_and_position_ids(data, return attention_mask, loss_mask, position_ids def params_grad_norm(model): print_rank_0("params_grad_norm") norm2 = torch.cuda.FloatTensor([0.0]) for param in model.parameters(): if param.grad is None: continue norm = torch.norm(param.grad.data.float(), 2) norm2 += norm * norm torch.distributed.all_reduce(norm2) norm = norm2 ** 0.5 return norm.item() def params_global_norm(model): print_rank_0("params_global_norm") norm2 = torch.cuda.FloatTensor([0.0]) for param in model.parameters(): norm = torch.norm(param.data.float(), 2) norm2 += norm * norm torch.distributed.all_reduce(norm2) norm = norm2 ** 0.5 return norm.item() def print_model(model): print_rank_0("print-model") for name, param in model.named_parameters(): if param.requires_grad: #print("{} {}".format(name, param.data), flush=True) print_rank_0("{} {}".format(name, param.data)) return def print_grads(model): print_rank_0("print-grads") for name, param in model.named_parameters(): if param.grad is None: continue print_rank_0("{} {}".format(name, param.grad))