Loading megatron/arguments.py +51 −22 Original line number Diff line number Diff line Loading @@ -160,7 +160,8 @@ def parse_args(extra_args_provider=None, defaults={}, 'expected sample-based learnig rate warmup' if args.lr_warmup_fraction is not None: assert args.lr_warmup_samples == 0, \ 'can only specify one of lr-warmup-fraction and lr-warmup-samples' 'can only specify one of lr-warmup-fraction ' \ 'and lr-warmup-samples' # Check required arguments. required_args = ['num_layers', 'hidden_size', 'num_attention_heads', Loading Loading @@ -242,13 +243,15 @@ def _add_network_size_args(parser): group.add_argument('--hidden-size', type=int, default=None, help='Tansformer hidden size.') group.add_argument('--ffn-hidden-size', type=int, default=None, help='Transformer Feed-Forward Network hidden size. This is set to 4*hidden-size if not ' 'provided') help='Transformer Feed-Forward Network hidden size. ' 'This is set to 4*hidden-size if not provided') group.add_argument('--num-attention-heads', type=int, default=None, help='Number of transformer attention heads.') group.add_argument('--kv-channels', type=int, default=None, help='Projection weights dimension in multi-head attention. ' 'This is set to args.hidden_size // args.num_attention_heads if not provided.') help='Projection weights dimension in multi-head ' 'attention. This is set to ' ' args.hidden_size // args.num_attention_heads ' 'if not provided.') group.add_argument('--max-position-embeddings', type=int, default=None, help='Maximum number of position embeddings to use. ' 'This is the size of position embedding.') Loading @@ -266,7 +269,8 @@ def _add_network_size_args(parser): 'should not be used unless for backward compatibility' 'reasons.') group.add_argument('--onnx-safe', type=bool, required=False, help='Use workarounds for known problems with Torch ONNX exporter') help='Use workarounds for known problems with ' 'Torch ONNX exporter') group.add_argument('--bert-no-binary-head', action='store_false', help='Disable BERT binary head.', dest='bert_binary_head') Loading @@ -279,6 +283,24 @@ def _add_logging_args(parser): group.add_argument('--log-params-norm', action='store_true', help='If set, calculate and log parameters norm.') group.add_argument('--tensorboard-log-interval', type=int, default=1, help='Report to tensorboard interval.') group.add_argument('--log-timers-to-tensorboard', action='store_true', help='If set, write timers to tensorboard.') group.add_argument('--log-batch-size-to-tensorboard', action='store_true', help='If set, write batch-size to tensorboard.') group.add_argument('--no-log-learnig-rate-to-tensorboard', action='store_false', help='Disable learning rate logging to tensorboard.', dest='log_learning_rate_to_tensorboard') group.add_argument('--no-log-loss-scale-to-tensorboard', action='store_false', help='Disable loss-scale logging to tensorboard.', dest='log_loss_scale_to_tensorboard') group.add_argument('--log-validation-ppl-to-tensorboard', action='store_true', help='If set, write validation perplexity to ' 'tensorboard.') return parser Loading @@ -295,11 +317,11 @@ def _add_regularization_args(parser): group.add_argument('--clip-grad', type=float, default=1.0, help='Gradient clipping based on global L2 norm.') group.add_argument('--adam-beta1', type=float, default=0.9, help='First coefficient for computing running averages of' 'gradient and its square') help='First coefficient for computing running averages ' 'of gradient and its square') group.add_argument('--adam-beta2', type=float, default=0.999, help='Second coefficient for computing running averages of' 'gradient and its square') help='Second coefficient for computing running averages ' 'of gradient and its square') group.add_argument('--adam-eps', type=float, default=1e-08, help='Term added to the denominator to improve' 'numerical stability') Loading Loading @@ -525,12 +547,14 @@ def _add_distributed_args(parser): group.add_argument('--local_rank', type=int, default=None, help='local rank passed from distributed launcher.') group.add_argument('--lazy-mpu-init', type=bool, required=False, help='If set to True, initialize_megatron() skips DDP initialization' ' and returns function to complete it instead.' 'Also turns on --use-cpu-initialization flag.' 'This is for external DDP manager.' ) group.add_argument('--use-cpu-initialization', action='store_true', default=None, help='If set, affine parallel weights initialization uses CPU' ) help='If set to True, initialize_megatron() ' 'skips DDP initialization and returns function to ' 'complete it instead.Also turns on ' '--use-cpu-initialization flag. This is for ' 'external DDP manager.' ) group.add_argument('--use-cpu-initialization', action='store_true', default=None, help='If set, affine parallel weights ' 'initialization uses CPU' ) return parser Loading Loading @@ -616,19 +640,22 @@ def _add_realm_args(parser): # network size group.add_argument('--ict-head-size', type=int, default=None, help='Size of block embeddings to be used in ICT and REALM (paper default: 128)') help='Size of block embeddings to be used in ICT and ' 'REALM (paper default: 128)') # checkpointing group.add_argument('--ict-load', type=str, default=None, help='Directory containing an ICTBertModel checkpoint') group.add_argument('--bert-load', type=str, default=None, help='Directory containing an BertModel checkpoint (needed to start ICT and REALM)') help='Directory containing an BertModel checkpoint ' '(needed to start ICT and REALM)') # data group.add_argument('--titles-data-path', type=str, default=None, help='Path to titles dataset used for ICT') group.add_argument('--query-in-block-prob', type=float, default=0.1, help='Probability of keeping query in block for ICT dataset') help='Probability of keeping query in block for ' 'ICT dataset') group.add_argument('--use-one-sent-docs', action='store_true', help='Whether to use one sentence documents in ICT') Loading @@ -644,9 +671,11 @@ def _add_realm_args(parser): # indexer group.add_argument('--indexer-batch-size', type=int, default=128, help='How large of batches to use when doing indexing jobs') help='How large of batches to use when doing indexing ' 'jobs') group.add_argument('--indexer-log-interval', type=int, default=1000, help='After how many batches should the indexer report progress') help='After how many batches should the indexer ' 'report progress') return parser Loading megatron/training.py +27 −19 Original line number Diff line number Diff line Loading @@ -712,10 +712,13 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, total_loss_dict[skipped_iters_key] # Tensorboard values. if writer and is_last_rank(): if writer and (iteration % args.tensorboard_log_interval == 0 ) and \ is_last_rank(): if args.log_learning_rate_to_tensorboard: writer.add_scalar('learning-rate', learning_rate, iteration) writer.add_scalar('learning-rate vs samples', learning_rate, args.consumed_train_samples) if args.log_batch_size_to_tensorboard: writer.add_scalar('batch-size', batch_size, iteration) writer.add_scalar('batch-size vs samples', batch_size, args.consumed_train_samples) Loading @@ -723,6 +726,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, writer.add_scalar(key , loss_dict[key], iteration) writer.add_scalar(key + ' vs samples', loss_dict[key], args.consumed_train_samples) if args.log_loss_scale_to_tensorboard: writer.add_scalar('loss-scale', loss_scale, iteration) writer.add_scalar('loss-scale vs samples', loss_scale, args.consumed_train_samples) Loading @@ -734,6 +738,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, writer.add_scalar('params-norm', params_norm, iteration) writer.add_scalar('params-norm vs samples', params_norm, args.consumed_train_samples) if args.log_timers_to_tensorboard: timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) Loading @@ -741,6 +746,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, elapsed_time = timers('interval time').elapsed() elapsed_time_per_iteration = elapsed_time / total_iterations if writer and torch.distributed.get_rank() == 0: if args.log_timers_to_tensorboard: writer.add_scalar('iteration-time', elapsed_time_per_iteration, iteration) log_string = ' iteration {:8d}/{:8d} |'.format( Loading Loading @@ -955,15 +961,17 @@ def evaluate_and_print_results(prefix, forward_step_func, ppl = math.exp(min(20, total_loss_dict[key].item())) string += '{} PPL: {:.6E} | '.format(key, ppl) if writer and is_last_rank(): writer.add_scalar('{} value-validation'.format(key), writer.add_scalar('{} validation'.format(key), total_loss_dict[key].item(), iteration) writer.add_scalar('{} ppl-validation'.format(key), ppl, iteration) writer.add_scalar('{} value-validation vs samples'.format(key), writer.add_scalar('{} validation vs samples'.format(key), total_loss_dict[key].item(), args.consumed_train_samples) writer.add_scalar('{} ppl-validation vs samples'.format(key), ppl, args.consumed_train_samples) if args.log_validation_ppl_to_tensorboard: writer.add_scalar('{} validation ppl'.format(key), ppl, iteration) writer.add_scalar('{} validation ppl vs samples'.format(key), ppl, args.consumed_train_samples) length = len(string) + 1 print_rank_last('-' * length) Loading Loading
megatron/arguments.py +51 −22 Original line number Diff line number Diff line Loading @@ -160,7 +160,8 @@ def parse_args(extra_args_provider=None, defaults={}, 'expected sample-based learnig rate warmup' if args.lr_warmup_fraction is not None: assert args.lr_warmup_samples == 0, \ 'can only specify one of lr-warmup-fraction and lr-warmup-samples' 'can only specify one of lr-warmup-fraction ' \ 'and lr-warmup-samples' # Check required arguments. required_args = ['num_layers', 'hidden_size', 'num_attention_heads', Loading Loading @@ -242,13 +243,15 @@ def _add_network_size_args(parser): group.add_argument('--hidden-size', type=int, default=None, help='Tansformer hidden size.') group.add_argument('--ffn-hidden-size', type=int, default=None, help='Transformer Feed-Forward Network hidden size. This is set to 4*hidden-size if not ' 'provided') help='Transformer Feed-Forward Network hidden size. ' 'This is set to 4*hidden-size if not provided') group.add_argument('--num-attention-heads', type=int, default=None, help='Number of transformer attention heads.') group.add_argument('--kv-channels', type=int, default=None, help='Projection weights dimension in multi-head attention. ' 'This is set to args.hidden_size // args.num_attention_heads if not provided.') help='Projection weights dimension in multi-head ' 'attention. This is set to ' ' args.hidden_size // args.num_attention_heads ' 'if not provided.') group.add_argument('--max-position-embeddings', type=int, default=None, help='Maximum number of position embeddings to use. ' 'This is the size of position embedding.') Loading @@ -266,7 +269,8 @@ def _add_network_size_args(parser): 'should not be used unless for backward compatibility' 'reasons.') group.add_argument('--onnx-safe', type=bool, required=False, help='Use workarounds for known problems with Torch ONNX exporter') help='Use workarounds for known problems with ' 'Torch ONNX exporter') group.add_argument('--bert-no-binary-head', action='store_false', help='Disable BERT binary head.', dest='bert_binary_head') Loading @@ -279,6 +283,24 @@ def _add_logging_args(parser): group.add_argument('--log-params-norm', action='store_true', help='If set, calculate and log parameters norm.') group.add_argument('--tensorboard-log-interval', type=int, default=1, help='Report to tensorboard interval.') group.add_argument('--log-timers-to-tensorboard', action='store_true', help='If set, write timers to tensorboard.') group.add_argument('--log-batch-size-to-tensorboard', action='store_true', help='If set, write batch-size to tensorboard.') group.add_argument('--no-log-learnig-rate-to-tensorboard', action='store_false', help='Disable learning rate logging to tensorboard.', dest='log_learning_rate_to_tensorboard') group.add_argument('--no-log-loss-scale-to-tensorboard', action='store_false', help='Disable loss-scale logging to tensorboard.', dest='log_loss_scale_to_tensorboard') group.add_argument('--log-validation-ppl-to-tensorboard', action='store_true', help='If set, write validation perplexity to ' 'tensorboard.') return parser Loading @@ -295,11 +317,11 @@ def _add_regularization_args(parser): group.add_argument('--clip-grad', type=float, default=1.0, help='Gradient clipping based on global L2 norm.') group.add_argument('--adam-beta1', type=float, default=0.9, help='First coefficient for computing running averages of' 'gradient and its square') help='First coefficient for computing running averages ' 'of gradient and its square') group.add_argument('--adam-beta2', type=float, default=0.999, help='Second coefficient for computing running averages of' 'gradient and its square') help='Second coefficient for computing running averages ' 'of gradient and its square') group.add_argument('--adam-eps', type=float, default=1e-08, help='Term added to the denominator to improve' 'numerical stability') Loading Loading @@ -525,12 +547,14 @@ def _add_distributed_args(parser): group.add_argument('--local_rank', type=int, default=None, help='local rank passed from distributed launcher.') group.add_argument('--lazy-mpu-init', type=bool, required=False, help='If set to True, initialize_megatron() skips DDP initialization' ' and returns function to complete it instead.' 'Also turns on --use-cpu-initialization flag.' 'This is for external DDP manager.' ) group.add_argument('--use-cpu-initialization', action='store_true', default=None, help='If set, affine parallel weights initialization uses CPU' ) help='If set to True, initialize_megatron() ' 'skips DDP initialization and returns function to ' 'complete it instead.Also turns on ' '--use-cpu-initialization flag. This is for ' 'external DDP manager.' ) group.add_argument('--use-cpu-initialization', action='store_true', default=None, help='If set, affine parallel weights ' 'initialization uses CPU' ) return parser Loading Loading @@ -616,19 +640,22 @@ def _add_realm_args(parser): # network size group.add_argument('--ict-head-size', type=int, default=None, help='Size of block embeddings to be used in ICT and REALM (paper default: 128)') help='Size of block embeddings to be used in ICT and ' 'REALM (paper default: 128)') # checkpointing group.add_argument('--ict-load', type=str, default=None, help='Directory containing an ICTBertModel checkpoint') group.add_argument('--bert-load', type=str, default=None, help='Directory containing an BertModel checkpoint (needed to start ICT and REALM)') help='Directory containing an BertModel checkpoint ' '(needed to start ICT and REALM)') # data group.add_argument('--titles-data-path', type=str, default=None, help='Path to titles dataset used for ICT') group.add_argument('--query-in-block-prob', type=float, default=0.1, help='Probability of keeping query in block for ICT dataset') help='Probability of keeping query in block for ' 'ICT dataset') group.add_argument('--use-one-sent-docs', action='store_true', help='Whether to use one sentence documents in ICT') Loading @@ -644,9 +671,11 @@ def _add_realm_args(parser): # indexer group.add_argument('--indexer-batch-size', type=int, default=128, help='How large of batches to use when doing indexing jobs') help='How large of batches to use when doing indexing ' 'jobs') group.add_argument('--indexer-log-interval', type=int, default=1000, help='After how many batches should the indexer report progress') help='After how many batches should the indexer ' 'report progress') return parser Loading
megatron/training.py +27 −19 Original line number Diff line number Diff line Loading @@ -712,10 +712,13 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, total_loss_dict[skipped_iters_key] # Tensorboard values. if writer and is_last_rank(): if writer and (iteration % args.tensorboard_log_interval == 0 ) and \ is_last_rank(): if args.log_learning_rate_to_tensorboard: writer.add_scalar('learning-rate', learning_rate, iteration) writer.add_scalar('learning-rate vs samples', learning_rate, args.consumed_train_samples) if args.log_batch_size_to_tensorboard: writer.add_scalar('batch-size', batch_size, iteration) writer.add_scalar('batch-size vs samples', batch_size, args.consumed_train_samples) Loading @@ -723,6 +726,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, writer.add_scalar(key , loss_dict[key], iteration) writer.add_scalar(key + ' vs samples', loss_dict[key], args.consumed_train_samples) if args.log_loss_scale_to_tensorboard: writer.add_scalar('loss-scale', loss_scale, iteration) writer.add_scalar('loss-scale vs samples', loss_scale, args.consumed_train_samples) Loading @@ -734,6 +738,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, writer.add_scalar('params-norm', params_norm, iteration) writer.add_scalar('params-norm vs samples', params_norm, args.consumed_train_samples) if args.log_timers_to_tensorboard: timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) Loading @@ -741,6 +746,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, elapsed_time = timers('interval time').elapsed() elapsed_time_per_iteration = elapsed_time / total_iterations if writer and torch.distributed.get_rank() == 0: if args.log_timers_to_tensorboard: writer.add_scalar('iteration-time', elapsed_time_per_iteration, iteration) log_string = ' iteration {:8d}/{:8d} |'.format( Loading Loading @@ -955,15 +961,17 @@ def evaluate_and_print_results(prefix, forward_step_func, ppl = math.exp(min(20, total_loss_dict[key].item())) string += '{} PPL: {:.6E} | '.format(key, ppl) if writer and is_last_rank(): writer.add_scalar('{} value-validation'.format(key), writer.add_scalar('{} validation'.format(key), total_loss_dict[key].item(), iteration) writer.add_scalar('{} ppl-validation'.format(key), ppl, iteration) writer.add_scalar('{} value-validation vs samples'.format(key), writer.add_scalar('{} validation vs samples'.format(key), total_loss_dict[key].item(), args.consumed_train_samples) writer.add_scalar('{} ppl-validation vs samples'.format(key), ppl, args.consumed_train_samples) if args.log_validation_ppl_to_tensorboard: writer.add_scalar('{} validation ppl'.format(key), ppl, iteration) writer.add_scalar('{} validation ppl vs samples'.format(key), ppl, args.consumed_train_samples) length = len(string) + 1 print_rank_last('-' * length) Loading