Loading megatron/arguments.py +9 −0 Original line number Diff line number Diff line Loading @@ -199,6 +199,15 @@ def _add_regularization_args(parser): help='Weight decay coefficient for L2 regularization.') group.add_argument('--clip-grad', type=float, default=1.0, help='Gradient clipping based on global L2 norm.') group.add_argument('--adam-beta1', type=float, default=0.9, help='First coefficient for computing running averages of' 'gradient and its square') group.add_argument('--adam-beta2', type=float, default=0.999, help='Second coefficient for computing running averages of' 'gradient and its square') group.add_argument('--adam-eps', type=float, default=1e-08, help='Term added to the denominator to improve' 'numerical stability') return parser Loading megatron/training.py +2 −1 Original line number Diff line number Diff line Loading @@ -164,7 +164,8 @@ def get_optimizer(model): param.model_parallel = False # Use Adam. optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay) optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps) # Wrap into fp16 optimizer. if args.fp16: Loading Loading
megatron/arguments.py +9 −0 Original line number Diff line number Diff line Loading @@ -199,6 +199,15 @@ def _add_regularization_args(parser): help='Weight decay coefficient for L2 regularization.') group.add_argument('--clip-grad', type=float, default=1.0, help='Gradient clipping based on global L2 norm.') group.add_argument('--adam-beta1', type=float, default=0.9, help='First coefficient for computing running averages of' 'gradient and its square') group.add_argument('--adam-beta2', type=float, default=0.999, help='Second coefficient for computing running averages of' 'gradient and its square') group.add_argument('--adam-eps', type=float, default=1e-08, help='Term added to the denominator to improve' 'numerical stability') return parser Loading
megatron/training.py +2 −1 Original line number Diff line number Diff line Loading @@ -164,7 +164,8 @@ def get_optimizer(model): param.model_parallel = False # Use Adam. optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay) optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps) # Wrap into fp16 optimizer. if args.fp16: Loading