Loading megatron/arguments.py +1 −1 Original line number Diff line number Diff line Loading @@ -601,7 +601,7 @@ def _add_distributed_args(parser): group.add_argument('--use-cpu-initialization', action='store_true', default=None, help='If set, affine parallel weights ' 'initialization uses CPU' ) group.add_argument('--empty-unused-memory-each-iter', default=0, type=int, group.add_argument('--empty-unused-memory-level', default=0, type=int, choices=[0, 1, 2], help='Call torch.cuda.empty_cache() each iteration ' '(training and eval), to reduce fragmentation.' Loading Loading
megatron/arguments.py +1 −1 Original line number Diff line number Diff line Loading @@ -601,7 +601,7 @@ def _add_distributed_args(parser): group.add_argument('--use-cpu-initialization', action='store_true', default=None, help='If set, affine parallel weights ' 'initialization uses CPU' ) group.add_argument('--empty-unused-memory-each-iter', default=0, type=int, group.add_argument('--empty-unused-memory-level', default=0, type=int, choices=[0, 1, 2], help='Call torch.cuda.empty_cache() each iteration ' '(training and eval), to reduce fragmentation.' Loading