Loading examples/pretrain_bert_distributed.sh +1 −0 Original line number Diff line number Diff line Loading @@ -32,6 +32,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --distributed-backend nccl \ --lr 0.0001 \ --lr-decay-style linear \ --min-lr 1.0e-5 \ --lr-decay-iters 990000 \ --weight-decay 1e-2 \ --clip-grad 1.0 \ Loading examples/pretrain_gpt2_distributed.sh +1 −0 Original line number Diff line number Diff line Loading @@ -36,6 +36,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --distributed-backend nccl \ --lr 0.00015 \ --lr-decay-style cosine \ --min-lr 1.0e-5 \ --weight-decay 1e-2 \ --clip-grad 1.0 \ --warmup .01 \ Loading Loading
examples/pretrain_bert_distributed.sh +1 −0 Original line number Diff line number Diff line Loading @@ -32,6 +32,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --distributed-backend nccl \ --lr 0.0001 \ --lr-decay-style linear \ --min-lr 1.0e-5 \ --lr-decay-iters 990000 \ --weight-decay 1e-2 \ --clip-grad 1.0 \ Loading
examples/pretrain_gpt2_distributed.sh +1 −0 Original line number Diff line number Diff line Loading @@ -36,6 +36,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --distributed-backend nccl \ --lr 0.00015 \ --lr-decay-style cosine \ --min-lr 1.0e-5 \ --weight-decay 1e-2 \ --clip-grad 1.0 \ --warmup .01 \ Loading