Loading examples/pretrain_t5.sh +3 −2 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ python pretrain_t5.py \ --encoder-seq-length 512 \ --decoder-seq-length 128 \ --micro-batch-size 16 \ --global-batch-size 2048 \ --global-batch-size 16 \ --max-position-embeddings 512 \ --train-iters 1000000 \ --lr-decay-iters 1000000 \ Loading @@ -35,4 +35,5 @@ python pretrain_t5.py \ --save-interval 10000 \ --eval-interval 1000 \ --eval-iters 10 \ --fp16 --fp16 \ --vocab-extra-ids 100 examples/pretrain_t5_distributed.sh +3 −2 Original line number Diff line number Diff line Loading @@ -24,7 +24,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --encoder-seq-length 512 \ --decoder-seq-length 128 \ --micro-batch-size 16 \ --global-batch-size 2048 \ --global-batch-size 128 \ --max-position-embeddings 512 \ --train-iters 1000000 \ --lr-decay-iters 1000000 \ Loading @@ -44,4 +44,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --save-interval 10000 \ --eval-interval 1000 \ --eval-iters 10 \ --fp16 --fp16 \ --vocab-extra-ids 100 examples/pretrain_t5_distributed_with_mp.sh +3 −2 Original line number Diff line number Diff line Loading @@ -24,7 +24,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --encoder-seq-length 512 \ --decoder-seq-length 128 \ --micro-batch-size 16 \ --global-batch-size 2048 \ --global-batch-size 128 \ --seq-length 512 \ --max-position-embeddings 512 \ --train-iters 1000000 \ Loading @@ -45,4 +45,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --save-interval 10000 \ --eval-interval 1000 \ --eval-iters 10 \ --fp16 --fp16 \ --vocab-extra-ids 100 Loading
examples/pretrain_t5.sh +3 −2 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ python pretrain_t5.py \ --encoder-seq-length 512 \ --decoder-seq-length 128 \ --micro-batch-size 16 \ --global-batch-size 2048 \ --global-batch-size 16 \ --max-position-embeddings 512 \ --train-iters 1000000 \ --lr-decay-iters 1000000 \ Loading @@ -35,4 +35,5 @@ python pretrain_t5.py \ --save-interval 10000 \ --eval-interval 1000 \ --eval-iters 10 \ --fp16 --fp16 \ --vocab-extra-ids 100
examples/pretrain_t5_distributed.sh +3 −2 Original line number Diff line number Diff line Loading @@ -24,7 +24,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --encoder-seq-length 512 \ --decoder-seq-length 128 \ --micro-batch-size 16 \ --global-batch-size 2048 \ --global-batch-size 128 \ --max-position-embeddings 512 \ --train-iters 1000000 \ --lr-decay-iters 1000000 \ Loading @@ -44,4 +44,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --save-interval 10000 \ --eval-interval 1000 \ --eval-iters 10 \ --fp16 --fp16 \ --vocab-extra-ids 100
examples/pretrain_t5_distributed_with_mp.sh +3 −2 Original line number Diff line number Diff line Loading @@ -24,7 +24,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --encoder-seq-length 512 \ --decoder-seq-length 128 \ --micro-batch-size 16 \ --global-batch-size 2048 \ --global-batch-size 128 \ --seq-length 512 \ --max-position-embeddings 512 \ --train-iters 1000000 \ Loading @@ -45,4 +45,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ --save-interval 10000 \ --eval-interval 1000 \ --eval-iters 10 \ --fp16 --fp16 \ --vocab-extra-ids 100