Commit 4635bdf7 authored Jan 11, 2021 by Jared Casper

Merge branch 'readme_update_jan_2021' into 'main'

Readme update + change gpt2 to gpt

See merge request ADLR/megatron-lm!206

parents 86eb5bd8 152aab30

README.md

+101 −124

File changed.

Preview size limit exceeded, changes collapsed.

examples/evaluate_zeroshot_gpt2.sh→examples/evaluate_zeroshot_gpt.sh

+0 −0

File moved.

examples/finetune_mnli_distributed.sh

+2 −2

Original line number	Diff line number	Diff line
		@@ -28,11 +28,11 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
		--num-layers 24 \
		--hidden-size 1024 \
		--num-attention-heads 16 \
		--batch-size 8 \
		--micro-batch-size 8 \
		--checkpoint-activations \
		--lr 5.0e-5 \
		--lr-decay-style linear \
		--warmup 0.065 \
		--lr-warmup-fraction 0.065 \
		--seq-length 512 \
		--max-position-embeddings 512 \
		--save-interval 500000 \

examples/finetune_race_distributed.sh

+2 −2

Original line number	Diff line number	Diff line
		@@ -28,11 +28,11 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
		--num-layers 24 \
		--hidden-size 1024 \
		--num-attention-heads 16 \
		--batch-size 4 \
		--micro-batch-size 4 \
		--checkpoint-activations \
		--lr 1.0e-5 \
		--lr-decay-style linear \
		--warmup 0.06 \
		--lr-warmup-fraction 0.06 \
		--seq-length 512 \
		--max-position-embeddings 512 \
		--save-interval 100000 \

examples/pretrain_bert.sh

+4 −4

Original line number	Diff line number	Diff line
		@@ -9,24 +9,24 @@ python pretrain_bert.py \
		--num-layers 24 \
		--hidden-size 1024 \
		--num-attention-heads 16 \
		--batch-size 4 \
		--micro-batch-size 4 \
		--global-batch-size 8 \
		--seq-length 512 \
		--max-position-embeddings 512 \
		--train-iters 2000000 \
		--lr-decay-iters 990000 \
		--save $CHECKPOINT_PATH \
		--load $CHECKPOINT_PATH \
		--data-path $DATA_PATH \
		--vocab-file bert-vocab.txt \
		--data-impl mmap \
		--split 949,50,1 \
		--distributed-backend nccl \
		--lr 0.0001 \
		--min-lr 0.00001 \
		--lr-decay-style linear \
		--lr-decay-iters 990000 \
		--lr-warmup-fraction .01 \
		--weight-decay 1e-2 \
		--clip-grad 1.0 \
		--warmup .01 \
		--log-interval 100 \
		--save-interval 10000 \
		--eval-interval 1000 \