Commit c1e0689d authored by slym's avatar slym
Browse files

Checkpoint a set number of invidividual Transformer layers

consider the case of pipeline-model prallelism

clean up arugments

argument naming cleanup

update readme and examples
parent 68797d90
Loading
Loading
Loading
Loading
+6 −6
Original line number Diff line number Diff line
@@ -156,7 +156,7 @@ OUTPUT_ARGS="--log-interval 10 \
             --save-interval 500 \
             --eval-interval 100 \
             --eval-iters 10 \
             --checkpoint-activations"
             --activations-checkpoint-method uniform"

python pretrain_bert.py \
       $BERT_ARGS \
@@ -345,7 +345,7 @@ python pretrain_ict.py \
    --max-position-embeddings 256 \
    --ict-head-size 128 \
    --train-iters 100000 \
    --checkpoint-activations \
    --activations-checkpoint-method uniform \
    --bert-load /path/to/pretrained_bert \
    --load checkpoints \
    --save checkpoints \
@@ -375,7 +375,7 @@ python tools/create_doc_index.py \
    --ict-head-size 128 \
    --num-attention-heads 12 \
    --batch-size 128 \
    --checkpoint-activations \
    --activations-checkpoint-method uniform \
    --seq-length 256 \
    --max-position-embeddings 256 \
    --ict-load /path/to/pretrained_ict \
@@ -482,7 +482,7 @@ python tasks/main.py \
       --merge-file $MERGE_FILE \
       --load $CHECKPOINT_PATH \
       --micro-batch-size 8 \
       --checkpoint-activations \
       --activations-checkpoint-method uniform \
       --log-interval 10 \
       --no-load-optim \
       --no-load-rng
@@ -512,7 +512,7 @@ python tasks/main.py \
       --merge-file $MERGE_FILE \
       --load $CHECKPOINT_PATH \
       --micro-batch-size 8 \
       --checkpoint-activations \
       --activations-checkpoint-method uniform \
       --log-interval 10 \
       --no-load-optim \
       --no-load-rng
@@ -542,7 +542,7 @@ COMMON_TASK_ARGS="--num-layers 24 \
COMMON_TASK_ARGS_EXT="--train-data $TRAIN_DATA \
                      --valid-data $VALID_DATA \
                      --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
                      --checkpoint-activations \
                      --activations-checkpoint-method uniform \
                      --save-interval 10000 \
                      --save $CHECKPOINT_PATH \
                      --log-interval 100 \
+1 −1
Original line number Diff line number Diff line
@@ -20,7 +20,7 @@ python tasks/main.py \
    --num-attention-heads 12 \
    --tensor-model-parallel-size 1 \
    --micro-batch-size 128 \
    --checkpoint-activations \
    --activations-checkpoint-method uniform \
    --seq-length 512 \
    --max-position-embeddings 512 \
    --load ${CHECKPOINT_PATH} \
+1 −1
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
               --hidden-size 1024 \
               --num-attention-heads 16 \
               --batch-size 8 \
               --checkpoint-activations \
               --activations-checkpoint-method uniform \
               --seq-length 1024 \
               --max-position-embeddings 1024 \
               --log-interval 10 \
+1 −1
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
               --hidden-size 1024 \
               --num-attention-heads 16 \
               --micro-batch-size 8 \
               --checkpoint-activations \
               --activations-checkpoint-method uniform \
               --lr 5.0e-5 \
               --lr-decay-style linear \
               --lr-warmup-fraction 0.065 \
+1 −1
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
               --hidden-size 1024 \
               --num-attention-heads 16 \
               --micro-batch-size 4 \
               --checkpoint-activations \
               --activations-checkpoint-method uniform \
               --lr 1.0e-5 \
               --lr-decay-style linear \
               --lr-warmup-fraction 0.06 \
Loading