Commit 279d8320 authored by rprenger's avatar rprenger
Browse files

Adding files for launching server and CLI but removing cluster specific stuff

parent 59414b33
Loading
Loading
Loading
Loading
+31 −0
Original line number Diff line number Diff line
#!/bin/bash
DISTRIBUTED_ARGS="--nproc_per_node 16 \
                  --nnodes 3 \
                  --node_rank 0 \
                  --master_addr localhost \
                  --master_port 6000"

CHECKPOINT=<Path to checkpoint (e.g /gpt3-530b-megatron_tp16_pp3)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>

pip install flask-restful

python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_api_server.py   /
       --tensor-model-parallel-size 16  /
       --pipeline-model-parallel-size 3  /
       --num-layers 105  /
       --hidden-size 20480  /
       --load ${CHECKPOINT}  /
       --num-attention-heads 128  /
       --max-position-embeddings 2048  /
       --tokenizer-type GPT2BPETokenizer  /
       --fp16  /
       --micro-batch-size 1  /
       --seq-length 2048  /
       --out-seq-length 2048  /
       --temperature 1.0  /
       --vocab-file $VOCAB_FILE  /
       --merge-file $MERGE_FILE  /
       --top_p 0.9  /
	   --seed 42
+11 −0
Original line number Diff line number Diff line
#!/bin/bash
echo "Loading model and starting server.  May take several minutes"
./run_api_server_530B.sh
STATUS = 1
while [ $STATUS -eq 1]
do
	sleep 20
	curl -s -m 20 'http://localhost:5000/generate' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"sentences":["Test2"], "max_len":30}' | head -n 1 | grep "HTTP/1.[01] [23].." > /dev/null
	STATUS = $?
done
python tools/run_cli.py 'http://localhost:5000/generate' 

run_api_server_530B.sh

deleted100755 → 0
+0 −27
Original line number Diff line number Diff line
#!/bin/bash
CHECKPOINT="/home/universal-lm-data.cosmos549/scratch/jcasper/gpt3-530b-megatron_tp16_pp3"
DATA_PATH="/home/universal-lm-data.cosmos549/scratch/mshoeybi/data/gpt2"
VOCAB_FILE="${DATA_PATH}/bpe/gpt2-vocab.json"
MERGE_FILE="${DATA_PATH}/bpe/gpt2-merges.txt"
RUN_CMD=(
python tools/run_api_server.py 
       --tensor-model-parallel-size 16 
       --pipeline-model-parallel-size 3 
       --num-layers 105 
       --hidden-size 20480 
       --load ${CHECKPOINT} 
       --num-attention-heads 128 
       --max-position-embeddings 2048 
       --tokenizer-type GPT2BPETokenizer 
       --fp16 
       --micro-batch-size 1 
       --seq-length 2048 
       --out-seq-length 2048 
       --temperature 1.0 
       --vocab-file $VOCAB_FILE 
       --merge-file $MERGE_FILE 
       --top_p 0.9 
	   --seed 42
)

submit_job --duration 168 --nodes 3 --gpu 16 --reservation adlr-530b --partition batch_UN_dgx2_singlenode --mounts /home/universal-lm-data.cosmos549,/home/dcg-adlr-rprenger-source.cosmos352,/home/dcg-adlr-sgodil-data.cosmos233,/home/dcg-adlr-rprenger-output.cosmos349,/home/dcg-adlr-mchrzanowski-chidesign-data --image gitlab-master.nvidia.com/adlr/rprenger/megatron:latest --skip_ib_check --tasks_per_node 16 -c "${RUN_CMD[*]}"