Commit 055a673e authored by rprenger's avatar rprenger
Browse files

Addressing comments

parent 279d8320
Loading
Loading
Loading
Loading

examples/run_cli_530B.sh

deleted100755 → 0
+0 −11
Original line number Diff line number Diff line
#!/bin/bash
echo "Loading model and starting server.  May take several minutes"
./run_api_server_530B.sh
STATUS = 1
while [ $STATUS -eq 1]
do
	sleep 20
	curl -s -m 20 'http://localhost:5000/generate' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"sentences":["Test2"], "max_len":30}' | head -n 1 | grep "HTTP/1.[01] [23].." > /dev/null
	STATUS = $?
done
python tools/run_cli.py 'http://localhost:5000/generate' 
+32 −0
Original line number Diff line number Diff line
#!/bin/bash
DISTRIBUTED_ARGS="--nproc_per_node 16 \
                  --nnodes 3 \
# This example will start serving the 345M model.
DISTRIBUTED_ARGS="--nproc_per_node 1 \
                  --nnodes 1 \
                  --node_rank 0 \
                  --master_addr localhost \
                  --master_port 6000"

CHECKPOINT=<Path to checkpoint (e.g /gpt3-530b-megatron_tp16_pp3)>
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>

pip install flask-restful

python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_api_server.py   /
       --tensor-model-parallel-size 16  /
       --pipeline-model-parallel-size 3  /
       --num-layers 105  /
       --hidden-size 20480  /
python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   /
       --tensor-model-parallel-size 1  /
       --pipeline-model-parallel-size 1  /
       --num-layers 24  /
       --hidden-size 1024  /
       --load ${CHECKPOINT}  /
       --num-attention-heads 128  /
       --max-position-embeddings 2048  /
       --num-attention-heads 16  /
       --max-position-embeddings 1024  /
       --tokenizer-type GPT2BPETokenizer  /
       --fp16  /
       --micro-batch-size 1  /
       --seq-length 2048  /
       --out-seq-length 2048  /
       --seq-length 1024  /
       --out-seq-length 1024  /
       --temperature 1.0  /
       --vocab-file $VOCAB_FILE  /
       --merge-file $MERGE_FILE  /
+32 −0
Original line number Diff line number Diff line
#!/bin/bash
# This example will start serving the 345M model that is partitioned 8 way tensor parallel
DISTRIBUTED_ARGS="--nproc_per_node 8 \
                  --nnodes 1 \
                  --node_rank 0 \
                  --master_addr localhost \
                  --master_port 6000"

CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>

pip install flask-restful

python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   /
       --tensor-model-parallel-size 8  /
       --pipeline-model-parallel-size 1  /
       --num-layers 24  /
       --hidden-size 1024  /
       --load ${CHECKPOINT}  /
       --num-attention-heads 16  /
       --max-position-embeddings 1024  /
       --tokenizer-type GPT2BPETokenizer  /
       --fp16  /
       --micro-batch-size 1  /
       --seq-length 1024  /
       --out-seq-length 1024  /
       --temperature 1.0  /
       --vocab-file $VOCAB_FILE  /
       --merge-file $MERGE_FILE  /
       --top_p 0.9  /
	   --seed 42
+3 −8
Original line number Diff line number Diff line
@@ -121,14 +121,14 @@ def receive_generate_info():
    """
    Needs to be synced up with send_generate_info
    """
    input_info_tensor = torch.empty(3, dtype=torch.int64, device=torch.device("cuda"))
    input_info_tensor = torch.empty(3, dtype=torch.int64, device=torch.cuda.current_device())
    torch.distributed.broadcast(input_info_tensor, 0)
    batch_size = input_info_tensor[0].item()
    seq_len = input_info_tensor[1].item()
    max_len = input_info_tensor[2].item()
    
    context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.device("cuda"))
    context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.device("cuda"))
    context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device())
    context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.cuda.current_device())
    
    # Send variables to all ranks 
    torch.distributed.broadcast(context_length_tensor, 0)
@@ -153,9 +153,6 @@ def synced_generate(model, context_tokens_tensor, context_length_tensor, max_len
def generate(model, sentences=None, max_len=0):
    if torch.distributed.get_rank() == 0:
        context_tokens_tensor, context_length_tensor = tokenize_batch(sentences)
        c = context_length_tensor[0]
        b = context_tokens_tensor.size(0)
        start = time.time()
        send_generate_info(context_tokens_tensor, context_length_tensor, max_len)
    else:
        context_length_tensor, context_tokens_tensor, max_len = receive_generate_info()
@@ -169,8 +166,6 @@ def generate(model, sentences=None, max_len=0):
        for i in range(decode_tokens.size(0)):
            decode_token = decode_tokens[i,:].cpu().numpy().tolist()
            resp_sentences.append(tokenizer.detokenize(decode_token))
        end = time.time()
        print(str(b)+","+str(c)+","+str(decode_tokens.size(1))+","+str(end-start), flush=True)
        return resp_sentences

def switch(val1, val2, boolean):
Loading