Addressing comments (055a673e) · Commits · candle / Megatron-LM

examples/run_cli_530B.sh

deleted100755 → 0

+0 −11

Original line number	Diff line number	Diff line
		#!/bin/bash
		echo "Loading model and starting server. May take several minutes"
		./run_api_server_530B.sh
		STATUS = 1
		while [ $STATUS -eq 1]
		do
		sleep 20
		curl -s -m 20 'http://localhost:5000/generate' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8' -d '{"sentences":["Test2"], "max_len":30}' \| head -n 1 \| grep "HTTP/1.[01] [23].." > /dev/null
		STATUS = $?
		done
		python tools/run_cli.py 'http://localhost:5000/generate'

examples/run_api_server_530B.sh→examples/run_text_generation_server_345M.sh

+32 −0

Original line number	Diff line number	Diff line
		#!/bin/bash
		DISTRIBUTED_ARGS="--nproc_per_node 16 \
		--nnodes 3 \
		# This example will start serving the 345M model.
		DISTRIBUTED_ARGS="--nproc_per_node 1 \
		--nnodes 1 \
		--node_rank 0 \
		--master_addr localhost \
		--master_port 6000"

		CHECKPOINT=<Path to checkpoint (e.g /gpt3-530b-megatron_tp16_pp3)>
		CHECKPOINT=<Path to checkpoint (e.g /345m)>
		VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
		MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>

		pip install flask-restful

		python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_api_server.py /
		--tensor-model-parallel-size 16 /
		--pipeline-model-parallel-size 3 /
		--num-layers 105 /
		--hidden-size 20480 /
		python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py /
		--tensor-model-parallel-size 1 /
		--pipeline-model-parallel-size 1 /
		--num-layers 24 /
		--hidden-size 1024 /
		--load ${CHECKPOINT} /
		--num-attention-heads 128 /
		--max-position-embeddings 2048 /
		--num-attention-heads 16 /
		--max-position-embeddings 1024 /
		--tokenizer-type GPT2BPETokenizer /
		--fp16 /
		--micro-batch-size 1 /
		--seq-length 2048 /
		--out-seq-length 2048 /
		--seq-length 1024 /
		--out-seq-length 1024 /
		--temperature 1.0 /
		--vocab-file $VOCAB_FILE /
		--merge-file $MERGE_FILE /

examples/run_text_generation_server_345M_8_tensor_parallel.sh

0 → 100755

+32 −0

Original line number	Diff line number	Diff line
		#!/bin/bash
		# This example will start serving the 345M model that is partitioned 8 way tensor parallel
		DISTRIBUTED_ARGS="--nproc_per_node 8 \
		--nnodes 1 \
		--node_rank 0 \
		--master_addr localhost \
		--master_port 6000"

		CHECKPOINT=<Path to checkpoint (e.g /345m)>
		VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
		MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>

		pip install flask-restful

		python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py /
		--tensor-model-parallel-size 8 /
		--pipeline-model-parallel-size 1 /
		--num-layers 24 /
		--hidden-size 1024 /
		--load ${CHECKPOINT} /
		--num-attention-heads 16 /
		--max-position-embeddings 1024 /
		--tokenizer-type GPT2BPETokenizer /
		--fp16 /
		--micro-batch-size 1 /
		--seq-length 1024 /
		--out-seq-length 1024 /
		--temperature 1.0 /
		--vocab-file $VOCAB_FILE /
		--merge-file $MERGE_FILE /
		--top_p 0.9 /
		--seed 42

megatron/api_server.py→megatron/text_generation_server.py

+0 −0

File moved.

View file

megatron/text_generation_utils.py

+3 −8

Original line number	Diff line number	Diff line
		@@ -121,14 +121,14 @@ def receive_generate_info():
		"""
		Needs to be synced up with send_generate_info
		"""
		input_info_tensor = torch.empty(3, dtype=torch.int64, device=torch.device("cuda"))
		input_info_tensor = torch.empty(3, dtype=torch.int64, device=torch.cuda.current_device())
		torch.distributed.broadcast(input_info_tensor, 0)
		batch_size = input_info_tensor[0].item()
		seq_len = input_info_tensor[1].item()
		max_len = input_info_tensor[2].item()

		context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.device("cuda"))
		context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.device("cuda"))
		context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device())
		context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.cuda.current_device())

		# Send variables to all ranks
		torch.distributed.broadcast(context_length_tensor, 0)
		@@ -153,9 +153,6 @@ def synced_generate(model, context_tokens_tensor, context_length_tensor, max_len
		def generate(model, sentences=None, max_len=0):
		if torch.distributed.get_rank() == 0:
		context_tokens_tensor, context_length_tensor = tokenize_batch(sentences)
		c = context_length_tensor[0]
		b = context_tokens_tensor.size(0)
		start = time.time()
		send_generate_info(context_tokens_tensor, context_length_tensor, max_len)
		else:
		context_length_tensor, context_tokens_tensor, max_len = receive_generate_info()
		@@ -169,8 +166,6 @@ def generate(model, sentences=None, max_len=0):
		for i in range(decode_tokens.size(0)):
		decode_token = decode_tokens[i,:].cpu().numpy().tolist()
		resp_sentences.append(tokenizer.detokenize(decode_token))
		end = time.time()
		print(str(b)+","+str(c)+","+str(decode_tokens.size(1))+","+str(end-start), flush=True)
		return resp_sentences

		def switch(val1, val2, boolean):

Admin message