Loading examples/run_text_generation_server_345M.sh 0 → 100755 +32 −0 Original line number Diff line number Diff line #!/bin/bash # This example will start serving the 345M model. DISTRIBUTED_ARGS="--nproc_per_node 1 \ --nnodes 1 \ --node_rank 0 \ --master_addr localhost \ --master_port 6000" CHECKPOINT=<Path to checkpoint (e.g /345m)> VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)> MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)> pip install flask-restful python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py / --tensor-model-parallel-size 1 / --pipeline-model-parallel-size 1 / --num-layers 24 / --hidden-size 1024 / --load ${CHECKPOINT} / --num-attention-heads 16 / --max-position-embeddings 1024 / --tokenizer-type GPT2BPETokenizer / --fp16 / --micro-batch-size 1 / --seq-length 1024 / --out-seq-length 1024 / --temperature 1.0 / --vocab-file $VOCAB_FILE / --merge-file $MERGE_FILE / --top_p 0.9 / --seed 42 examples/run_text_generation_server_345M_8_tensor_parallel.sh 0 → 100755 +32 −0 Original line number Diff line number Diff line #!/bin/bash # This example will start serving the 345M model that is partitioned 8 way tensor parallel DISTRIBUTED_ARGS="--nproc_per_node 8 \ --nnodes 1 \ --node_rank 0 \ --master_addr localhost \ --master_port 6000" CHECKPOINT=<Path to checkpoint (e.g /345m)> VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)> MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)> pip install flask-restful python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py / --tensor-model-parallel-size 8 / --pipeline-model-parallel-size 1 / --num-layers 24 / --hidden-size 1024 / --load ${CHECKPOINT} / --num-attention-heads 16 / --max-position-embeddings 1024 / --tokenizer-type GPT2BPETokenizer / --fp16 / --micro-batch-size 1 / --seq-length 1024 / --out-seq-length 1024 / --temperature 1.0 / --vocab-file $VOCAB_FILE / --merge-file $MERGE_FILE / --top_p 0.9 / --seed 42 megatron/initialize.py +3 −2 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ import time import numpy as np import torch from datetime import timedelta from megatron import fused_kernels from megatron import get_adlr_autoresume Loading Loading @@ -175,8 +176,8 @@ def _initialize_distributed(): # Call the init process torch.distributed.init_process_group( backend=args.distributed_backend, world_size=args.world_size, rank=args.rank) world_size=args.world_size, rank=args.rank, timeout=timedelta(days=7)) # Set the tensor model-parallel, pipeline model-parallel, and # data-parallel communicators. Loading megatron/text_generation_server.py 0 → 100644 +66 −0 Original line number Diff line number Diff line # coding=utf-8 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import torch from flask import Flask, request, jsonify, current_app from flask_restful import Resource, Api from megatron import get_args from megatron import mpu from megatron.text_generation_utils import generate GENERATE_NUM = 0 class MegatronGenerate(Resource): def __init__(self, model): self.model = model @staticmethod def send_do_generate(): choice = torch.cuda.LongTensor([GENERATE_NUM]) torch.distributed.broadcast(choice, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()) def put(self): args = get_args() sentences = request.get_json()["sentences"] if len(sentences) > 128: return "Maximum number of sentences is 128", 400 max_len = 64 # Choosing hopefully sane default. Full sequence is slow if "max_len" in request.get_json(): max_len = request.get_json()["max_len"] if not isinstance(max_len, int): return "max_len must be an integer greater than 0" if max_len < 1: return "max_len must be an integer greater than 0" MegatronGenerate.send_do_generate() # Tell other ranks we're doing generate resp_sentences = generate(self.model, sentences, max_len) return jsonify({"sentences": resp_sentences}) def index(): return current_app.send_static_file('index.html') class MegatronServer(object): def __init__(self, model): self.app = Flask(__name__) self.app.add_url_rule('/', 'index', index) api = Api(self.app) api.add_resource(MegatronGenerate, '/generate', resource_class_args=[model]) def run(self, url): self.app.run(url, threaded=False, debug=False) megatron/text_generation_utils.py +104 −351 Original line number Diff line number Diff line Loading @@ -40,7 +40,8 @@ def get_batch(context_tokens): tokenizer = get_tokenizer() # Move to GPU. tokens = context_tokens.view(args.micro_batch_size, -1).contiguous().cuda() tokens = context_tokens.contiguous().cuda() # Get the attention mask and postition ids. attention_mask, _, position_ids = get_ltor_masks_and_position_ids( tokens, Loading Loading @@ -84,301 +85,7 @@ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): return logits def generate_samples_input_from_file(model): args = get_args() tokenizer = get_tokenizer() # Read the sample file and open the output file. assert args.sample_input_file is not None, \ 'sample input file is not provided.' if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0: fname = open(args.sample_input_file, "r") all_raw_text = fname.readlines() input_count = len(all_raw_text) input_pos = 0 if args.sample_output_file is None: sample_output_file = args.sample_input_file + ".out" print('`sample-output-file` not specified, setting ' 'it to {}'.format(sample_output_file)) else: sample_output_file = args.sample_output_file fname_out = open(sample_output_file, "w+") context_count = 0 model.eval() with torch.no_grad(): while True: terminate_runs = 0 raw_text_len = 0 if mpu.is_pipeline_first_stage() \ and mpu.get_tensor_model_parallel_rank() == 0: raw_text = all_raw_text[input_pos] input_pos += 1 if input_pos == input_count: raw_text = "stop" raw_text_len = len(raw_text) if "stop" in raw_text: terminate_runs = 1 else: context_tokens = tokenizer.tokenize(raw_text) context_length = len(context_tokens) if context_length >= (args.seq_length // 2): print("\nContext length", context_length, "\nPlease give smaller context (half of the " "sequence length)!", flush=True) continue else: context_tokens = tokenizer.tokenize("EMPTY TEXT") context_length = 0 input_info = [terminate_runs, raw_text_len, context_length] input_info_tensor = torch.cuda.LongTensor(input_info) torch.distributed.all_reduce(input_info_tensor, group=mpu.get_model_parallel_group()) terminate_runs = input_info_tensor[0].item() raw_text_len = input_info_tensor[1].item() context_length = input_info_tensor[2].item() if terminate_runs == 1: return # For pipeline parallel we send context tokens to other stages # so they get the lengths correct if mpu.get_tensor_model_parallel_rank() == 0 \ and args.pipeline_model_parallel_size > 1: if mpu.is_pipeline_first_stage(): src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() context_tokens_tensor = torch.cuda.LongTensor(context_tokens) torch.distributed.broadcast(context_tokens_tensor, src, group) else: src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() context_tokens_tensor = torch.empty(context_length, dtype=torch.int64, device=torch.device("cuda")) torch.distributed.broadcast(context_tokens_tensor, src, group) context_tokens = context_tokens_tensor.cpu().numpy().tolist() token_stream = get_token_stream(model, [context_tokens]) for _, decode_tokens in enumerate(token_stream): pass if mpu.get_tensor_model_parallel_rank() == 0: if mpu.is_pipeline_first_stage(): os.system('clear') print("\nContext:", raw_text, flush=True) fname_out.write("\nContext:") fname_out.write(raw_text) decode_tokens, _ = decode_tokens decode_tokens = decode_tokens[0].cpu().numpy().tolist() trim_decode_tokens = tokenizer.detokenize( decode_tokens)[raw_text_len:] print("\nMegatron-LM:", trim_decode_tokens, flush=True) fname_out.write("\n\nMegatron-LM:") fname_out.write(trim_decode_tokens) fname_out.write("\n") raw_text = None context_count += 1 # We added this function to support the tasks evaluation such as squad # and drop in the https://github.com/EleutherAI/lm-evaluation-harness # codebase. The lm-evaluation-harness code can now call this function # similar to their current generate function call used for gpt style models. def generate_samples_eval(model, context, max_gen_length, eos_token_id): # Generate samples for lm evaluation # NEED TO THINK ABOUT eos token args = get_args() tokenizer = get_tokenizer() raw_text_len = len(context) model.eval() context_tokens = tokenizer.tokenize(context) args.out_seq_length = max_gen_length + len(context_tokens) args.eos_id = eos_token_id with torch.no_grad(): token_stream = get_token_stream(model, [context_tokens]) for counter, decode_tokens in enumerate(token_stream): if counter == args.out_seq_length: break decode_tokens, _ = decode_tokens decode_tokens = decode_tokens[0].cpu().numpy().tolist() trim_decode_tokens = tokenizer.detokenize( decode_tokens)[raw_text_len:] return trim_decode_tokens def generate_samples_interactive(model, print_frequency=24): args = get_args() tokenizer = get_tokenizer() context_count = 0 model.eval() with torch.no_grad(): while True: terminate_runs = 0 raw_text_len = 0 if mpu.is_pipeline_first_stage() \ and mpu.get_tensor_model_parallel_rank() == 0: os.system('clear') raw_text = input("\nContext prompt (stop to exit) >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("\nContext prompt (stop to exit) >>> ") raw_text_len = len(raw_text) if "stop" in raw_text: terminate_runs = 1 else: context_tokens = tokenizer.tokenize(raw_text) context_length = len(context_tokens) if context_length >= (args.seq_length // 2): print("\nContext length", context_length, "\nPlease give smaller context (half of the " "sequence length)!", flush=True) continue else: context_tokens = tokenizer.tokenize("EMPTY TEXT") context_length = 0 input_info = [terminate_runs, raw_text_len, context_length] input_info_tensor = torch.cuda.LongTensor(input_info) torch.distributed.all_reduce(input_info_tensor, group=mpu.get_model_parallel_group()) terminate_runs = input_info_tensor[0].item() raw_text_len = input_info_tensor[1].item() context_length = input_info_tensor[2].item() if terminate_runs == 1: return # For pipeline parallel we send context tokens to other stages # so they get the lengths correct if mpu.get_tensor_model_parallel_rank() == 0 \ and args.pipeline_model_parallel_size > 1: if mpu.is_pipeline_first_stage(): src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() context_tokens_tensor = torch.cuda.LongTensor(context_tokens) torch.distributed.broadcast(context_tokens_tensor, src, group) else: src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() context_tokens_tensor = torch.empty(context_length, dtype=torch.int64, device=torch.device("cuda")) torch.distributed.broadcast(context_tokens_tensor, src, group) context_tokens = context_tokens_tensor.cpu().numpy().tolist() token_stream = get_token_stream(model, [context_tokens]) for counter, decode_tokens in enumerate(token_stream): if counter % print_frequency != 0 \ or mpu.get_tensor_model_parallel_rank() != 0 \ or not mpu.is_pipeline_first_stage(): continue os.system('clear') print("\nContext:", raw_text, flush=True) decode_tokens, _ = decode_tokens decode_tokens = decode_tokens[0].cpu().numpy().tolist() trim_decode_tokens = tokenizer.detokenize( decode_tokens)[raw_text_len:] print("\nMegatron-LM:", trim_decode_tokens, flush=True) if mpu.is_pipeline_first_stage() \ and mpu.get_tensor_model_parallel_rank() == 0: os.system('clear') print("\nContext:", raw_text, flush=True) if not isinstance(decode_tokens, list): decode_tokens, _ = decode_tokens decode_tokens = decode_tokens[0].cpu().numpy().tolist() trim_decode_tokens = tokenizer.detokenize( decode_tokens)[raw_text_len:] print("\nMegatron-LM:", trim_decode_tokens, flush=True) input("\nPress Enter to continue >>>") raw_text = None context_count += 1 def generate_samples_unconditional(model): args = get_args() tokenizer = get_tokenizer() num_samples = args.num_samples context_tokens = [[tokenizer.eod] for _ in range(args.micro_batch_size)] ctr = 0 while True: start_time = time.time() for token_stream in get_token_stream(model, copy.deepcopy(context_tokens)): pass if mpu.is_pipeline_last_stage() and \ mpu.get_tensor_model_parallel_rank() == 0: if ctr % args.log_interval == 0: print('Avg s/batch:', (time.time() - start_time) / min(args.log_interval, ctr + 1)) start_time = time.time() length = len(token_stream) token_batch = token_stream[0].cpu().numpy().tolist() length_batch = token_stream[1].cpu().numpy().tolist() assert len(length_batch) == args.micro_batch_size for tokens, length in zip(token_batch, length_batch): tokens = tokens[1:length - 1] text = tokenizer.detokenize(tokens) is_finished = length < args.seq_length - 1 datum = {'text': text, 'length': length - 1, 'finished': is_finished} yield datum ctr += 1 if ctr >= num_samples: break else: for _ in range(args.micro_batch_size): yield None ctr += 1 if ctr >= num_samples: break if ctr >= num_samples: break def generate_and_write_samples_unconditional(model): args = get_args() assert args.genfile is not None with open(args.genfile, 'w') as f: for datum in generate_samples_unconditional(model): if mpu.is_pipeline_last_stage() and \ mpu.get_tensor_model_parallel_rank() == 0: f.write(json.dumps(datum) + '\n') def pad_batch(batch, pad_id, args): context_lengths = [] for tokens in batch: context_length = len(tokens) Loading @@ -387,41 +94,94 @@ def pad_batch(batch, pad_id, args): context_lengths.append(context_length) return batch, context_lengths def get_token_stream(model, context_tokens): def tokenize_batch(sentences): args = get_args() tokenizer = get_tokenizer() context_tokens = [tokenizer.tokenize(s) for s in sentences] context_tokens, context_lengths = pad_batch(context_tokens, tokenizer.eod, args) context_tokens_tensor = torch.cuda.LongTensor(context_tokens) context_length_tensor = torch.cuda.LongTensor(context_lengths) return context_tokens_tensor, context_length_tensor def send_generate_info(context_tokens_tensor, context_length_tensor, max_len): """ Needs to be synced up with receive_generate_info """ # Send the sizes of the tensors input_info = [context_tokens_tensor.size(0), context_tokens_tensor.size(1), max_len] input_info_tensor = torch.cuda.LongTensor(input_info) torch.distributed.broadcast(input_info_tensor, 0) # Send variables to all ranks torch.distributed.broadcast(context_length_tensor, 0) torch.distributed.broadcast(context_tokens_tensor, 0) def receive_generate_info(): """ Needs to be synced up with send_generate_info """ input_info_tensor = torch.empty(3, dtype=torch.int64, device=torch.cuda.current_device()) torch.distributed.broadcast(input_info_tensor, 0) batch_size = input_info_tensor[0].item() seq_len = input_info_tensor[1].item() max_len = input_info_tensor[2].item() torch.distributed.broadcast(context_length_tensor, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()) torch.distributed.broadcast(context_tokens_tensor, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()) context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device()) context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.cuda.current_device()) # Send variables to all ranks torch.distributed.broadcast(context_length_tensor, 0) torch.distributed.broadcast(context_tokens_tensor, 0) return context_length_tensor, context_tokens_tensor, max_len def synced_generate(model, context_tokens_tensor, context_length_tensor, max_len): context_length = context_length_tensor.min().item() tokens, attention_mask, position_ids = get_batch(context_tokens_tensor) batch_token_iterator = sample_sequence_batch(model, context_tokens_tensor, context_length_tensor, attention_mask, position_ids) attention_mask, position_ids, max_len) for tokens, lengths in batch_token_iterator: context_length += 1 if tokens is not None: yield tokens[:, :context_length], lengths return tokens[:, :context_length] def generate(model, sentences=None, max_len=0): model.eval() if torch.distributed.get_rank() == 0: context_tokens_tensor, context_length_tensor = tokenize_batch(sentences) send_generate_info(context_tokens_tensor, context_length_tensor, max_len) else: yield None, None context_length_tensor, context_tokens_tensor, max_len = receive_generate_info() decode_tokens = synced_generate(model, context_tokens_tensor, context_length_tensor, max_len) def switch(val1, val2, boolean): if torch.distributed.get_rank() == 0: args = get_args() tokenizer = get_tokenizer() resp_sentences = [] for i in range(decode_tokens.size(0)): decode_token = decode_tokens[i,:].cpu().numpy().tolist() resp_sentences.append(tokenizer.detokenize(decode_token)) return resp_sentences def generate_samples_eval(model, context, max_gen_length, eos_token_id): """ This function is here to provide an a matching API for a legacy task This implementation hasn't been tested yet to make sure it matches """ assert False, "Implementation untested" args = get_args() args.eos_id = eos_token_id raw_text_len = len(context) resp_sentences = generate(model, [context], max_gen_length) return resp_sentences[0][raw_text_len:] def switch(val1, val2, boolean): boolean = boolean.type_as(val1) return (1 - boolean) * val1 + boolean * val2 Loading @@ -435,6 +195,7 @@ def forward_step(model, tokens, position_ids, attention_mask, tokentype_ids, args = get_args() orig_seq_length = args.seq_length args.seq_length = tokens.shape[1] args.micro_batch_size = tokens.shape[0] input_tensor = recv_forward() Loading Loading @@ -462,7 +223,6 @@ def forward_step(model, tokens, position_ids, attention_mask, tokentype_ids, def sample_sequence_batch(model, context_tokens, context_lengths, attention_mask, position_ids, maxlen=None, type_ids=None): args = get_args() tokenizer = get_tokenizer() Loading @@ -486,22 +246,15 @@ def sample_sequence_batch(model, context_tokens, context_lengths, tokens = context_tokens if maxlen is None: maxlen = args.seq_length - 1 maxlen = maxlen + org_context_length if maxlen > (org_context_length + args.out_seq_length): maxlen = org_context_length + args.out_seq_length lengths = torch.ones([batch_size]).long().cuda() * maxlen while context_length <= (maxlen): if args.recompute: output = forward_step(model, tokens, position_ids, attention_mask, tokentype_ids=type_ids, forward_method_parallel_output=False) if mpu.is_pipeline_last_stage(): assert output is not None logits = output[:, context_length - 1, :] else: while context_length < maxlen: types2use = None if counter == 0: tokens2use = tokens[:, :context_length] Loading Loading
examples/run_text_generation_server_345M.sh 0 → 100755 +32 −0 Original line number Diff line number Diff line #!/bin/bash # This example will start serving the 345M model. DISTRIBUTED_ARGS="--nproc_per_node 1 \ --nnodes 1 \ --node_rank 0 \ --master_addr localhost \ --master_port 6000" CHECKPOINT=<Path to checkpoint (e.g /345m)> VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)> MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)> pip install flask-restful python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py / --tensor-model-parallel-size 1 / --pipeline-model-parallel-size 1 / --num-layers 24 / --hidden-size 1024 / --load ${CHECKPOINT} / --num-attention-heads 16 / --max-position-embeddings 1024 / --tokenizer-type GPT2BPETokenizer / --fp16 / --micro-batch-size 1 / --seq-length 1024 / --out-seq-length 1024 / --temperature 1.0 / --vocab-file $VOCAB_FILE / --merge-file $MERGE_FILE / --top_p 0.9 / --seed 42
examples/run_text_generation_server_345M_8_tensor_parallel.sh 0 → 100755 +32 −0 Original line number Diff line number Diff line #!/bin/bash # This example will start serving the 345M model that is partitioned 8 way tensor parallel DISTRIBUTED_ARGS="--nproc_per_node 8 \ --nnodes 1 \ --node_rank 0 \ --master_addr localhost \ --master_port 6000" CHECKPOINT=<Path to checkpoint (e.g /345m)> VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)> MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)> pip install flask-restful python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py / --tensor-model-parallel-size 8 / --pipeline-model-parallel-size 1 / --num-layers 24 / --hidden-size 1024 / --load ${CHECKPOINT} / --num-attention-heads 16 / --max-position-embeddings 1024 / --tokenizer-type GPT2BPETokenizer / --fp16 / --micro-batch-size 1 / --seq-length 1024 / --out-seq-length 1024 / --temperature 1.0 / --vocab-file $VOCAB_FILE / --merge-file $MERGE_FILE / --top_p 0.9 / --seed 42
megatron/initialize.py +3 −2 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ import time import numpy as np import torch from datetime import timedelta from megatron import fused_kernels from megatron import get_adlr_autoresume Loading Loading @@ -175,8 +176,8 @@ def _initialize_distributed(): # Call the init process torch.distributed.init_process_group( backend=args.distributed_backend, world_size=args.world_size, rank=args.rank) world_size=args.world_size, rank=args.rank, timeout=timedelta(days=7)) # Set the tensor model-parallel, pipeline model-parallel, and # data-parallel communicators. Loading
megatron/text_generation_server.py 0 → 100644 +66 −0 Original line number Diff line number Diff line # coding=utf-8 # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import torch from flask import Flask, request, jsonify, current_app from flask_restful import Resource, Api from megatron import get_args from megatron import mpu from megatron.text_generation_utils import generate GENERATE_NUM = 0 class MegatronGenerate(Resource): def __init__(self, model): self.model = model @staticmethod def send_do_generate(): choice = torch.cuda.LongTensor([GENERATE_NUM]) torch.distributed.broadcast(choice, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()) def put(self): args = get_args() sentences = request.get_json()["sentences"] if len(sentences) > 128: return "Maximum number of sentences is 128", 400 max_len = 64 # Choosing hopefully sane default. Full sequence is slow if "max_len" in request.get_json(): max_len = request.get_json()["max_len"] if not isinstance(max_len, int): return "max_len must be an integer greater than 0" if max_len < 1: return "max_len must be an integer greater than 0" MegatronGenerate.send_do_generate() # Tell other ranks we're doing generate resp_sentences = generate(self.model, sentences, max_len) return jsonify({"sentences": resp_sentences}) def index(): return current_app.send_static_file('index.html') class MegatronServer(object): def __init__(self, model): self.app = Flask(__name__) self.app.add_url_rule('/', 'index', index) api = Api(self.app) api.add_resource(MegatronGenerate, '/generate', resource_class_args=[model]) def run(self, url): self.app.run(url, threaded=False, debug=False)
megatron/text_generation_utils.py +104 −351 Original line number Diff line number Diff line Loading @@ -40,7 +40,8 @@ def get_batch(context_tokens): tokenizer = get_tokenizer() # Move to GPU. tokens = context_tokens.view(args.micro_batch_size, -1).contiguous().cuda() tokens = context_tokens.contiguous().cuda() # Get the attention mask and postition ids. attention_mask, _, position_ids = get_ltor_masks_and_position_ids( tokens, Loading Loading @@ -84,301 +85,7 @@ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): return logits def generate_samples_input_from_file(model): args = get_args() tokenizer = get_tokenizer() # Read the sample file and open the output file. assert args.sample_input_file is not None, \ 'sample input file is not provided.' if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0: fname = open(args.sample_input_file, "r") all_raw_text = fname.readlines() input_count = len(all_raw_text) input_pos = 0 if args.sample_output_file is None: sample_output_file = args.sample_input_file + ".out" print('`sample-output-file` not specified, setting ' 'it to {}'.format(sample_output_file)) else: sample_output_file = args.sample_output_file fname_out = open(sample_output_file, "w+") context_count = 0 model.eval() with torch.no_grad(): while True: terminate_runs = 0 raw_text_len = 0 if mpu.is_pipeline_first_stage() \ and mpu.get_tensor_model_parallel_rank() == 0: raw_text = all_raw_text[input_pos] input_pos += 1 if input_pos == input_count: raw_text = "stop" raw_text_len = len(raw_text) if "stop" in raw_text: terminate_runs = 1 else: context_tokens = tokenizer.tokenize(raw_text) context_length = len(context_tokens) if context_length >= (args.seq_length // 2): print("\nContext length", context_length, "\nPlease give smaller context (half of the " "sequence length)!", flush=True) continue else: context_tokens = tokenizer.tokenize("EMPTY TEXT") context_length = 0 input_info = [terminate_runs, raw_text_len, context_length] input_info_tensor = torch.cuda.LongTensor(input_info) torch.distributed.all_reduce(input_info_tensor, group=mpu.get_model_parallel_group()) terminate_runs = input_info_tensor[0].item() raw_text_len = input_info_tensor[1].item() context_length = input_info_tensor[2].item() if terminate_runs == 1: return # For pipeline parallel we send context tokens to other stages # so they get the lengths correct if mpu.get_tensor_model_parallel_rank() == 0 \ and args.pipeline_model_parallel_size > 1: if mpu.is_pipeline_first_stage(): src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() context_tokens_tensor = torch.cuda.LongTensor(context_tokens) torch.distributed.broadcast(context_tokens_tensor, src, group) else: src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() context_tokens_tensor = torch.empty(context_length, dtype=torch.int64, device=torch.device("cuda")) torch.distributed.broadcast(context_tokens_tensor, src, group) context_tokens = context_tokens_tensor.cpu().numpy().tolist() token_stream = get_token_stream(model, [context_tokens]) for _, decode_tokens in enumerate(token_stream): pass if mpu.get_tensor_model_parallel_rank() == 0: if mpu.is_pipeline_first_stage(): os.system('clear') print("\nContext:", raw_text, flush=True) fname_out.write("\nContext:") fname_out.write(raw_text) decode_tokens, _ = decode_tokens decode_tokens = decode_tokens[0].cpu().numpy().tolist() trim_decode_tokens = tokenizer.detokenize( decode_tokens)[raw_text_len:] print("\nMegatron-LM:", trim_decode_tokens, flush=True) fname_out.write("\n\nMegatron-LM:") fname_out.write(trim_decode_tokens) fname_out.write("\n") raw_text = None context_count += 1 # We added this function to support the tasks evaluation such as squad # and drop in the https://github.com/EleutherAI/lm-evaluation-harness # codebase. The lm-evaluation-harness code can now call this function # similar to their current generate function call used for gpt style models. def generate_samples_eval(model, context, max_gen_length, eos_token_id): # Generate samples for lm evaluation # NEED TO THINK ABOUT eos token args = get_args() tokenizer = get_tokenizer() raw_text_len = len(context) model.eval() context_tokens = tokenizer.tokenize(context) args.out_seq_length = max_gen_length + len(context_tokens) args.eos_id = eos_token_id with torch.no_grad(): token_stream = get_token_stream(model, [context_tokens]) for counter, decode_tokens in enumerate(token_stream): if counter == args.out_seq_length: break decode_tokens, _ = decode_tokens decode_tokens = decode_tokens[0].cpu().numpy().tolist() trim_decode_tokens = tokenizer.detokenize( decode_tokens)[raw_text_len:] return trim_decode_tokens def generate_samples_interactive(model, print_frequency=24): args = get_args() tokenizer = get_tokenizer() context_count = 0 model.eval() with torch.no_grad(): while True: terminate_runs = 0 raw_text_len = 0 if mpu.is_pipeline_first_stage() \ and mpu.get_tensor_model_parallel_rank() == 0: os.system('clear') raw_text = input("\nContext prompt (stop to exit) >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("\nContext prompt (stop to exit) >>> ") raw_text_len = len(raw_text) if "stop" in raw_text: terminate_runs = 1 else: context_tokens = tokenizer.tokenize(raw_text) context_length = len(context_tokens) if context_length >= (args.seq_length // 2): print("\nContext length", context_length, "\nPlease give smaller context (half of the " "sequence length)!", flush=True) continue else: context_tokens = tokenizer.tokenize("EMPTY TEXT") context_length = 0 input_info = [terminate_runs, raw_text_len, context_length] input_info_tensor = torch.cuda.LongTensor(input_info) torch.distributed.all_reduce(input_info_tensor, group=mpu.get_model_parallel_group()) terminate_runs = input_info_tensor[0].item() raw_text_len = input_info_tensor[1].item() context_length = input_info_tensor[2].item() if terminate_runs == 1: return # For pipeline parallel we send context tokens to other stages # so they get the lengths correct if mpu.get_tensor_model_parallel_rank() == 0 \ and args.pipeline_model_parallel_size > 1: if mpu.is_pipeline_first_stage(): src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() context_tokens_tensor = torch.cuda.LongTensor(context_tokens) torch.distributed.broadcast(context_tokens_tensor, src, group) else: src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() context_tokens_tensor = torch.empty(context_length, dtype=torch.int64, device=torch.device("cuda")) torch.distributed.broadcast(context_tokens_tensor, src, group) context_tokens = context_tokens_tensor.cpu().numpy().tolist() token_stream = get_token_stream(model, [context_tokens]) for counter, decode_tokens in enumerate(token_stream): if counter % print_frequency != 0 \ or mpu.get_tensor_model_parallel_rank() != 0 \ or not mpu.is_pipeline_first_stage(): continue os.system('clear') print("\nContext:", raw_text, flush=True) decode_tokens, _ = decode_tokens decode_tokens = decode_tokens[0].cpu().numpy().tolist() trim_decode_tokens = tokenizer.detokenize( decode_tokens)[raw_text_len:] print("\nMegatron-LM:", trim_decode_tokens, flush=True) if mpu.is_pipeline_first_stage() \ and mpu.get_tensor_model_parallel_rank() == 0: os.system('clear') print("\nContext:", raw_text, flush=True) if not isinstance(decode_tokens, list): decode_tokens, _ = decode_tokens decode_tokens = decode_tokens[0].cpu().numpy().tolist() trim_decode_tokens = tokenizer.detokenize( decode_tokens)[raw_text_len:] print("\nMegatron-LM:", trim_decode_tokens, flush=True) input("\nPress Enter to continue >>>") raw_text = None context_count += 1 def generate_samples_unconditional(model): args = get_args() tokenizer = get_tokenizer() num_samples = args.num_samples context_tokens = [[tokenizer.eod] for _ in range(args.micro_batch_size)] ctr = 0 while True: start_time = time.time() for token_stream in get_token_stream(model, copy.deepcopy(context_tokens)): pass if mpu.is_pipeline_last_stage() and \ mpu.get_tensor_model_parallel_rank() == 0: if ctr % args.log_interval == 0: print('Avg s/batch:', (time.time() - start_time) / min(args.log_interval, ctr + 1)) start_time = time.time() length = len(token_stream) token_batch = token_stream[0].cpu().numpy().tolist() length_batch = token_stream[1].cpu().numpy().tolist() assert len(length_batch) == args.micro_batch_size for tokens, length in zip(token_batch, length_batch): tokens = tokens[1:length - 1] text = tokenizer.detokenize(tokens) is_finished = length < args.seq_length - 1 datum = {'text': text, 'length': length - 1, 'finished': is_finished} yield datum ctr += 1 if ctr >= num_samples: break else: for _ in range(args.micro_batch_size): yield None ctr += 1 if ctr >= num_samples: break if ctr >= num_samples: break def generate_and_write_samples_unconditional(model): args = get_args() assert args.genfile is not None with open(args.genfile, 'w') as f: for datum in generate_samples_unconditional(model): if mpu.is_pipeline_last_stage() and \ mpu.get_tensor_model_parallel_rank() == 0: f.write(json.dumps(datum) + '\n') def pad_batch(batch, pad_id, args): context_lengths = [] for tokens in batch: context_length = len(tokens) Loading @@ -387,41 +94,94 @@ def pad_batch(batch, pad_id, args): context_lengths.append(context_length) return batch, context_lengths def get_token_stream(model, context_tokens): def tokenize_batch(sentences): args = get_args() tokenizer = get_tokenizer() context_tokens = [tokenizer.tokenize(s) for s in sentences] context_tokens, context_lengths = pad_batch(context_tokens, tokenizer.eod, args) context_tokens_tensor = torch.cuda.LongTensor(context_tokens) context_length_tensor = torch.cuda.LongTensor(context_lengths) return context_tokens_tensor, context_length_tensor def send_generate_info(context_tokens_tensor, context_length_tensor, max_len): """ Needs to be synced up with receive_generate_info """ # Send the sizes of the tensors input_info = [context_tokens_tensor.size(0), context_tokens_tensor.size(1), max_len] input_info_tensor = torch.cuda.LongTensor(input_info) torch.distributed.broadcast(input_info_tensor, 0) # Send variables to all ranks torch.distributed.broadcast(context_length_tensor, 0) torch.distributed.broadcast(context_tokens_tensor, 0) def receive_generate_info(): """ Needs to be synced up with send_generate_info """ input_info_tensor = torch.empty(3, dtype=torch.int64, device=torch.cuda.current_device()) torch.distributed.broadcast(input_info_tensor, 0) batch_size = input_info_tensor[0].item() seq_len = input_info_tensor[1].item() max_len = input_info_tensor[2].item() torch.distributed.broadcast(context_length_tensor, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()) torch.distributed.broadcast(context_tokens_tensor, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()) context_length_tensor = torch.empty(batch_size, dtype=torch.int64, device=torch.cuda.current_device()) context_tokens_tensor = torch.empty(batch_size, seq_len, dtype=torch.int64, device=torch.cuda.current_device()) # Send variables to all ranks torch.distributed.broadcast(context_length_tensor, 0) torch.distributed.broadcast(context_tokens_tensor, 0) return context_length_tensor, context_tokens_tensor, max_len def synced_generate(model, context_tokens_tensor, context_length_tensor, max_len): context_length = context_length_tensor.min().item() tokens, attention_mask, position_ids = get_batch(context_tokens_tensor) batch_token_iterator = sample_sequence_batch(model, context_tokens_tensor, context_length_tensor, attention_mask, position_ids) attention_mask, position_ids, max_len) for tokens, lengths in batch_token_iterator: context_length += 1 if tokens is not None: yield tokens[:, :context_length], lengths return tokens[:, :context_length] def generate(model, sentences=None, max_len=0): model.eval() if torch.distributed.get_rank() == 0: context_tokens_tensor, context_length_tensor = tokenize_batch(sentences) send_generate_info(context_tokens_tensor, context_length_tensor, max_len) else: yield None, None context_length_tensor, context_tokens_tensor, max_len = receive_generate_info() decode_tokens = synced_generate(model, context_tokens_tensor, context_length_tensor, max_len) def switch(val1, val2, boolean): if torch.distributed.get_rank() == 0: args = get_args() tokenizer = get_tokenizer() resp_sentences = [] for i in range(decode_tokens.size(0)): decode_token = decode_tokens[i,:].cpu().numpy().tolist() resp_sentences.append(tokenizer.detokenize(decode_token)) return resp_sentences def generate_samples_eval(model, context, max_gen_length, eos_token_id): """ This function is here to provide an a matching API for a legacy task This implementation hasn't been tested yet to make sure it matches """ assert False, "Implementation untested" args = get_args() args.eos_id = eos_token_id raw_text_len = len(context) resp_sentences = generate(model, [context], max_gen_length) return resp_sentences[0][raw_text_len:] def switch(val1, val2, boolean): boolean = boolean.type_as(val1) return (1 - boolean) * val1 + boolean * val2 Loading @@ -435,6 +195,7 @@ def forward_step(model, tokens, position_ids, attention_mask, tokentype_ids, args = get_args() orig_seq_length = args.seq_length args.seq_length = tokens.shape[1] args.micro_batch_size = tokens.shape[0] input_tensor = recv_forward() Loading Loading @@ -462,7 +223,6 @@ def forward_step(model, tokens, position_ids, attention_mask, tokentype_ids, def sample_sequence_batch(model, context_tokens, context_lengths, attention_mask, position_ids, maxlen=None, type_ids=None): args = get_args() tokenizer = get_tokenizer() Loading @@ -486,22 +246,15 @@ def sample_sequence_batch(model, context_tokens, context_lengths, tokens = context_tokens if maxlen is None: maxlen = args.seq_length - 1 maxlen = maxlen + org_context_length if maxlen > (org_context_length + args.out_seq_length): maxlen = org_context_length + args.out_seq_length lengths = torch.ones([batch_size]).long().cuda() * maxlen while context_length <= (maxlen): if args.recompute: output = forward_step(model, tokens, position_ids, attention_mask, tokentype_ids=type_ids, forward_method_parallel_output=False) if mpu.is_pipeline_last_stage(): assert output is not None logits = output[:, context_length - 1, :] else: while context_length < maxlen: types2use = None if counter == 0: tokens2use = tokens[:, :context_length] Loading