Loading examples/run_gpt2_eval.pydeleted 100644 → 0 +0 −81 Original line number Diff line number Diff line """ example usage: python scripts/run_gpt2_eval.py \ --model-parallel-size 1 \ --num-layers 12 \ --hidden-size 768 \ --num-attention-heads 12 \ --model-path <gpt2_117_path> \ --data-path <wikitext_tokens_test_path> \ --batch-size 16 \ --cache-dir <cache dir path> """ import argparse import subprocess parser = argparse.ArgumentParser('run zero shot GPT2 eval') parser.add_argument('--model-path', type=str, required=True, help='Saved model path for evaluation') parser.add_argument('--batch-size', type=int, default=4, help='batch size to use for evaluation') parser.add_argument('--num-attention-heads', type=int, default=12, help='num of transformer attention heads') parser.add_argument('--hidden-size', type=int, default=768, help='tansformer hidden size') parser.add_argument('--num-layers', type=int, default=12, help='num decoder layers') parser.add_argument('--data-path', type=str, required=True, help='Data path for evaluation data') parser.add_argument('--cloze-eval', action='store_true', help='Run lambada cloze eval instead of perplexity eval.') parser.add_argument('--easy-lambada', action='store_true', help='use easier formulation of lambada') parser.add_argument('--model-parallel-size', type=int, default=1, help='model parallel size to use') args = parser.parse_args() multinode_args = '' if args.model_parallel_size > 1: multinode_args += ' -m torch.distributed.launch --nproc_per_node {} '.format(args.model_parallel_size) CMD = ' --model-parallel-size {model_par} \ --num-layers {nlayers} \ --hidden-size {hidden} \ --log-interval 100 \ --load {model} \ --batch-size {batch} \ --num-attention-heads {natt} \ --seq-length 1024 \ --max-position-embeddings 1024 \ --tokenizer-type GPT2BPETokenizer \ --distributed-backend nccl \ --hidden-dropout 0.1 \ --attention-dropout 0.1 \ --fp16 \ --lr 1 --no-load-optim --no-load-rng --epochs 0 \ --overlapping-eval 32 \ --merge-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/merges.txt \ --vocab-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/vocab.json'.format(model_par=args.model_parallel_size, nlayers=args.num_layers, hidden=args.hidden_size, model=args.model_path, batch=args.batch_size, natt=args.num_attention_heads,) if args.cloze_eval: CMD += ' --valid-data {} '.format(args.data_path) CMD += ' --task LAMBADA ' if not args.easy_lambada: CMD += ' --strict-lambada ' CMD = 'main.py' + CMD print('Running Lambada Eval Command:', flush=True) else: CMD += ' --valid-data {} '.format(args.data_path) CMD += ' --task WIKITEXT103 ' CMD = 'main.py' + CMD print('Running PPL Eval Command:', flush=True) CMD = 'python3 '+multinode_args+CMD print(CMD, flush=True) subprocess.call(CMD.split()) megatron/arguments.py +14 −31 Original line number Diff line number Diff line Loading @@ -35,8 +35,6 @@ def parse_args(extra_args_provider=None, defaults={}): parser = _add_validation_args(parser) parser = _add_data_args(parser) parser = _add_autoresume_args(parser) # TODO: Refactor parser = _add_gpt2_args(parser) # Custom arguments. if extra_args_provider is not None: Loading @@ -54,6 +52,12 @@ def parse_args(extra_args_provider=None, defaults={}): 'defaults can only be overwritten for args with None values.' setattr(args, key, defaults[key]) # Check required arguments. required_args = ['num_layers', 'hidden_size', 'num_attention_heads', 'max_position_embeddings'] for req_arg in required_args: _check_arg_is_not_none(args, req_arg) # Distributed args. args.rank = int(os.getenv('RANK', '0')) args.world_size = int(os.getenv("WORLD_SIZE", '1')) Loading Loading @@ -93,16 +97,20 @@ def _print_args(args): print('---------------- end of arguments ----------------', flush=True) def _check_arg_is_not_none(args, arg): assert getattr(args, arg) is not None, '{} argument is None'.format(arg) def _add_network_size_args(parser): group = parser.add_argument_group(title='network size') group.add_argument('--num-layers', type=int, required=True, group.add_argument('--num-layers', type=int, default=None, help='Number of transformer layers.') group.add_argument('--hidden-size', type=int, required=True, group.add_argument('--hidden-size', type=int, default=None, help='Tansformer hidden size.') group.add_argument('--num-attention-heads', type=int, required=True, group.add_argument('--num-attention-heads', type=int, default=None, help='Number of transformer attention heads.') group.add_argument('--max-position-embeddings', type=int, required=True, group.add_argument('--max-position-embeddings', type=int, default=None, help='Maximum number of position embeddings to use. ' 'This is the size of position embedding.') group.add_argument('--make-vocab-size-divisible-by', type=int, default=128, Loading Loading @@ -342,28 +350,3 @@ def _add_autoresume_args(parser): 'termination signal') return parser ######################################################################## def _add_gpt2_args(parser): group = parser.add_argument_group(title='gpt2') group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt', help='The filename containing all the shards ' 'sizes for numpy data loader') return parser def add_data_args_(parser): """Train/valid/test data arguments.""" group = parser.add_argument_group('data', 'data configurations') group.add_argument('--data-loader', type=str, default=None, choices=['raw', 'lazy', 'tfrecords', 'numpy', 'binary'], help='Which data loader to use. Default varies by model.') return parser Loading
examples/run_gpt2_eval.pydeleted 100644 → 0 +0 −81 Original line number Diff line number Diff line """ example usage: python scripts/run_gpt2_eval.py \ --model-parallel-size 1 \ --num-layers 12 \ --hidden-size 768 \ --num-attention-heads 12 \ --model-path <gpt2_117_path> \ --data-path <wikitext_tokens_test_path> \ --batch-size 16 \ --cache-dir <cache dir path> """ import argparse import subprocess parser = argparse.ArgumentParser('run zero shot GPT2 eval') parser.add_argument('--model-path', type=str, required=True, help='Saved model path for evaluation') parser.add_argument('--batch-size', type=int, default=4, help='batch size to use for evaluation') parser.add_argument('--num-attention-heads', type=int, default=12, help='num of transformer attention heads') parser.add_argument('--hidden-size', type=int, default=768, help='tansformer hidden size') parser.add_argument('--num-layers', type=int, default=12, help='num decoder layers') parser.add_argument('--data-path', type=str, required=True, help='Data path for evaluation data') parser.add_argument('--cloze-eval', action='store_true', help='Run lambada cloze eval instead of perplexity eval.') parser.add_argument('--easy-lambada', action='store_true', help='use easier formulation of lambada') parser.add_argument('--model-parallel-size', type=int, default=1, help='model parallel size to use') args = parser.parse_args() multinode_args = '' if args.model_parallel_size > 1: multinode_args += ' -m torch.distributed.launch --nproc_per_node {} '.format(args.model_parallel_size) CMD = ' --model-parallel-size {model_par} \ --num-layers {nlayers} \ --hidden-size {hidden} \ --log-interval 100 \ --load {model} \ --batch-size {batch} \ --num-attention-heads {natt} \ --seq-length 1024 \ --max-position-embeddings 1024 \ --tokenizer-type GPT2BPETokenizer \ --distributed-backend nccl \ --hidden-dropout 0.1 \ --attention-dropout 0.1 \ --fp16 \ --lr 1 --no-load-optim --no-load-rng --epochs 0 \ --overlapping-eval 32 \ --merge-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/merges.txt \ --vocab-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/vocab.json'.format(model_par=args.model_parallel_size, nlayers=args.num_layers, hidden=args.hidden_size, model=args.model_path, batch=args.batch_size, natt=args.num_attention_heads,) if args.cloze_eval: CMD += ' --valid-data {} '.format(args.data_path) CMD += ' --task LAMBADA ' if not args.easy_lambada: CMD += ' --strict-lambada ' CMD = 'main.py' + CMD print('Running Lambada Eval Command:', flush=True) else: CMD += ' --valid-data {} '.format(args.data_path) CMD += ' --task WIKITEXT103 ' CMD = 'main.py' + CMD print('Running PPL Eval Command:', flush=True) CMD = 'python3 '+multinode_args+CMD print(CMD, flush=True) subprocess.call(CMD.split())
megatron/arguments.py +14 −31 Original line number Diff line number Diff line Loading @@ -35,8 +35,6 @@ def parse_args(extra_args_provider=None, defaults={}): parser = _add_validation_args(parser) parser = _add_data_args(parser) parser = _add_autoresume_args(parser) # TODO: Refactor parser = _add_gpt2_args(parser) # Custom arguments. if extra_args_provider is not None: Loading @@ -54,6 +52,12 @@ def parse_args(extra_args_provider=None, defaults={}): 'defaults can only be overwritten for args with None values.' setattr(args, key, defaults[key]) # Check required arguments. required_args = ['num_layers', 'hidden_size', 'num_attention_heads', 'max_position_embeddings'] for req_arg in required_args: _check_arg_is_not_none(args, req_arg) # Distributed args. args.rank = int(os.getenv('RANK', '0')) args.world_size = int(os.getenv("WORLD_SIZE", '1')) Loading Loading @@ -93,16 +97,20 @@ def _print_args(args): print('---------------- end of arguments ----------------', flush=True) def _check_arg_is_not_none(args, arg): assert getattr(args, arg) is not None, '{} argument is None'.format(arg) def _add_network_size_args(parser): group = parser.add_argument_group(title='network size') group.add_argument('--num-layers', type=int, required=True, group.add_argument('--num-layers', type=int, default=None, help='Number of transformer layers.') group.add_argument('--hidden-size', type=int, required=True, group.add_argument('--hidden-size', type=int, default=None, help='Tansformer hidden size.') group.add_argument('--num-attention-heads', type=int, required=True, group.add_argument('--num-attention-heads', type=int, default=None, help='Number of transformer attention heads.') group.add_argument('--max-position-embeddings', type=int, required=True, group.add_argument('--max-position-embeddings', type=int, default=None, help='Maximum number of position embeddings to use. ' 'This is the size of position embedding.') group.add_argument('--make-vocab-size-divisible-by', type=int, default=128, Loading Loading @@ -342,28 +350,3 @@ def _add_autoresume_args(parser): 'termination signal') return parser ######################################################################## def _add_gpt2_args(parser): group = parser.add_argument_group(title='gpt2') group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt', help='The filename containing all the shards ' 'sizes for numpy data loader') return parser def add_data_args_(parser): """Train/valid/test data arguments.""" group = parser.add_argument_group('data', 'data configurations') group.add_argument('--data-loader', type=str, default=None, choices=['raw', 'lazy', 'tfrecords', 'numpy', 'binary'], help='Which data loader to use. Default varies by model.') return parser