Loading megatron/arguments.py +10 −18 Original line number Diff line number Diff line Loading @@ -35,6 +35,8 @@ def parse_args(extra_args_provider=None, defaults={}): parser = _add_validation_args(parser) parser = _add_data_args(parser) parser = _add_autoresume_args(parser) # TODO: Refactor parser = _add_gpt2_args(parser) # Custom arguments. if extra_args_provider is not None: Loading Loading @@ -293,6 +295,8 @@ def _add_data_args(parser): 'validation and 5% for test.') group.add_argument('--vocab-file', type=str, required=True, help='Path to the vocab file.') group.add_argument('--merge-file', type=str, default=None, help='Path to the BPE merge file.') group.add_argument('--seq-length', type=int, required=True, help="Maximum sequence length to process.") group.add_argument('--mask-prob', type=float, default=0.15, Loading Loading @@ -330,19 +334,19 @@ def _add_autoresume_args(parser): ######################################################################## def add_training_args_(parser): """Training arguments.""" def _add_gpt2_args(parser): group = parser.add_argument_group(title='gpt2') group = parser.add_argument_group('train', 'training configurations') # Batch prodecuer arguments group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt', help='The filename containing all the shards ' 'sizes for numpy data loader') group.add_argument('--reset-position-ids', action='store_true', help='Reset posistion ids after end-of-document token.') group.add_argument('--reset-attention-mask', action='store_true', help='Reset self attention maske after ' 'end-of-document token.') group.add_argument('--eod-mask-loss', action='store_true', help='Mask loss for the end of document tokens') help='Mask loss for the end of document tokens.') return parser Loading Loading @@ -411,18 +415,6 @@ def add_data_args_(parser): choices=['raw', 'lazy', 'tfrecords', 'numpy', 'binary'], help='Which data loader to use. Default varies by model.') group.add_argument('--train-data', nargs='+', default=None, help='Whitespace separated paths or corpora names ' 'for training.') group.add_argument('--valid-data', nargs='*', default=None, help='path(s) to the validation data.') group.add_argument('--test-data', nargs='*', default=None, help='path(s) to the testing data.') # arguments for binary data loader # arguments for numpy data loader group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt', help='the filename containing all the shards sizes for numpy data loader') return parser megatron/tokenizer/tokenizer.py +28 −0 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ from abc import ABC from abc import abstractmethod from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer def build_tokenizer(args): Loading @@ -28,9 +29,13 @@ def build_tokenizer(args): flush=True) # Select and instantiate the tokenizer. assert args.vocab_file is not None if args.tokenizer_type == 'BertWordPieceLowerCase': tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, lower_case=True) elif args.tokenizer_type == 'GPT2BPETokenizer': assert args.merge_file is not None tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) else: raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type)) Loading Loading @@ -129,3 +134,26 @@ class _BertWordPieceTokenizer(AbstractTokenizer): @property def pad(self): return self.pad_id class _GPT2BPETokenizer(AbstractTokenizer): """Original GPT2 BPE tokenizer.""" def __init__(self, vocab_file, merge_file): name = 'GPT2 BPE' super().__init__(name) self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', special_tokens=[], max_len=None) self.eod_id = self.tokenizer.encoder['<|endoftext|>'] @property def vocab_size(self): return len(self.tokenizer.encoder) def tokenize(self, text): return self.tokenizer.encode(text) @property def eod(self): return self.eod_id pretrain_gpt2.py +17 −33 Original line number Diff line number Diff line Loading @@ -17,20 +17,16 @@ import torch from gpt2_data_loader import make_gpt2_dataloaders from megatron import get_args from megatron import get_timers from configure_data import configure_data from gpt2_data_loader import make_gpt2_dataloaders from megatron import mpu from megatron import print_rank_0 from megatron.model import GPT2Model from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids from megatron import print_rank_0 from megatron.utils import reduce_losses from megatron.utils import vocab_size_with_padding from megatron.training import pretrain import os def model_provider(): """Build the model.""" Loading Loading @@ -97,7 +93,7 @@ def forward_step(data_iterator, model): # Get the batch. timers('batch generator').start() tokens, labels, loss_mask, attention_mask, position_ids = get_batch( data_iterator, args, timers) data_iterator) timers('batch generator').stop() # Forward model. Loading @@ -121,28 +117,17 @@ def get_train_val_test_data(): # Data loader only on rank 0 of each model parallel group. if mpu.get_model_parallel_rank() == 0: if args.data_loader == 'numpy': assert len(args.train_data) == 1 args.train_data = args.train_data[0] assert len(args.valid_data) == 1 args.valid_data = args.valid_data[0] assert len(args.test_data) == 1 args.test_data = args.test_data[0] args.cache_dir = 'cache' args.train_data = os.path.join(args.data_path, 'train') args.valid_data = os.path.join(args.data_path, 'valid') args.test_data = os.path.join(args.data_path, 'test') (train_data, val_data, test_data), num_tokens, \ eod_token = make_gpt2_dataloaders(args) elif args.data_loader == 'raw' or args.data_loader == 'lazy': data_config = configure_data() data_config.set_defaults(data_set_type='GPT2', transpose=False) (train_data, val_data, test_data), tokenizer = data_config.apply( args) num_tokens = tokenizer.num_tokens eod_token = tokenizer.get_command('eos').Id assert eod_token == tokenizer.get_command('pad').Id else: print("Unsupported data loader for GPT2.") exit(1) # pad. num_tokens = vocab_size_with_padding(num_tokens, args) from megatron.tokenizer.tokenizer import _vocab_size_with_padding num_tokens = _vocab_size_with_padding(num_tokens, args) print_rank_0('> found end-of-document token: {}'.format(eod_token)) token_counts = torch.cuda.LongTensor([num_tokens, eod_token, int(args.do_train), Loading @@ -161,7 +146,6 @@ def get_train_val_test_data(): args.do_valid = token_counts[3].item() args.do_test = token_counts[4].item() args.vocab_size = num_tokens args.eod_token = eod_token return train_data, val_data, test_data Loading @@ -169,5 +153,5 @@ def get_train_val_test_data(): if __name__ == "__main__": pretrain(get_train_val_test_data, model_provider, forward_step) pretrain(get_train_val_test_data, model_provider, forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) Loading
megatron/arguments.py +10 −18 Original line number Diff line number Diff line Loading @@ -35,6 +35,8 @@ def parse_args(extra_args_provider=None, defaults={}): parser = _add_validation_args(parser) parser = _add_data_args(parser) parser = _add_autoresume_args(parser) # TODO: Refactor parser = _add_gpt2_args(parser) # Custom arguments. if extra_args_provider is not None: Loading Loading @@ -293,6 +295,8 @@ def _add_data_args(parser): 'validation and 5% for test.') group.add_argument('--vocab-file', type=str, required=True, help='Path to the vocab file.') group.add_argument('--merge-file', type=str, default=None, help='Path to the BPE merge file.') group.add_argument('--seq-length', type=int, required=True, help="Maximum sequence length to process.") group.add_argument('--mask-prob', type=float, default=0.15, Loading Loading @@ -330,19 +334,19 @@ def _add_autoresume_args(parser): ######################################################################## def add_training_args_(parser): """Training arguments.""" def _add_gpt2_args(parser): group = parser.add_argument_group(title='gpt2') group = parser.add_argument_group('train', 'training configurations') # Batch prodecuer arguments group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt', help='The filename containing all the shards ' 'sizes for numpy data loader') group.add_argument('--reset-position-ids', action='store_true', help='Reset posistion ids after end-of-document token.') group.add_argument('--reset-attention-mask', action='store_true', help='Reset self attention maske after ' 'end-of-document token.') group.add_argument('--eod-mask-loss', action='store_true', help='Mask loss for the end of document tokens') help='Mask loss for the end of document tokens.') return parser Loading Loading @@ -411,18 +415,6 @@ def add_data_args_(parser): choices=['raw', 'lazy', 'tfrecords', 'numpy', 'binary'], help='Which data loader to use. Default varies by model.') group.add_argument('--train-data', nargs='+', default=None, help='Whitespace separated paths or corpora names ' 'for training.') group.add_argument('--valid-data', nargs='*', default=None, help='path(s) to the validation data.') group.add_argument('--test-data', nargs='*', default=None, help='path(s) to the testing data.') # arguments for binary data loader # arguments for numpy data loader group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt', help='the filename containing all the shards sizes for numpy data loader') return parser
megatron/tokenizer/tokenizer.py +28 −0 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ from abc import ABC from abc import abstractmethod from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer def build_tokenizer(args): Loading @@ -28,9 +29,13 @@ def build_tokenizer(args): flush=True) # Select and instantiate the tokenizer. assert args.vocab_file is not None if args.tokenizer_type == 'BertWordPieceLowerCase': tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, lower_case=True) elif args.tokenizer_type == 'GPT2BPETokenizer': assert args.merge_file is not None tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) else: raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type)) Loading Loading @@ -129,3 +134,26 @@ class _BertWordPieceTokenizer(AbstractTokenizer): @property def pad(self): return self.pad_id class _GPT2BPETokenizer(AbstractTokenizer): """Original GPT2 BPE tokenizer.""" def __init__(self, vocab_file, merge_file): name = 'GPT2 BPE' super().__init__(name) self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', special_tokens=[], max_len=None) self.eod_id = self.tokenizer.encoder['<|endoftext|>'] @property def vocab_size(self): return len(self.tokenizer.encoder) def tokenize(self, text): return self.tokenizer.encode(text) @property def eod(self): return self.eod_id
pretrain_gpt2.py +17 −33 Original line number Diff line number Diff line Loading @@ -17,20 +17,16 @@ import torch from gpt2_data_loader import make_gpt2_dataloaders from megatron import get_args from megatron import get_timers from configure_data import configure_data from gpt2_data_loader import make_gpt2_dataloaders from megatron import mpu from megatron import print_rank_0 from megatron.model import GPT2Model from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids from megatron import print_rank_0 from megatron.utils import reduce_losses from megatron.utils import vocab_size_with_padding from megatron.training import pretrain import os def model_provider(): """Build the model.""" Loading Loading @@ -97,7 +93,7 @@ def forward_step(data_iterator, model): # Get the batch. timers('batch generator').start() tokens, labels, loss_mask, attention_mask, position_ids = get_batch( data_iterator, args, timers) data_iterator) timers('batch generator').stop() # Forward model. Loading @@ -121,28 +117,17 @@ def get_train_val_test_data(): # Data loader only on rank 0 of each model parallel group. if mpu.get_model_parallel_rank() == 0: if args.data_loader == 'numpy': assert len(args.train_data) == 1 args.train_data = args.train_data[0] assert len(args.valid_data) == 1 args.valid_data = args.valid_data[0] assert len(args.test_data) == 1 args.test_data = args.test_data[0] args.cache_dir = 'cache' args.train_data = os.path.join(args.data_path, 'train') args.valid_data = os.path.join(args.data_path, 'valid') args.test_data = os.path.join(args.data_path, 'test') (train_data, val_data, test_data), num_tokens, \ eod_token = make_gpt2_dataloaders(args) elif args.data_loader == 'raw' or args.data_loader == 'lazy': data_config = configure_data() data_config.set_defaults(data_set_type='GPT2', transpose=False) (train_data, val_data, test_data), tokenizer = data_config.apply( args) num_tokens = tokenizer.num_tokens eod_token = tokenizer.get_command('eos').Id assert eod_token == tokenizer.get_command('pad').Id else: print("Unsupported data loader for GPT2.") exit(1) # pad. num_tokens = vocab_size_with_padding(num_tokens, args) from megatron.tokenizer.tokenizer import _vocab_size_with_padding num_tokens = _vocab_size_with_padding(num_tokens, args) print_rank_0('> found end-of-document token: {}'.format(eod_token)) token_counts = torch.cuda.LongTensor([num_tokens, eod_token, int(args.do_train), Loading @@ -161,7 +146,6 @@ def get_train_val_test_data(): args.do_valid = token_counts[3].item() args.do_test = token_counts[4].item() args.vocab_size = num_tokens args.eod_token = eod_token return train_data, val_data, test_data Loading @@ -169,5 +153,5 @@ def get_train_val_test_data(): if __name__ == "__main__": pretrain(get_train_val_test_data, model_provider, forward_step) pretrain(get_train_val_test_data, model_provider, forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})