Commit 0e5dfd7f authored by Mohammad's avatar Mohammad
Browse files

added gpt2 tokenizer

parent b6e0377b
Loading
Loading
Loading
Loading
+10 −18
Original line number Diff line number Diff line
@@ -35,6 +35,8 @@ def parse_args(extra_args_provider=None, defaults={}):
    parser = _add_validation_args(parser)
    parser = _add_data_args(parser)
    parser = _add_autoresume_args(parser)
    # TODO: Refactor
    parser = _add_gpt2_args(parser)

    # Custom arguments.
    if extra_args_provider is not None:
@@ -293,6 +295,8 @@ def _add_data_args(parser):
                       'validation and 5% for test.')
    group.add_argument('--vocab-file', type=str, required=True,
                       help='Path to the vocab file.')
    group.add_argument('--merge-file', type=str, default=None,
                       help='Path to the BPE merge file.')
    group.add_argument('--seq-length', type=int, required=True,
                       help="Maximum sequence length to process.")
    group.add_argument('--mask-prob', type=float, default=0.15,
@@ -330,19 +334,19 @@ def _add_autoresume_args(parser):
########################################################################


def add_training_args_(parser):
    """Training arguments."""
def _add_gpt2_args(parser):
    group = parser.add_argument_group(title='gpt2')

    group = parser.add_argument_group('train', 'training configurations')

    # Batch prodecuer arguments
    group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt',
                       help='The filename containing all the shards '
                       'sizes for numpy data loader')
    group.add_argument('--reset-position-ids', action='store_true',
                       help='Reset posistion ids after end-of-document token.')
    group.add_argument('--reset-attention-mask', action='store_true',
                       help='Reset self attention maske after '
                       'end-of-document token.')
    group.add_argument('--eod-mask-loss', action='store_true',
                       help='Mask loss for the end of document tokens')
                       help='Mask loss for the end of document tokens.')

    return parser

@@ -411,18 +415,6 @@ def add_data_args_(parser):
                       choices=['raw', 'lazy', 'tfrecords', 'numpy', 'binary'],
                       help='Which data loader to use. Default varies by model.')

    group.add_argument('--train-data', nargs='+', default=None,
                       help='Whitespace separated paths or corpora names '
                       'for training.')
    group.add_argument('--valid-data', nargs='*', default=None,
                       help='path(s) to the validation data.')
    group.add_argument('--test-data', nargs='*', default=None,
                       help='path(s) to the testing data.')

    # arguments for binary data loader
    # arguments for numpy data loader
    group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt',
                       help='the filename containing all the shards sizes for numpy data loader')

    return parser
+28 −0
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ from abc import ABC
from abc import abstractmethod

from .bert_tokenization import FullTokenizer as FullBertTokenizer
from .gpt2_tokenization import GPT2Tokenizer


def build_tokenizer(args):
@@ -28,9 +29,13 @@ def build_tokenizer(args):
              flush=True)

    # Select and instantiate the tokenizer.
    assert args.vocab_file is not None
    if args.tokenizer_type == 'BertWordPieceLowerCase':
        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
                                                    lower_case=True)
    elif args.tokenizer_type == 'GPT2BPETokenizer':
        assert args.merge_file is not None
        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
    else:
        raise NotImplementedError('{} tokenizer is not '
                                  'implemented.'.format(args.tokenizer_type))
@@ -129,3 +134,26 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
    @property
    def pad(self):
        return self.pad_id


class _GPT2BPETokenizer(AbstractTokenizer):
    """Original GPT2 BPE tokenizer."""

    def __init__(self, vocab_file, merge_file):
        name = 'GPT2 BPE'
        super().__init__(name)

        self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
                                       special_tokens=[], max_len=None)
        self.eod_id = self.tokenizer.encoder['<|endoftext|>']

    @property
    def vocab_size(self):
        return len(self.tokenizer.encoder)

    def tokenize(self, text):
        return self.tokenizer.encode(text)

    @property
    def eod(self):
        return self.eod_id
+17 −33
Original line number Diff line number Diff line
@@ -17,20 +17,16 @@

import torch


from gpt2_data_loader import make_gpt2_dataloaders
from megatron import get_args
from megatron import get_timers

from configure_data import configure_data
from gpt2_data_loader import make_gpt2_dataloaders
from megatron import mpu
from megatron import print_rank_0
from megatron.model import GPT2Model
from megatron.training import pretrain
from megatron.utils import get_ltor_masks_and_position_ids
from megatron import print_rank_0
from megatron.utils import reduce_losses
from megatron.utils import vocab_size_with_padding
from megatron.training import pretrain

import os

def model_provider():
    """Build the model."""
@@ -97,7 +93,7 @@ def forward_step(data_iterator, model):
    # Get the batch.
    timers('batch generator').start()
    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
        data_iterator, args, timers)
        data_iterator)
    timers('batch generator').stop()

    # Forward model.
@@ -121,28 +117,17 @@ def get_train_val_test_data():

    # Data loader only on rank 0 of each model parallel group.
    if mpu.get_model_parallel_rank() == 0:
        if args.data_loader == 'numpy':
            assert len(args.train_data) == 1
            args.train_data = args.train_data[0]
            assert len(args.valid_data) == 1
            args.valid_data = args.valid_data[0]
            assert len(args.test_data) == 1
            args.test_data = args.test_data[0]

        args.cache_dir = 'cache'
        args.train_data = os.path.join(args.data_path, 'train')
        args.valid_data = os.path.join(args.data_path, 'valid')
        args.test_data = os.path.join(args.data_path, 'test')
        (train_data, val_data, test_data), num_tokens, \
            eod_token = make_gpt2_dataloaders(args)
        elif args.data_loader == 'raw' or args.data_loader == 'lazy':
            data_config = configure_data()
            data_config.set_defaults(data_set_type='GPT2', transpose=False)
            (train_data, val_data, test_data), tokenizer = data_config.apply(
                args)
            num_tokens = tokenizer.num_tokens
            eod_token = tokenizer.get_command('eos').Id
            assert eod_token == tokenizer.get_command('pad').Id
        else:
            print("Unsupported data loader for GPT2.")
            exit(1)

        # pad.
        num_tokens = vocab_size_with_padding(num_tokens, args)
        from megatron.tokenizer.tokenizer import _vocab_size_with_padding
        num_tokens = _vocab_size_with_padding(num_tokens, args)
        print_rank_0('> found end-of-document token: {}'.format(eod_token))
        token_counts = torch.cuda.LongTensor([num_tokens, eod_token,
                                              int(args.do_train),
@@ -161,7 +146,6 @@ def get_train_val_test_data():
    args.do_valid = token_counts[3].item()
    args.do_test = token_counts[4].item()

    args.vocab_size = num_tokens
    args.eod_token = eod_token

    return train_data, val_data, test_data
@@ -169,5 +153,5 @@ def get_train_val_test_data():

if __name__ == "__main__":

    pretrain(get_train_val_test_data,
             model_provider, forward_step)
    pretrain(get_train_val_test_data, model_provider, forward_step,
             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})