Commit 1a1cfcff authored by Jared Casper's avatar Jared Casper
Browse files

Plumbing for new albert dataset, including mods to arguments for data loaders.

parent 0058b1a5
Loading
Loading
Loading
Loading
+43 −33
Original line number Diff line number Diff line
@@ -267,23 +267,52 @@ def add_data_args(parser):
    group.add_argument('--shuffle', action='store_true',
                       help='Shuffle data. Shuffling is deterministic '
                       'based on seed and current epoch.')
    group.add_argument('--data-loader', type=str, default=None,
                       choices=['raw', 'lazy', 'tfrecords', 'numpy', 'binary'],
                       help='Which data loader to use. Default varies by model.')

    group.add_argument('--train-data', nargs='+', default=None,
                       help='Whitespace separated filenames or corpora names '
                       help='Whitespace separated paths or corpora names '
                       'for training.')
    group.add_argument('--valid-data', nargs='*', default=None,
                       help='path(s) to the validation data.')
    group.add_argument('--test-data', nargs='*', default=None,
                       help='path(s) to the testing data.')
    group.add_argument('--data-path', type=str, default=None,
                       help='path to combined dataset to split')
    group.add_argument('--split', default='1000,1,1',
                       help='comma-separated list of proportions for training,'
                       ' validation, and test split')

    group.add_argument('--use-npy-data-loader', action='store_true',
                       help='Use the numpy data loader. If set, then'
                       'train-data-path, val-data-path, and test-data-path'
                       'should also be provided.')
    group.add_argument('--train-data-path', type=str, default='',
                       help='path to the training data')
    group.add_argument('--val-data-path', type=str, default='',
                       help='path to the validation data')
    group.add_argument('--test-data-path', type=str, default='',
                       help='path to the test data')
    group.add_argument('--seq-length', type=int, default=512,
                       help="Maximum sequence length to process")
    group.add_argument('--max-preds-per-seq', type=int, default=None,
                       help='Maximum number of predictions to use per sequence.'
                       'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
                       'MUST BE SPECIFIED IF `--data-loader tfrecords`.')

    # arguments for binary data loader
    parser.add_argument('--vocab', type=str, default='vocab.txt',
                        help='path to vocab file')
    parser.add_argument('--data-impl', type=str, default='infer',
                        help='implementation of indexed datasets',
                        choices=['lazy', 'cached', 'mmap', 'infer'])
    parser.add_argument('--max-num-samples', type=int, default=None,
                        help='Maximum number of samples to plan for, defaults to total iters * batch-size.')
    parser.add_argument('--data-epochs', type=int, default=None,
                        help='Number of epochs to plan for, defaults to using --max-num-samples')
    parser.add_argument('--mask-prob', default=0.15, type=float,
                        help='probability of replacing a token with mask')
    parser.add_argument('--short-seq-prob', default=0.1, type=float,
                        help='probability of producing a short sequence')
    parser.add_argument('--skip-mmap-warmup', action='store_true',
                        help='skip warming up mmap files')

    # arguments for numpy data loader
    group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt',
                       help='the filename containing all the shards sizes')
                       help='the filename containing all the shards sizes for numpy data loader')

    # arguments for raw/tfrecords data loader
    group.add_argument('--delim', default=',',
                       help='delimiter used to parse csv data files')
    group.add_argument('--text-key', default='sentence',
@@ -291,16 +320,6 @@ def add_data_args(parser):
    group.add_argument('--eval-text-key', default=None,
                       help='key to use to extract text from '
                       'json/csv evaluation datasets')
    group.add_argument('--valid-data', nargs='*', default=None,
                       help="""Filename for validation data.""")
    group.add_argument('--split', default='1000,1,1',
                       help='comma-separated list of proportions for training,'
                       ' validation, and test split')
    group.add_argument('--test-data', nargs='*', default=None,
                       help="""Filename for testing""")

    group.add_argument('--lazy-loader', action='store_true',
                       help='whether to lazy read the data set')
    group.add_argument('--loose-json', action='store_true',
                       help='Use loose json (one json-formatted string per '
                       'newline), instead of tight json (data file is one '
@@ -308,6 +327,7 @@ def add_data_args(parser):
    group.add_argument('--presplit-sentences', action='store_true',
                       help='Dataset content consists of documents where '
                       'each document consists of newline separated sentences')

    group.add_argument('--num-workers', type=int, default=2,
                       help="""Number of workers to use for dataloading""")
    group.add_argument('--tokenizer-model-type', type=str,
@@ -328,16 +348,6 @@ def add_data_args(parser):
                       help='what type of tokenizer to use')
    group.add_argument("--cache-dir", default=None, type=str,
                       help="Where to store pre-trained BERT downloads")
    group.add_argument('--use-tfrecords', action='store_true',
                       help='load `--train-data`, `--valid-data`, '
                       '`--test-data` from BERT tf records instead of '
                       'normal data pipeline')
    group.add_argument('--seq-length', type=int, default=512,
                       help="Maximum sequence length to process")
    group.add_argument('--max-preds-per-seq', type=int, default=None,
                       help='Maximum number of predictions to use per sequence.'
                       'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
                       'MUST BE SPECIFIED IF `--use-tfrecords` is True.')

    return parser

@@ -355,7 +365,7 @@ def get_args():

    args = parser.parse_args()

    if not args.train_data and not args.train_data_path:
    if not args.train_data and not args.data_path:
        print('WARNING: No training data specified')

    args.cuda = torch.cuda.is_available()
+2 −2
Original line number Diff line number Diff line
@@ -116,7 +116,7 @@ def make_tfrecord_loaders(args):
def make_loaders(args):
    """makes training/val/test"""

    if args.use_tfrecords:
    if args.data_loader == 'tfrecords':
        return make_tfrecord_loaders(args)
    world_size = torch.distributed.get_world_size(
        group=mpu.get_data_parallel_group())
@@ -134,7 +134,7 @@ def make_loaders(args):
    data_set_args = {
        'path': args.train_data,
        'seq_length': seq_length,
        'lazy': args.lazy_loader,
        'lazy': args.data_loader == 'lazy',
        'delim': args.delim,
        'text_key': args.text_key,
        'label_key': 'label',
+3 −3
Original line number Diff line number Diff line
@@ -56,9 +56,9 @@ def make_gpt2_dataloaders(args):
                                           num_workers=num_workers,
                                           pin_memory=True)

    train = make_data_loader_(args.train_data_path)
    valid = make_data_loader_(args.val_data_path)
    test = make_data_loader_(args.test_data_path)
    train = make_data_loader_(args.train_data)
    valid = make_data_loader_(args.val_data)
    test = make_data_loader_(args.test_data)

    args.do_train = False
    args.do_valid = False
+1 −1
Original line number Diff line number Diff line
from . import indexed_dataset
from .bert_tokenization import FullTokenizer as FullBertTokenizer
from .dataset import AlbertDataset
from .albert_dataset import AlbertDataset
+12 −2
Original line number Diff line number Diff line
@@ -29,8 +29,13 @@ class AlbertDataset(Dataset):
        self.indexed_dataset = indexed_dataset

        # Build the samples mapping.
        if not num_epochs:
            if not max_num_samples:
                raise ValueError("Need to specify either max_num_samples or num_epochs")
            num_epochs = int(max_num_samples / len(indexed_dataset)) + 1
        if not max_num_samples:
            max_num_samples = len(indexed_dataset) * num_epochs
        print(f"Building the sample map for {num_epochs} epochs or {max_num_samples} samples.")
        self.samples_mapping = helpers.build_mapping(
            indexed_dataset.doc_idx,
            indexed_dataset.sizes,
@@ -52,12 +57,17 @@ class AlbertDataset(Dataset):
    @classmethod
    def from_paths(cls, vocab, data_prefix, data_impl,
                   num_epochs, max_num_samples, masked_lm_prob,
                   max_seq_length, short_seq_prob, seed):
                   max_seq_length, short_seq_prob, seed, skip_warmup=False):
        tokenizer = FullBertTokenizer(vocab, do_lower_case=True)
        idx_ds = indexed_dataset.make_dataset(data_prefix, data_impl)
        print("> Reading dataset index")
        idx_ds = indexed_dataset.make_dataset(data_prefix, data_impl, skip_warmup)
        print("> Finished creating indexed dataset")
        return cls(idx_ds, tokenizer, num_epochs, max_num_samples, masked_lm_prob,
                   max_seq_length, short_seq_prob, seed)

    def num_tokens(self):
        return self.tokenizer.vocab_size()

    def __len__(self):
        return self.samples_mapping.shape[0]

Loading