Commit a1d04b79 authored by Jared Casper's avatar Jared Casper
Browse files

Updating public repo with latest changes.

parent 93ab4bea
Loading
Loading
Loading
Loading

.gitignore

0 → 100644
+1 −0
Original line number Diff line number Diff line
__pycache__
+46 −2
Original line number Diff line number Diff line
@@ -114,7 +114,8 @@ def add_training_args(parser):
                       help='report interval')
    group.add_argument('--exit-interval', type=int, default=None,
                       help='Exit the program after this many new iterations.')

    group.add_argument('--tensorboard-dir', type=str, default=None,
                       help='Write TensorBoard logs to this directory')
    group.add_argument('--seed', type=int, default=1234,
                       help='random seed')
    # Batch prodecuer arguments
@@ -123,6 +124,8 @@ def add_training_args(parser):
    group.add_argument('--reset-attention-mask', action='store_true',
                       help='Reset self attention maske after '
                       'end-of-document token.')
    group.add_argument('--eod-mask-loss', action='store_true',
                       help='Mask loss for the end of document tokens')

    # Learning rate.
    group.add_argument('--lr-decay-iters', type=int, default=None,
@@ -133,9 +136,25 @@ def add_training_args(parser):
                       help='learning rate decay function')
    group.add_argument('--lr', type=float, default=1.0e-4,
                       help='initial learning rate')
    group.add_argument('--min-lr', type=float, default=0.0,
                       help='Minumum value for learning rate. The scheduler'
                       'clip values below this threshold.')
    group.add_argument('--warmup', type=float, default=0.01,
                       help='percentage of data to warmup on (.01 = 1% of all '
                       'training iters). Default 0.01')
    group.add_argument('--override-lr-scheduler', action='store_true',
                       help='Reset the values of the scheduler (learning rate,'
                       'warmup iterations, minimum learning rate, maximum '
                       'number of iterations, and decay style from input '
                       'arguments and ignore values from checkpoints. Note'
                       'that all the above values will be reset.')
    group.add_argument('--use-checkpoint-lr-scheduler', action='store_true',
                       help='Use checkpoint to set the values of the scheduler '
                       '(learning rate, warmup iterations, minimum learning '
                       'rate, maximum number of iterations, and decay style '
                       'from input arguments and ignore values from '
                       'checkpoints. Notethat all the above values will be '
                       'reset.')
    # model checkpointing
    group.add_argument('--save', type=str, default=None,
                       help='Output directory to save checkpoints to.')
@@ -163,8 +182,17 @@ def add_training_args(parser):
    group.add_argument('--distributed-backend', default='nccl',
                       help='which backend to use for distributed '
                       'training. One of [gloo, nccl]')
    group.add_argument('--DDP-impl', default='local',
                       help='which DistributedDataParallel implementation '
                       'to use. One of [local, torch]')
    group.add_argument('--local_rank', type=int, default=None,
                       help='local rank passed from distributed launcher')
    # autoresume
    group.add_argument('--adlr-autoresume', action='store_true',
                       help='enable autoresume on adlr cluster.')
    group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
                       help='intervals over which check for autoresume'
                       'termination signal')

    return parser

@@ -193,6 +221,8 @@ def add_evaluation_args(parser):
                       help='sliding window for overlapping eval ')
    group.add_argument('--cloze-eval', action='store_true',
                       help='Evaluation dataset from `--valid-data` is a cloze task')
    group.add_argument('--strict-lambada', action='store_true',
                       help='use more difficult formulation of lambada')
    group.add_argument('--eval-hf', action='store_true',
                       help='perform evaluation with huggingface openai model.'
                       'use `--load` to specify weights path to be loaded')
@@ -207,9 +237,23 @@ def add_text_generate_args(parser):

    group = parser.add_argument_group('Text generation', 'configurations')
    group.add_argument("--temperature", type=float, default=1.0)
    group.add_argument("--greedy", action='store_true', default=False)
    group.add_argument("--top_p", type=float, default=0.0)
    group.add_argument("--top_k", type=int, default=0)
    group.add_argument("--out-seq-length", type=int, default=256)
    group.add_argument("--out-seq-length", type=int, default=1024)
    group.add_argument("--sample-input-file", type=str, default="",
                      help='get input from file instead of interactive mode, '
                           'each line is an input' )
    group.add_argument("--sample-output-file", type=str, default="",
                      help='output file got from --sample-input-file')
    group.add_argument("--num-samples", type=int, default=0,
                       help='number of samples to generate unconditionally, '
                       'defaults to 0 and interactive conditional sampling')
    group.add_argument("--genfile", type=str,
                       help='output file when generating unconditionally')
    group.add_argument("--recompute", action='store_true',
                       help='during generation recompute all attention '
                       'instead of using previously computed keys/values.')
    return parser


+2 −1
Original line number Diff line number Diff line
@@ -148,7 +148,8 @@ def make_loaders(args):
        'model_type': args.tokenizer_model_type,
        'cache_dir': args.cache_dir,
        'max_preds_per_seq': args.max_preds_per_seq,
        'presplit_sentences': args.presplit_sentences}
        'presplit_sentences': args.presplit_sentences,
        'parallel_group': mpu.get_data_parallel_group()}

    eval_set_args = copy.copy(data_set_args)
    eval_set_args['split'] = [1.]
+19 −6
Original line number Diff line number Diff line
@@ -16,6 +16,8 @@
import os
import math

import torch

from .samplers import DistributedBatchSampler
from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset
from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader
@@ -61,7 +63,8 @@ def supported_corpus(corpus_name):
def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
                delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
                tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
                model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs):
                model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None,
                 parallel_group=None, **kwargs):
    """function to create datasets+tokenizers for common options"""
    if isinstance(process_fn, str):
        process_fn = eval(process_fn)
@@ -76,11 +79,19 @@ def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=N
                named_corpora = True
                name = path_
                path_ = corpora.NAMED_CORPORA[path_].PATH
            if not exists_lazy(path_, data_type='data'):
            if torch.distributed.get_rank() == 0 and not exists_lazy(path_, data_type='data'):
                # create cached version of dataset for lazy loading if it doesn't exist
                text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
                    delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose)
                make_lazy(path_, text.X, data_type='data')
            # This should be a barrier but nccl barrier assumes
            # device_index=rank which is not the case for model
            # parallel case
            counts = torch.cuda.LongTensor([1])
            torch.distributed.all_reduce(counts, group=parallel_group)
            assert counts[0].item() == torch.distributed.get_world_size(
                group=parallel_group)

            text = lazy_array_loader(path_, data_type='data', map_fn=process_fn)
        else:
            # get dataset
@@ -107,15 +118,17 @@ def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=N
    # Split dataset into train/val/test (and wrap bert dataset)
    if should_split(split):
        ds = split_ds(ds, split)
        if ds_type.lower() == 'bert':
        if 'bert' in ds_type.lower():
            presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
            ds = [bert_sentencepair_dataset(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences)  if d is not None else None  for d in ds]
            dstype = bert_sentencepair_dataset
            ds = [dstype(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences)  if d is not None else None  for d in ds]
        elif ds_type.lower() == 'gpt2':
            ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
    else:
        if ds_type.lower() == 'bert':
        if 'bert' in ds_type.lower():
            presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
            ds = bert_sentencepair_dataset(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
            dstype = bert_sentencepair_dataset
            ds = dstype(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
        elif ds_type.lower() == 'gpt2':
            ds = GPT2Dataset(ds, max_seq_len=seq_length)
    return ds, tokenizer

data_utils/datasets.py

100644 → 100755
+18 −5
Original line number Diff line number Diff line
@@ -461,6 +461,7 @@ class GPT2Dataset(data.Dataset):
                 weighted=True,
                 sample_across_doc=True,
                 random_across_doc_sampling=True,
                 bias_for_single_doc=False,
                 sentence_start=False, **kwargs):
        self.ds = ds
        self.ds_len = len(self.ds)
@@ -473,6 +474,7 @@ class GPT2Dataset(data.Dataset):
        self.weighted = weighted
        self.sample_across_doc = sample_across_doc
        self.random_across_doc_sampling = random_across_doc_sampling
        self.bias_for_single_doc = bias_for_single_doc
        self.sentence_start = sentence_start
        self.init_weighting()

@@ -510,7 +512,10 @@ class GPT2Dataset(data.Dataset):

        # truncate or pad tokens
        num_tokens = len(tokens)
        if self.bias_for_single_doc:
            tokens_to_strip = num_tokens - self.max_seq_len - 1
        else:
            tokens_to_strip = num_tokens - 1
        if tokens_to_strip > 0:
            strip_left_tokens = rng.randint(tokens_to_strip + 1)
            tokens = tokens[strip_left_tokens:]
@@ -758,7 +763,8 @@ class bert_sentencepair_dataset(data.Dataset):
        """
        tokens_a, token_types_a = a
        tokens_b, token_types_b = b
        max_num_tokens = max_seq_len - 3
        max_num_tokens = self.calc_seq_len(max_seq_len)
        # max_num_tokens = max_seq_len - 3
        while True:
            len_a = len(tokens_a)
            len_b = len(tokens_b)
@@ -782,6 +788,9 @@ class bert_sentencepair_dataset(data.Dataset):
                trunc_types.pop()
        return (tokens_a, token_types_a), (tokens_b, token_types_b)

    def calc_seq_len(self, max_seq_len):
        return max_seq_len - 3

    def mask_token(self, idx, tokens, types, vocab_words, rng):
        """
        helper function to mask `idx` token from `tokens` according to
@@ -807,6 +816,11 @@ class bert_sentencepair_dataset(data.Dataset):
        seq += [self.tokenizer.get_command('pad').Id] * num_pad
        return seq, pad_mask

    def concat_tokens(self, tokens_a, token_types_a, tokens_b, token_types_b):
        tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [self.tokenizer.get_command('sep').Id] + tokens_b + [self.tokenizer.get_command('sep').Id]
        token_types = [token_types_a[0]] + token_types_a + [token_types_a[0]] + token_types_b + [token_types_b[0]]
        return tokens, token_types

    def create_masked_lm_predictions(self, a, b, mask_lm_prob, max_preds_per_seq, vocab_words, rng):
        """
        Mask sequence pair for BERT training according to:
@@ -814,8 +828,7 @@ class bert_sentencepair_dataset(data.Dataset):
        """
        tokens_a, token_types_a = a
        tokens_b, token_types_b = b
        tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [self.tokenizer.get_command('sep').Id] + tokens_b + [self.tokenizer.get_command('sep').Id]
        token_types = [token_types_a[0]] + token_types_a + [token_types_a[0]] + token_types_b + [token_types_b[0]]
        tokens, token_types = self.concat_tokens(tokens_a, token_types_a, tokens_b, token_types_b)

        len_a = len(tokens_a)
        len_b = len(tokens_b)
Loading