Loading .gitignore 0 → 100644 +1 −0 Original line number Diff line number Diff line __pycache__ arguments.py +46 −2 Original line number Diff line number Diff line Loading @@ -114,7 +114,8 @@ def add_training_args(parser): help='report interval') group.add_argument('--exit-interval', type=int, default=None, help='Exit the program after this many new iterations.') group.add_argument('--tensorboard-dir', type=str, default=None, help='Write TensorBoard logs to this directory') group.add_argument('--seed', type=int, default=1234, help='random seed') # Batch prodecuer arguments Loading @@ -123,6 +124,8 @@ def add_training_args(parser): group.add_argument('--reset-attention-mask', action='store_true', help='Reset self attention maske after ' 'end-of-document token.') group.add_argument('--eod-mask-loss', action='store_true', help='Mask loss for the end of document tokens') # Learning rate. group.add_argument('--lr-decay-iters', type=int, default=None, Loading @@ -133,9 +136,25 @@ def add_training_args(parser): help='learning rate decay function') group.add_argument('--lr', type=float, default=1.0e-4, help='initial learning rate') group.add_argument('--min-lr', type=float, default=0.0, help='Minumum value for learning rate. The scheduler' 'clip values below this threshold.') group.add_argument('--warmup', type=float, default=0.01, help='percentage of data to warmup on (.01 = 1% of all ' 'training iters). Default 0.01') group.add_argument('--override-lr-scheduler', action='store_true', help='Reset the values of the scheduler (learning rate,' 'warmup iterations, minimum learning rate, maximum ' 'number of iterations, and decay style from input ' 'arguments and ignore values from checkpoints. Note' 'that all the above values will be reset.') group.add_argument('--use-checkpoint-lr-scheduler', action='store_true', help='Use checkpoint to set the values of the scheduler ' '(learning rate, warmup iterations, minimum learning ' 'rate, maximum number of iterations, and decay style ' 'from input arguments and ignore values from ' 'checkpoints. Notethat all the above values will be ' 'reset.') # model checkpointing group.add_argument('--save', type=str, default=None, help='Output directory to save checkpoints to.') Loading Loading @@ -163,8 +182,17 @@ def add_training_args(parser): group.add_argument('--distributed-backend', default='nccl', help='which backend to use for distributed ' 'training. One of [gloo, nccl]') group.add_argument('--DDP-impl', default='local', help='which DistributedDataParallel implementation ' 'to use. One of [local, torch]') group.add_argument('--local_rank', type=int, default=None, help='local rank passed from distributed launcher') # autoresume group.add_argument('--adlr-autoresume', action='store_true', help='enable autoresume on adlr cluster.') group.add_argument('--adlr-autoresume-interval', type=int, default=1000, help='intervals over which check for autoresume' 'termination signal') return parser Loading Loading @@ -193,6 +221,8 @@ def add_evaluation_args(parser): help='sliding window for overlapping eval ') group.add_argument('--cloze-eval', action='store_true', help='Evaluation dataset from `--valid-data` is a cloze task') group.add_argument('--strict-lambada', action='store_true', help='use more difficult formulation of lambada') group.add_argument('--eval-hf', action='store_true', help='perform evaluation with huggingface openai model.' 'use `--load` to specify weights path to be loaded') Loading @@ -207,9 +237,23 @@ def add_text_generate_args(parser): group = parser.add_argument_group('Text generation', 'configurations') group.add_argument("--temperature", type=float, default=1.0) group.add_argument("--greedy", action='store_true', default=False) group.add_argument("--top_p", type=float, default=0.0) group.add_argument("--top_k", type=int, default=0) group.add_argument("--out-seq-length", type=int, default=256) group.add_argument("--out-seq-length", type=int, default=1024) group.add_argument("--sample-input-file", type=str, default="", help='get input from file instead of interactive mode, ' 'each line is an input' ) group.add_argument("--sample-output-file", type=str, default="", help='output file got from --sample-input-file') group.add_argument("--num-samples", type=int, default=0, help='number of samples to generate unconditionally, ' 'defaults to 0 and interactive conditional sampling') group.add_argument("--genfile", type=str, help='output file when generating unconditionally') group.add_argument("--recompute", action='store_true', help='during generation recompute all attention ' 'instead of using previously computed keys/values.') return parser Loading configure_data.py +2 −1 Original line number Diff line number Diff line Loading @@ -148,7 +148,8 @@ def make_loaders(args): 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir, 'max_preds_per_seq': args.max_preds_per_seq, 'presplit_sentences': args.presplit_sentences} 'presplit_sentences': args.presplit_sentences, 'parallel_group': mpu.get_data_parallel_group()} eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] Loading data_utils/__init__.py +19 −6 Original line number Diff line number Diff line Loading @@ -16,6 +16,8 @@ import os import math import torch from .samplers import DistributedBatchSampler from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader Loading Loading @@ -61,7 +63,8 @@ def supported_corpus(corpus_name): def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.], delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None, tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None, model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs): model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, parallel_group=None, **kwargs): """function to create datasets+tokenizers for common options""" if isinstance(process_fn, str): process_fn = eval(process_fn) Loading @@ -76,11 +79,19 @@ def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=N named_corpora = True name = path_ path_ = corpora.NAMED_CORPORA[path_].PATH if not exists_lazy(path_, data_type='data'): if torch.distributed.get_rank() == 0 and not exists_lazy(path_, data_type='data'): # create cached version of dataset for lazy loading if it doesn't exist text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose) make_lazy(path_, text.X, data_type='data') # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case counts = torch.cuda.LongTensor([1]) torch.distributed.all_reduce(counts, group=parallel_group) assert counts[0].item() == torch.distributed.get_world_size( group=parallel_group) text = lazy_array_loader(path_, data_type='data', map_fn=process_fn) else: # get dataset Loading @@ -107,15 +118,17 @@ def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=N # Split dataset into train/val/test (and wrap bert dataset) if should_split(split): ds = split_ds(ds, split) if ds_type.lower() == 'bert': if 'bert' in ds_type.lower(): presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False ds = [bert_sentencepair_dataset(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences) if d is not None else None for d in ds] dstype = bert_sentencepair_dataset ds = [dstype(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences) if d is not None else None for d in ds] elif ds_type.lower() == 'gpt2': ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds] else: if ds_type.lower() == 'bert': if 'bert' in ds_type.lower(): presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False ds = bert_sentencepair_dataset(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences) dstype = bert_sentencepair_dataset ds = dstype(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences) elif ds_type.lower() == 'gpt2': ds = GPT2Dataset(ds, max_seq_len=seq_length) return ds, tokenizer data_utils/datasets.py 100644 → 100755 +18 −5 Original line number Diff line number Diff line Loading @@ -461,6 +461,7 @@ class GPT2Dataset(data.Dataset): weighted=True, sample_across_doc=True, random_across_doc_sampling=True, bias_for_single_doc=False, sentence_start=False, **kwargs): self.ds = ds self.ds_len = len(self.ds) Loading @@ -473,6 +474,7 @@ class GPT2Dataset(data.Dataset): self.weighted = weighted self.sample_across_doc = sample_across_doc self.random_across_doc_sampling = random_across_doc_sampling self.bias_for_single_doc = bias_for_single_doc self.sentence_start = sentence_start self.init_weighting() Loading Loading @@ -510,7 +512,10 @@ class GPT2Dataset(data.Dataset): # truncate or pad tokens num_tokens = len(tokens) if self.bias_for_single_doc: tokens_to_strip = num_tokens - self.max_seq_len - 1 else: tokens_to_strip = num_tokens - 1 if tokens_to_strip > 0: strip_left_tokens = rng.randint(tokens_to_strip + 1) tokens = tokens[strip_left_tokens:] Loading Loading @@ -758,7 +763,8 @@ class bert_sentencepair_dataset(data.Dataset): """ tokens_a, token_types_a = a tokens_b, token_types_b = b max_num_tokens = max_seq_len - 3 max_num_tokens = self.calc_seq_len(max_seq_len) # max_num_tokens = max_seq_len - 3 while True: len_a = len(tokens_a) len_b = len(tokens_b) Loading @@ -782,6 +788,9 @@ class bert_sentencepair_dataset(data.Dataset): trunc_types.pop() return (tokens_a, token_types_a), (tokens_b, token_types_b) def calc_seq_len(self, max_seq_len): return max_seq_len - 3 def mask_token(self, idx, tokens, types, vocab_words, rng): """ helper function to mask `idx` token from `tokens` according to Loading @@ -807,6 +816,11 @@ class bert_sentencepair_dataset(data.Dataset): seq += [self.tokenizer.get_command('pad').Id] * num_pad return seq, pad_mask def concat_tokens(self, tokens_a, token_types_a, tokens_b, token_types_b): tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [self.tokenizer.get_command('sep').Id] + tokens_b + [self.tokenizer.get_command('sep').Id] token_types = [token_types_a[0]] + token_types_a + [token_types_a[0]] + token_types_b + [token_types_b[0]] return tokens, token_types def create_masked_lm_predictions(self, a, b, mask_lm_prob, max_preds_per_seq, vocab_words, rng): """ Mask sequence pair for BERT training according to: Loading @@ -814,8 +828,7 @@ class bert_sentencepair_dataset(data.Dataset): """ tokens_a, token_types_a = a tokens_b, token_types_b = b tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [self.tokenizer.get_command('sep').Id] + tokens_b + [self.tokenizer.get_command('sep').Id] token_types = [token_types_a[0]] + token_types_a + [token_types_a[0]] + token_types_b + [token_types_b[0]] tokens, token_types = self.concat_tokens(tokens_a, token_types_a, tokens_b, token_types_b) len_a = len(tokens_a) len_b = len(tokens_b) Loading Loading
arguments.py +46 −2 Original line number Diff line number Diff line Loading @@ -114,7 +114,8 @@ def add_training_args(parser): help='report interval') group.add_argument('--exit-interval', type=int, default=None, help='Exit the program after this many new iterations.') group.add_argument('--tensorboard-dir', type=str, default=None, help='Write TensorBoard logs to this directory') group.add_argument('--seed', type=int, default=1234, help='random seed') # Batch prodecuer arguments Loading @@ -123,6 +124,8 @@ def add_training_args(parser): group.add_argument('--reset-attention-mask', action='store_true', help='Reset self attention maske after ' 'end-of-document token.') group.add_argument('--eod-mask-loss', action='store_true', help='Mask loss for the end of document tokens') # Learning rate. group.add_argument('--lr-decay-iters', type=int, default=None, Loading @@ -133,9 +136,25 @@ def add_training_args(parser): help='learning rate decay function') group.add_argument('--lr', type=float, default=1.0e-4, help='initial learning rate') group.add_argument('--min-lr', type=float, default=0.0, help='Minumum value for learning rate. The scheduler' 'clip values below this threshold.') group.add_argument('--warmup', type=float, default=0.01, help='percentage of data to warmup on (.01 = 1% of all ' 'training iters). Default 0.01') group.add_argument('--override-lr-scheduler', action='store_true', help='Reset the values of the scheduler (learning rate,' 'warmup iterations, minimum learning rate, maximum ' 'number of iterations, and decay style from input ' 'arguments and ignore values from checkpoints. Note' 'that all the above values will be reset.') group.add_argument('--use-checkpoint-lr-scheduler', action='store_true', help='Use checkpoint to set the values of the scheduler ' '(learning rate, warmup iterations, minimum learning ' 'rate, maximum number of iterations, and decay style ' 'from input arguments and ignore values from ' 'checkpoints. Notethat all the above values will be ' 'reset.') # model checkpointing group.add_argument('--save', type=str, default=None, help='Output directory to save checkpoints to.') Loading Loading @@ -163,8 +182,17 @@ def add_training_args(parser): group.add_argument('--distributed-backend', default='nccl', help='which backend to use for distributed ' 'training. One of [gloo, nccl]') group.add_argument('--DDP-impl', default='local', help='which DistributedDataParallel implementation ' 'to use. One of [local, torch]') group.add_argument('--local_rank', type=int, default=None, help='local rank passed from distributed launcher') # autoresume group.add_argument('--adlr-autoresume', action='store_true', help='enable autoresume on adlr cluster.') group.add_argument('--adlr-autoresume-interval', type=int, default=1000, help='intervals over which check for autoresume' 'termination signal') return parser Loading Loading @@ -193,6 +221,8 @@ def add_evaluation_args(parser): help='sliding window for overlapping eval ') group.add_argument('--cloze-eval', action='store_true', help='Evaluation dataset from `--valid-data` is a cloze task') group.add_argument('--strict-lambada', action='store_true', help='use more difficult formulation of lambada') group.add_argument('--eval-hf', action='store_true', help='perform evaluation with huggingface openai model.' 'use `--load` to specify weights path to be loaded') Loading @@ -207,9 +237,23 @@ def add_text_generate_args(parser): group = parser.add_argument_group('Text generation', 'configurations') group.add_argument("--temperature", type=float, default=1.0) group.add_argument("--greedy", action='store_true', default=False) group.add_argument("--top_p", type=float, default=0.0) group.add_argument("--top_k", type=int, default=0) group.add_argument("--out-seq-length", type=int, default=256) group.add_argument("--out-seq-length", type=int, default=1024) group.add_argument("--sample-input-file", type=str, default="", help='get input from file instead of interactive mode, ' 'each line is an input' ) group.add_argument("--sample-output-file", type=str, default="", help='output file got from --sample-input-file') group.add_argument("--num-samples", type=int, default=0, help='number of samples to generate unconditionally, ' 'defaults to 0 and interactive conditional sampling') group.add_argument("--genfile", type=str, help='output file when generating unconditionally') group.add_argument("--recompute", action='store_true', help='during generation recompute all attention ' 'instead of using previously computed keys/values.') return parser Loading
configure_data.py +2 −1 Original line number Diff line number Diff line Loading @@ -148,7 +148,8 @@ def make_loaders(args): 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir, 'max_preds_per_seq': args.max_preds_per_seq, 'presplit_sentences': args.presplit_sentences} 'presplit_sentences': args.presplit_sentences, 'parallel_group': mpu.get_data_parallel_group()} eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] Loading
data_utils/__init__.py +19 −6 Original line number Diff line number Diff line Loading @@ -16,6 +16,8 @@ import os import math import torch from .samplers import DistributedBatchSampler from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader Loading Loading @@ -61,7 +63,8 @@ def supported_corpus(corpus_name): def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.], delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None, tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None, model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs): model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, parallel_group=None, **kwargs): """function to create datasets+tokenizers for common options""" if isinstance(process_fn, str): process_fn = eval(process_fn) Loading @@ -76,11 +79,19 @@ def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=N named_corpora = True name = path_ path_ = corpora.NAMED_CORPORA[path_].PATH if not exists_lazy(path_, data_type='data'): if torch.distributed.get_rank() == 0 and not exists_lazy(path_, data_type='data'): # create cached version of dataset for lazy loading if it doesn't exist text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose) make_lazy(path_, text.X, data_type='data') # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case counts = torch.cuda.LongTensor([1]) torch.distributed.all_reduce(counts, group=parallel_group) assert counts[0].item() == torch.distributed.get_world_size( group=parallel_group) text = lazy_array_loader(path_, data_type='data', map_fn=process_fn) else: # get dataset Loading @@ -107,15 +118,17 @@ def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=N # Split dataset into train/val/test (and wrap bert dataset) if should_split(split): ds = split_ds(ds, split) if ds_type.lower() == 'bert': if 'bert' in ds_type.lower(): presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False ds = [bert_sentencepair_dataset(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences) if d is not None else None for d in ds] dstype = bert_sentencepair_dataset ds = [dstype(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences) if d is not None else None for d in ds] elif ds_type.lower() == 'gpt2': ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds] else: if ds_type.lower() == 'bert': if 'bert' in ds_type.lower(): presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False ds = bert_sentencepair_dataset(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences) dstype = bert_sentencepair_dataset ds = dstype(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences) elif ds_type.lower() == 'gpt2': ds = GPT2Dataset(ds, max_seq_len=seq_length) return ds, tokenizer
data_utils/datasets.py 100644 → 100755 +18 −5 Original line number Diff line number Diff line Loading @@ -461,6 +461,7 @@ class GPT2Dataset(data.Dataset): weighted=True, sample_across_doc=True, random_across_doc_sampling=True, bias_for_single_doc=False, sentence_start=False, **kwargs): self.ds = ds self.ds_len = len(self.ds) Loading @@ -473,6 +474,7 @@ class GPT2Dataset(data.Dataset): self.weighted = weighted self.sample_across_doc = sample_across_doc self.random_across_doc_sampling = random_across_doc_sampling self.bias_for_single_doc = bias_for_single_doc self.sentence_start = sentence_start self.init_weighting() Loading Loading @@ -510,7 +512,10 @@ class GPT2Dataset(data.Dataset): # truncate or pad tokens num_tokens = len(tokens) if self.bias_for_single_doc: tokens_to_strip = num_tokens - self.max_seq_len - 1 else: tokens_to_strip = num_tokens - 1 if tokens_to_strip > 0: strip_left_tokens = rng.randint(tokens_to_strip + 1) tokens = tokens[strip_left_tokens:] Loading Loading @@ -758,7 +763,8 @@ class bert_sentencepair_dataset(data.Dataset): """ tokens_a, token_types_a = a tokens_b, token_types_b = b max_num_tokens = max_seq_len - 3 max_num_tokens = self.calc_seq_len(max_seq_len) # max_num_tokens = max_seq_len - 3 while True: len_a = len(tokens_a) len_b = len(tokens_b) Loading @@ -782,6 +788,9 @@ class bert_sentencepair_dataset(data.Dataset): trunc_types.pop() return (tokens_a, token_types_a), (tokens_b, token_types_b) def calc_seq_len(self, max_seq_len): return max_seq_len - 3 def mask_token(self, idx, tokens, types, vocab_words, rng): """ helper function to mask `idx` token from `tokens` according to Loading @@ -807,6 +816,11 @@ class bert_sentencepair_dataset(data.Dataset): seq += [self.tokenizer.get_command('pad').Id] * num_pad return seq, pad_mask def concat_tokens(self, tokens_a, token_types_a, tokens_b, token_types_b): tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [self.tokenizer.get_command('sep').Id] + tokens_b + [self.tokenizer.get_command('sep').Id] token_types = [token_types_a[0]] + token_types_a + [token_types_a[0]] + token_types_b + [token_types_b[0]] return tokens, token_types def create_masked_lm_predictions(self, a, b, mask_lm_prob, max_preds_per_seq, vocab_words, rng): """ Mask sequence pair for BERT training according to: Loading @@ -814,8 +828,7 @@ class bert_sentencepair_dataset(data.Dataset): """ tokens_a, token_types_a = a tokens_b, token_types_b = b tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [self.tokenizer.get_command('sep').Id] + tokens_b + [self.tokenizer.get_command('sep').Id] token_types = [token_types_a[0]] + token_types_a + [token_types_a[0]] + token_types_b + [token_types_b[0]] tokens, token_types = self.concat_tokens(tokens_a, token_types_a, tokens_b, token_types_b) len_a = len(tokens_a) len_b = len(tokens_b) Loading