Loading megatron/data/samplers.py +3 −4 Original line number Diff line number Diff line Loading @@ -76,12 +76,11 @@ class RandomSampler(data.sampler.Sampler): class DistributedBatchSampler(data.sampler.BatchSampler): """ similar to normal implementation of distributed sampler, except """Similar to normal implementation of distributed sampler, except implementation is at the batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler. """ (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.""" def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False): super(DistributedBatchSampler, self).__init__(sampler, batch_size, Loading megatron/global_vars.py +47 −46 Original line number Diff line number Diff line Loading @@ -141,10 +141,7 @@ def _ensure_var_is_not_initialized(var, name): assert var is None, '{} is already initialized.'.format(name) class Timers: """Group of timers.""" class Timer: class _Timer: """Timer.""" def __init__(self, name): Loading Loading @@ -188,12 +185,16 @@ class Timers: self.start() return elapsed_ class Timers: """Group of timers.""" def __init__(self): self.timers = {} def __call__(self, name): if name not in self.timers: self.timers[name] = self.Timer(name) self.timers[name] = _Timer(name) return self.timers[name] def write(self, names, writer, iteration, normalizer=1.0, reset=False): Loading megatron/initialize.py +3 −3 Original line number Diff line number Diff line Loading @@ -17,8 +17,8 @@ import random import os import numpy as np import numpy as np import torch from megatron import get_adlr_autoresume Loading @@ -31,7 +31,7 @@ from megatron.global_vars import set_global_variables def initialize_megatron(extra_args_provider=None, args_defaults={}): """Set global variables, initialize distributed, and set autoresume and random seeds.""" # Male sure cuda is avaiable. # Make sure cuda is available. assert torch.cuda.is_available(), 'Megatron requires CUDA.' # Parse args, build tokenizer, and set adlr-autoresume, Loading @@ -45,7 +45,7 @@ def initialize_megatron(extra_args_provider=None, args_defaults={}): # Autoresume. _init_autoresume() # Random seeds for reproducability. # Random seeds for reproducibility. args = get_args() if args.rank == 0: print('> setting random seeds to {} ...'.format(args.seed)) Loading megatron/training.py +4 −4 Original line number Diff line number Diff line Loading @@ -97,7 +97,7 @@ def pretrain(train_val_test_data_provider, model_provider, forward_step_func, print_rank_0('training ...') iteration = 0 if args.train_iters > 0: if args.do_train and args.train_iters > 0: if args.do_train: iteration, _ = train(forward_step_func, model, optimizer, lr_scheduler, Loading Loading @@ -151,7 +151,7 @@ def get_model(model_provider_func): model = LocalDDP(model) return model print_rank_0('Unknown DDP implementation specified: {}. ' raise NotImplementedError('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl)) sys.exit() Loading Loading @@ -385,8 +385,8 @@ def train(forward_step_func, model, optimizer, lr_scheduler, report_memory_flag) # Autoresume if (iteration % args.adlr_autoresume_interval == 0) and \ args.adlr_autoresume: if args.adlr_autoresume and \ (iteration % args.adlr_autoresume_interval == 0): check_adlr_autoresume_termination(iteration, model, optimizer, lr_scheduler) Loading pretrain_gpt2.py +4 −4 Original line number Diff line number Diff line Loading @@ -109,13 +109,13 @@ def make_gpt2_dataloaders(): initial_seed = args.seed # Build the datasets. def build_dataset_(name): def _build_dataset(name): return GPT2Dataset(os.path.join(args.data_path, name), args.input_data_sizes_file, args.seq_length, args.seed) train_ds = build_dataset_('train') valid_ds = build_dataset_('valid') test_ds = build_dataset_('test') train_ds = _build_dataset('train') valid_ds = _build_dataset('valid') test_ds = _build_dataset('test') # Dataloaders train = make_data_loader(train_ds) Loading Loading
megatron/data/samplers.py +3 −4 Original line number Diff line number Diff line Loading @@ -76,12 +76,11 @@ class RandomSampler(data.sampler.Sampler): class DistributedBatchSampler(data.sampler.BatchSampler): """ similar to normal implementation of distributed sampler, except """Similar to normal implementation of distributed sampler, except implementation is at the batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler. """ (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.""" def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False): super(DistributedBatchSampler, self).__init__(sampler, batch_size, Loading
megatron/global_vars.py +47 −46 Original line number Diff line number Diff line Loading @@ -141,10 +141,7 @@ def _ensure_var_is_not_initialized(var, name): assert var is None, '{} is already initialized.'.format(name) class Timers: """Group of timers.""" class Timer: class _Timer: """Timer.""" def __init__(self, name): Loading Loading @@ -188,12 +185,16 @@ class Timers: self.start() return elapsed_ class Timers: """Group of timers.""" def __init__(self): self.timers = {} def __call__(self, name): if name not in self.timers: self.timers[name] = self.Timer(name) self.timers[name] = _Timer(name) return self.timers[name] def write(self, names, writer, iteration, normalizer=1.0, reset=False): Loading
megatron/initialize.py +3 −3 Original line number Diff line number Diff line Loading @@ -17,8 +17,8 @@ import random import os import numpy as np import numpy as np import torch from megatron import get_adlr_autoresume Loading @@ -31,7 +31,7 @@ from megatron.global_vars import set_global_variables def initialize_megatron(extra_args_provider=None, args_defaults={}): """Set global variables, initialize distributed, and set autoresume and random seeds.""" # Male sure cuda is avaiable. # Make sure cuda is available. assert torch.cuda.is_available(), 'Megatron requires CUDA.' # Parse args, build tokenizer, and set adlr-autoresume, Loading @@ -45,7 +45,7 @@ def initialize_megatron(extra_args_provider=None, args_defaults={}): # Autoresume. _init_autoresume() # Random seeds for reproducability. # Random seeds for reproducibility. args = get_args() if args.rank == 0: print('> setting random seeds to {} ...'.format(args.seed)) Loading
megatron/training.py +4 −4 Original line number Diff line number Diff line Loading @@ -97,7 +97,7 @@ def pretrain(train_val_test_data_provider, model_provider, forward_step_func, print_rank_0('training ...') iteration = 0 if args.train_iters > 0: if args.do_train and args.train_iters > 0: if args.do_train: iteration, _ = train(forward_step_func, model, optimizer, lr_scheduler, Loading Loading @@ -151,7 +151,7 @@ def get_model(model_provider_func): model = LocalDDP(model) return model print_rank_0('Unknown DDP implementation specified: {}. ' raise NotImplementedError('Unknown DDP implementation specified: {}. ' 'Exiting.'.format(args.DDP_impl)) sys.exit() Loading Loading @@ -385,8 +385,8 @@ def train(forward_step_func, model, optimizer, lr_scheduler, report_memory_flag) # Autoresume if (iteration % args.adlr_autoresume_interval == 0) and \ args.adlr_autoresume: if args.adlr_autoresume and \ (iteration % args.adlr_autoresume_interval == 0): check_adlr_autoresume_termination(iteration, model, optimizer, lr_scheduler) Loading
pretrain_gpt2.py +4 −4 Original line number Diff line number Diff line Loading @@ -109,13 +109,13 @@ def make_gpt2_dataloaders(): initial_seed = args.seed # Build the datasets. def build_dataset_(name): def _build_dataset(name): return GPT2Dataset(os.path.join(args.data_path, name), args.input_data_sizes_file, args.seq_length, args.seed) train_ds = build_dataset_('train') valid_ds = build_dataset_('valid') test_ds = build_dataset_('test') train_ds = _build_dataset('train') valid_ds = _build_dataset('valid') test_ds = _build_dataset('test') # Dataloaders train = make_data_loader(train_ds) Loading