Commit bf3ce751 authored by Mohammad's avatar Mohammad
Browse files

addressed comments from raul, neel, and jared

parent 8600642e
Loading
Loading
Loading
Loading
+3 −4
Original line number Diff line number Diff line
@@ -76,12 +76,11 @@ class RandomSampler(data.sampler.Sampler):


class DistributedBatchSampler(data.sampler.BatchSampler):
    """
    similar to normal implementation of distributed sampler, except
    """Similar to normal implementation of distributed sampler, except
    implementation is at the batch sampler level, instead of just the
    sampler level. This allows wrapping of arbitrary data samplers
    (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
    """
    (sequential, random, WeightedRandomSampler, etc.) with this batch
    sampler."""
    def __init__(self, sampler, batch_size, drop_last, rank=-1,
                 world_size=2, wrap_last=False):
        super(DistributedBatchSampler, self).__init__(sampler, batch_size,
+47 −46
Original line number Diff line number Diff line
@@ -141,10 +141,7 @@ def _ensure_var_is_not_initialized(var, name):
    assert var is None, '{} is already initialized.'.format(name)


class Timers:
    """Group of timers."""

    class Timer:
class _Timer:
    """Timer."""

    def __init__(self, name):
@@ -188,12 +185,16 @@ class Timers:
            self.start()
        return elapsed_


class Timers:
    """Group of timers."""

    def __init__(self):
        self.timers = {}

    def __call__(self, name):
        if name not in self.timers:
            self.timers[name] = self.Timer(name)
            self.timers[name] = _Timer(name)
        return self.timers[name]

    def write(self, names, writer, iteration, normalizer=1.0, reset=False):
+3 −3
Original line number Diff line number Diff line
@@ -17,8 +17,8 @@

import random
import os
import numpy as np

import numpy as np
import torch

from megatron import get_adlr_autoresume
@@ -31,7 +31,7 @@ from megatron.global_vars import set_global_variables
def initialize_megatron(extra_args_provider=None, args_defaults={}):
    """Set global variables, initialize distributed, and
    set autoresume and random seeds."""
    # Male sure cuda is avaiable.
    # Make sure cuda is available.
    assert torch.cuda.is_available(), 'Megatron requires CUDA.'

    # Parse args, build tokenizer, and set adlr-autoresume,
@@ -45,7 +45,7 @@ def initialize_megatron(extra_args_provider=None, args_defaults={}):
    # Autoresume.
    _init_autoresume()

    # Random seeds for reproducability.
    # Random seeds for reproducibility.
    args = get_args()
    if args.rank == 0:
        print('> setting random seeds to {} ...'.format(args.seed))
+4 −4
Original line number Diff line number Diff line
@@ -97,7 +97,7 @@ def pretrain(train_val_test_data_provider, model_provider, forward_step_func,
    print_rank_0('training ...')

    iteration = 0
    if args.train_iters > 0:
    if args.do_train and args.train_iters > 0:
        if args.do_train:
            iteration, _ = train(forward_step_func,
                                 model, optimizer, lr_scheduler,
@@ -151,7 +151,7 @@ def get_model(model_provider_func):
        model = LocalDDP(model)
        return model

    print_rank_0('Unknown DDP implementation specified: {}. '
    raise NotImplementedError('Unknown DDP implementation specified: {}. '
                 'Exiting.'.format(args.DDP_impl))
    sys.exit()

@@ -385,8 +385,8 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
                                          report_memory_flag)

        # Autoresume
        if (iteration % args.adlr_autoresume_interval == 0) and \
           args.adlr_autoresume:
        if args.adlr_autoresume and \
           (iteration % args.adlr_autoresume_interval == 0):
            check_adlr_autoresume_termination(iteration, model, optimizer,
                                              lr_scheduler)

+4 −4
Original line number Diff line number Diff line
@@ -109,13 +109,13 @@ def make_gpt2_dataloaders():
    initial_seed = args.seed

    # Build the datasets.
    def build_dataset_(name):
    def _build_dataset(name):
        return GPT2Dataset(os.path.join(args.data_path, name),
                           args.input_data_sizes_file,
                           args.seq_length, args.seed)
    train_ds = build_dataset_('train')
    valid_ds = build_dataset_('valid')
    test_ds = build_dataset_('test')
    train_ds = _build_dataset('train')
    valid_ds = _build_dataset('valid')
    test_ds = _build_dataset('test')

    # Dataloaders
    train = make_data_loader(train_ds)