Commit a0368ddf authored by Raul Puri's avatar Raul Puri
Browse files

eval+numeric update

parent abe36e2e
Loading
Loading
Loading
Loading
+1 −4
Original line number Diff line number Diff line
@@ -34,10 +34,7 @@ from model import DistributedDataParallel as DDP
import mpu
from apex.optimizers import FusedAdam as Adam
from utils import Timers
from utils import save_checkpoint
from utils import save_checkpoint_model_parallel
from utils import load_checkpoint
from utils import load_checkpoint_model_parallel
from utils import report_memory
from utils import print_params_min_max_norm
from utils import print_rank_0
@@ -84,7 +81,7 @@ def setup_model(args):
    model = get_model(args)

    if args.load is not None:
        _ = load_checkpoint_model_parallel(
        _ = load_checkpoint(
            model, None, None, args)

    return model
+13 −1
Original line number Diff line number Diff line
@@ -60,6 +60,17 @@ def make_gpt2_dataloaders(args):
    valid = make_data_loader_(args.val_data_path)
    test = make_data_loader_(args.test_data_path)

    args.do_train = False
    args.do_valid = False
    args.do_test = False

    if train is not None:
        args.do_train = True
    if valid is not None:
        args.do_valid = True
    if test is not None:
        args.do_test = True

    # Tokenizer.
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=args.cache_dir)
    eod_token = tokenizer.encoder['<|endoftext|>']
@@ -126,7 +137,8 @@ class GPT2Dataset(Dataset):
    def build_dataset_(self, shard_index):
        # Garbage collect so we don't use a lot of memory.
        # Leave the last one in case other threads have not catche up yet.
        for i in range(shard_index - 1):
        #for i in range(shard_index - 1):
        for i in range(shard_index):
            self.shards_data[i] = None
            self.shards_sample_index[i] = None
        # Read the shard.
+3 −4
Original line number Diff line number Diff line
@@ -480,10 +480,9 @@ class BertParallelSelfAttention(torch.nn.Module):
        value_layer = self._transpose_for_scores(mixed_value_layer)

        # Raw attention scores. [b, np, s, s]
        attention_scores = torch.matmul(query_layer,
                                        key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(
            self.hidden_size_per_attention_head)
        norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head))
        attention_scores = torch.matmul(query_layer/norm_factor,
                                        key_layer.transpose(-1, -2)/norm_factor)
        # Apply the attention mask.
        attention_scores += attention_mask

+0 −1
Original line number Diff line number Diff line
@@ -221,7 +221,6 @@ def forward_step(data_iterator, model, args, timers):
    losses = mpu.vocab_parallel_cross_entropy(
        output.contiguous().float(), lm_labels.contiguous())
    loss_mask = loss_mask.contiguous()
    loss_mask = loss_mask.view(-1)
    lm_loss = torch.sum(
        losses.view(-1) * loss_mask.view(-1).float()) / loss_mask.sum()