Loading evaluate_gpt2.py +1 −4 Original line number Diff line number Diff line Loading @@ -34,10 +34,7 @@ from model import DistributedDataParallel as DDP import mpu from apex.optimizers import FusedAdam as Adam from utils import Timers from utils import save_checkpoint from utils import save_checkpoint_model_parallel from utils import load_checkpoint from utils import load_checkpoint_model_parallel from utils import report_memory from utils import print_params_min_max_norm from utils import print_rank_0 Loading Loading @@ -84,7 +81,7 @@ def setup_model(args): model = get_model(args) if args.load is not None: _ = load_checkpoint_model_parallel( _ = load_checkpoint( model, None, None, args) return model Loading gpt2_data_loader.py +13 −1 Original line number Diff line number Diff line Loading @@ -60,6 +60,17 @@ def make_gpt2_dataloaders(args): valid = make_data_loader_(args.val_data_path) test = make_data_loader_(args.test_data_path) args.do_train = False args.do_valid = False args.do_test = False if train is not None: args.do_train = True if valid is not None: args.do_valid = True if test is not None: args.do_test = True # Tokenizer. tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=args.cache_dir) eod_token = tokenizer.encoder['<|endoftext|>'] Loading Loading @@ -126,7 +137,8 @@ class GPT2Dataset(Dataset): def build_dataset_(self, shard_index): # Garbage collect so we don't use a lot of memory. # Leave the last one in case other threads have not catche up yet. for i in range(shard_index - 1): #for i in range(shard_index - 1): for i in range(shard_index): self.shards_data[i] = None self.shards_sample_index[i] = None # Read the shard. Loading mpu/transformer.py +3 −4 Original line number Diff line number Diff line Loading @@ -480,10 +480,9 @@ class BertParallelSelfAttention(torch.nn.Module): value_layer = self._transpose_for_scores(mixed_value_layer) # Raw attention scores. [b, np, s, s] attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / math.sqrt( self.hidden_size_per_attention_head) norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head)) attention_scores = torch.matmul(query_layer/norm_factor, key_layer.transpose(-1, -2)/norm_factor) # Apply the attention mask. attention_scores += attention_mask Loading pretrain_bert.py +0 −1 Original line number Diff line number Diff line Loading @@ -221,7 +221,6 @@ def forward_step(data_iterator, model, args, timers): losses = mpu.vocab_parallel_cross_entropy( output.contiguous().float(), lm_labels.contiguous()) loss_mask = loss_mask.contiguous() loss_mask = loss_mask.view(-1) lm_loss = torch.sum( losses.view(-1) * loss_mask.view(-1).float()) / loss_mask.sum() Loading Loading
evaluate_gpt2.py +1 −4 Original line number Diff line number Diff line Loading @@ -34,10 +34,7 @@ from model import DistributedDataParallel as DDP import mpu from apex.optimizers import FusedAdam as Adam from utils import Timers from utils import save_checkpoint from utils import save_checkpoint_model_parallel from utils import load_checkpoint from utils import load_checkpoint_model_parallel from utils import report_memory from utils import print_params_min_max_norm from utils import print_rank_0 Loading Loading @@ -84,7 +81,7 @@ def setup_model(args): model = get_model(args) if args.load is not None: _ = load_checkpoint_model_parallel( _ = load_checkpoint( model, None, None, args) return model Loading
gpt2_data_loader.py +13 −1 Original line number Diff line number Diff line Loading @@ -60,6 +60,17 @@ def make_gpt2_dataloaders(args): valid = make_data_loader_(args.val_data_path) test = make_data_loader_(args.test_data_path) args.do_train = False args.do_valid = False args.do_test = False if train is not None: args.do_train = True if valid is not None: args.do_valid = True if test is not None: args.do_test = True # Tokenizer. tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=args.cache_dir) eod_token = tokenizer.encoder['<|endoftext|>'] Loading Loading @@ -126,7 +137,8 @@ class GPT2Dataset(Dataset): def build_dataset_(self, shard_index): # Garbage collect so we don't use a lot of memory. # Leave the last one in case other threads have not catche up yet. for i in range(shard_index - 1): #for i in range(shard_index - 1): for i in range(shard_index): self.shards_data[i] = None self.shards_sample_index[i] = None # Read the shard. Loading
mpu/transformer.py +3 −4 Original line number Diff line number Diff line Loading @@ -480,10 +480,9 @@ class BertParallelSelfAttention(torch.nn.Module): value_layer = self._transpose_for_scores(mixed_value_layer) # Raw attention scores. [b, np, s, s] attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / math.sqrt( self.hidden_size_per_attention_head) norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head)) attention_scores = torch.matmul(query_layer/norm_factor, key_layer.transpose(-1, -2)/norm_factor) # Apply the attention mask. attention_scores += attention_mask Loading
pretrain_bert.py +0 −1 Original line number Diff line number Diff line Loading @@ -221,7 +221,6 @@ def forward_step(data_iterator, model, args, timers): losses = mpu.vocab_parallel_cross_entropy( output.contiguous().float(), lm_labels.contiguous()) loss_mask = loss_mask.contiguous() loss_mask = loss_mask.view(-1) lm_loss = torch.sum( losses.view(-1) * loss_mask.view(-1).float()) / loss_mask.sum() Loading