Loading megatron/data/albert_dataset.py +98 −98 Original line number Diff line number Diff line """TO BE ADDED """ import os import random import time import os import numpy as np import torch from torch.utils.data import Dataset from .dataset_utils import build_training_sample #from data.mapping import build_training_samples_mapping from . import helpers from megatron.data import helpers from megatron.data import FullBertTokenizer from megatron.data.dataset_utils import build_training_sample from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset from megatron.utils import print_rank_0 class AlbertDataset(Dataset): def __init__(self, vocab_file, data_prefix, data_impl, skip_warmup, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, short_seq_prob, seed): def __init__(self, vocab_file, data_prefix, data_impl, skip_warmup, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, short_seq_prob, seed): # Params to store. self.seed = seed Loading @@ -32,11 +28,12 @@ class AlbertDataset(Dataset): self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=True) # Indexed dataset. self.indexed_dataset = self._get_indexed_dataset(data_prefix, data_impl, self.indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup) # Build the samples mapping. self.samples_mapping = self._get_samples_mapping(self.indexed_dataset, self.samples_mapping = get_samples_mapping_(self.indexed_dataset, data_prefix, num_epochs, max_num_samples, Loading @@ -45,12 +42,12 @@ class AlbertDataset(Dataset): self.seed) # Vocab stuff. self.vocab_id_list = list(tokenizer.inv_vocab.keys()) self.vocab_id_to_token_dict = tokenizer.inv_vocab self.cls_id = tokenizer.vocab['[CLS]'] self.sep_id = tokenizer.vocab['[SEP]'] self.mask_id = tokenizer.vocab['[MASK]'] self.pad_id = tokenizer.vocab['[PAD]'] self.vocab_id_list = list(self.tokenizer.inv_vocab.keys()) self.vocab_id_to_token_dict = self.tokenizer.inv_vocab self.cls_id = self.tokenizer.vocab['[CLS]'] self.sep_id = self.tokenizer.vocab['[SEP]'] self.mask_id = self.tokenizer.vocab['[MASK]'] self.pad_id = self.tokenizer.vocab['[PAD]'] exit() Loading @@ -64,6 +61,8 @@ class AlbertDataset(Dataset): def __getitem__(self, idx): # Note that this rng state should be python and not numpy since # python randint is inclusive whereas the numpy one is exclusive. rng = random.Random(self.seed + idx) start_index, end_index, seq_length = self.samples_mapping[idx] sample = [] Loading @@ -82,7 +81,7 @@ class AlbertDataset(Dataset): def _get_indexed_dataset(self, data_prefix, data_impl, skip_warmup): def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): start_time = time.time() print_rank_0("> Reading dataset index ...") indexed_dataset = make_indexed_dataset(data_prefix, Loading @@ -93,8 +92,7 @@ class AlbertDataset(Dataset): return indexed_dataset def _get_samples_mapping(self, indexed_dataset, def get_samples_mapping_(indexed_dataset, data_prefix, num_epochs, max_num_samples, Loading Loading @@ -274,6 +272,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length, return samples_np ''' ''' # WILL BE REPLACED WITH JARED'S class JaredDataset(object): Loading Loading @@ -395,3 +394,4 @@ if __name__ == '__main__': max_seq_length=512, short_seq_prob=0.1, seed=1234) ''' megatron/data/dataset_utils.py +13 −10 Original line number Diff line number Diff line Loading @@ -24,7 +24,9 @@ def build_training_sample(sample, mask_id: Mask token id. pad_id: Padding token id. masked_lm_prob: Probability to mask tokens. rng: Random number genenrator. rng: Random number genenrator. Note that this rng state should be python and not numpy since python randint is inclusive for the opper bound whereas the numpy one is exclusive. """ # We assume that we have at least two sentences in the sample Loading @@ -36,8 +38,8 @@ def build_training_sample(sample, # Truncate to `target_sequence_length`. max_num_tokens = target_seq_length truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b), max_num_tokens, rng) truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b), max_num_tokens, rng) # Build tokens and toketypes. tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b, Loading @@ -50,17 +52,17 @@ def build_training_sample(sample, cls_id, sep_id, mask_id, max_predictions_per_seq, rng) # Padding. tokens_np, tokentypes_np, labels, padding_mask, loss_mask \ tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \ = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, masked_labels, pad_id, max_seq_length) train_sample = { 'text': tokens_np, 'types': tokentypes_np, 'labels': labels, 'labels': labels_np, 'is_random': int(is_next_random), 'loss_mask': loss_mask, 'padding_mask': padding_mask, 'loss_mask': loss_mask_np, 'padding_mask': padding_mask_np, 'truncated': int(truncated)} return train_sample Loading Loading @@ -357,7 +359,8 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) # Padding mask. padding_mask_np = np.array([1]*num_tokens + [0]*padding_length, dtype=np.int64) padding_mask_np = np.array([1]*num_tokens + [0]*padding_length, dtype=np.int64) # Lables and loss mask. labels = [-1] * max_seq_length Loading @@ -372,8 +375,7 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np ''' if __name__ == '__main__': Loading Loading @@ -469,3 +471,4 @@ if __name__ == '__main__': string += '{:5d}'.format(tokentype) string += '{:5d}'.format(padding_mask) print(string) ''' megatron/model/bert_model.py +1 −1 Original line number Diff line number Diff line Loading @@ -145,7 +145,7 @@ class BertModel(MegatronModule): init_method=init_method, scaled_init_method=scaled_init_method_normal(init_method_std, num_layers), residual_connection_post_layernorm=True) residual_connection_post_layernorm=False) self.lm_head = BertLMHead( self.language_model.embedding.word_embeddings.weight.size(0), Loading pretrain_albert.py +2 −2 Original line number Diff line number Diff line Loading @@ -73,7 +73,7 @@ def get_batch(data_iterator, timers): sentence_order = data_b['is_random'].long() loss_mask = data_b['loss_mask'].float() lm_labels = data_b['labels'].long() padding_mask = data_b['padding_mask'].byte() padding_mask = data_b['padding_mask'].long() return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask Loading @@ -88,7 +88,7 @@ def forward_step(data_iterator, model, args, timers): timers('batch generator').stop() # Forward model. lm_logits, sop_logits = model(tokens, 1-padding_mask, tokentype_ids=types) lm_logits, sop_logits = model(tokens, padding_mask, tokentype_ids=types) sop_loss = F.cross_entropy(sop_logits.view(-1, 2).contiguous().float(), sentence_order.view(-1).contiguous(), Loading pretrain_bert.py +1 −1 Original line number Diff line number Diff line Loading @@ -72,7 +72,7 @@ def get_batch(data_iterator, timers): next_sentence = data_b['is_random'].long() loss_mask = data_b['mask'].float() lm_labels = data_b['mask_labels'].long() padding_mask = data_b['pad_mask'].byte() padding_mask = data_b['pad_mask'].long() return tokens, types, next_sentence, loss_mask, lm_labels, padding_mask Loading Loading
megatron/data/albert_dataset.py +98 −98 Original line number Diff line number Diff line """TO BE ADDED """ import os import random import time import os import numpy as np import torch from torch.utils.data import Dataset from .dataset_utils import build_training_sample #from data.mapping import build_training_samples_mapping from . import helpers from megatron.data import helpers from megatron.data import FullBertTokenizer from megatron.data.dataset_utils import build_training_sample from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset from megatron.utils import print_rank_0 class AlbertDataset(Dataset): def __init__(self, vocab_file, data_prefix, data_impl, skip_warmup, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, short_seq_prob, seed): def __init__(self, vocab_file, data_prefix, data_impl, skip_warmup, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, short_seq_prob, seed): # Params to store. self.seed = seed Loading @@ -32,11 +28,12 @@ class AlbertDataset(Dataset): self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=True) # Indexed dataset. self.indexed_dataset = self._get_indexed_dataset(data_prefix, data_impl, self.indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup) # Build the samples mapping. self.samples_mapping = self._get_samples_mapping(self.indexed_dataset, self.samples_mapping = get_samples_mapping_(self.indexed_dataset, data_prefix, num_epochs, max_num_samples, Loading @@ -45,12 +42,12 @@ class AlbertDataset(Dataset): self.seed) # Vocab stuff. self.vocab_id_list = list(tokenizer.inv_vocab.keys()) self.vocab_id_to_token_dict = tokenizer.inv_vocab self.cls_id = tokenizer.vocab['[CLS]'] self.sep_id = tokenizer.vocab['[SEP]'] self.mask_id = tokenizer.vocab['[MASK]'] self.pad_id = tokenizer.vocab['[PAD]'] self.vocab_id_list = list(self.tokenizer.inv_vocab.keys()) self.vocab_id_to_token_dict = self.tokenizer.inv_vocab self.cls_id = self.tokenizer.vocab['[CLS]'] self.sep_id = self.tokenizer.vocab['[SEP]'] self.mask_id = self.tokenizer.vocab['[MASK]'] self.pad_id = self.tokenizer.vocab['[PAD]'] exit() Loading @@ -64,6 +61,8 @@ class AlbertDataset(Dataset): def __getitem__(self, idx): # Note that this rng state should be python and not numpy since # python randint is inclusive whereas the numpy one is exclusive. rng = random.Random(self.seed + idx) start_index, end_index, seq_length = self.samples_mapping[idx] sample = [] Loading @@ -82,7 +81,7 @@ class AlbertDataset(Dataset): def _get_indexed_dataset(self, data_prefix, data_impl, skip_warmup): def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): start_time = time.time() print_rank_0("> Reading dataset index ...") indexed_dataset = make_indexed_dataset(data_prefix, Loading @@ -93,8 +92,7 @@ class AlbertDataset(Dataset): return indexed_dataset def _get_samples_mapping(self, indexed_dataset, def get_samples_mapping_(indexed_dataset, data_prefix, num_epochs, max_num_samples, Loading Loading @@ -274,6 +272,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length, return samples_np ''' ''' # WILL BE REPLACED WITH JARED'S class JaredDataset(object): Loading Loading @@ -395,3 +394,4 @@ if __name__ == '__main__': max_seq_length=512, short_seq_prob=0.1, seed=1234) '''
megatron/data/dataset_utils.py +13 −10 Original line number Diff line number Diff line Loading @@ -24,7 +24,9 @@ def build_training_sample(sample, mask_id: Mask token id. pad_id: Padding token id. masked_lm_prob: Probability to mask tokens. rng: Random number genenrator. rng: Random number genenrator. Note that this rng state should be python and not numpy since python randint is inclusive for the opper bound whereas the numpy one is exclusive. """ # We assume that we have at least two sentences in the sample Loading @@ -36,8 +38,8 @@ def build_training_sample(sample, # Truncate to `target_sequence_length`. max_num_tokens = target_seq_length truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b), max_num_tokens, rng) truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b), max_num_tokens, rng) # Build tokens and toketypes. tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b, Loading @@ -50,17 +52,17 @@ def build_training_sample(sample, cls_id, sep_id, mask_id, max_predictions_per_seq, rng) # Padding. tokens_np, tokentypes_np, labels, padding_mask, loss_mask \ tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \ = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, masked_labels, pad_id, max_seq_length) train_sample = { 'text': tokens_np, 'types': tokentypes_np, 'labels': labels, 'labels': labels_np, 'is_random': int(is_next_random), 'loss_mask': loss_mask, 'padding_mask': padding_mask, 'loss_mask': loss_mask_np, 'padding_mask': padding_mask_np, 'truncated': int(truncated)} return train_sample Loading Loading @@ -357,7 +359,8 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) # Padding mask. padding_mask_np = np.array([1]*num_tokens + [0]*padding_length, dtype=np.int64) padding_mask_np = np.array([1]*num_tokens + [0]*padding_length, dtype=np.int64) # Lables and loss mask. labels = [-1] * max_seq_length Loading @@ -372,8 +375,7 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np ''' if __name__ == '__main__': Loading Loading @@ -469,3 +471,4 @@ if __name__ == '__main__': string += '{:5d}'.format(tokentype) string += '{:5d}'.format(padding_mask) print(string) '''
megatron/model/bert_model.py +1 −1 Original line number Diff line number Diff line Loading @@ -145,7 +145,7 @@ class BertModel(MegatronModule): init_method=init_method, scaled_init_method=scaled_init_method_normal(init_method_std, num_layers), residual_connection_post_layernorm=True) residual_connection_post_layernorm=False) self.lm_head = BertLMHead( self.language_model.embedding.word_embeddings.weight.size(0), Loading
pretrain_albert.py +2 −2 Original line number Diff line number Diff line Loading @@ -73,7 +73,7 @@ def get_batch(data_iterator, timers): sentence_order = data_b['is_random'].long() loss_mask = data_b['loss_mask'].float() lm_labels = data_b['labels'].long() padding_mask = data_b['padding_mask'].byte() padding_mask = data_b['padding_mask'].long() return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask Loading @@ -88,7 +88,7 @@ def forward_step(data_iterator, model, args, timers): timers('batch generator').stop() # Forward model. lm_logits, sop_logits = model(tokens, 1-padding_mask, tokentype_ids=types) lm_logits, sop_logits = model(tokens, padding_mask, tokentype_ids=types) sop_loss = F.cross_entropy(sop_logits.view(-1, 2).contiguous().float(), sentence_order.view(-1).contiguous(), Loading
pretrain_bert.py +1 −1 Original line number Diff line number Diff line Loading @@ -72,7 +72,7 @@ def get_batch(data_iterator, timers): next_sentence = data_b['is_random'].long() loss_mask = data_b['mask'].float() lm_labels = data_b['mask_labels'].long() padding_mask = data_b['pad_mask'].byte() padding_mask = data_b['pad_mask'].long() return tokens, types, next_sentence, loss_mask, lm_labels, padding_mask Loading