before optimization (6140718f) · Commits · candle / Megatron-LM

megatron/data/dataset.py

+72 −57

Original line number	Diff line number	Diff line
		@@ -7,18 +7,55 @@ import numpy as np
		import torch
		from torch.utils.data import Dataset

		from dataset_utils import build_training_sample

		# WILL BE REPLACED WITH JARED'S
		class JaredDataset(object):

		def __init__(self, doc_idx, sizes, sentences):
		self.doc_idx = doc_idx
		self.num_docs = len(self.doc_idx) - 1
		self.sizes = sizes
		self.sentences = sentences
		class AlbertDataSet(Dataset):

		def __init__(self, indexed_dataset, tokenizer, num_epochs,
		masked_lm_prob, max_seq_length, short_seq_prob, seed):

		# Params to store.
		self.seed = seed
		self.masked_lm_prob = masked_lm_prob
		self.max_seq_length = max_seq_length

		# Indexed dataset.
		self.indexed_dataset = indexed_dataset

		# Build the samples mapping.
		self.samples_mapping = build_training_samples_mapping(
		indexed_dataset,
		num_epochs,
		self.max_seq_length,
		short_seq_prob,
		self.seed)

		# Vocab stuff.
		self.vocab_id_list = list(tokenizer.inv_vocab.keys())
		self.vocab_id_to_token_dict = tokenizer.inv_vocab
		self.cls_id = tokenizer.vocab['[CLS]']
		self.sep_id = tokenizer.vocab['[SEP]']
		self.mask_id = tokenizer.vocab['[MASK]']
		self.pad_id = tokenizer.vocab['[PAD]']


		def __len__(self):
		return self.samples.shape[0]

		def __getitem__(self, idx):
		return self.sentences[idx]
		rng = random.Random(self.seed + idx)
		start_index, end_index, seq_length = self.samples_mapping[idx]
		sample = []
		for index in range(start_index, end_index):
		sample.append(self.indexed_dataset[index])
		return build_training_sample(sample, seq_length,
		self.max_seq_length,
		self.vocab_id_list,
		self.vocab_id_to_token_dict,
		self.cls_id, self.sep_id,
		self.mask_id, self.pad_id,
		self.masked_lm_prob, rng)


		def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
		@@ -87,6 +124,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
		while sent_index < sent_index_last:

		# Get the size.
		assert indexed_dataset.sizes[sent_index] > 0
		size += indexed_dataset.sizes[sent_index]
		sent_index += 1

		@@ -133,51 +171,17 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
		return samples_np


		class AlbertDataSet(Dataset):

		def __init__(self, indexed_dataset, tokenizer, num_epochs,
		masked_lm_prob, max_seq_length, short_seq_prob, seed):

		# Params to store.
		self.seed = seed
		self.masked_lm_prob = masked_lm_prob
		self.max_seq_length = max_seq_length

		# Indexed dataset.
		self.indexed_dataset = indexed_dataset

		# Build the samples mapping.
		self.samples_mapping = build_training_samples_mapping(
		indexed_dataset,
		num_epochs,
		self.max_seq_length,
		short_seq_prob,
		self.seed)

		# Vocab stuff.
		self.vocab_id_list = list(tokenizer.inv_vocab.keys())
		self.vocab_id_to_token_dict = tokenizer.inv_vocab
		self.cls_id = tokenizer.vocab['[CLS]']
		self.sep_id = tokenizer.vocab['[SEP]']
		self.mask_id = tokenizer.vocab['[MASK]']
		self.pad_id = tokenizer.vocab['[PAD]']

		# WILL BE REPLACED WITH JARED'S
		class JaredDataset(object):

		def __len__(self):
		return self.samples.shape[0]
		def __init__(self, doc_idx, sizes, sentences):
		self.doc_idx = doc_idx
		self.num_docs = len(self.doc_idx) - 1
		self.sizes = sizes
		self.sentences = sentences

		def __getitem__(self, idx):
		rng = random.Random(self.seed + idx)
		start_index, end_index = self.samples_mapping[idx]
		sample = []
		for index in range(start_index, end_index):
		sample.append(self.indexed_dataset[index])
		return build_training_sample(sample, self.vocab_id_list,
		self.vocab_id_to_token_dict,
		self.cls_id, self.sep_id,
		self.mask_id, self.pad_id,
		self.masked_lm_prob, self.max_seq_length,
		rng)
		return self.sentences[idx]



		@@ -198,10 +202,12 @@ if __name__ == '__main__':
		sentences = []
		for line in text.split('\n'):
		if line != '\n':
		sentences.extend(nltk.tokenize.sent_tokenize(line))
		sent = nltk.tokenize.sent_tokenize(line)
		if sent:
		sentences.extend(sent)
		yield sentences

		input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
		input_file = '/raid/mshoeybi/data/albert/sample/samples_1000.json'
		vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'

		tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
		@@ -212,19 +218,28 @@ if __name__ == '__main__':
		sentences_list = []

		for sentences in document_generator:
		doc_idx.append(len(sentences))
		num_sent = 0
		for sentence in sentences:
		tokens = tokenizer.tokenize(sentence)
		if tokens:
		ids = tokenizer.convert_tokens_to_ids(tokens)
		if len(ids) == 0:
		print('****************')
		print(sentence)
		print(tokens)
		print(ids)
		print('****************')
		sizes.append(len(ids))
		sentences_list.append(ids)
		num_sent += 1
		doc_idx.append(num_sent)
		for i in range(1, len(doc_idx)):
		doc_idx[i] += doc_idx[i-1]

		indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
		dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
		tokenizer=tokenizer,
		num_epochs=3,
		num_epochs=10,
		masked_lm_prob=0.15,
		max_seq_length=512,
		short_seq_prob=0.1,

megatron/data/dataset_utils.py

+13 −7

Original line number	Diff line number	Diff line
		@@ -5,13 +5,18 @@ import collections
		import numpy as np


		def build_training_sample(sample, vocab_id_list, vocab_id_to_token_dict,
		def build_training_sample(sample,
		target_seq_length, max_seq_length,
		vocab_id_list, vocab_id_to_token_dict,
		cls_id, sep_id, mask_id, pad_id,
		masked_lm_prob, max_seq_length, rng):
		masked_lm_prob, rng):
		"""Biuld training sample.

		Arguments:
		sample: A list of sentences in which each sentence is a list token ids.
		target_seq_length: Desired sequence length.
		max_seq_length: Maximum length of the sequence. All values are padded to
		this length.
		vocab_id_list: List of vocabulary ids. Used to pick a random id.
		vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
		cls_id: Start of example id.
		@@ -19,20 +24,19 @@ def build_training_sample(sample, vocab_id_list, vocab_id_to_token_dict,
		mask_id: Mask token id.
		pad_id: Padding token id.
		masked_lm_prob: Probability to mask tokens.
		max_seq_length: Maximum length of the sequence. All values are padded to
		this length.
		rng: Random number genenrator.
		"""

		# We assume that we have at least two sentences in the sample
		assert len(sample) > 1
		assert target_seq_length <= max_seq_length

		# Divide sample into two segments (A and B).
		tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, rng)

		# Truncate to `max_sequence_length`.
		# Truncate to `target_sequence_length`.
		# Note that we have account for [CLS] A [SEP] B [SEP]
		max_num_tokens = max_seq_length - 3
		max_num_tokens = target_seq_length - 3
		truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b),
		max_num_tokens, rng)

		@@ -421,11 +425,13 @@ if __name__ == '__main__':
		for s in samples[0]:
		sample.append(tokenizer.convert_tokens_to_ids(s))
		max_seq_length = 512
		target_seq_length = 444
		masked_lm_prob = 0.15
		example = build_training_sample(sample,
		target_seq_length, max_seq_length,
		vocab_id_list, vocab_id_to_token_dict,
		cls_id, sep_id, mask_id, pad_id,
		masked_lm_prob, max_seq_length, rng)
		masked_lm_prob, rng)

		orig_tokens = []
		for s in samples[0]: