resolved reproducibity issue (e2add0fd) · Commits · candle / Megatron-LM

megatron/data/albert_dataset.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -137,9 +137,6 @@ class AlbertDataset(Dataset):

		def __getitem__(self, idx):

		# Note that this rng state should be python and not numpy since
		# python randint is inclusive whereas the numpy one is exclusive.
		rng = random.Random(self.seed + idx)
		start_index, end_index, seq_length = self.samples_mapping[idx]
		sample = []
		for index in range(start_index, end_index):
		@@ -149,13 +146,16 @@ class AlbertDataset(Dataset):
		if len(s) > 1000:
		print(self.tokenizer.convert_ids_to_tokens(s))
		'''
		# Note that this rng state should be numpy and not python since
		# python randint is inclusive whereas the numpy one is exclusive.
		np_rng = np.random.RandomState(seed=(self.seed + idx))
		return build_training_sample(sample, seq_length,
		self.max_seq_length, # needed for padding
		self.vocab_id_list,
		self.vocab_id_to_token_dict,
		self.cls_id, self.sep_id,
		self.mask_id, self.pad_id,
		self.masked_lm_prob, rng)
		self.masked_lm_prob, np_rng)


		def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):

megatron/data/dataset_utils.py

+22 −22

Original line number	Diff line number	Diff line
		@@ -9,7 +9,7 @@ def build_training_sample(sample,
		target_seq_length, max_seq_length,
		vocab_id_list, vocab_id_to_token_dict,
		cls_id, sep_id, mask_id, pad_id,
		masked_lm_prob, rng):
		masked_lm_prob, np_rng):
		"""Biuld training sample.

		Arguments:
		@@ -24,8 +24,8 @@ def build_training_sample(sample,
		mask_id: Mask token id.
		pad_id: Padding token id.
		masked_lm_prob: Probability to mask tokens.
		rng: Random number genenrator. Note that this rng state should be
		python and not numpy since python randint is inclusive for
		np_rng: Random number genenrator. Note that this rng state should be
		numpy and not python since python randint is inclusive for
		the opper bound whereas the numpy one is exclusive.
		"""

		@@ -34,12 +34,12 @@ def build_training_sample(sample,
		assert target_seq_length <= max_seq_length

		# Divide sample into two segments (A and B).
		tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, rng)
		tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, np_rng)

		# Truncate to `target_sequence_length`.
		max_num_tokens = target_seq_length
		truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a),
		len(tokens_b), max_num_tokens, rng)
		len(tokens_b), max_num_tokens, np_rng)

		# Build tokens and toketypes.
		tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b,
		@@ -49,7 +49,7 @@ def build_training_sample(sample,
		max_predictions_per_seq = masked_lm_prob * max_num_tokens
		(tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions(
		tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
		cls_id, sep_id, mask_id, max_predictions_per_seq, rng)
		cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng)

		# Padding.
		tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
		@@ -67,7 +67,7 @@ def build_training_sample(sample,
		return train_sample


		def get_a_and_b_segments(sample, rng):
		def get_a_and_b_segments(sample, np_rng):
		"""Divide sample into a and b segments."""

		# Number of sentences in the sample.
		@@ -79,8 +79,8 @@ def get_a_and_b_segments(sample, rng):
		# `a_end` is how many sentences go into the `A`.
		a_end = 1
		if n_sentences >= 3:
		# Note that randin in python is inclusive.
		a_end = rng.randint(1, n_sentences - 1)
		# Note that randin in numpy is exclusive.
		a_end = np_rng.randint(1, n_sentences)
		tokens_a = []
		for j in range(a_end):
		tokens_a.extend(sample[j])
		@@ -92,14 +92,14 @@ def get_a_and_b_segments(sample, rng):

		# Random next:
		is_next_random = False
		if rng.random() < 0.5:
		if np_rng.random() < 0.5:
		is_next_random = True
		tokens_a, tokens_b = tokens_b, tokens_a

		return tokens_a, tokens_b, is_next_random


		def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, rng):
		def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng):
		"""Truncates a pair of sequences to a maximum sequence length."""
		#print(len_a, len_b, max_num_tokens)
		assert len_a > 0
		@@ -113,7 +113,7 @@ def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, rng):
		else:
		len_b -= 1
		tokens = tokens_b
		if rng.random() < 0.5:
		if np_rng.random() < 0.5:
		del tokens[0]
		else:
		tokens.pop()
		@@ -163,7 +163,7 @@ def create_masked_lm_predictions(tokens,
		masked_lm_prob,
		cls_id, sep_id, mask_id,
		max_predictions_per_seq,
		rng,
		np_rng,
		max_ngrams=3,
		do_whole_word_mask=True,
		favor_longer_ngram=False,
		@@ -223,7 +223,7 @@ def create_masked_lm_predictions(tokens,
		ngram_index.append(cand_indexes[idx:idx+n])
		ngram_indexes.append(ngram_index)

		rng.shuffle(ngram_indexes)
		np_rng.shuffle(ngram_indexes)

		masked_lms = []
		covered_indexes = set()
		@@ -239,7 +239,7 @@ def create_masked_lm_predictions(tokens,
		if index in covered_indexes:
		continue

		n = np.random.choice(ngrams[:len(cand_index_set)],
		n = np_rng.choice(ngrams[:len(cand_index_set)],
		p=pvals[:len(cand_index_set)] /
		pvals[:len(cand_index_set)].sum(keepdims=True))
		index_set = sum(cand_index_set[n - 1], [])
		@@ -268,22 +268,22 @@ def create_masked_lm_predictions(tokens,

		masked_token = None
		# 80% of the time, replace with [MASK]
		if rng.random() < 0.8:
		if np_rng.random() < 0.8:
		masked_token = mask_id
		else:
		# 10% of the time, keep original
		if rng.random() < 0.5:
		if np_rng.random() < 0.5:
		masked_token = tokens[index]
		# 10% of the time, replace with random word
		else:
		masked_token = vocab_id_list[rng.randint(0, len(vocab_id_list) - 1)]
		masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))]

		output_tokens[index] = masked_token

		masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
		assert len(masked_lms) <= num_to_predict

		rng.shuffle(ngram_indexes)
		np_rng.shuffle(ngram_indexes)

		select_indexes = set()
		if do_permutation:
		@@ -327,7 +327,7 @@ def create_masked_lm_predictions(tokens,

		select_indexes = sorted(select_indexes)
		permute_indexes = list(select_indexes)
		rng.shuffle(permute_indexes)
		np_rng.shuffle(permute_indexes)
		orig_token = list(output_tokens)

		for src_i, tgt_i in zip(select_indexes, permute_indexes):