minor cleaning (75a86a1d) · Commits · candle / Megatron-LM

megatron/data/albert_dataset.py

+16 −245

Original line number	Diff line number	Diff line
		"""TO BE ADDED """
		# coding=utf-8
		# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.

		"""ALBERT Style dataset."""

		import os
		import time
		@@ -140,11 +155,6 @@ class AlbertDataset(Dataset):
		sample = []
		for index in range(start_index, end_index):
		sample.append(self.indexed_dataset[index])
		'''
		for s in sample:
		if len(s) > 1000:
		print(self.tokenizer.convert_ids_to_tokens(s))
		'''
		# Note that this rng state should be numpy and not python since
		# python randint is inclusive whereas the numpy one is exclusive.
		np_rng = np.random.RandomState(seed=(self.seed + idx))
		@@ -285,242 +295,3 @@ def get_samples_mapping_(indexed_dataset,
		samples_mapping.shape[0]))

		return samples_mapping


		'''
		def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
		"""With probability `short_seq_prob` generate a smaller sequence lenght."""
		if np_rng.random() < short_seq_prob:
		return np_rng.randint(2, max_num_tokens + 1)
		return max_num_tokens


		def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
		short_seq_prob, seed):
		"""Build a mapping to reconstruct training samples."""

		start_time = time.time()
		print('> building training samples mapping ...')

		# RNG:
		np_rng = np.random.RandomState(seed=seed)

		# List of start sentence index and end sentence index (end is exclusive)
		# to retrieve.
		samples = []

		# Account for [CLS], [SEP], [SEP]
		max_num_tokens = max_seq_length - 3

		# Number of documents processed:
		total_docs = 0
		# Number of documents that are skipped:
		skipped_docs = 0
		# Number of empty documents:
		empty_docs = 0

		# For each epoch:
		for epoch in range(num_epochs):
		# For each document:
		for doc_index in range(indexed_dataset.num_docs):
		if epoch == 0:
		total_docs += 1

		# Document sentences are in [sent_index_first, sent_index_last).
		sent_index_first = indexed_dataset.doc_idx[doc_index]
		sent_index_last = indexed_dataset.doc_idx[doc_index+1]
		assert sent_index_last >= sent_index_first

		# Empty docs.
		if (sent_index_last - sent_index_first) == 0:
		if epoch == 0:
		print('*WARNING* document {} is empty'.format(
		doc_index))
		empty_docs += 1
		continue
		# Skip documents that only have one sentences.
		if (sent_index_last - sent_index_first) == 1:
		if epoch == 0:
		print('*WARNING* document {} has only one sentnece, '
		'skipping ...'.format(doc_index))
		skipped_docs += 1
		continue

		# Loop through sentences.
		sent_index = sent_index_first
		target_seq_length = get_target_seq_length(max_num_tokens,
		short_seq_prob, np_rng)
		size = 0
		while sent_index < sent_index_last:

		# Get the size.
		assert indexed_dataset.sizes[sent_index] > 0
		size += indexed_dataset.sizes[sent_index]
		sent_index += 1

		# If we have reached the target length.
		exceeded_target_size = (size >= target_seq_length)
		# If only one sentence is left in the document.
		only_one_sent_left = (sent_index == (sent_index_last - 1))
		# If we have at least two sentneces.
		have_more_than_one_sent = (sent_index - sent_index_first) > 1
		# If we have reached end of the document.
		reached_end_of_doc = (sent_index == sent_index_last)
		if (exceeded_target_size and not only_one_sent_left and
		have_more_than_one_sent) or reached_end_of_doc:
		assert (sent_index - sent_index_first) > 1
		assert size > 1
		# Add the sample.
		samples.append([sent_index_first, sent_index,
		target_seq_length])
		# Reset indices
		sent_index_first = sent_index
		target_seq_length = get_target_seq_length(max_num_tokens,
		short_seq_prob,
		np_rng)
		size = 0
		num_sentences = 0

		# Convert to numpy array.
		samples_np = np.array(samples, dtype=np.int64)
		# Shuffle.
		np_rng.shuffle(samples_np)
		elapsed_time = time.time() - start_time

		# Print some stats:
		print('\n*************************** info ***************************')
		print(' elapsed time (sec) ..................... {}'.format(elapsed_time))
		print(' number of epochs ....................... {}'.format(num_epochs))
		print(' number of samples ...................... {}'.format(
		samples_np.shape[0]))
		print(' number of documents .................... {}'.format(total_docs))
		print(' number of empty documents .............. {}'.format(empty_docs))
		print(' number of documents with one sentence .. {}'.format(skipped_docs))
		print('****************************************************************\n')

		return samples_np
		'''

		'''
		# WILL BE REPLACED WITH JARED'S
		class JaredDataset(object):

		def __init__(self, doc_idx, sizes, sentences):
		self.doc_idx = doc_idx
		self.num_docs = len(self.doc_idx) - 1
		self.sizes = sizes
		self.sentences = sentences

		def __getitem__(self, idx):
		return self.sentences[idx]



		if __name__ == '__main__':
		print('dataset ...')

		from bert_tokenization import FullTokenizer
		import json
		import nltk
		nltk.download('punkt')

		def document_generator_provider(input_file):
		with open(input_file, 'r') as ifile:
		for document in ifile:
		data = json.loads(document)
		text = data['text']
		sentences = []
		for line in text.split('\n'):
		if line != '\n':
		sent = nltk.tokenize.sent_tokenize(line)
		if sent:
		sentences.extend(sent)
		yield sentences

		input_file = 'test/samples_10000.json'
		vocab_file = 'test/vocab.txt'

		tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
		document_generator = document_generator_provider(input_file)

		doc_idx = [0]
		sizes = []
		sentences_list = []

		for sentences in document_generator:
		num_sent = 0
		for sentence in sentences:
		tokens = tokenizer.tokenize(sentence)
		if tokens:
		ids = tokenizer.convert_tokens_to_ids(tokens)
		if len(ids) == 0:
		print('****************')
		print(sentence)
		print(tokens)
		print(ids)
		print('****************')
		sizes.append(len(ids))
		sentences_list.append(ids)
		num_sent += 1
		doc_idx.append(num_sent)
		for i in range(1, len(doc_idx)):
		doc_idx[i] += doc_idx[i-1]

		#max_size = np.iinfo(np.int32).max // 32

		import time

		docs_np = np.array(doc_idx, dtype=np.uint32)
		sizes_np = np.array(sizes, dtype=np.uint16)

		start_time = time.time()
		max_seq_length = 512
		max_size = docs_np.shape[0]
		lens = np.full(max_size, max_seq_length-3, dtype=np.uint16)
		lens_rand = np.random.randint(low=2, high=(max_seq_length-2),
		size=max_size//10, dtype=np.uint16)
		lens_view = lens[:max_size//10]
		np.copyto(lens_view, lens_rand)
		np.random.shuffle(lens)
		print('num docs', max_size)
		print('lens time', time.time() - start_time)

		import helpers
		start_time = time.time()
		maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234)
		print('maps time', time.time() - start_time)
		print(maps)
		exit()

		start_time = time.time()
		max_size = 10 #np.iinfo(np.int32).max 32
		docs = np.arange(10, dtype=np.uint32)
		print(docs)

		a = example.doit(docs, max_size)
		print(type(a))
		print(a.shape)
		print(a)
		print(time.time() - start_time)
		exit()


		#start_time = time.time()
		count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10)
		print(count)
		maps = maps[:count]
		np.random.shuffle(maps)
		print(time.time() - start_time)


		exit()

		indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
		dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
		tokenizer=tokenizer,
		num_epochs=10,
		masked_lm_prob=0.15,
		max_seq_length=512,
		short_seq_prob=0.1,
		seed=1234)
		'''

megatron/data/dataset_utils.py

+14 −100

Original line number	Diff line number	Diff line
		"""TO BE ADDED"""
		# coding=utf-8
		# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.


		import collections
		@@ -373,102 +386,3 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
		loss_mask_np = np.array(loss_mask, dtype=np.int64)

		return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np


		'''
		if __name__ == '__main__':


		print('building the dataset ...')

		from bert_tokenization import FullTokenizer
		import json
		import nltk
		nltk.download('punkt')

		def document_generator_provider(input_file):
		with open(input_file, 'r') as ifile:
		for document in ifile:
		data = json.loads(document)
		text = data['text']
		sentences = []
		for line in text.split('\n'):
		if line != '\n':
		sentences.extend(nltk.tokenize.sent_tokenize(line))
		yield sentences

		input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
		vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'

		tokenizer = FullTokenizer(vocab_file, do_lower_case=True)

		document_generator = document_generator_provider(input_file)
		samples = []
		sizes = []
		for sentences in document_generator:
		tokens_list = []
		size = 0
		for sentence in sentences:
		tokens = tokenizer.tokenize(sentence)
		tokens_list.append(tokens)
		size += len(tokens)
		samples.append(tokens_list)
		sizes.append(size)
		print(sizes)

		import random
		rng = random.Random(123567)
		vocab_id_list = list(tokenizer.inv_vocab.keys())
		cls_id = tokenizer.vocab['[CLS]']
		sep_id = tokenizer.vocab['[SEP]']
		mask_id = tokenizer.vocab['[MASK]']
		pad_id = tokenizer.vocab['[PAD]']
		vocab_id_to_token_dict = tokenizer.inv_vocab
		sample = []
		for s in samples[0]:
		sample.append(tokenizer.convert_tokens_to_ids(s))
		max_seq_length = 512
		target_seq_length = 444
		masked_lm_prob = 0.15
		example = build_training_sample(sample,
		target_seq_length, max_seq_length,
		vocab_id_list, vocab_id_to_token_dict,
		cls_id, sep_id, mask_id, pad_id,
		masked_lm_prob, rng)

		orig_tokens = []
		for s in samples[0]:
		orig_tokens.extend(s)
		is_random = example['is_random']
		if is_random:
		print('random')
		else:
		print('not-random')
		#exit()
		ii = 0
		for i in range(max_seq_length):
		token = tokenizer.inv_vocab[example['text'][i]]
		if token in ['[CLS]', '[SEP]'] :
		orig_token = token
		elif ii < len(orig_tokens):
		orig_token = orig_tokens[ii]
		ii += 1
		else:
		orig_token = 'EMPTY'
		tokentype = example['types'][i]
		label_id = example['labels'][i]
		label = 'NONE'
		if label_id >= 0:
		label = tokenizer.inv_vocab[label_id]
		loss_mask = example['loss_mask'][i]
		padding_mask = example['padding_mask'][i]

		string = ''
		string += '{:15s}'.format(orig_token)
		string += '{:15s}'.format(token)
		string += '{:15s}'.format(label)
		string += '{:5d}'.format(loss_mask)
		string += '{:5d}'.format(tokentype)
		string += '{:5d}'.format(padding_mask)
		print(string)
		'''

megatron/data/helpers.cpp

+18 −0

Original line number	Diff line number	Diff line
		/*
		coding=utf-8
		Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.

		Licensed under the Apache License, Version 2.0 (the "License");
		you may not use this file except in compliance with the License.
		You may obtain a copy of the License at

		http://www.apache.org/licenses/LICENSE-2.0

		Unless required by applicable law or agreed to in writing, software
		distributed under the License is distributed on an "AS IS" BASIS,
		WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		See the License for the specific language governing permissions and
		limitations under the License.
		*/


		/* Helper methods for fast index mapping builds */

		#include <algorithm>