Loading megatron/data/albert_dataset.py +16 −245 Original line number Diff line number Diff line """TO BE ADDED """ # coding=utf-8 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ALBERT Style dataset.""" import os import time Loading Loading @@ -140,11 +155,6 @@ class AlbertDataset(Dataset): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) ''' for s in sample: if len(s) > 1000: print(self.tokenizer.convert_ids_to_tokens(s)) ''' # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) Loading Loading @@ -285,242 +295,3 @@ def get_samples_mapping_(indexed_dataset, samples_mapping.shape[0])) return samples_mapping ''' def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng): """With probability `short_seq_prob` generate a smaller sequence lenght.""" if np_rng.random() < short_seq_prob: return np_rng.randint(2, max_num_tokens + 1) return max_num_tokens def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length, short_seq_prob, seed): """Build a mapping to reconstruct training samples.""" start_time = time.time() print('> building training samples mapping ...') # RNG: np_rng = np.random.RandomState(seed=seed) # List of start sentence index and end sentence index (end is exclusive) # to retrieve. samples = [] # Account for [CLS], [SEP], [SEP] max_num_tokens = max_seq_length - 3 # Number of documents processed: total_docs = 0 # Number of documents that are skipped: skipped_docs = 0 # Number of empty documents: empty_docs = 0 # For each epoch: for epoch in range(num_epochs): # For each document: for doc_index in range(indexed_dataset.num_docs): if epoch == 0: total_docs += 1 # Document sentences are in [sent_index_first, sent_index_last). sent_index_first = indexed_dataset.doc_idx[doc_index] sent_index_last = indexed_dataset.doc_idx[doc_index+1] assert sent_index_last >= sent_index_first # Empty docs. if (sent_index_last - sent_index_first) == 0: if epoch == 0: print('***WARNING*** document {} is empty'.format( doc_index)) empty_docs += 1 continue # Skip documents that only have one sentences. if (sent_index_last - sent_index_first) == 1: if epoch == 0: print('***WARNING*** document {} has only one sentnece, ' 'skipping ...'.format(doc_index)) skipped_docs += 1 continue # Loop through sentences. sent_index = sent_index_first target_seq_length = get_target_seq_length(max_num_tokens, short_seq_prob, np_rng) size = 0 while sent_index < sent_index_last: # Get the size. assert indexed_dataset.sizes[sent_index] > 0 size += indexed_dataset.sizes[sent_index] sent_index += 1 # If we have reached the target length. exceeded_target_size = (size >= target_seq_length) # If only one sentence is left in the document. only_one_sent_left = (sent_index == (sent_index_last - 1)) # If we have at least two sentneces. have_more_than_one_sent = (sent_index - sent_index_first) > 1 # If we have reached end of the document. reached_end_of_doc = (sent_index == sent_index_last) if (exceeded_target_size and not only_one_sent_left and have_more_than_one_sent) or reached_end_of_doc: assert (sent_index - sent_index_first) > 1 assert size > 1 # Add the sample. samples.append([sent_index_first, sent_index, target_seq_length]) # Reset indices sent_index_first = sent_index target_seq_length = get_target_seq_length(max_num_tokens, short_seq_prob, np_rng) size = 0 num_sentences = 0 # Convert to numpy array. samples_np = np.array(samples, dtype=np.int64) # Shuffle. np_rng.shuffle(samples_np) elapsed_time = time.time() - start_time # Print some stats: print('\n***************************** info *****************************') print(' elapsed time (sec) ..................... {}'.format(elapsed_time)) print(' number of epochs ....................... {}'.format(num_epochs)) print(' number of samples ...................... {}'.format( samples_np.shape[0])) print(' number of documents .................... {}'.format(total_docs)) print(' number of empty documents .............. {}'.format(empty_docs)) print(' number of documents with one sentence .. {}'.format(skipped_docs)) print('****************************************************************\n') return samples_np ''' ''' # WILL BE REPLACED WITH JARED'S class JaredDataset(object): def __init__(self, doc_idx, sizes, sentences): self.doc_idx = doc_idx self.num_docs = len(self.doc_idx) - 1 self.sizes = sizes self.sentences = sentences def __getitem__(self, idx): return self.sentences[idx] if __name__ == '__main__': print('dataset ...') from bert_tokenization import FullTokenizer import json import nltk nltk.download('punkt') def document_generator_provider(input_file): with open(input_file, 'r') as ifile: for document in ifile: data = json.loads(document) text = data['text'] sentences = [] for line in text.split('\n'): if line != '\n': sent = nltk.tokenize.sent_tokenize(line) if sent: sentences.extend(sent) yield sentences input_file = 'test/samples_10000.json' vocab_file = 'test/vocab.txt' tokenizer = FullTokenizer(vocab_file, do_lower_case=True) document_generator = document_generator_provider(input_file) doc_idx = [0] sizes = [] sentences_list = [] for sentences in document_generator: num_sent = 0 for sentence in sentences: tokens = tokenizer.tokenize(sentence) if tokens: ids = tokenizer.convert_tokens_to_ids(tokens) if len(ids) == 0: print('****************') print(sentence) print(tokens) print(ids) print('****************') sizes.append(len(ids)) sentences_list.append(ids) num_sent += 1 doc_idx.append(num_sent) for i in range(1, len(doc_idx)): doc_idx[i] += doc_idx[i-1] #max_size = np.iinfo(np.int32).max // 32 import time docs_np = np.array(doc_idx, dtype=np.uint32) sizes_np = np.array(sizes, dtype=np.uint16) start_time = time.time() max_seq_length = 512 max_size = docs_np.shape[0] lens = np.full(max_size, max_seq_length-3, dtype=np.uint16) lens_rand = np.random.randint(low=2, high=(max_seq_length-2), size=max_size//10, dtype=np.uint16) lens_view = lens[:max_size//10] np.copyto(lens_view, lens_rand) np.random.shuffle(lens) print('num docs', max_size) print('lens time', time.time() - start_time) import helpers start_time = time.time() maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234) print('maps time', time.time() - start_time) print(maps) exit() start_time = time.time() max_size = 10 #np.iinfo(np.int32).max 32 docs = np.arange(10, dtype=np.uint32) print(docs) a = example.doit(docs, max_size) print(type(a)) print(a.shape) print(a) print(time.time() - start_time) exit() #start_time = time.time() count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10) print(count) maps = maps[:count] np.random.shuffle(maps) print(time.time() - start_time) exit() indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list) dataset = AlbertDataSet(indexed_dataset=indexed_dataset, tokenizer=tokenizer, num_epochs=10, masked_lm_prob=0.15, max_seq_length=512, short_seq_prob=0.1, seed=1234) ''' megatron/data/dataset_utils.py +14 −100 Original line number Diff line number Diff line """TO BE ADDED""" # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors, and NVIDIA. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections Loading Loading @@ -373,102 +386,3 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, loss_mask_np = np.array(loss_mask, dtype=np.int64) return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np ''' if __name__ == '__main__': print('building the dataset ...') from bert_tokenization import FullTokenizer import json import nltk nltk.download('punkt') def document_generator_provider(input_file): with open(input_file, 'r') as ifile: for document in ifile: data = json.loads(document) text = data['text'] sentences = [] for line in text.split('\n'): if line != '\n': sentences.extend(nltk.tokenize.sent_tokenize(line)) yield sentences input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json' vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt' tokenizer = FullTokenizer(vocab_file, do_lower_case=True) document_generator = document_generator_provider(input_file) samples = [] sizes = [] for sentences in document_generator: tokens_list = [] size = 0 for sentence in sentences: tokens = tokenizer.tokenize(sentence) tokens_list.append(tokens) size += len(tokens) samples.append(tokens_list) sizes.append(size) print(sizes) import random rng = random.Random(123567) vocab_id_list = list(tokenizer.inv_vocab.keys()) cls_id = tokenizer.vocab['[CLS]'] sep_id = tokenizer.vocab['[SEP]'] mask_id = tokenizer.vocab['[MASK]'] pad_id = tokenizer.vocab['[PAD]'] vocab_id_to_token_dict = tokenizer.inv_vocab sample = [] for s in samples[0]: sample.append(tokenizer.convert_tokens_to_ids(s)) max_seq_length = 512 target_seq_length = 444 masked_lm_prob = 0.15 example = build_training_sample(sample, target_seq_length, max_seq_length, vocab_id_list, vocab_id_to_token_dict, cls_id, sep_id, mask_id, pad_id, masked_lm_prob, rng) orig_tokens = [] for s in samples[0]: orig_tokens.extend(s) is_random = example['is_random'] if is_random: print('random') else: print('not-random') #exit() ii = 0 for i in range(max_seq_length): token = tokenizer.inv_vocab[example['text'][i]] if token in ['[CLS]', '[SEP]'] : orig_token = token elif ii < len(orig_tokens): orig_token = orig_tokens[ii] ii += 1 else: orig_token = 'EMPTY' tokentype = example['types'][i] label_id = example['labels'][i] label = 'NONE' if label_id >= 0: label = tokenizer.inv_vocab[label_id] loss_mask = example['loss_mask'][i] padding_mask = example['padding_mask'][i] string = '' string += '{:15s}'.format(orig_token) string += '{:15s}'.format(token) string += '{:15s}'.format(label) string += '{:5d}'.format(loss_mask) string += '{:5d}'.format(tokentype) string += '{:5d}'.format(padding_mask) print(string) ''' megatron/data/helpers.cpp +18 −0 Original line number Diff line number Diff line /* coding=utf-8 Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Helper methods for fast index mapping builds */ #include <algorithm> Loading Loading
megatron/data/albert_dataset.py +16 −245 Original line number Diff line number Diff line """TO BE ADDED """ # coding=utf-8 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ALBERT Style dataset.""" import os import time Loading Loading @@ -140,11 +155,6 @@ class AlbertDataset(Dataset): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) ''' for s in sample: if len(s) > 1000: print(self.tokenizer.convert_ids_to_tokens(s)) ''' # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) Loading Loading @@ -285,242 +295,3 @@ def get_samples_mapping_(indexed_dataset, samples_mapping.shape[0])) return samples_mapping ''' def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng): """With probability `short_seq_prob` generate a smaller sequence lenght.""" if np_rng.random() < short_seq_prob: return np_rng.randint(2, max_num_tokens + 1) return max_num_tokens def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length, short_seq_prob, seed): """Build a mapping to reconstruct training samples.""" start_time = time.time() print('> building training samples mapping ...') # RNG: np_rng = np.random.RandomState(seed=seed) # List of start sentence index and end sentence index (end is exclusive) # to retrieve. samples = [] # Account for [CLS], [SEP], [SEP] max_num_tokens = max_seq_length - 3 # Number of documents processed: total_docs = 0 # Number of documents that are skipped: skipped_docs = 0 # Number of empty documents: empty_docs = 0 # For each epoch: for epoch in range(num_epochs): # For each document: for doc_index in range(indexed_dataset.num_docs): if epoch == 0: total_docs += 1 # Document sentences are in [sent_index_first, sent_index_last). sent_index_first = indexed_dataset.doc_idx[doc_index] sent_index_last = indexed_dataset.doc_idx[doc_index+1] assert sent_index_last >= sent_index_first # Empty docs. if (sent_index_last - sent_index_first) == 0: if epoch == 0: print('***WARNING*** document {} is empty'.format( doc_index)) empty_docs += 1 continue # Skip documents that only have one sentences. if (sent_index_last - sent_index_first) == 1: if epoch == 0: print('***WARNING*** document {} has only one sentnece, ' 'skipping ...'.format(doc_index)) skipped_docs += 1 continue # Loop through sentences. sent_index = sent_index_first target_seq_length = get_target_seq_length(max_num_tokens, short_seq_prob, np_rng) size = 0 while sent_index < sent_index_last: # Get the size. assert indexed_dataset.sizes[sent_index] > 0 size += indexed_dataset.sizes[sent_index] sent_index += 1 # If we have reached the target length. exceeded_target_size = (size >= target_seq_length) # If only one sentence is left in the document. only_one_sent_left = (sent_index == (sent_index_last - 1)) # If we have at least two sentneces. have_more_than_one_sent = (sent_index - sent_index_first) > 1 # If we have reached end of the document. reached_end_of_doc = (sent_index == sent_index_last) if (exceeded_target_size and not only_one_sent_left and have_more_than_one_sent) or reached_end_of_doc: assert (sent_index - sent_index_first) > 1 assert size > 1 # Add the sample. samples.append([sent_index_first, sent_index, target_seq_length]) # Reset indices sent_index_first = sent_index target_seq_length = get_target_seq_length(max_num_tokens, short_seq_prob, np_rng) size = 0 num_sentences = 0 # Convert to numpy array. samples_np = np.array(samples, dtype=np.int64) # Shuffle. np_rng.shuffle(samples_np) elapsed_time = time.time() - start_time # Print some stats: print('\n***************************** info *****************************') print(' elapsed time (sec) ..................... {}'.format(elapsed_time)) print(' number of epochs ....................... {}'.format(num_epochs)) print(' number of samples ...................... {}'.format( samples_np.shape[0])) print(' number of documents .................... {}'.format(total_docs)) print(' number of empty documents .............. {}'.format(empty_docs)) print(' number of documents with one sentence .. {}'.format(skipped_docs)) print('****************************************************************\n') return samples_np ''' ''' # WILL BE REPLACED WITH JARED'S class JaredDataset(object): def __init__(self, doc_idx, sizes, sentences): self.doc_idx = doc_idx self.num_docs = len(self.doc_idx) - 1 self.sizes = sizes self.sentences = sentences def __getitem__(self, idx): return self.sentences[idx] if __name__ == '__main__': print('dataset ...') from bert_tokenization import FullTokenizer import json import nltk nltk.download('punkt') def document_generator_provider(input_file): with open(input_file, 'r') as ifile: for document in ifile: data = json.loads(document) text = data['text'] sentences = [] for line in text.split('\n'): if line != '\n': sent = nltk.tokenize.sent_tokenize(line) if sent: sentences.extend(sent) yield sentences input_file = 'test/samples_10000.json' vocab_file = 'test/vocab.txt' tokenizer = FullTokenizer(vocab_file, do_lower_case=True) document_generator = document_generator_provider(input_file) doc_idx = [0] sizes = [] sentences_list = [] for sentences in document_generator: num_sent = 0 for sentence in sentences: tokens = tokenizer.tokenize(sentence) if tokens: ids = tokenizer.convert_tokens_to_ids(tokens) if len(ids) == 0: print('****************') print(sentence) print(tokens) print(ids) print('****************') sizes.append(len(ids)) sentences_list.append(ids) num_sent += 1 doc_idx.append(num_sent) for i in range(1, len(doc_idx)): doc_idx[i] += doc_idx[i-1] #max_size = np.iinfo(np.int32).max // 32 import time docs_np = np.array(doc_idx, dtype=np.uint32) sizes_np = np.array(sizes, dtype=np.uint16) start_time = time.time() max_seq_length = 512 max_size = docs_np.shape[0] lens = np.full(max_size, max_seq_length-3, dtype=np.uint16) lens_rand = np.random.randint(low=2, high=(max_seq_length-2), size=max_size//10, dtype=np.uint16) lens_view = lens[:max_size//10] np.copyto(lens_view, lens_rand) np.random.shuffle(lens) print('num docs', max_size) print('lens time', time.time() - start_time) import helpers start_time = time.time() maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234) print('maps time', time.time() - start_time) print(maps) exit() start_time = time.time() max_size = 10 #np.iinfo(np.int32).max 32 docs = np.arange(10, dtype=np.uint32) print(docs) a = example.doit(docs, max_size) print(type(a)) print(a.shape) print(a) print(time.time() - start_time) exit() #start_time = time.time() count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10) print(count) maps = maps[:count] np.random.shuffle(maps) print(time.time() - start_time) exit() indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list) dataset = AlbertDataSet(indexed_dataset=indexed_dataset, tokenizer=tokenizer, num_epochs=10, masked_lm_prob=0.15, max_seq_length=512, short_seq_prob=0.1, seed=1234) '''
megatron/data/dataset_utils.py +14 −100 Original line number Diff line number Diff line """TO BE ADDED""" # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors, and NVIDIA. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections Loading Loading @@ -373,102 +386,3 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, loss_mask_np = np.array(loss_mask, dtype=np.int64) return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np ''' if __name__ == '__main__': print('building the dataset ...') from bert_tokenization import FullTokenizer import json import nltk nltk.download('punkt') def document_generator_provider(input_file): with open(input_file, 'r') as ifile: for document in ifile: data = json.loads(document) text = data['text'] sentences = [] for line in text.split('\n'): if line != '\n': sentences.extend(nltk.tokenize.sent_tokenize(line)) yield sentences input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json' vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt' tokenizer = FullTokenizer(vocab_file, do_lower_case=True) document_generator = document_generator_provider(input_file) samples = [] sizes = [] for sentences in document_generator: tokens_list = [] size = 0 for sentence in sentences: tokens = tokenizer.tokenize(sentence) tokens_list.append(tokens) size += len(tokens) samples.append(tokens_list) sizes.append(size) print(sizes) import random rng = random.Random(123567) vocab_id_list = list(tokenizer.inv_vocab.keys()) cls_id = tokenizer.vocab['[CLS]'] sep_id = tokenizer.vocab['[SEP]'] mask_id = tokenizer.vocab['[MASK]'] pad_id = tokenizer.vocab['[PAD]'] vocab_id_to_token_dict = tokenizer.inv_vocab sample = [] for s in samples[0]: sample.append(tokenizer.convert_tokens_to_ids(s)) max_seq_length = 512 target_seq_length = 444 masked_lm_prob = 0.15 example = build_training_sample(sample, target_seq_length, max_seq_length, vocab_id_list, vocab_id_to_token_dict, cls_id, sep_id, mask_id, pad_id, masked_lm_prob, rng) orig_tokens = [] for s in samples[0]: orig_tokens.extend(s) is_random = example['is_random'] if is_random: print('random') else: print('not-random') #exit() ii = 0 for i in range(max_seq_length): token = tokenizer.inv_vocab[example['text'][i]] if token in ['[CLS]', '[SEP]'] : orig_token = token elif ii < len(orig_tokens): orig_token = orig_tokens[ii] ii += 1 else: orig_token = 'EMPTY' tokentype = example['types'][i] label_id = example['labels'][i] label = 'NONE' if label_id >= 0: label = tokenizer.inv_vocab[label_id] loss_mask = example['loss_mask'][i] padding_mask = example['padding_mask'][i] string = '' string += '{:15s}'.format(orig_token) string += '{:15s}'.format(token) string += '{:15s}'.format(label) string += '{:5d}'.format(loss_mask) string += '{:5d}'.format(tokentype) string += '{:5d}'.format(padding_mask) print(string) '''
megatron/data/helpers.cpp +18 −0 Original line number Diff line number Diff line /* coding=utf-8 Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Helper methods for fast index mapping builds */ #include <algorithm> Loading