Commit 75a86a1d authored by Mohammad Shoeybi's avatar Mohammad Shoeybi
Browse files

minor cleaning

parent 5fc301aa
Loading
Loading
Loading
Loading
+16 −245
Original line number Diff line number Diff line
"""TO BE ADDED """
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""ALBERT Style dataset."""

import os
import time
@@ -140,11 +155,6 @@ class AlbertDataset(Dataset):
        sample = []
        for index in range(start_index, end_index):
            sample.append(self.indexed_dataset[index])
        '''
        for s in sample:
            if len(s) > 1000:
                print(self.tokenizer.convert_ids_to_tokens(s))
        '''
        # Note that this rng state should be numpy and not python since
        # python randint is inclusive whereas the numpy one is exclusive.
        np_rng = np.random.RandomState(seed=(self.seed + idx))
@@ -285,242 +295,3 @@ def get_samples_mapping_(indexed_dataset,
        samples_mapping.shape[0]))

    return samples_mapping


'''
def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
    """With probability `short_seq_prob` generate a smaller sequence lenght."""
    if np_rng.random() < short_seq_prob:
        return np_rng.randint(2, max_num_tokens + 1)
    return max_num_tokens


def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
                                   short_seq_prob, seed):
    """Build a mapping to reconstruct training samples."""

    start_time = time.time()
    print('> building training samples mapping ...')

    # RNG:
    np_rng = np.random.RandomState(seed=seed)

    # List of start sentence index and end sentence index (end is exclusive)
    # to retrieve.
    samples = []

    # Account for [CLS], [SEP], [SEP]
    max_num_tokens = max_seq_length - 3

    # Number of documents processed:
    total_docs = 0
    # Number of documents that are skipped:
    skipped_docs = 0
    # Number of empty documents:
    empty_docs = 0

    # For each epoch:
    for epoch in range(num_epochs):
        # For each document:
        for doc_index in range(indexed_dataset.num_docs):
            if epoch == 0:
                total_docs += 1

            # Document sentences are in [sent_index_first, sent_index_last).
            sent_index_first = indexed_dataset.doc_idx[doc_index]
            sent_index_last = indexed_dataset.doc_idx[doc_index+1]
            assert sent_index_last >= sent_index_first

            # Empty docs.
            if (sent_index_last - sent_index_first) == 0:
                if epoch == 0:
                    print('***WARNING*** document {} is empty'.format(
                        doc_index))
                    empty_docs += 1
                continue
            # Skip documents that only have one sentences.
            if (sent_index_last - sent_index_first) == 1:
                if epoch == 0:
                    print('***WARNING*** document {} has only one sentnece, '
                          'skipping ...'.format(doc_index))
                    skipped_docs += 1
                continue

            # Loop through sentences.
            sent_index = sent_index_first
            target_seq_length = get_target_seq_length(max_num_tokens,
                                                      short_seq_prob, np_rng)
            size = 0
            while sent_index < sent_index_last:

                # Get the size.
                assert indexed_dataset.sizes[sent_index] > 0
                size += indexed_dataset.sizes[sent_index]
                sent_index += 1

                # If we have reached the target length.
                exceeded_target_size = (size >= target_seq_length)
                # If only one sentence is left in the document.
                only_one_sent_left = (sent_index == (sent_index_last - 1))
                # If we have at least two sentneces.
                have_more_than_one_sent = (sent_index - sent_index_first) > 1
                # If we have reached end of the document.
                reached_end_of_doc = (sent_index == sent_index_last)
                if (exceeded_target_size and not only_one_sent_left and
                    have_more_than_one_sent) or reached_end_of_doc:
                    assert (sent_index - sent_index_first) > 1
                    assert size > 1
                    # Add the sample.
                    samples.append([sent_index_first, sent_index,
                                    target_seq_length])
                    # Reset indices
                    sent_index_first = sent_index
                    target_seq_length = get_target_seq_length(max_num_tokens,
                                                              short_seq_prob,
                                                              np_rng)
                    size = 0
                    num_sentences = 0

    # Convert to numpy array.
    samples_np = np.array(samples, dtype=np.int64)
    # Shuffle.
    np_rng.shuffle(samples_np)
    elapsed_time = time.time() - start_time

    # Print some stats:
    print('\n***************************** info *****************************')
    print('   elapsed time (sec) ..................... {}'.format(elapsed_time))
    print('   number of epochs ....................... {}'.format(num_epochs))
    print('   number of samples ...................... {}'.format(
        samples_np.shape[0]))
    print('   number of documents .................... {}'.format(total_docs))
    print('   number of empty documents .............. {}'.format(empty_docs))
    print('   number of documents with one sentence .. {}'.format(skipped_docs))
    print('****************************************************************\n')

    return samples_np
'''

'''
# WILL BE REPLACED WITH JARED'S
class JaredDataset(object):

    def __init__(self, doc_idx, sizes, sentences):
        self.doc_idx = doc_idx
        self.num_docs = len(self.doc_idx) - 1
        self.sizes = sizes
        self.sentences = sentences

    def __getitem__(self, idx):
        return self.sentences[idx]



if __name__ == '__main__':
    print('dataset ...')

    from bert_tokenization import FullTokenizer
    import json
    import nltk
    nltk.download('punkt')

    def document_generator_provider(input_file):
        with open(input_file, 'r') as ifile:
            for document in ifile:
                data = json.loads(document)
                text = data['text']
                sentences = []
                for line in text.split('\n'):
                    if line != '\n':
                        sent = nltk.tokenize.sent_tokenize(line)
                        if sent:
                            sentences.extend(sent)
                yield sentences

    input_file = 'test/samples_10000.json'
    vocab_file = 'test/vocab.txt'

    tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
    document_generator = document_generator_provider(input_file)

    doc_idx = [0]
    sizes = []
    sentences_list = []

    for sentences in document_generator:
        num_sent = 0
        for sentence in sentences:
            tokens = tokenizer.tokenize(sentence)
            if tokens:
                ids = tokenizer.convert_tokens_to_ids(tokens)
                if len(ids) == 0:
                    print('****************')
                    print(sentence)
                    print(tokens)
                    print(ids)
                    print('****************')
                sizes.append(len(ids))
                sentences_list.append(ids)
                num_sent += 1
        doc_idx.append(num_sent)
    for i in range(1, len(doc_idx)):
        doc_idx[i] += doc_idx[i-1]

    #max_size = np.iinfo(np.int32).max // 32

    import time

    docs_np = np.array(doc_idx, dtype=np.uint32)
    sizes_np = np.array(sizes, dtype=np.uint16)

    start_time = time.time()
    max_seq_length = 512
    max_size = docs_np.shape[0]
    lens = np.full(max_size, max_seq_length-3, dtype=np.uint16)
    lens_rand = np.random.randint(low=2, high=(max_seq_length-2),
                                  size=max_size//10, dtype=np.uint16)
    lens_view = lens[:max_size//10]
    np.copyto(lens_view, lens_rand)
    np.random.shuffle(lens)
    print('num docs', max_size)
    print('lens time', time.time() - start_time)

    import helpers
    start_time = time.time()
    maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234)
    print('maps time', time.time() - start_time)
    print(maps)
    exit()

    start_time = time.time()
    max_size = 10 #np.iinfo(np.int32).max 32
    docs = np.arange(10, dtype=np.uint32)
    print(docs)

    a = example.doit(docs, max_size)
    print(type(a))
    print(a.shape)
    print(a)
    print(time.time() - start_time)
    exit()


    #start_time = time.time()
    count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10)
    print(count)
    maps = maps[:count]
    np.random.shuffle(maps)
    print(time.time() - start_time)


    exit()

    indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
    dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
                            tokenizer=tokenizer,
                            num_epochs=10,
                            masked_lm_prob=0.15,
                            max_seq_length=512,
                            short_seq_prob=0.1,
                            seed=1234)
'''
+14 −100
Original line number Diff line number Diff line
"""TO BE ADDED"""
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import collections
@@ -373,102 +386,3 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
    loss_mask_np = np.array(loss_mask, dtype=np.int64)

    return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np


'''
if __name__ == '__main__':


    print('building the dataset ...')

    from bert_tokenization import FullTokenizer
    import json
    import nltk
    nltk.download('punkt')

    def document_generator_provider(input_file):
        with open(input_file, 'r') as ifile:
            for document in ifile:
                data = json.loads(document)
                text = data['text']
                sentences = []
                for line in text.split('\n'):
                    if line != '\n':
                        sentences.extend(nltk.tokenize.sent_tokenize(line))
                yield sentences

    input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
    vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'

    tokenizer = FullTokenizer(vocab_file, do_lower_case=True)

    document_generator = document_generator_provider(input_file)
    samples = []
    sizes = []
    for sentences in document_generator:
        tokens_list = []
        size = 0
        for sentence in sentences:
            tokens = tokenizer.tokenize(sentence)
            tokens_list.append(tokens)
            size += len(tokens)
        samples.append(tokens_list)
        sizes.append(size)
    print(sizes)

    import random
    rng = random.Random(123567)
    vocab_id_list = list(tokenizer.inv_vocab.keys())
    cls_id = tokenizer.vocab['[CLS]']
    sep_id = tokenizer.vocab['[SEP]']
    mask_id = tokenizer.vocab['[MASK]']
    pad_id = tokenizer.vocab['[PAD]']
    vocab_id_to_token_dict = tokenizer.inv_vocab
    sample = []
    for s in samples[0]:
        sample.append(tokenizer.convert_tokens_to_ids(s))
    max_seq_length = 512
    target_seq_length = 444
    masked_lm_prob = 0.15
    example = build_training_sample(sample,
                                    target_seq_length, max_seq_length,
                                    vocab_id_list, vocab_id_to_token_dict,
                                    cls_id, sep_id, mask_id, pad_id,
                                    masked_lm_prob, rng)

    orig_tokens = []
    for s in samples[0]:
        orig_tokens.extend(s)
    is_random = example['is_random']
    if is_random:
        print('random')
    else:
        print('not-random')
    #exit()
    ii = 0
    for i in range(max_seq_length):
        token = tokenizer.inv_vocab[example['text'][i]]
        if token in ['[CLS]', '[SEP]'] :
            orig_token = token
        elif ii < len(orig_tokens):
            orig_token = orig_tokens[ii]
            ii += 1
        else:
            orig_token = 'EMPTY'
        tokentype = example['types'][i]
        label_id = example['labels'][i]
        label = 'NONE'
        if label_id >= 0:
            label = tokenizer.inv_vocab[label_id]
        loss_mask = example['loss_mask'][i]
        padding_mask = example['padding_mask'][i]

        string = ''
        string += '{:15s}'.format(orig_token)
        string += '{:15s}'.format(token)
        string += '{:15s}'.format(label)
        string += '{:5d}'.format(loss_mask)
        string += '{:5d}'.format(tokentype)
        string += '{:5d}'.format(padding_mask)
        print(string)
'''
+18 −0
Original line number Diff line number Diff line
/*
 coding=utf-8
 Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */


/* Helper methods for fast index mapping builds */

#include <algorithm>