Commit 6140718f authored by Mohammad Shoeybi's avatar Mohammad Shoeybi
Browse files

before optimization

parent c125d247
Loading
Loading
Loading
Loading
+72 −57
Original line number Diff line number Diff line
@@ -7,18 +7,55 @@ import numpy as np
import torch
from torch.utils.data import Dataset

from dataset_utils import build_training_sample

# WILL BE REPLACED WITH JARED'S
class JaredDataset(object):

    def __init__(self, doc_idx, sizes, sentences):
        self.doc_idx = doc_idx
        self.num_docs = len(self.doc_idx) - 1
        self.sizes = sizes
        self.sentences = sentences
class AlbertDataSet(Dataset):

    def __init__(self, indexed_dataset, tokenizer, num_epochs,
                 masked_lm_prob, max_seq_length, short_seq_prob, seed):

        # Params to store.
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length

        # Indexed dataset.
        self.indexed_dataset = indexed_dataset

        # Build the samples mapping.
        self.samples_mapping = build_training_samples_mapping(
            indexed_dataset,
            num_epochs,
            self.max_seq_length,
            short_seq_prob,
            self.seed)

        # Vocab stuff.
        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
        self.vocab_id_to_token_dict = tokenizer.inv_vocab
        self.cls_id = tokenizer.vocab['[CLS]']
        self.sep_id = tokenizer.vocab['[SEP]']
        self.mask_id = tokenizer.vocab['[MASK]']
        self.pad_id = tokenizer.vocab['[PAD]']


    def __len__(self):
        return self.samples.shape[0]

    def __getitem__(self, idx):
        return self.sentences[idx]
        rng = random.Random(self.seed + idx)
        start_index, end_index, seq_length = self.samples_mapping[idx]
        sample = []
        for index in range(start_index, end_index):
            sample.append(self.indexed_dataset[index])
        return build_training_sample(sample, seq_length,
                                     self.max_seq_length,
                                     self.vocab_id_list,
                                     self.vocab_id_to_token_dict,
                                     self.cls_id, self.sep_id,
                                     self.mask_id, self.pad_id,
                                     self.masked_lm_prob, rng)


def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
@@ -87,6 +124,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
            while sent_index < sent_index_last:

                # Get the size.
                assert indexed_dataset.sizes[sent_index] > 0
                size += indexed_dataset.sizes[sent_index]
                sent_index += 1

@@ -133,51 +171,17 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
    return samples_np


class AlbertDataSet(Dataset):

    def __init__(self, indexed_dataset, tokenizer, num_epochs,
                 masked_lm_prob, max_seq_length, short_seq_prob, seed):

        # Params to store.
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length

        # Indexed dataset.
        self.indexed_dataset = indexed_dataset

        # Build the samples mapping.
        self.samples_mapping = build_training_samples_mapping(
            indexed_dataset,
            num_epochs,
            self.max_seq_length,
            short_seq_prob,
            self.seed)

        # Vocab stuff.
        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
        self.vocab_id_to_token_dict = tokenizer.inv_vocab
        self.cls_id = tokenizer.vocab['[CLS]']
        self.sep_id = tokenizer.vocab['[SEP]']
        self.mask_id = tokenizer.vocab['[MASK]']
        self.pad_id = tokenizer.vocab['[PAD]']

# WILL BE REPLACED WITH JARED'S
class JaredDataset(object):

    def __len__(self):
        return self.samples.shape[0]
    def __init__(self, doc_idx, sizes, sentences):
        self.doc_idx = doc_idx
        self.num_docs = len(self.doc_idx) - 1
        self.sizes = sizes
        self.sentences = sentences

    def __getitem__(self, idx):
        rng = random.Random(self.seed + idx)
        start_index, end_index = self.samples_mapping[idx]
        sample = []
        for index in range(start_index, end_index):
            sample.append(self.indexed_dataset[index])
        return build_training_sample(sample, self.vocab_id_list,
                                     self.vocab_id_to_token_dict,
                                     self.cls_id, self.sep_id,
                                     self.mask_id, self.pad_id,
                                     self.masked_lm_prob, self.max_seq_length,
                                     rng)
        return self.sentences[idx]



@@ -198,10 +202,12 @@ if __name__ == '__main__':
                sentences = []
                for line in text.split('\n'):
                    if line != '\n':
                        sentences.extend(nltk.tokenize.sent_tokenize(line))
                        sent = nltk.tokenize.sent_tokenize(line)
                        if sent:
                            sentences.extend(sent)
                yield sentences

    input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
    input_file = '/raid/mshoeybi/data/albert/sample/samples_1000.json'
    vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'

    tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
@@ -212,19 +218,28 @@ if __name__ == '__main__':
    sentences_list = []

    for sentences in document_generator:
        doc_idx.append(len(sentences))
        num_sent = 0
        for sentence in sentences:
            tokens = tokenizer.tokenize(sentence)
            if tokens:
                ids = tokenizer.convert_tokens_to_ids(tokens)
                if len(ids) == 0:
                    print('****************')
                    print(sentence)
                    print(tokens)
                    print(ids)
                    print('****************')
                sizes.append(len(ids))
                sentences_list.append(ids)
                num_sent += 1
        doc_idx.append(num_sent)
    for i in range(1, len(doc_idx)):
        doc_idx[i] += doc_idx[i-1]

    indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
    dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
                            tokenizer=tokenizer,
                            num_epochs=3,
                            num_epochs=10,
                            masked_lm_prob=0.15,
                            max_seq_length=512,
                            short_seq_prob=0.1,
+13 −7
Original line number Diff line number Diff line
@@ -5,13 +5,18 @@ import collections
import numpy as np


def build_training_sample(sample, vocab_id_list, vocab_id_to_token_dict,
def build_training_sample(sample,
                          target_seq_length, max_seq_length,
                          vocab_id_list, vocab_id_to_token_dict,
                          cls_id, sep_id, mask_id, pad_id,
                          masked_lm_prob, max_seq_length, rng):
                          masked_lm_prob, rng):
    """Biuld training sample.

    Arguments:
        sample: A list of sentences in which each sentence is a list token ids.
        target_seq_length: Desired sequence length.
        max_seq_length: Maximum length of the sequence. All values are padded to
            this length.
        vocab_id_list: List of vocabulary ids. Used to pick a random id.
        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
        cls_id: Start of example id.
@@ -19,20 +24,19 @@ def build_training_sample(sample, vocab_id_list, vocab_id_to_token_dict,
        mask_id: Mask token id.
        pad_id: Padding token id.
        masked_lm_prob: Probability to mask tokens.
        max_seq_length: Maximum length of the sequence. All values are padded to
            this length.
        rng: Random number genenrator.
    """

    # We assume that we have at least two sentences in the sample
    assert len(sample) > 1
    assert target_seq_length <= max_seq_length

    # Divide sample into two segments (A and B).
    tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, rng)

    # Truncate to `max_sequence_length`.
    # Truncate to `target_sequence_length`.
    # Note that we have account for [CLS] A [SEP] B [SEP]
    max_num_tokens = max_seq_length - 3
    max_num_tokens = target_seq_length - 3
    truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b),
                      max_num_tokens, rng)

@@ -421,11 +425,13 @@ if __name__ == '__main__':
    for s in samples[0]:
        sample.append(tokenizer.convert_tokens_to_ids(s))
    max_seq_length = 512
    target_seq_length = 444
    masked_lm_prob = 0.15
    example = build_training_sample(sample,
                                    target_seq_length, max_seq_length,
                                    vocab_id_list, vocab_id_to_token_dict,
                                    cls_id, sep_id, mask_id, pad_id,
                                    masked_lm_prob, max_seq_length, rng)
                                    masked_lm_prob, rng)

    orig_tokens = []
    for s in samples[0]: