Commit f51ceb7c authored by Mohammad Shoeybi's avatar Mohammad Shoeybi
Browse files

c++ code working

parent 6140718f
Loading
Loading
Loading
Loading
+53 −4
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ import torch
from torch.utils.data import Dataset

from dataset_utils import build_training_sample

#from data.mapping import build_training_samples_mapping

class AlbertDataSet(Dataset):

@@ -57,7 +57,7 @@ class AlbertDataSet(Dataset):
                                     self.mask_id, self.pad_id,
                                     self.masked_lm_prob, rng)


'''
def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
    """With probability `short_seq_prob` generate a smaller sequence lenght."""
    if np_rng.random() < short_seq_prob:
@@ -169,7 +169,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
    print('****************************************************************\n')

    return samples_np

'''

# WILL BE REPLACED WITH JARED'S
class JaredDataset(object):
@@ -207,7 +207,7 @@ if __name__ == '__main__':
                            sentences.extend(sent)
                yield sentences

    input_file = '/raid/mshoeybi/data/albert/sample/samples_1000.json'
    input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
    vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'

    tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
@@ -236,6 +236,55 @@ if __name__ == '__main__':
    for i in range(1, len(doc_idx)):
        doc_idx[i] += doc_idx[i-1]

    #max_size = np.iinfo(np.int32).max // 32

    import time

    docs_np = np.array(doc_idx, dtype=np.uint32)
    sizes_np = np.array(sizes, dtype=np.uint16)

    start_time = time.time()
    max_seq_length = 512
    max_size = docs_np.shape[0]
    lens = np.full(max_size, max_seq_length-3, dtype=np.uint16)
    lens_rand = np.random.randint(low=2, high=(max_seq_length-2),
                                  size=max_size//10, dtype=np.uint16)
    lens_view = lens[:max_size//10]
    np.copyto(lens_view, lens_rand)
    np.random.shuffle(lens)
    print('num docs', max_size)
    print('lens time', time.time() - start_time)

    import helpers
    start_time = time.time()
    maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234)
    print('maps time', time.time() - start_time)
    print(maps)
    exit()

    start_time = time.time()
    max_size = 10 #np.iinfo(np.int32).max 32
    docs = np.arange(10, dtype=np.uint32)
    print(docs)

    a = example.doit(docs, max_size)
    print(type(a))
    print(a.shape)
    print(a)
    print(time.time() - start_time)
    exit()


    #start_time = time.time()
    count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10)
    print(count)
    maps = maps[:count]
    np.random.shuffle(maps)
    print(time.time() - start_time)


    exit()

    indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
    dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
                            tokenizer=tokenizer,
+202 −0
Original line number Diff line number Diff line

#include <algorithm>
#include <iostream>
#include <limits>
#include <math.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>

namespace py = pybind11;
using namespace std;


inline uint32_t get_sample_len(const int short_seq_ratio,
			       const uint32_t max_length) {
  /* Training sample length. */
  const auto random_number = rand();
  if ((random_number % short_seq_ratio) == 0) {
    return 2 + random_number % (max_length - 1);
  }
  return max_length;
}


py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
				    const py::array_t<uint16_t>& sizes_,
				    const int num_epochs,
				    const int max_num_samples,
				    const int max_seq_length,
				    const double short_seq_prob,
				    const int seed) {

  cout << "> building dataset mapping for " << docs_.shape(0) - 1 <<
    " documents with " << sizes_.shape(0) << " sentences ..." << endl;

  // For efficiency, convert probability to ratio.
  const int short_seq_ratio = int(round(1.0 / short_seq_prob));

  // Remove bound checks.
  auto docs = docs_.unchecked<1>();
  auto sizes = sizes_.unchecked<1>();

  // Check for consistency.
  if (docs[docs.shape(0) - 1] != sizes.shape(0)) {
    cout << "document values is not consistent with length of sizes: " <<
      docs[docs.shape(0) - 1] << " != " << sizes.shape(0) << endl;
    throw(-1);
    }

  // Mapping and it's length (1D).
  int num_samples = -1;
  uint32_t* maps = NULL;

  // Perform two iterations, in the first iteration get the size
  // and allocate memory and in the second iteration populate the map.
  bool second = false;
  for (int iteration=0; iteration < 2; ++iteration) {

    // Set the seed so both iterations produce the same results.
    srand(seed);

    // Set the flag on second iteration.
    if (iteration == 1) {
      second = true;
    }

    // Counters:
    uint32_t empty_docs = 0;
    uint32_t one_sent_docs = 0;

    // Current map index.
    uint64_t map_index = 0;

    // For each epoch:
    for (int epoch=0; epoch < num_epochs; ++epoch) {
      if (map_index >= max_num_samples) {
	cout << " > reached " << max_num_samples << " samples after " <<
	  epoch << " epochs ..." << endl;
	break;
      }
      // For each document:
      for (int doc=0; doc < (docs.shape(0) - 1); ++doc) {

	// Document sentences are in [sent_index_first, sent_index_last).
	const uint32_t sent_index_first = docs[doc];
	const uint32_t sent_index_last = docs[doc + 1];

	// At the begining of the document previous index is the start index.
	uint32_t prev_start_index = sent_index_first;

	// Remaining documents.
	uint32_t num_remain_sent = sent_index_last - sent_index_first;

	// Some bookkeeping
	if ((epoch == 0) && (!second)) {
	  if (num_remain_sent == 0) {
	    cout << "***WARNING*** document " << doc << " is empty" << endl;
	    empty_docs += 1;
	  }
	  if (num_remain_sent == 1) {
	    cout << "***WARNING*** document " << doc <<
	      " has one sentence" << endl;
	    one_sent_docs += 1;
	  }
	}

	// If we have more than two sentences.
	if (num_remain_sent > 1) {

	  // Set values.
	  uint32_t size = 0;
	  uint32_t num_sent = 0;
	  uint32_t seq_len = get_sample_len(short_seq_ratio, max_seq_length);

	  // Loop through sentences.
	  for (uint32_t sent_index=sent_index_first;
	       sent_index < sent_index_last; ++sent_index) {

	    // Add the size and number of sentences.
	    size += sizes[sent_index];
	    num_sent += 1;
	    num_remain_sent -= 1;

	    // If we have reached the target length.
	    // and if not only one sentence is left in the document.
	    // and if we have at least two sentneces.
	    // and if we have reached end of the document.
	    if (((size >= seq_len) && (num_remain_sent > 1) &&
		 (num_sent > 1) ) || (num_remain_sent == 0)) {

	      // Populate the map.
	      if (second) {
		const uint64_t map_index_0 = 3 * map_index;
		maps[map_index_0] = prev_start_index;
		maps[map_index_0 + 1] = sent_index + 1;
		maps[map_index_0 + 2] = seq_len;
	      }

	      // Update indices / counters.
	      map_index += 1;
	      prev_start_index = sent_index + 1;
	      seq_len = get_sample_len(short_seq_ratio, max_seq_length);
	      size = 0;
	      num_sent = 0;
	    }
	  }

	} // if (num_remain_sent > 1) {
      } // for (int doc=0; doc < num_docs; ++doc) {
    } // for (int epoch=0; epoch < num_epochs; ++epoch) {

    // For now only support mappings up to MAX_INT.
    if (map_index > std::numeric_limits<int>::max()) {
      cout << "number of samples ("<< map_index <<") exceeded MAX_INT" << endl;
      throw(-1);
    }
    else if (!second) {
      cout << "    number of samples:                      " <<
	map_index << endl;
      cout << "    number of empty documents:              " <<
	empty_docs << endl;
      cout << "    number of documents with one sentence:  " <<
	one_sent_docs << endl;
      maps = new uint32_t[3*map_index];
      num_samples = int(map_index);
    }

  } // for (int iteration=0; iteration < 2; ++iteration) {

  // Shuffle.
  for (int i=(num_samples - 1); i > 0; --i) {
    const int j = rand() % (i + 1);
    uint64_t i0 = 3 * i;
    uint64_t j0 = 3 * j;
    // Swap values.
    swap(maps[i0], maps[j0]);
    swap(maps[i0 + 1], maps[j0 + 1]);
    swap(maps[i0 + 2], maps[j0 + 2]);
  }

  cout << " > done building the mapping." << endl;

  // Method to deallocate memory.
  py::capsule free_when_done(maps, [](void *mem_) {
      uint32_t *mem = reinterpret_cast<uint32_t *>(mem_);
      cout << "freeing memory for the dataset mapping" << endl;
      delete[] mem;
    });

  // Return the numpy array.
  return py::array_t<uint32_t>({num_samples, 3}, // shape
			       {3*4, 4}, // C-style contiguous strides
			       maps, // the data pointer
			       free_when_done); // numpy array references

}


PYBIND11_MODULE(helpers, m) {
  m.def("build_mapping", &build_mapping);
}