Commit dfde23ea authored by D'azevedo, Ed's avatar D'azevedo, Ed
Browse files

initial checkin code to estimate amount of GPU memory

parent c37365cf
Loading
Loading
Loading
Loading
+13 −4
Original line number Diff line number Diff line
@@ -25,12 +25,15 @@ void apply_Htarget_pvbatch(
#define Y(i) Y_[(i)-1]
{
 const int ialign = 32;
 const double giga = 1000.0 * 1000.0 * 1000.0;

 double gflops1 = (double) 0.0;
 double gflops2 = (double) 0.0;
 double time_1st_vbatch = (double) 0.0;
 double time_2nd_vbatch = (double) 0.0;

 size_t nbytes_BX = 0;

/*
 ------------------
 compute  Y = H * X
@@ -140,7 +143,8 @@ void apply_Htarget_pvbatch(
 int ld_Abatch = descAbatch_[LLD_];
 int ld_Bbatch = descBbatch_[LLD_];

 double *BX_ = (double *) dmrg_malloc( (sizeof(double) * ld_BX) * Locq_BX );
 nbytes_BX = ( (sizeof(double) * ld_BX) * Locq_BX );
 double *BX_ = (double *) dmrg_malloc( nbytes_BX );
 assert( BX_ != NULL );

#define BX(i,j) BX_[ indx2f(i,j,ld_BX) ]
@@ -287,7 +291,7 @@ void apply_Htarget_pvbatch(
#ifdef _OPENMP
    time_1st_vbatch += omp_get_wtime();
#endif
    gflops1 = gflops1/(1000.0*1000.0*1000.0);
    gflops1 = gflops1/(giga);
   


@@ -362,14 +366,19 @@ void apply_Htarget_pvbatch(
                      ngroups, group_size_ );
#ifdef _OPENMP
   time_2nd_vbatch += omp_get_wtime();
   gflops2 = gflops2/(1000.0*1000.0*1000.0);
   gflops2 = gflops2/(giga);

   printf("1st vbatch %f gflops (gflops1=%lf,time=%lf)\n", 
          gflops1/time_1st_vbatch,  gflops1, time_1st_vbatch );
   printf("2nd vbatch %f gflops (gflops2=%lf,time=%lf)\n", 
          gflops2/time_2nd_vbatch, gflops2, time_2nd_vbatch );

   printf("overall %f gflops\n", (gflops1+gflops2)/(time_1st_vbatch + time_2nd_vbatch) );
   printf("overall %f gflops\n", 
          (gflops1+gflops2)/(time_1st_vbatch + time_2nd_vbatch) );

   printf("apply_Htarget_pvbatch:memory BX (%f GBytes)\n",
          (double) nbytes_BX/(giga) );

#endif
     

+18 −5
Original line number Diff line number Diff line
@@ -41,11 +41,16 @@ void apply_Htarget_sparse(
#define Y(i) Y_[(i)-1]
{

 const double giga = 1000.0*1000.0*1000.0;
 const int idebug = 1;
 const int ialign = 32;

 double total_time = -dmrg_get_wtime();

 size_t nbytes_X = 0;
 size_t nbytes_Y = 0;
 size_t nbytes_BX = 0;

 double gflops1 =  0.0;
 double gflops2 =  0.0;
 double time_1st_vbatch =  0.0;
@@ -104,7 +109,8 @@ void apply_Htarget_sparse(
  * -----------------------
  */
  if (need_allocate_X) {
    X_ = (FpType *) dmrg_malloc( sizeof(FpType) * xy_size_dim );
    nbytes_X = sizeof(FpType) * xy_size_dim;
    X_ = (FpType *) dmrg_malloc( nbytes_X );
    assert( X_ != NULL );
    void *dest = (void *) &(X_[0]);
    void *src = (void *) &(Xin_[0]);
@@ -113,7 +119,8 @@ void apply_Htarget_sparse(
    };

  if (need_allocate_Y) {
    Y_ = (FpType *) dmrg_malloc( sizeof(FpType) * xy_size_dim );
    nbytes_Y = sizeof(FpType) * xy_size_dim;
    Y_ = (FpType *) dmrg_malloc( nbytes_Y );
    assert( Y_ != NULL );
    };

@@ -173,7 +180,8 @@ void apply_Htarget_sparse(
#define gBXbatch(ipatch) gBXbatch_[(ipatch)-1]


 FpType *pBXmem = (FpType *) dmrg_malloc( sizeof(FpType) * sum_BX_sizes );
 nbytes_BX = sizeof(FpType) * sum_BX_sizes;
 FpType *pBXmem = (FpType *) dmrg_malloc( nbytes_BX );
 if (pBXmem == NULL) {
    printf("apply_Htarget_sparse: sum_BX_sizes=%le\n", (double) sum_BX_sizes);
    printf("max_nC=%d, sum_nC=%d, nnz_nC=%d\n",
@@ -357,7 +365,7 @@ void apply_Htarget_sparse(
                      beta_array_,   c_array_, ldc_array_,
                      ngroups, group_size_ );
    time_1st_vbatch += dmrg_get_wtime();
    gflops1 = gflops1/(1000.0*1000.0*1000.0);
    gflops1 = gflops1/giga;

        
/*
@@ -478,7 +486,7 @@ void apply_Htarget_sparse(
                      beta_array_,   c_array_, ldc_array_,
                      ngroups, group_size_ );
   time_2nd_vbatch += dmrg_get_wtime();
   gflops2 = gflops2/(1000.0*1000.0*1000.0);
   gflops2 = gflops2/giga;

   if (idebug >= 1) {
   printf("1st vbatch %lf gflops/sec (gflops1=%lf,time=%lf)\n", 
@@ -488,6 +496,7 @@ void apply_Htarget_sparse(

   printf("overall %lf gflops/sec\n", 
           (gflops1+gflops2)/(time_1st_vbatch + time_2nd_vbatch) );

   };
     

@@ -519,6 +528,10 @@ void apply_Htarget_sparse(
  if (idebug >= 1) {
          printf("apply_Htarget_sparse: total_time=%lf \n", 
                                        total_time);
          printf("apply_Htarget_sparse:memory BX (%f GBytes) X (%f GBytes) Y (%f GBytes) \n",
                  (double) nbytes_BX/(giga),
                  (double) nbytes_X/(giga),
                  (double) nbytes_Y/(giga));
  };

}
+9 −3
Original line number Diff line number Diff line
@@ -27,12 +27,15 @@ void apply_Htarget_vbatch(
{
 const int idebug = 1;
 const int ialign = 32;
 const double giga = 1000.0*1000.0*1000.0;

 double gflops1 = (FpType) 0.0;
 double gflops2 = (FpType) 0.0;
 double time_1st_vbatch = (FpType) 0.0;
 double time_2nd_vbatch = (FpType) 0.0;

 size_t nbytes_BX = 0;

/*
 ------------------
 compute  Y = H * X
@@ -109,7 +112,8 @@ void apply_Htarget_vbatch(
 int ncolBX = (ncolA * noperator );
 int ld_BX = ialign * ICEIL(nrowBX,ialign);

 FpType *BX_ = (FpType *) dmrg_malloc( (sizeof(FpType) * ld_BX) * (ncolA * noperator) );
 nbytes_BX = ( (sizeof(FpType) * ld_BX) * (ncolA * noperator) );
 FpType *BX_ = (FpType *) dmrg_malloc( nbytes_BX );
 assert( BX_ != NULL );

#define BX(i,j) BX_[ indx2f(i,j,ld_BX) ]
@@ -196,7 +200,7 @@ void apply_Htarget_vbatch(
                      beta_array_,   c_array_, ldc_array_,
                      ngroups, group_size_ );
    time_1st_vbatch += dmrg_get_wtime();
    gflops1 = gflops1/(1000.0*1000.0*1000.0);
    gflops1 = gflops1/(giga);
   


@@ -268,7 +272,7 @@ void apply_Htarget_vbatch(
                      beta_array_,   c_array_, ldc_array_,
                      ngroups, group_size_ );
   time_2nd_vbatch += dmrg_get_wtime();
   gflops2 = gflops2/(1000.0*1000.0*1000.0);
   gflops2 = gflops2/(giga);

   if (idebug >= 1) {
   printf("1st vbatch %lf gflops/sec (gflops1=%lf,time=%lf)\n", 
@@ -278,6 +282,8 @@ void apply_Htarget_vbatch(

   printf("overall %lf gflops/sec\n", 
           (gflops1+gflops2)/(time_1st_vbatch + time_2nd_vbatch) );
   printf("memory BX(%lf GBytes)\n",
          (double) nbytes_BX/(giga) );
   };
     

+45 −16
Original line number Diff line number Diff line
@@ -124,9 +124,13 @@ void dmrg_Xgemm_vbatch( char ctransa_array[],
{

const int idebug = 0;
const double giga = 1000.0*1000.0*1000.0;
double gflops = 0;
double elapsed_time = 0;

size_t nbytes = 0;
size_t nbytes_total = 0;

 if (idebug >= 1) {
  elapsed_time = -dmrg_get_wtime();
  int igroup = 0;
@@ -139,7 +143,7 @@ double elapsed_time = 0;
                ( (double) group_size[igroup]) *
                2.0;
      };
   gflops = gflops/(1000.0*1000.0*1000.0);
   gflops = gflops/(giga);
   };


@@ -215,21 +219,44 @@ double elapsed_time = 0;
  const int ialign = 32;
  int vbatch_dim = ialign * ICEIL( (batch_size+1),ialign );
#ifdef USE_MALLOC
  FpType *alpha_vbatch = (FpType *) dmrg_malloc( sizeof(FpType) * (vbatch_dim));

  FpType *beta_vbatch = (FpType *) dmrg_malloc( sizeof(FpType) * (vbatch_dim));
  int *m_vbatch = (int *) dmrg_malloc(sizeof(int) * (vbatch_dim));
  int *n_vbatch = (int *) dmrg_malloc(sizeof(int) * (vbatch_dim));
  int *k_vbatch = (int *) dmrg_malloc(sizeof(int) * (vbatch_dim));
  char *transa_vbatch = (char *) dmrg_malloc(sizeof(char) *(vbatch_dim));
  char *transb_vbatch = (char *) dmrg_malloc(sizeof(char) *(vbatch_dim));
  int *lda_vbatch = (int *) dmrg_malloc(sizeof(int)*(vbatch_dim));
  int *ldb_vbatch = (int *) dmrg_malloc(sizeof(int)*(vbatch_dim));
  int *ldc_vbatch = (int *) dmrg_malloc(sizeof(int)*(vbatch_dim));

  FpType **a_vbatch = (FpType **) dmrg_malloc( sizeof(FpType *) * (vbatch_dim));
  FpType **b_vbatch = (FpType **) dmrg_malloc( sizeof(FpType *) * (vbatch_dim));
  FpType **c_vbatch = (FpType **) dmrg_malloc( sizeof(FpType *) * (vbatch_dim));
  nbytes = sizeof(FpType) * (vbatch_dim); nbytes_total += nbytes;
  FpType *alpha_vbatch = (FpType *) dmrg_malloc( nbytes );

  nbytes = sizeof(FpType) * (vbatch_dim); nbytes_total += nbytes;
  FpType *beta_vbatch = (FpType *) dmrg_malloc( nbytes );

  nbytes = sizeof(int) * (vbatch_dim); nbytes_total += nbytes;
  int *m_vbatch = (int *) dmrg_malloc( nbytes );

  nbytes = sizeof(int) * (vbatch_dim); nbytes_total += nbytes;
  int *n_vbatch = (int *) dmrg_malloc( nbytes );

  nbytes = sizeof(int) * (vbatch_dim); nbytes_total += nbytes;
  int *k_vbatch = (int *) dmrg_malloc( nbytes );

  nbytes = sizeof(char) *(vbatch_dim); nbytes_total += nbytes;
  char *transa_vbatch = (char *) dmrg_malloc( nbytes );

  nbytes = sizeof(char) *(vbatch_dim); nbytes_total += nbytes;
  char *transb_vbatch = (char *) dmrg_malloc( nbytes );

  nbytes = sizeof(int)*(vbatch_dim); nbytes_total += nbytes;
  int *lda_vbatch = (int *) dmrg_malloc( nbytes );

  nbytes = sizeof(int)*(vbatch_dim); nbytes_total += nbytes;
  int *ldb_vbatch = (int *) dmrg_malloc( nbytes );

  nbytes = sizeof(int)*(vbatch_dim); nbytes_total += nbytes;
  int *ldc_vbatch = (int *) dmrg_malloc( nbytes );

  nbytes = sizeof(FpType *) * (vbatch_dim); nbytes_total += nbytes;
  FpType **a_vbatch = (FpType **) dmrg_malloc( nbytes );

  nbytes = sizeof(FpType *) * (vbatch_dim); nbytes_total += nbytes;
  FpType **b_vbatch = (FpType **) dmrg_malloc( nbytes );

  nbytes = sizeof(FpType *) * (vbatch_dim); nbytes_total += nbytes;
  FpType **c_vbatch = (FpType **) dmrg_malloc( nbytes );

  assert( alpha_vbatch != NULL );
  assert( beta_vbatch != NULL );
@@ -472,6 +499,8 @@ double elapsed_time = 0;

    printf("dmrg_vbatch: gflops=%lf, elapsed_time=%lf, gflops/sec=%lf\n",
                         gflops,     elapsed_time, gflops_per_sec );
    printf("dmrg_vbatch need %lf GBytes\n",
             (double) nbytes_total/(giga) );
    };
                         
}
+12 −2
Original line number Diff line number Diff line
@@ -80,9 +80,13 @@ void setup_sparse_batch(
  const int false = 0;
  const int true = !false;
  const int use_Xlacpy = true ;
  const double giga = 1000.0*1000.0*1000.0;

  double total_time = -dmrg_get_wtime();

  size_t nbytes_Abatch = 0;
  size_t nbytes_Bbatch = 0;

  int ipatch = 0;

  int gnnz_A_[npatches*npatches*noperator];
@@ -341,8 +345,10 @@ void setup_sparse_batch(
  assert( gAbatch_ != NULL );
  assert( gBbatch_ != NULL );

  FpType *pAmem = (FpType *) dmrg_malloc( sizeof(FpType) * sum_Abatch_sizes);
  FpType *pBmem = (FpType *) dmrg_malloc( sizeof(FpType) * sum_Bbatch_sizes);
  nbytes_Abatch = sizeof(FpType) * sum_Abatch_sizes;
  nbytes_Bbatch = sizeof(FpType) * sum_Bbatch_sizes;
  FpType *pAmem = (FpType *) dmrg_malloc( nbytes_Abatch );
  FpType *pBmem = (FpType *) dmrg_malloc( nbytes_Bbatch );
  assert( pAmem != NULL );
  assert( pBmem != NULL );
  {
@@ -472,6 +478,10 @@ void setup_sparse_batch(
  if (idebug >= 1) {
          printf("setup_sparse_batch: total_time = %lf \n", 
                                      total_time );
          printf("setup_sparse_batch:memory Abatch (%lf GBytes) Bbatch (%lf GBytes)\n",
                 (double) nbytes_Abatch/(giga),
                 (double) nbytes_Bbatch/(giga) );

  };
}
          
Loading