Loading src/dmrg_vbatch.c +119 −4 Original line number Diff line number Diff line Loading @@ -17,8 +17,14 @@ #include "dmrg_magma.h" #define MAXGPUS 8 static magma_queue_t queue_array[MAXGPUS]; static magma_device_t device_array[MAXGPUS]; static int ngpu = 0; static magma_queue_t queue = 0; static int device = 0; #endif Loading @@ -28,11 +34,35 @@ void dmrg_init() is_initialized = 1; #ifdef USE_MAGMA const int idebug = 1; device = 0; magma_init(); magma_getdevices( device_array, MAXGPUS, &ngpu ); assert( ngpu >= 1 ); if (idebug >= 1) { printf("dmrg_init: ngpu = %d \n",ngpu); }; int idev = 0; for (idev = 0; idev < ngpu; idev++) { device = device_array[idev]; magma_setdevice( device ); magma_queue_create( device, &queue ); assert( queue != 0 ); queue_array[idev] = queue; }; idev = 0; queue = queue_array[idev]; device = device_array[idev]; #endif }; Loading Loading @@ -355,7 +385,7 @@ double elapsed_time = 0; FpType beta = beta_vbatch[0]; if (ngpu == 1) { magmablas_Xgemm_vbatched( transA, transB, m_vbatch, n_vbatch, k_vbatch, Loading @@ -365,9 +395,94 @@ double elapsed_time = 0; beta, c_vbatch, ldc_vbatch, batch_size, queue ); } else { /* * -------------------------------------------- * simple partitioning of work to multiple GPUs * -------------------------------------------- */ int inc = (batch_size + (ngpu-1))/ngpu; int idev = 0; for(idev = 0; idev < ngpu; idev++) { int istart = idev * inc; int iend = istart+inc-1; if (iend >= (batch_size-1)) { iend = batch_size-1; }; int isize = (iend - istart + 1); device = device_array[idev]; queue = queue_array[idev]; magma_setdevice( device ); int *pm_vbatch = &(m_vbatch[istart]); int *pn_vbatch = &(n_vbatch[istart]); int *pk_vbatch = &(k_vbatch[istart]); FpType **pa_vbatch = &(a_vbatch[istart]); FpType **pb_vbatch = &(b_vbatch[istart]); FpType **pc_vbatch = &(c_vbatch[istart]); int *plda_vbatch = &(lda_vbatch[istart]); int *pldb_vbatch = &(ldb_vbatch[istart]); int *pldc_vbatch = &(ldc_vbatch[istart]); /* * ------------ * extra checks * ------------ */ int i = 0; for(i=0; i < isize; i++) { int mm = pm_vbatch[i]; int nn = pn_vbatch[i]; int kk = pk_vbatch[i]; int lda = plda_vbatch[i]; int ldb = pldb_vbatch[i]; int ldc = pldc_vbatch[i]; FpType *Amat = a_vbatch[i]; FpType *Bmat = b_vbatch[i]; FpType *Cmat = c_vbatch[i]; assert( mm >= 1 ); assert( nn >= 1 ); assert( kk >= 1 ); assert( lda >= 1 ); assert( ldb >= 1 ); assert( ldc >= 1 ); assert( Amat != 0 ); assert( Bmat != 0 ); assert( Cmat != 0 ); }; int pbatch_size = isize; if (pbatch_size >= 1) { magmablas_Xgemm_vbatched( transA, transB, pm_vbatch, pn_vbatch, pk_vbatch, alpha, (FpType const * const *) pa_vbatch, plda_vbatch, (FpType const * const *) pb_vbatch, pldb_vbatch, beta, pc_vbatch, pldc_vbatch, pbatch_size, queue ); }; }; /* end for idev */ idev = 0; device = device_array[idev]; queue = queue_array[idev]; }; /* end if (ngpu > 1) */ }; #else { Loading Loading
src/dmrg_vbatch.c +119 −4 Original line number Diff line number Diff line Loading @@ -17,8 +17,14 @@ #include "dmrg_magma.h" #define MAXGPUS 8 static magma_queue_t queue_array[MAXGPUS]; static magma_device_t device_array[MAXGPUS]; static int ngpu = 0; static magma_queue_t queue = 0; static int device = 0; #endif Loading @@ -28,11 +34,35 @@ void dmrg_init() is_initialized = 1; #ifdef USE_MAGMA const int idebug = 1; device = 0; magma_init(); magma_getdevices( device_array, MAXGPUS, &ngpu ); assert( ngpu >= 1 ); if (idebug >= 1) { printf("dmrg_init: ngpu = %d \n",ngpu); }; int idev = 0; for (idev = 0; idev < ngpu; idev++) { device = device_array[idev]; magma_setdevice( device ); magma_queue_create( device, &queue ); assert( queue != 0 ); queue_array[idev] = queue; }; idev = 0; queue = queue_array[idev]; device = device_array[idev]; #endif }; Loading Loading @@ -355,7 +385,7 @@ double elapsed_time = 0; FpType beta = beta_vbatch[0]; if (ngpu == 1) { magmablas_Xgemm_vbatched( transA, transB, m_vbatch, n_vbatch, k_vbatch, Loading @@ -365,9 +395,94 @@ double elapsed_time = 0; beta, c_vbatch, ldc_vbatch, batch_size, queue ); } else { /* * -------------------------------------------- * simple partitioning of work to multiple GPUs * -------------------------------------------- */ int inc = (batch_size + (ngpu-1))/ngpu; int idev = 0; for(idev = 0; idev < ngpu; idev++) { int istart = idev * inc; int iend = istart+inc-1; if (iend >= (batch_size-1)) { iend = batch_size-1; }; int isize = (iend - istart + 1); device = device_array[idev]; queue = queue_array[idev]; magma_setdevice( device ); int *pm_vbatch = &(m_vbatch[istart]); int *pn_vbatch = &(n_vbatch[istart]); int *pk_vbatch = &(k_vbatch[istart]); FpType **pa_vbatch = &(a_vbatch[istart]); FpType **pb_vbatch = &(b_vbatch[istart]); FpType **pc_vbatch = &(c_vbatch[istart]); int *plda_vbatch = &(lda_vbatch[istart]); int *pldb_vbatch = &(ldb_vbatch[istart]); int *pldc_vbatch = &(ldc_vbatch[istart]); /* * ------------ * extra checks * ------------ */ int i = 0; for(i=0; i < isize; i++) { int mm = pm_vbatch[i]; int nn = pn_vbatch[i]; int kk = pk_vbatch[i]; int lda = plda_vbatch[i]; int ldb = pldb_vbatch[i]; int ldc = pldc_vbatch[i]; FpType *Amat = a_vbatch[i]; FpType *Bmat = b_vbatch[i]; FpType *Cmat = c_vbatch[i]; assert( mm >= 1 ); assert( nn >= 1 ); assert( kk >= 1 ); assert( lda >= 1 ); assert( ldb >= 1 ); assert( ldc >= 1 ); assert( Amat != 0 ); assert( Bmat != 0 ); assert( Cmat != 0 ); }; int pbatch_size = isize; if (pbatch_size >= 1) { magmablas_Xgemm_vbatched( transA, transB, pm_vbatch, pn_vbatch, pk_vbatch, alpha, (FpType const * const *) pa_vbatch, plda_vbatch, (FpType const * const *) pb_vbatch, pldb_vbatch, beta, pc_vbatch, pldc_vbatch, pbatch_size, queue ); }; }; /* end for idev */ idev = 0; device = device_array[idev]; queue = queue_array[idev]; }; /* end if (ngpu > 1) */ }; #else { Loading