Loading src/Makefile +1 −0 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ OBJS=\ setup_sparse_batch.o \ unsetup_sparse_batch.o \ apply_Htarget_sparse.o \ estimate_work.o \ BatchedGemm.o libdmrgppPluginSc.a: $(OBJS) Loading src/apply_Htarget_sparse.c +8 −2 Original line number Diff line number Diff line Loading @@ -103,7 +103,10 @@ void apply_Htarget_sparse( if (need_allocate_X) { X_ = (FpType *) dmrg_malloc( sizeof(FpType) * xy_size_dim ); assert( X_ != NULL ); memcpy( &(X_[0]), &(Xin_[0]), sizeof(FpType) * xy_size ); void *dest = (void *) &(X_[0]); void *src = (void *) &(Xin_[0]); size_t count = sizeof(FpType) * xy_size; dmrg_memcpy( dest, src, count ); }; if (need_allocate_Y) { Loading Loading @@ -501,7 +504,10 @@ void apply_Htarget_sparse( dmrg_free( X_ ); X_ = NULL; }; if (need_allocate_Y) { memcpy( &(Yout_[0]), &(Y_[0]), sizeof(FpType) * xy_size ); void *dest = &(Yout_[0]); void *src = &(Y_[0]); size_t count = sizeof(FpType) * xy_size; dmrg_memcpy( dest, src, count ); dmrg_free( Y_ ); Y_ = NULL; }; #endif Loading src/dmrg_malloc.c +12 −0 Original line number Diff line number Diff line Loading @@ -20,6 +20,18 @@ int dmrg_is_managed( const void *ptr ) return( is_managed ); } void dmrg_memcpy(void *dest, const void *src, size_t count) { #ifdef USE_MAGMA cudaError_t istat = cudaMemcpy( dest, src, count, cudaMemcpyDefault ); if (istat != cudaSuccess) { fprintf(stderr,"dmrg_memcpy: %s\n", cudaGetErrorString(istat)); }; assert( istat == cudaSuccess ); #else memcpy( dest, src, count ); #endif } void *dmrg_malloc(const size_t alloc_size ) { Loading src/dmrg_vbatch.h +2 −0 Original line number Diff line number Diff line Loading @@ -45,6 +45,8 @@ extern "C" { extern void dmrg_init(); extern void dmrg_memcpy(void *dest, const void *src, size_t n); extern int dmrg_is_managed( const void *ptr ); Loading src/estimate_work.c 0 → 100644 +76 −0 Original line number Diff line number Diff line #include "test_vbatch.h" void estimate_work( int npatches, int left_patch_size_[], int right_patch_size_[], int nC_[], double *ptotal_gflops, double *pgmemA, double *pgmemB, double *pgmemBX ) #define nC(ipatch,jpatch) nC_[ ((ipatch)-1) + ((jpatch)-1)*npatches ] #define left_patch_size(ipatch) left_patch_size_[(ipatch)-1] #define right_patch_size(ipatch) right_patch_size_[(ipatch)-1] { /* ------------------- estimate total work ------------------- */ assert( ptotal_gflops != NULL ); assert( pgmemA != NULL ); assert( pgmemB != NULL ); assert( pgmemBX != NULL ); double gmemA = 0.0; double gmemB = 0.0; double gmemBX = 0.0; double total_flops = 0.0; { int ipatch = 0; int jpatch = 0; for(jpatch=1; jpatch <= npatches; jpatch++) { for(ipatch=1; ipatch <= npatches; ipatch++) { int nop = nC(ipatch,jpatch); if (nop <= 0) continue; double flops_total = 0.0; double flops_method1 = 0.0; double flops_method2 = 0.0; /* -------------------------------------- Note: evaluate (B * X ) * transpose(A) -------------------------------------- */ int nrowA = left_patch_size(ipatch); int ncolA = left_patch_size(jpatch); int nrowB = right_patch_size(ipatch); int ncolB = right_patch_size(jpatch); int ncolX = ncolA; gmemA += nop * nrowA * ncolA; gmemB += nop * nrowB * ncolB; gmemBX += nop * nrowB * ncolX; cal_kron_flops( nrowA, nrowB, ncolA, ncolB, &flops_total, &flops_method1, &flops_method2); total_flops += flops_method1*nop; }; }; }; double total_gflops = total_flops/(1000.0*1000.0*1000.0); *ptotal_gflops = total_gflops; *pgmemA = gmemA; *pgmemB = gmemB; *pgmemBX = gmemBX; } Loading
src/Makefile +1 −0 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ OBJS=\ setup_sparse_batch.o \ unsetup_sparse_batch.o \ apply_Htarget_sparse.o \ estimate_work.o \ BatchedGemm.o libdmrgppPluginSc.a: $(OBJS) Loading
src/apply_Htarget_sparse.c +8 −2 Original line number Diff line number Diff line Loading @@ -103,7 +103,10 @@ void apply_Htarget_sparse( if (need_allocate_X) { X_ = (FpType *) dmrg_malloc( sizeof(FpType) * xy_size_dim ); assert( X_ != NULL ); memcpy( &(X_[0]), &(Xin_[0]), sizeof(FpType) * xy_size ); void *dest = (void *) &(X_[0]); void *src = (void *) &(Xin_[0]); size_t count = sizeof(FpType) * xy_size; dmrg_memcpy( dest, src, count ); }; if (need_allocate_Y) { Loading Loading @@ -501,7 +504,10 @@ void apply_Htarget_sparse( dmrg_free( X_ ); X_ = NULL; }; if (need_allocate_Y) { memcpy( &(Yout_[0]), &(Y_[0]), sizeof(FpType) * xy_size ); void *dest = &(Yout_[0]); void *src = &(Y_[0]); size_t count = sizeof(FpType) * xy_size; dmrg_memcpy( dest, src, count ); dmrg_free( Y_ ); Y_ = NULL; }; #endif Loading
src/dmrg_malloc.c +12 −0 Original line number Diff line number Diff line Loading @@ -20,6 +20,18 @@ int dmrg_is_managed( const void *ptr ) return( is_managed ); } void dmrg_memcpy(void *dest, const void *src, size_t count) { #ifdef USE_MAGMA cudaError_t istat = cudaMemcpy( dest, src, count, cudaMemcpyDefault ); if (istat != cudaSuccess) { fprintf(stderr,"dmrg_memcpy: %s\n", cudaGetErrorString(istat)); }; assert( istat == cudaSuccess ); #else memcpy( dest, src, count ); #endif } void *dmrg_malloc(const size_t alloc_size ) { Loading
src/dmrg_vbatch.h +2 −0 Original line number Diff line number Diff line Loading @@ -45,6 +45,8 @@ extern "C" { extern void dmrg_init(); extern void dmrg_memcpy(void *dest, const void *src, size_t n); extern int dmrg_is_managed( const void *ptr ); Loading
src/estimate_work.c 0 → 100644 +76 −0 Original line number Diff line number Diff line #include "test_vbatch.h" void estimate_work( int npatches, int left_patch_size_[], int right_patch_size_[], int nC_[], double *ptotal_gflops, double *pgmemA, double *pgmemB, double *pgmemBX ) #define nC(ipatch,jpatch) nC_[ ((ipatch)-1) + ((jpatch)-1)*npatches ] #define left_patch_size(ipatch) left_patch_size_[(ipatch)-1] #define right_patch_size(ipatch) right_patch_size_[(ipatch)-1] { /* ------------------- estimate total work ------------------- */ assert( ptotal_gflops != NULL ); assert( pgmemA != NULL ); assert( pgmemB != NULL ); assert( pgmemBX != NULL ); double gmemA = 0.0; double gmemB = 0.0; double gmemBX = 0.0; double total_flops = 0.0; { int ipatch = 0; int jpatch = 0; for(jpatch=1; jpatch <= npatches; jpatch++) { for(ipatch=1; ipatch <= npatches; ipatch++) { int nop = nC(ipatch,jpatch); if (nop <= 0) continue; double flops_total = 0.0; double flops_method1 = 0.0; double flops_method2 = 0.0; /* -------------------------------------- Note: evaluate (B * X ) * transpose(A) -------------------------------------- */ int nrowA = left_patch_size(ipatch); int ncolA = left_patch_size(jpatch); int nrowB = right_patch_size(ipatch); int ncolB = right_patch_size(jpatch); int ncolX = ncolA; gmemA += nop * nrowA * ncolA; gmemB += nop * nrowB * ncolB; gmemBX += nop * nrowB * ncolX; cal_kron_flops( nrowA, nrowB, ncolA, ncolB, &flops_total, &flops_method1, &flops_method2); total_flops += flops_method1*nop; }; }; }; double total_gflops = total_flops/(1000.0*1000.0*1000.0); *ptotal_gflops = total_gflops; *pgmemA = gmemA; *pgmemB = gmemB; *pgmemBX = gmemBX; }