Unverified Commit ef0deee4 authored by Kabelitz, Matthew Edward's avatar Kabelitz, Matthew Edward Committed by GitHub
Browse files

Native SpMV Kernel Update and nnz changes (#316)

* Native matrices now determine indices for data transfer at assemble-time

* Changed to Waitany for nonblocking

* Moved local computation to end

* Updated NNZ to be compatibly with PETSc o/dnnz parameters

* Resolved CI python version mismatch

* Fixed a bug where too much memory was being allocated by Native matrix init
parent 81fdbc2c
...@@ -111,7 +111,7 @@ PUBLIC :: BandedMatrixType ...@@ -111,7 +111,7 @@ PUBLIC :: BandedMatrixType
PUBLIC :: DistributedBandedMatrixType PUBLIC :: DistributedBandedMatrixType
PUBLIC :: DistributedBlockBandedMatrixType PUBLIC :: DistributedBlockBandedMatrixType
PUBLIC :: SparseMatrixType PUBLIC :: SparseMatrixType
INTEGER(SIK),PARAMETER,PUBLIC :: MATVEC_SLOTS=4 INTEGER(SIK),PARAMETER,PUBLIC :: MATVEC_SLOTS=10
! PETSc implementations ! PETSc implementations
#ifdef FUTILITY_HAVE_PETSC #ifdef FUTILITY_HAVE_PETSC
PUBLIC :: PETScMatrixType PUBLIC :: PETScMatrixType
...@@ -698,18 +698,17 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b) ...@@ -698,18 +698,17 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
CHARACTER(LEN=1),INTENT(IN) :: ul CHARACTER(LEN=1),INTENT(IN) :: ul
CHARACTER(LEN=1),INTENT(IN) :: d CHARACTER(LEN=1),INTENT(IN) :: d
INTEGER(SIK),INTENT(IN) :: incx INTEGER(SIK),INTENT(IN) :: incx
INTEGER(SIK) :: rank,k INTEGER(SIK) :: rank,k,lowIdx,highIdx
INTEGER(SIK),ALLOCATABLE :: recvIdx(:,:), sendIdx(:,:)
REAL(SRK),ALLOCATABLE :: recvResult(:,:),sendResult(:,:),tmpProduct(:) REAL(SRK),ALLOCATABLE :: recvResult(:,:),sendResult(:,:),tmpProduct(:)
INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),sendIdxRequests(MATVEC_SLOTS)
INTEGER(SIK) :: recvRequests(MATVEC_SLOTS),recvIdxRequests(MATVEC_SLOTS)
INTEGER(SIK) :: lowIdx,highIdx,sendCounter,recvCounter
#ifdef HAVE_MPI #ifdef HAVE_MPI
! Get rank ! Get rank
INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),recvRequests(MATVEC_SLOTS)
INTEGER(SIK) :: ctRecv(MATVEC_SLOTS),srcRank,destRank INTEGER(SIK) :: ctRecv(MATVEC_SLOTS),srcRank,destRank
INTEGER(SIK) :: i,idxTmp,cnt,ctDefault,mpierr,nproc INTEGER(SIK) :: i,idxTmp,ctDefault,mpierr,nproc
CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr) CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr)
CALL MPI_Comm_Size(thisMatrix%comm,nProc,mpierr) CALL MPI_Comm_Size(thisMatrix%comm,nProc,mpierr)
sendRequests=MPI_REQUEST_NULL
recvRequests=MPI_REQUEST_NULL
#else #else
rank=0 rank=0
#endif #endif
...@@ -719,55 +718,13 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b) ...@@ -719,55 +718,13 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
! This allows one to be used for computation and the rest for ! This allows one to be used for computation and the rest for
! communication. ! communication.
! Which one to write to will be determined by *Counter MOD MATVEC_SLOTS ! Which one to write to will be determined by *Counter MOD MATVEC_SLOTS
sendCounter=0
recvCounter=0
sendRequests=0
sendIdxRequests=0
recvRequests=0
recvIdxRequests=0
! The recv/sendResult array will sometimes hold it's full length, ! The recv/sendResult array will sometimes hold it's full length,
! and sometimes less. ! and sometimes less.
! The variable "cnt" will be used to determine this at
! each iteration
ALLOCATE(recvIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS))
ALLOCATE(recvResult(thisMatrix%iOffsets(2),MATVEC_SLOTS)) ALLOCATE(recvResult(thisMatrix%iOffsets(2),MATVEC_SLOTS))
ALLOCATE(sendIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS))
ALLOCATE(sendResult(thisMatrix%iOffsets(2),MATVEC_SLOTS)) ALLOCATE(sendResult(thisMatrix%iOffsets(2),MATVEC_SLOTS))
ALLOCATE(tmpProduct(thisMatrix%iOffsets(rank+2)- & ALLOCATE(tmpProduct(thisMatrix%iOffsets(rank+2)- &
thisMatrix%iOffsets(rank+1))) thisMatrix%iOffsets(rank+1)))
tmpProduct=0.0_SRK
! First, take care of locally held data.
SELECT TYPE(thisMatrix)
TYPE IS(DistributedBlockBandedMatrixType)
IF(.NOT. thisMatrix%blockMask) THEN
DO k=1,thisMatrix%nlocalBlocks
lowIdx=(k-1)*thisMatrix%blockSize+1
highIdx=lowIdx-1+thisMatrix%blockSize
CALL matvec_MatrixType(THISMATRIX=thisMatrix%blocks(k),X=x(lowIdx:highIdx), &
Y=tmpProduct(lowIdx:highIdx),ALPHA=1.0_SRK,BETA=0.0_SRK)
ENDDO
IF(thisMatrix%chunks(rank+1)%isInit) THEN
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
y=tmpProduct,ALPHA=1.0_SRK,BETA=1.0_SRK)
ENDIF
ELSE
IF(thisMatrix%chunks(rank+1)%isInit) THEN
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
y=tmpProduct,ALPHA=1.0_SRK,BETA=0.0_SRK)
ELSE
tmpProduct=0.0_SRK
ENDIF
ENDIF
TYPE IS(DistributedBandedMatrixType)
IF(thisMatrix%chunks(rank+1)%isInit) THEN
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
y=tmpProduct,ALPHA=1.0_SRK,BETA=0.0_SRK)
ELSE
tmpProduct=0.0_SRK
ENDIF
CLASS DEFAULT
tmpProduct = 0.0_SRK
ENDSELECT
#ifdef HAVE_MPI #ifdef HAVE_MPI
! On each rank, loop over the chunks held (on diagonal moving down) ! On each rank, loop over the chunks held (on diagonal moving down)
...@@ -778,79 +735,82 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b) ...@@ -778,79 +735,82 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
! First do local computation and send ! First do local computation and send
IF (thismatrix%chunks(destRank+1)%isInit) THEN IF (thismatrix%chunks(destRank+1)%isInit) THEN
! Decide whether to send whole vector or multiple sparse ! Decide whether to send whole vector or multiple sparse
IF (2*thisMatrix%chunks(destRank+1)%nnz/3 >= thisMatrix%chunks(destRank+1)%n) THEN IF (thisMatrix%chunks(destRank+1)%nnz >= thisMatrix%chunks(destRank+1)%n) THEN
sendCounter=sendCounter+1
idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1
! Check if we can safely write to sendRequests ! Check if we can safely write to sendRequests
CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests) CALL pop_send(sendResult,idxTmp,sendRequests)
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(destRank+1),X=x,& CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(destRank+1),X=x,&
y=sendResult(1:ctDefault,idxTmp),ALPHA=1.0_SRK,BETA=0.0_SRK) y=sendResult(1:ctDefault,idxTmp),ALPHA=1.0_SRK,BETA=0.0_SRK)
CALL MPI_ISend(sendResult(1:ctDefault,idxTmp),ctDefault, & CALL MPI_ISend(sendResult(1:ctDefault,idxTmp),ctDefault, &
MPI_DOUBLE_PRECISION,destRank,0,thisMatrix%comm,sendRequests(idxTmp),mpierr) MPI_DOUBLE_PRECISION,destRank,0,thisMatrix%comm,sendRequests(idxTmp),mpierr)
ELSE ELSE
! Send several sparse vectors ! Gather and send several sparse vectors
CALL pop_send(sendResult,idxTmp,sendRequests)
lowIdx=1
DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx) DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx)
sendCounter=sendCounter+1 highidx=lowIdx+SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)-1
idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1 sendResult(lowIdx:highIdx,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem &
CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)
! compute destination indices
! perform special-case multiplication from single band here
cnt=SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)
sendIdx(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%jIdx &
-thisMatrix%chunks(destRank+1)%bandIdx(k)
CALL MPI_ISend(sendIdx(1:cnt,idxTmp),cnt,MPI_INTEGER,destRank,0, &
thisMatrix%comm,sendIdxRequests(idxTmp), mpierr)
sendResult(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem &
*x(thisMatrix%chunks(destRank+1)%bands(k)%jIdx) *x(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)
CALL MPI_ISend(sendResult(1:cnt,idxTmp),cnt,MPI_DOUBLE_PRECISION, & lowIdx=highIdx+1
destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr)
ENDDO ENDDO
CALL MPI_ISend(sendResult(1:highIdx,idxTmp),highIdx,MPI_DOUBLE_PRECISION, &
destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr)
ENDIF ENDIF
ENDIF ENDIF
! We might receive data from rank MOD(rank-i,nproc) ! We might receive data from rank MOD(rank-i,nproc)
srcRank = MODULO(rank-i,nProc) srcRank = MODULO(rank-i,nProc)
IF (ALLOCATED(thisMatrix%bandSizes(srcRank+1)%p)) THEN IF (thisMatrix%incIdxStt(srcRank+1) == -1) THEN
IF (thisMatrix%bandSizes(srcRank+1)%p(1) < 0) THEN ! We are receiving a whole vector at once
! We are receiving a whole vector at once ! If we've filled up the available storage, we need
recvCounter=recvCounter+1 ! to wait for communication to finish up
idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1 CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
! If we've filled up the available storage, we need ctRecv,idxTmp,recvRequests)
! to wait for communication to finish up ctRecv(idxTmp)=SIZE(tmpProduct)
CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, & CALL MPI_IRecv(recvResult(1:ctRecv(idxTmp),idxTmp), ctRecv(idxTmp), &
recvRequests, recvIdxRequests) MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm, &
ctRecv(MOD(recvCounter,MATVEC_SLOTS)+1)=ctDefault recvRequests(idxTmp),mpierr)
CALL MPI_IRecv(recvResult(1:ctDefault,idxTmp), ctDefault, & ELSEIF (thisMatrix%incIdxStt(srcRank+1) > 0) THEN
MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm, & ! We are receiving multiple sparse chunks gathered together
recvRequests(idxTmp),mpierr) CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
ELSE ctRecv,idxTmp,recvRequests)
! We are receiving multiple sparse chunks ctRecv(idxTmp)= -srcRank-1
DO k=1,SIZE(thisMatrix%bandSizes(srcRank+1)%p) lowIdx=thisMatrix%incIdxStt(srcRank+1)
recvCounter=recvCounter+1 highIdx=thisMatrix%incIdxStp(srcRank+1)
idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1 CALL MPI_IRecv(recvResult(1:(highIdx-lowIdx+1),idxTmp),highIdx-lowIdx+1, &
! If we've filled up the available storage, we need MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm,recvRequests(idxTmp), mpierr)
! to wait for communication to finish up
CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, &
recvRequests, recvIdxRequests)
ctRecv(idxTmp)=-thisMatrix%bandSizes(srcRank+1)%p(k)
CALL MPI_IRecv(recvIdx(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), &
MPI_INTEGER,srcRank,0,thisMatrix%comm,recvIdxRequests(idxTmp), mpierr)
CALL MPI_IRecv(recvResult(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), &
MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm,recvRequests(idxTmp), mpierr)
ENDDO
ENDIF
ENDIF ENDIF
ENDDO ENDDO
! We've finished calling irecv. Wait for remaining #endif
! requests to finish: ! Now, take care of locally held data.
DO idxTmp=1,MATVEC_SLOTS SELECT TYPE(thisMatrix)
CALL pop_recv(tmpProduct, recvResult, recvIdx, ctRecv, idxTmp, & TYPE IS(DistributedBlockBandedMatrixType)
recvRequests, recvIdxRequests) IF(thisMatrix%chunks(rank+1)%isInit) THEN
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
y=tmpProduct,ALPHA=1.0_SRK,BETA=1.0_SRK)
ENDIF
IF(.NOT. thisMatrix%blockMask) THEN
DO k=1,thisMatrix%nlocalBlocks
lowIdx=(k-1)*thisMatrix%blockSize+1
highIdx=lowIdx-1+thisMatrix%blockSize
CALL matvec_MatrixType(THISMATRIX=thisMatrix%blocks(k),X=x(lowIdx:highIdx), &
Y=tmpProduct(lowIdx:highIdx),ALPHA=1.0_SRK,BETA=1.0_SRK)
ENDDO
ENDIF
TYPE IS(DistributedBandedMatrixType)
IF(thisMatrix%chunks(rank+1)%isInit) THEN
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
y=tmpProduct,ALPHA=1.0_SRK,BETA=1.0_SRK)
ENDIF
ENDSELECT
#if HAVE_MPI
! Wait for remaining requests to finish:
DO k=1,MATVEC_SLOTS
idxTmp=k
CALL pop_recv(tmpProduct,recvResult,thisMatrix,ctRecv,idxTmp,recvRequests,.TRUE.)
ENDDO ENDDO
DO idxTmp=1,MATVEC_SLOTS DO k=1,MATVEC_SLOTS
CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests) idxTmp=k
CALL pop_send(sendResult,idxTmp,sendRequests,.TRUE.)
ENDDO ENDDO
#endif #endif
...@@ -904,66 +864,102 @@ ENDSUBROUTINE matvec_DistrBandedMatrixType ...@@ -904,66 +864,102 @@ ENDSUBROUTINE matvec_DistrBandedMatrixType
!> @brief Helper routine that handles the movement of data out of the !> @brief Helper routine that handles the movement of data out of the
!> recieving buffer. This buffer has multiple "slots" that are either !> recieving buffer. This buffer has multiple "slots" that are either
!> being worked on directly or receiving data from MPI in background. !> being worked on directly or receiving data from MPI in background.
!> Empty slots are denoted with MPI_REQUEST_NULL
!> @param acc The accumulating vector !> @param acc The accumulating vector
!> @param valBuf The recv buffer for values !> @param valBuf The recv buffer for values
!> @param thisMat The matrixtype used in multiplication
!> @param ctBuf Count of elements in each buffer slot !> @param ctBuf Count of elements in each buffer slot
!> @param idxBuf The recv buffer for indices !> @param idx The empty buffer slot that was popped
!> @param idx The buffer slot to pop
!> @param req The array of active requests for data !> @param req The array of active requests for data
!> @param idxReq The array of requests for indices !> @param f Optional bool to flush all requests
!> !>
SUBROUTINE pop_recv(acc,valBuf,idxBuf,ctBuf,idx,req,idxReq) SUBROUTINE pop_recv(acc,valBuf,thisMat,ctBuf,idx,req,f)
!> Local vector data
REAL(SRK), INTENT(INOUT) :: acc(:) REAL(SRK), INTENT(INOUT) :: acc(:)
!> List of buffers
REAL(SRK), INTENT(INOUT) :: valBuf(:,:) REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
!> Matrix used in SpMV
CLASS(DistributedBandedMatrixType),INTENT(INOUT) :: thisMat
!> Array of buffer sizes
INTEGER(SIK), INTENT(INOUT) :: ctBuf(MATVEC_SLOTS) INTEGER(SIK), INTENT(INOUT) :: ctBuf(MATVEC_SLOTS)
INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:) !> The buffer slot popped
INTEGER(SIK), INTENT(IN) :: idx INTEGER(SIK), INTENT(OUT) :: idx
!> List of MPI Requests
INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS) INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS) !> Optional argument to flush all buffers
INTEGER(SIK) :: mpierr LOGICAL(SBK),OPTIONAL,INTENT(IN) :: f
INTEGER(SIK) :: stt,stp,mpierr,i,rank
LOGICAL(SBK) :: force
CALL MPI_Comm_Rank(thisMat%comm,rank,mpierr)
IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN force=.FALSE.
IF (PRESENT(f)) force=f
IF (force) THEN
IF (req(idx) == MPI_REQUEST_NULL) RETURN
CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
ELSE
IF (ANY(req == MPI_REQUEST_NULL)) THEN
DO i=1,SIZE(req)
IF (req(i) == MPI_REQUEST_NULL) THEN
idx=i
RETURN
ENDIF
ENDDO
ELSE
CALL MPI_WaitAny(SIZE(req),req,idx,MPI_STATUS_IGNORE,mpierr)
ENDIF
ENDIF
CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
IF(ctBuf(idx) > 0) THEN IF(ctBuf(idx) > 0) THEN
acc(1:ctBuf(idx))=acc(1:ctBuf(idx))+valBuf(1:ctBuf(idx),idx) acc(1:ctBuf(idx))=acc(1:ctBuf(idx))+valBuf(1:ctBuf(idx),idx)
ELSE ELSE
CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr) stt=thisMat%incIdxStt(-ctBuf(idx))
idxReq(idx)=0 stp=thisMat%incIdxStp(-ctBuf(idx))
acc(idxBuf(1:-ctBuf(idx),idx))=acc(idxBuf(1:-ctBuf(idx),idx)) & DO i=stt,stp
+valBuf(1:-ctBuf(idx),idx) acc(thisMat%incIdxMap(i))=acc(thisMat%incIdxMap(i))+valBuf(i-stt+1,idx)
ENDDO
ENDIF ENDIF
ENDSUBROUTINE pop_recv ENDSUBROUTINE pop_recv
! !
!------------------------------------------------------------------------------- !-------------------------------------------------------------------------------
!> @brief Helper routine that keeps of data in the send buffer while !> @brief Helper routine that keeps track of data in the send buffer while
!> MPI is still working. This buffer has multiple "slots" that are !> MPI is still working. This buffer has multiple "slots" that are
!> either being worked on directly or contain data being sent by !> either being worked on directly or contain data being sent by
!> MPI in background. !> MPI in background. Empty slots are denoted with MPI_REQUEST_NULL
!> @param valBuf The send buffer for values !> @param valBuf The send buffer for values
!> @param idxBuf The send buffer for indices !> @param idx The buffer slot popped
!> @param idx The buffer slot to pop
!> @param req The array of requests for data !> @param req The array of requests for data
!> @param idxReq The array of requests for indices !> @param f Optional bool to flush all requests
!> !>
SUBROUTINE pop_send(valBuf, idxBuf, idx, req, idxReq) SUBROUTINE pop_send(valBuf,idx,req,f)
!> Set of data buffers
REAL(SRK), INTENT(INOUT) :: valBuf(:,:) REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
!> The send buffer for indices !> The buffer slot popped
INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:) INTEGER(SIK), INTENT(OUT) :: idx
!> The buffer slot to pop !> The array of MPI requests
INTEGER(SIK), INTENT(IN) :: idx
!> The array of requests for data
INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS) INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
!> The array of requests for indices !> Optional argument to flush all buffers
INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS) LOGICAL(SBK),OPTIONAL,INTENT(IN) :: f
INTEGER(SIK) :: mpierr INTEGER(SIK) :: mpierr,i
LOGICAL(SBK) :: force
IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN
CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr) force=.FALSE.
IF(idxReq(idx) /= 0) THEN IF (PRESENT(f)) force=f
CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr) IF (force) THEN
idxReq(idx)=0 IF (req(idx) == MPI_REQUEST_NULL) RETURN
CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
ELSE
IF (ANY(req == MPI_REQUEST_NULL)) THEN
DO i=1,SIZE(req)
IF (req(i) == MPI_REQUEST_NULL) THEN
idx=i
RETURN
ENDIF
ENDDO
ELSE
CALL MPI_WaitAny(SIZE(req),req,idx,MPI_STATUS_IGNORE,mpierr)
ENDIF
ENDIF ENDIF
ENDSUBROUTINE pop_send ENDSUBROUTINE pop_send
#endif #endif
......
...@@ -132,11 +132,6 @@ TYPE Band ...@@ -132,11 +132,6 @@ TYPE Band
REAL(SRK), ALLOCATABLE :: elem(:) REAL(SRK), ALLOCATABLE :: elem(:)
ENDTYPE Band ENDTYPE Band
!> @brief Integer array used for 'ragged' storage of band contents
TYPE IntPtr
INTEGER(SIK),ALLOCATABLE :: p(:)
ENDTYPE IntPtr
!> @brief The basic banded matrix type !> @brief The basic banded matrix type
TYPE,EXTENDS(MatrixType) :: BandedMatrixType TYPE,EXTENDS(MatrixType) :: BandedMatrixType
!> Map of band indices stored (-m to n) !> Map of band indices stored (-m to n)
...@@ -194,13 +189,15 @@ TYPE,EXTENDS(DistributedMatrixType) :: DistributedBandedMatrixType ...@@ -194,13 +189,15 @@ TYPE,EXTENDS(DistributedMatrixType) :: DistributedBandedMatrixType
!> The column of banded matrix 'chunks' stored locally !> The column of banded matrix 'chunks' stored locally
TYPE(BandedMatrixType),ALLOCATABLE :: chunks(:) TYPE(BandedMatrixType),ALLOCATABLE :: chunks(:)
!> Number of nonzero elements !> Number of nonzero elements
INTEGER(SIK) :: nnz INTEGER(SLK) :: nnz
!> Number of columns !> Number of columns
INTEGER(SIK) :: m INTEGER(SIK) :: m
!> Block size (smallest indivisble unit) !> Block size (smallest indivisble unit)
INTEGER(SIK) :: blockSize INTEGER(SIK) :: blockSize
!> Array of band sizes used to determine optimal communication !> Array of band sizes used to determine optimal communication
TYPE(IntPtr), ALLOCATABLE :: bandSizes(:) INTEGER(SIK), ALLOCATABLE :: incIdxMap(:)
INTEGER(SIK), ALLOCATABLE :: incIdxStt(:)
INTEGER(SIK), ALLOCATABLE :: incIdxStp(:)
!> Temporary containers used before (and deallocated after) assembly !> Temporary containers used before (and deallocated after) assembly
INTEGER(SIK), ALLOCATABLE :: iTmp(:),jTmp(:) INTEGER(SIK), ALLOCATABLE :: iTmp(:),jTmp(:)
REAL(SRK),ALLOCATABLE :: elemTmp(:) REAL(SRK),ALLOCATABLE :: elemTmp(:)
...@@ -482,7 +479,9 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params) ...@@ -482,7 +479,9 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params)
#ifdef HAVE_MPI #ifdef HAVE_MPI
CHARACTER(LEN=*),PARAMETER :: myName='init_DistributedBandedMatrixParam' CHARACTER(LEN=*),PARAMETER :: myName='init_DistributedBandedMatrixParam'
TYPE(ParamType) :: validParams TYPE(ParamType) :: validParams
INTEGER(SIK) :: n,m,nnz,commID,rank,mpierr,nproc,i,blocksize,nlocal INTEGER(SIK),ALLOCATABLE :: onnz(:),dnnz(:)
INTEGER(SIK) :: n,m,commID,rank,mpierr,nproc,i,blocksize,nlocal,nnz_int
INTEGER(SLK) :: nnz
!Check to set up required and optional param lists. !Check to set up required and optional param lists.
IF(.NOT.MatrixType_Paramsflag) CALL MatrixTypes_Declare_ValidParams() IF(.NOT.MatrixType_Paramsflag) CALL MatrixTypes_Declare_ValidParams()
...@@ -496,10 +495,22 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params) ...@@ -496,10 +495,22 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params)
CALL validParams%get('MatrixType->n',n) CALL validParams%get('MatrixType->n',n)
CALL validParams%get('MatrixType->m',m) CALL validParams%get('MatrixType->m',m)
CALL validParams%get('MatrixType->MPI_Comm_ID',commID) CALL validParams%get('MatrixType->MPI_Comm_ID',commID)
CALL validParams%get('MatrixType->nnz',nnz) CALL validParams%get('MatrixType->nnz',nnz_int)
CALL validParams%get('MatrixType->blockSize',blockSize) CALL validParams%get('MatrixType->blockSize',blockSize)
CALL validParams%get('MatrixType->nlocal',nlocal) CALL validParams%get('MatrixType->nlocal',nlocal)
CALL validParams%clear() nnz=INT(nnz_int,KIND=8)
IF(nnz_int <= 1) THEN
IF (validParams%has("MatrixType->dnnz") .AND. validParams%has("MatrixType->onnz")) THEN
nnz=0_SLK
CALL validParams%get("MatrixType->onnz",onnz)
CALL validParams%get("MatrixType->dnnz",dnnz)
DO i=1,SIZE(onnz)
nnz=nnz+INT(onnz(i),KIND=SLK)
nnz=nnz+INT(dnnz(i),KIND=SLK)
ENDDO
CALL MPI_AllReduce(nnz,MPI_IN_PLACE,1,MPI_LONG,MPI_SUM,commID,mpierr)
ENDIF
ENDIF
IF(.NOT. matrix%isInit) THEN IF(.NOT. matrix%isInit) THEN
IF(n <= 1) THEN IF(n <= 1) THEN
...@@ -575,12 +586,12 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params) ...@@ -575,12 +586,12 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params)
matrix%nnz=nnz matrix%nnz=nnz
matrix%blockSize = blockSize matrix%blockSize = blockSize
matrix%comm=commID matrix%comm=commID
ALLOCATE(matrix%bandSizes(nProc))
ENDIF ENDIF
ELSE ELSE
CALL eMatrixType%raiseError('Incorrect call to '// & CALL eMatrixType%raiseError('Incorrect call to '// &
modName//'::'//myName//' - MatrixType already initialized') modName//'::'//myName//' - MatrixType already initialized')
ENDIF ENDIF
CALL validParams%clear()
#endif #endif
ENDSUBROUTINE init_DistributedBandedMatrixParam ENDSUBROUTINE init_DistributedBandedMatrixParam
...@@ -690,10 +701,10 @@ SUBROUTINE assemble_BandedMatrixType(thisMatrix) ...@@ -690,10 +701,10 @@ SUBROUTINE assemble_BandedMatrixType(thisMatrix)
ALLOCATE(idxOrig(thisMatrix%counter)) ALLOCATE(idxOrig(thisMatrix%counter))
IF (thisMatrix%isAssembled) RETURN IF (thisMatrix%isAssembled) RETURN
DO i=1,thisMatrix%counter DO i=1,thisMatrix%counter
iLong=INT(thisMatrix%iTmp(i),kind=SLK) iLong=INT(thisMatrix%iTmp(i),KIND=SLK)
jLong=INT(thisMatrix%jTmp(i),kind=SLK) jLong=INT(thisMatrix%jTmp(i),KIND=SLK)
nLong=INT(thisMatrix%n,kind=SLK) nLong=INT(thisMatrix%n,KIND=SLK)
mLong=INT(thisMatrix%m,kind=SLK) mLong=INT(thisMatrix%m,KIND=SLK)
idxOrig(i)=i idxOrig(i)=i
! The diagonal rank counts upwards as one moves southeast in the matrix, ! The diagonal rank counts upwards as one moves southeast in the matrix,
! starting at a large negative number in the bottom left, and reaching ! starting at a large negative number in the bottom left, and reaching
...@@ -764,14 +775,14 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr) ...@@ -764,14 +775,14 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
INTEGER(SIK),INTENT(OUT),OPTIONAL :: ierr INTEGER(SIK),INTENT(OUT),OPTIONAL :: ierr
#ifdef HAVE_MPI #ifdef HAVE_MPI
TYPE(ParamType) :: bandedPList TYPE(ParamType) :: bandedPList
INTEGER(SIK) :: mpierr, rank, nproc, i, j, nBandLocal INTEGER(SIK) :: mpierr,rank,nproc,i,j,nTransmit,stt,stp
INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nband(:),bandSizeTmp(:) INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nRecv(:),idxMapTmp(:)
REQUIRE(thisMatrix%isInit) REQUIRE(thisMatrix%isInit)
CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr) CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr)
CALL MPI_Comm_size(thisMatrix%comm,nproc,mpierr) CALL MPI_Comm_size(thisMatrix%comm,nproc,mpierr)
ALLOCATE(nnzPerChunk(nProc)) ALLOCATE(nnzPerChunk(nProc))
ALLOCATE(nBand(nProc)) ALLOCATE(nRecv(nProc))
nnzPerChunk=0 nnzPerChunk=0
DO i=1,thisMatrix%nLocal DO i=1,thisMatrix%nLocal
DO j=1,nproc DO j=1,nproc
...@@ -804,45 +815,77 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr) ...@@ -804,45 +815,77 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
CALL thisMatrix%chunks(i)%assemble() CALL thisMatrix%chunks(i)%assemble()