Unverified Commit ef0deee4 authored by Kabelitz, Matthew Edward's avatar Kabelitz, Matthew Edward Committed by GitHub
Browse files

Native SpMV Kernel Update and nnz changes (#316)

* Native matrices now determine indices for data transfer at assemble-time

* Changed to Waitany for nonblocking

* Moved local computation to end

* Updated NNZ to be compatibly with PETSc o/dnnz parameters

* Resolved CI python version mismatch

* Fixed a bug where too much memory was being allocated by Native matrix init
parent 81fdbc2c
......@@ -111,7 +111,7 @@ PUBLIC :: BandedMatrixType
PUBLIC :: DistributedBandedMatrixType
PUBLIC :: DistributedBlockBandedMatrixType
PUBLIC :: SparseMatrixType
INTEGER(SIK),PARAMETER,PUBLIC :: MATVEC_SLOTS=4
INTEGER(SIK),PARAMETER,PUBLIC :: MATVEC_SLOTS=10
! PETSc implementations
#ifdef FUTILITY_HAVE_PETSC
PUBLIC :: PETScMatrixType
......@@ -698,18 +698,17 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
CHARACTER(LEN=1),INTENT(IN) :: ul
CHARACTER(LEN=1),INTENT(IN) :: d
INTEGER(SIK),INTENT(IN) :: incx
INTEGER(SIK) :: rank,k
INTEGER(SIK),ALLOCATABLE :: recvIdx(:,:), sendIdx(:,:)
INTEGER(SIK) :: rank,k,lowIdx,highIdx
REAL(SRK),ALLOCATABLE :: recvResult(:,:),sendResult(:,:),tmpProduct(:)
INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),sendIdxRequests(MATVEC_SLOTS)
INTEGER(SIK) :: recvRequests(MATVEC_SLOTS),recvIdxRequests(MATVEC_SLOTS)
INTEGER(SIK) :: lowIdx,highIdx,sendCounter,recvCounter
#ifdef HAVE_MPI
! Get rank
INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),recvRequests(MATVEC_SLOTS)
INTEGER(SIK) :: ctRecv(MATVEC_SLOTS),srcRank,destRank
INTEGER(SIK) :: i,idxTmp,cnt,ctDefault,mpierr,nproc
INTEGER(SIK) :: i,idxTmp,ctDefault,mpierr,nproc
CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr)
CALL MPI_Comm_Size(thisMatrix%comm,nProc,mpierr)
sendRequests=MPI_REQUEST_NULL
recvRequests=MPI_REQUEST_NULL
#else
rank=0
#endif
......@@ -719,55 +718,13 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
! This allows one to be used for computation and the rest for
! communication.
! Which one to write to will be determined by *Counter MOD MATVEC_SLOTS
sendCounter=0
recvCounter=0
sendRequests=0
sendIdxRequests=0
recvRequests=0
recvIdxRequests=0
! The recv/sendResult array will sometimes hold it's full length,
! and sometimes less.
! The variable "cnt" will be used to determine this at
! each iteration
ALLOCATE(recvIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS))
ALLOCATE(recvResult(thisMatrix%iOffsets(2),MATVEC_SLOTS))
ALLOCATE(sendIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS))
ALLOCATE(sendResult(thisMatrix%iOffsets(2),MATVEC_SLOTS))
ALLOCATE(tmpProduct(thisMatrix%iOffsets(rank+2)- &
thisMatrix%iOffsets(rank+1)))
! First, take care of locally held data.
SELECT TYPE(thisMatrix)
TYPE IS(DistributedBlockBandedMatrixType)
IF(.NOT. thisMatrix%blockMask) THEN
DO k=1,thisMatrix%nlocalBlocks
lowIdx=(k-1)*thisMatrix%blockSize+1
highIdx=lowIdx-1+thisMatrix%blockSize
CALL matvec_MatrixType(THISMATRIX=thisMatrix%blocks(k),X=x(lowIdx:highIdx), &
Y=tmpProduct(lowIdx:highIdx),ALPHA=1.0_SRK,BETA=0.0_SRK)
ENDDO
IF(thisMatrix%chunks(rank+1)%isInit) THEN
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
y=tmpProduct,ALPHA=1.0_SRK,BETA=1.0_SRK)
ENDIF
ELSE
IF(thisMatrix%chunks(rank+1)%isInit) THEN
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
y=tmpProduct,ALPHA=1.0_SRK,BETA=0.0_SRK)
ELSE
tmpProduct=0.0_SRK
ENDIF
ENDIF
TYPE IS(DistributedBandedMatrixType)
IF(thisMatrix%chunks(rank+1)%isInit) THEN
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
y=tmpProduct,ALPHA=1.0_SRK,BETA=0.0_SRK)
ELSE
tmpProduct=0.0_SRK
ENDIF
CLASS DEFAULT
tmpProduct = 0.0_SRK
ENDSELECT
#ifdef HAVE_MPI
! On each rank, loop over the chunks held (on diagonal moving down)
......@@ -778,79 +735,82 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
! First do local computation and send
IF (thismatrix%chunks(destRank+1)%isInit) THEN
! Decide whether to send whole vector or multiple sparse
IF (2*thisMatrix%chunks(destRank+1)%nnz/3 >= thisMatrix%chunks(destRank+1)%n) THEN
sendCounter=sendCounter+1
idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1
IF (thisMatrix%chunks(destRank+1)%nnz >= thisMatrix%chunks(destRank+1)%n) THEN
! Check if we can safely write to sendRequests
CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)
CALL pop_send(sendResult,idxTmp,sendRequests)
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(destRank+1),X=x,&
y=sendResult(1:ctDefault,idxTmp),ALPHA=1.0_SRK,BETA=0.0_SRK)
CALL MPI_ISend(sendResult(1:ctDefault,idxTmp),ctDefault, &
MPI_DOUBLE_PRECISION,destRank,0,thisMatrix%comm,sendRequests(idxTmp),mpierr)
ELSE
! Send several sparse vectors
! Gather and send several sparse vectors
CALL pop_send(sendResult,idxTmp,sendRequests)
lowIdx=1
DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx)
sendCounter=sendCounter+1
idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1
CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)
! compute destination indices
! perform special-case multiplication from single band here
cnt=SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)
sendIdx(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%jIdx &
-thisMatrix%chunks(destRank+1)%bandIdx(k)
CALL MPI_ISend(sendIdx(1:cnt,idxTmp),cnt,MPI_INTEGER,destRank,0, &
thisMatrix%comm,sendIdxRequests(idxTmp), mpierr)
sendResult(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem &
highidx=lowIdx+SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)-1
sendResult(lowIdx:highIdx,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem &
*x(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)
CALL MPI_ISend(sendResult(1:cnt,idxTmp),cnt,MPI_DOUBLE_PRECISION, &
destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr)
lowIdx=highIdx+1
ENDDO
CALL MPI_ISend(sendResult(1:highIdx,idxTmp),highIdx,MPI_DOUBLE_PRECISION, &
destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr)
ENDIF
ENDIF
! We might receive data from rank MOD(rank-i,nproc)
srcRank = MODULO(rank-i,nProc)
IF (ALLOCATED(thisMatrix%bandSizes(srcRank+1)%p)) THEN
IF (thisMatrix%bandSizes(srcRank+1)%p(1) < 0) THEN
IF (thisMatrix%incIdxStt(srcRank+1) == -1) THEN
! We are receiving a whole vector at once
recvCounter=recvCounter+1
idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1
! If we've filled up the available storage, we need
! to wait for communication to finish up
CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, &
recvRequests, recvIdxRequests)
ctRecv(MOD(recvCounter,MATVEC_SLOTS)+1)=ctDefault
CALL MPI_IRecv(recvResult(1:ctDefault,idxTmp), ctDefault, &
CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
ctRecv,idxTmp,recvRequests)
ctRecv(idxTmp)=SIZE(tmpProduct)
CALL MPI_IRecv(recvResult(1:ctRecv(idxTmp),idxTmp), ctRecv(idxTmp), &
MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm, &
recvRequests(idxTmp),mpierr)
ELSE
! We are receiving multiple sparse chunks
DO k=1,SIZE(thisMatrix%bandSizes(srcRank+1)%p)
recvCounter=recvCounter+1
idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1
! If we've filled up the available storage, we need
! to wait for communication to finish up
CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, &
recvRequests, recvIdxRequests)
ctRecv(idxTmp)=-thisMatrix%bandSizes(srcRank+1)%p(k)
CALL MPI_IRecv(recvIdx(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), &
MPI_INTEGER,srcRank,0,thisMatrix%comm,recvIdxRequests(idxTmp), mpierr)
CALL MPI_IRecv(recvResult(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), &
ELSEIF (thisMatrix%incIdxStt(srcRank+1) > 0) THEN
! We are receiving multiple sparse chunks gathered together
CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
ctRecv,idxTmp,recvRequests)
ctRecv(idxTmp)= -srcRank-1
lowIdx=thisMatrix%incIdxStt(srcRank+1)
highIdx=thisMatrix%incIdxStp(srcRank+1)
CALL MPI_IRecv(recvResult(1:(highIdx-lowIdx+1),idxTmp),highIdx-lowIdx+1, &
MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm,recvRequests(idxTmp), mpierr)
ENDDO
ENDIF
ENDDO
#endif
! Now, take care of locally held data.
SELECT TYPE(thisMatrix)
TYPE IS(DistributedBlockBandedMatrixType)
IF(thisMatrix%chunks(rank+1)%isInit) THEN
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
y=tmpProduct,ALPHA=1.0_SRK,BETA=1.0_SRK)
ENDIF
IF(.NOT. thisMatrix%blockMask) THEN
DO k=1,thisMatrix%nlocalBlocks
lowIdx=(k-1)*thisMatrix%blockSize+1
highIdx=lowIdx-1+thisMatrix%blockSize
CALL matvec_MatrixType(THISMATRIX=thisMatrix%blocks(k),X=x(lowIdx:highIdx), &
Y=tmpProduct(lowIdx:highIdx),ALPHA=1.0_SRK,BETA=1.0_SRK)
ENDDO
! We've finished calling irecv. Wait for remaining
! requests to finish:
DO idxTmp=1,MATVEC_SLOTS
CALL pop_recv(tmpProduct, recvResult, recvIdx, ctRecv, idxTmp, &
recvRequests, recvIdxRequests)
ENDIF
TYPE IS(DistributedBandedMatrixType)
IF(thisMatrix%chunks(rank+1)%isInit) THEN
CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
y=tmpProduct,ALPHA=1.0_SRK,BETA=1.0_SRK)
ENDIF
ENDSELECT
#if HAVE_MPI
! Wait for remaining requests to finish:
DO k=1,MATVEC_SLOTS
idxTmp=k
CALL pop_recv(tmpProduct,recvResult,thisMatrix,ctRecv,idxTmp,recvRequests,.TRUE.)
ENDDO
DO idxTmp=1,MATVEC_SLOTS
CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)
DO k=1,MATVEC_SLOTS
idxTmp=k
CALL pop_send(sendResult,idxTmp,sendRequests,.TRUE.)
ENDDO
#endif
......@@ -904,66 +864,102 @@ ENDSUBROUTINE matvec_DistrBandedMatrixType
!> @brief Helper routine that handles the movement of data out of the
!> recieving buffer. This buffer has multiple "slots" that are either
!> being worked on directly or receiving data from MPI in background.
!> Empty slots are denoted with MPI_REQUEST_NULL
!> @param acc The accumulating vector
!> @param valBuf The recv buffer for values
!> @param thisMat The matrixtype used in multiplication
!> @param ctBuf Count of elements in each buffer slot
!> @param idxBuf The recv buffer for indices
!> @param idx The buffer slot to pop
!> @param idx The empty buffer slot that was popped
!> @param req The array of active requests for data
!> @param idxReq The array of requests for indices
!> @param f Optional bool to flush all requests
!>
SUBROUTINE pop_recv(acc,valBuf,idxBuf,ctBuf,idx,req,idxReq)
SUBROUTINE pop_recv(acc,valBuf,thisMat,ctBuf,idx,req,f)
!> Local vector data
REAL(SRK), INTENT(INOUT) :: acc(:)
!> List of buffers
REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
!> Matrix used in SpMV
CLASS(DistributedBandedMatrixType),INTENT(INOUT) :: thisMat
!> Array of buffer sizes
INTEGER(SIK), INTENT(INOUT) :: ctBuf(MATVEC_SLOTS)
INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:)
INTEGER(SIK), INTENT(IN) :: idx
!> The buffer slot popped
INTEGER(SIK), INTENT(OUT) :: idx
!> List of MPI Requests
INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS)
INTEGER(SIK) :: mpierr
IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN
!> Optional argument to flush all buffers
LOGICAL(SBK),OPTIONAL,INTENT(IN) :: f
INTEGER(SIK) :: stt,stp,mpierr,i,rank
LOGICAL(SBK) :: force
CALL MPI_Comm_Rank(thisMat%comm,rank,mpierr)
force=.FALSE.
IF (PRESENT(f)) force=f
IF (force) THEN
IF (req(idx) == MPI_REQUEST_NULL) RETURN
CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
ELSE
IF (ANY(req == MPI_REQUEST_NULL)) THEN
DO i=1,SIZE(req)
IF (req(i) == MPI_REQUEST_NULL) THEN
idx=i
RETURN
ENDIF
ENDDO
ELSE
CALL MPI_WaitAny(SIZE(req),req,idx,MPI_STATUS_IGNORE,mpierr)
ENDIF
ENDIF
IF(ctBuf(idx) > 0) THEN
acc(1:ctBuf(idx))=acc(1:ctBuf(idx))+valBuf(1:ctBuf(idx),idx)
ELSE
CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr)
idxReq(idx)=0
acc(idxBuf(1:-ctBuf(idx),idx))=acc(idxBuf(1:-ctBuf(idx),idx)) &
+valBuf(1:-ctBuf(idx),idx)
stt=thisMat%incIdxStt(-ctBuf(idx))
stp=thisMat%incIdxStp(-ctBuf(idx))
DO i=stt,stp
acc(thisMat%incIdxMap(i))=acc(thisMat%incIdxMap(i))+valBuf(i-stt+1,idx)
ENDDO
ENDIF
ENDSUBROUTINE pop_recv
!
!-------------------------------------------------------------------------------
!> @brief Helper routine that keeps of data in the send buffer while
!> @brief Helper routine that keeps track of data in the send buffer while
!> MPI is still working. This buffer has multiple "slots" that are
!> either being worked on directly or contain data being sent by
!> MPI in background.
!> MPI in background. Empty slots are denoted with MPI_REQUEST_NULL
!> @param valBuf The send buffer for values
!> @param idxBuf The send buffer for indices
!> @param idx The buffer slot to pop
!> @param idx The buffer slot popped
!> @param req The array of requests for data
!> @param idxReq The array of requests for indices
!> @param f Optional bool to flush all requests
!>
SUBROUTINE pop_send(valBuf, idxBuf, idx, req, idxReq)
SUBROUTINE pop_send(valBuf,idx,req,f)
!> Set of data buffers
REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
!> The send buffer for indices
INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:)
!> The buffer slot to pop
INTEGER(SIK), INTENT(IN) :: idx
!> The array of requests for data
!> The buffer slot popped
INTEGER(SIK), INTENT(OUT) :: idx
!> The array of MPI requests
INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
!> The array of requests for indices
INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS)
INTEGER(SIK) :: mpierr
IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN
!> Optional argument to flush all buffers
LOGICAL(SBK),OPTIONAL,INTENT(IN) :: f
INTEGER(SIK) :: mpierr,i
LOGICAL(SBK) :: force
force=.FALSE.
IF (PRESENT(f)) force=f
IF (force) THEN
IF (req(idx) == MPI_REQUEST_NULL) RETURN
CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
IF(idxReq(idx) /= 0) THEN
CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr)
idxReq(idx)=0
ELSE
IF (ANY(req == MPI_REQUEST_NULL)) THEN
DO i=1,SIZE(req)
IF (req(i) == MPI_REQUEST_NULL) THEN
idx=i
RETURN
ENDIF
ENDDO
ELSE
CALL MPI_WaitAny(SIZE(req),req,idx,MPI_STATUS_IGNORE,mpierr)
ENDIF
ENDIF
ENDSUBROUTINE pop_send
#endif
......
......@@ -132,11 +132,6 @@ TYPE Band
REAL(SRK), ALLOCATABLE :: elem(:)
ENDTYPE Band
!> @brief Integer array used for 'ragged' storage of band contents
TYPE IntPtr
INTEGER(SIK),ALLOCATABLE :: p(:)
ENDTYPE IntPtr
!> @brief The basic banded matrix type
TYPE,EXTENDS(MatrixType) :: BandedMatrixType
!> Map of band indices stored (-m to n)
......@@ -194,13 +189,15 @@ TYPE,EXTENDS(DistributedMatrixType) :: DistributedBandedMatrixType
!> The column of banded matrix 'chunks' stored locally
TYPE(BandedMatrixType),ALLOCATABLE :: chunks(:)
!> Number of nonzero elements
INTEGER(SIK) :: nnz
INTEGER(SLK) :: nnz
!> Number of columns
INTEGER(SIK) :: m
!> Block size (smallest indivisble unit)
INTEGER(SIK) :: blockSize
!> Array of band sizes used to determine optimal communication
TYPE(IntPtr), ALLOCATABLE :: bandSizes(:)
INTEGER(SIK), ALLOCATABLE :: incIdxMap(:)
INTEGER(SIK), ALLOCATABLE :: incIdxStt(:)
INTEGER(SIK), ALLOCATABLE :: incIdxStp(:)
!> Temporary containers used before (and deallocated after) assembly
INTEGER(SIK), ALLOCATABLE :: iTmp(:),jTmp(:)
REAL(SRK),ALLOCATABLE :: elemTmp(:)
......@@ -482,7 +479,9 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params)
#ifdef HAVE_MPI
CHARACTER(LEN=*),PARAMETER :: myName='init_DistributedBandedMatrixParam'
TYPE(ParamType) :: validParams
INTEGER(SIK) :: n,m,nnz,commID,rank,mpierr,nproc,i,blocksize,nlocal
INTEGER(SIK),ALLOCATABLE :: onnz(:),dnnz(:)
INTEGER(SIK) :: n,m,commID,rank,mpierr,nproc,i,blocksize,nlocal,nnz_int
INTEGER(SLK) :: nnz
!Check to set up required and optional param lists.
IF(.NOT.MatrixType_Paramsflag) CALL MatrixTypes_Declare_ValidParams()
......@@ -496,10 +495,22 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params)
CALL validParams%get('MatrixType->n',n)
CALL validParams%get('MatrixType->m',m)
CALL validParams%get('MatrixType->MPI_Comm_ID',commID)
CALL validParams%get('MatrixType->nnz',nnz)
CALL validParams%get('MatrixType->nnz',nnz_int)
CALL validParams%get('MatrixType->blockSize',blockSize)
CALL validParams%get('MatrixType->nlocal',nlocal)
CALL validParams%clear()
nnz=INT(nnz_int,KIND=8)
IF(nnz_int <= 1) THEN
IF (validParams%has("MatrixType->dnnz") .AND. validParams%has("MatrixType->onnz")) THEN
nnz=0_SLK
CALL validParams%get("MatrixType->onnz",onnz)
CALL validParams%get("MatrixType->dnnz",dnnz)
DO i=1,SIZE(onnz)
nnz=nnz+INT(onnz(i),KIND=SLK)
nnz=nnz+INT(dnnz(i),KIND=SLK)
ENDDO
CALL MPI_AllReduce(nnz,MPI_IN_PLACE,1,MPI_LONG,MPI_SUM,commID,mpierr)
ENDIF
ENDIF
IF(.NOT. matrix%isInit) THEN
IF(n <= 1) THEN
......@@ -575,12 +586,12 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params)
matrix%nnz=nnz
matrix%blockSize = blockSize
matrix%comm=commID
ALLOCATE(matrix%bandSizes(nProc))
ENDIF
ELSE
CALL eMatrixType%raiseError('Incorrect call to '// &
modName//'::'//myName//' - MatrixType already initialized')
ENDIF
CALL validParams%clear()
#endif
ENDSUBROUTINE init_DistributedBandedMatrixParam
......@@ -690,10 +701,10 @@ SUBROUTINE assemble_BandedMatrixType(thisMatrix)
ALLOCATE(idxOrig(thisMatrix%counter))
IF (thisMatrix%isAssembled) RETURN
DO i=1,thisMatrix%counter
iLong=INT(thisMatrix%iTmp(i),kind=SLK)
jLong=INT(thisMatrix%jTmp(i),kind=SLK)
nLong=INT(thisMatrix%n,kind=SLK)
mLong=INT(thisMatrix%m,kind=SLK)
iLong=INT(thisMatrix%iTmp(i),KIND=SLK)
jLong=INT(thisMatrix%jTmp(i),KIND=SLK)
nLong=INT(thisMatrix%n,KIND=SLK)
mLong=INT(thisMatrix%m,KIND=SLK)
idxOrig(i)=i
! The diagonal rank counts upwards as one moves southeast in the matrix,
! starting at a large negative number in the bottom left, and reaching
......@@ -764,14 +775,14 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
INTEGER(SIK),INTENT(OUT),OPTIONAL :: ierr
#ifdef HAVE_MPI
TYPE(ParamType) :: bandedPList
INTEGER(SIK) :: mpierr, rank, nproc, i, j, nBandLocal
INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nband(:),bandSizeTmp(:)
INTEGER(SIK) :: mpierr,rank,nproc,i,j,nTransmit,stt,stp
INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nRecv(:),idxMapTmp(:)
REQUIRE(thisMatrix%isInit)
CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr)
CALL MPI_Comm_size(thisMatrix%comm,nproc,mpierr)
ALLOCATE(nnzPerChunk(nProc))
ALLOCATE(nBand(nProc))
ALLOCATE(nRecv(nProc))
nnzPerChunk=0
DO i=1,thisMatrix%nLocal
DO j=1,nproc
......@@ -804,45 +815,77 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
CALL thisMatrix%chunks(i)%assemble()
CALL bandedPList%clear()
ENDIF
! Set nbandlocal (to be gathered into nband
! Decide the sending method to use. 'Full' chunks will send their whole
! contents while 'sparse' chunks will send band products.
! Set nTransmit to the number of REAL's to be sent from that processor to
! this one.
! >0: This many
! 0: None (chunk empty)
! -1: All
nTransmit=0
IF(thisMatrix%chunks(i)%isInit) THEN
IF(2*thisMatrix%chunks(i)%nnz/3 < thisMatrix%chunks(i)%n) THEN
nBandLocal=SIZE(thisMatrix%chunks(i)%bandIdx)
IF(thisMatrix%chunks(i)%nnz < thisMatrix%chunks(i)%n) THEN
DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx)
nTransmit=nTransmit+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx)
ENDDO
ELSE
nBandLocal=-1
nTransmit=-1
ENDIF
ELSE
nBandLocal=0
ENDIF
! This info must be disseminated through all procs
! Gather to rank i-1 into nBand(:)
CALL MPI_Gather(nBandLocal,1,MPI_INTEGER,nband(1),1,MPI_INTEGER,i-1, &
! Gather to rank i-1 into nRecv(:)
CALL MPI_Gather(nTransmit,1,MPI_INTEGER,nRecv,1,MPI_INTEGER,i-1, &
thisMatrix%comm,mpierr)
! Have rank i-1 allocate
IF(rank == i-1) THEN
nRecv(i)=0
nTransmit=0
DO j=1,SIZE(nRecv)
IF (nRecv(j) > 0) THEN
nTransmit=nTransmit+nRecv(j)
ENDIF
ENDDO
ALLOCATE(thisMatrix%incIdxMap(nTransmit))
ALLOCATE(thisMatrix%incIdxStt(nProc))
ALLOCATE(thisMatrix%incIdxStp(nProc))
thisMatrix%incIdxStt=0
thisMatrix%incIdxStp=0
! Use nTransmit as counter
nTransmit=1
DO j=1,nProc
IF(j /= i) THEN
IF(nBand(j) > 0) THEN
ALLOCATE(thisMatrix%bandSizes(j)%p(nBand(j)))
CALL MPI_Recv(thisMatrix%bandSizes(j)%p(1),nBand(j),MPI_INTEGER, &
j-1, 0, thisMatrix%comm,MPI_STATUS_IGNORE, mpierr)
IF(j == i) THEN
thisMatrix%incIdxStt(j)=0
thisMatrix%incIdxStp(j)=0
ELSE
IF(nBand(j) /= 0) THEN
ALLOCATE(thisMatrix%bandSizes(j)%p(1))
thisMatrix%bandSizes(j)%p(1)=-1
ENDIF
IF(nRecv(j) > 0) THEN
thisMatrix%incIdxStt(j)=nTransmit
thisMatrix%incIdxStp(j)=nTransmit+nRecv(j)-1
! We receive data (and don't want to bother sending idx's at mvmult
! time)
CALL MPI_Recv(thisMatrix%incIdxMap(nTransmit:nTransmit+nRecv(j)-1), &
nRecv(j),MPI_INTEGER,j-1,0,thisMatrix%comm,MPI_STATUS_IGNORE, mpierr)
nTransmit=nTransmit+nRecv(j)
ELSE IF (nRecv(j) == -1) THEN
thisMatrix%incIdxStt(j)=-1
thisMatrix%incIdxStp(j)=-1
ENDIF
ENDIF
ENDDO
ELSE
IF(nBandLocal > 0) THEN
ALLOCATE(bandSizeTmp(nBandLocal))
DO j=1,nBandLocal
bandSizeTmp(j)=SIZE(thisMatrix%chunks(i)%bands(j)%elem)
! Build the index map
IF(nTransmit > 0) THEN
ALLOCATE(idxMapTmp(nTransmit))
stt=1
DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx)
stp=stt+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx)-1
idxMapTmp(stt:stp)=thisMatrix%chunks(i)%bands(j)%jIdx-thisMatrix%chunks(i)%bandIdx(j)
stt=stp+1
ENDDO
CALL MPI_Send(bandSizeTmp(:),nBandLocal,MPI_INTEGER,i-1,0, &
CALL MPI_Send(idxMapTmp,nTransmit,MPI_INTEGER,i-1,0, &
thisMatrix%comm,mpierr)
DEALLOCATE(bandSizeTmp)
DEALLOCATE(idxMapTmp)
ENDIF
ENDIF
ENDDO
......@@ -1039,12 +1082,9 @@ SUBROUTINE clear_DistributedBandedMatrixType(matrix)
ENDDO
DEALLOCATE(matrix%chunks)
ENDIF
IF(ALLOCATED(matrix%bandSizes)) THEN
DO i=1,SIZE(matrix%bandSizes)
IF(ALLOCATED(matrix%bandSizes(i)%p)) DEALLOCATE(matrix%bandSizes(i)%p)
ENDDO
DEALLOCATE(matrix%bandSizes)
ENDIF
IF(ALLOCATED(matrix%incIdxMap)) DEALLOCATE(matrix%