Native matrices now determine indices for data transfer at assemble-time (2eed2386) · Commits · Futility / Futility

src/MatrixTypes.f90

+50 −78

Original line number	Diff line number	Diff line
		@@ -699,15 +699,13 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
		CHARACTER(LEN=1),INTENT(IN) :: d
		INTEGER(SIK),INTENT(IN) :: incx
		INTEGER(SIK) :: rank,k
		INTEGER(SIK),ALLOCATABLE :: recvIdx(:,:), sendIdx(:,:)
		REAL(SRK),ALLOCATABLE :: recvResult(:,:),sendResult(:,:),tmpProduct(:)
		INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),sendIdxRequests(MATVEC_SLOTS)
		INTEGER(SIK) :: recvRequests(MATVEC_SLOTS),recvIdxRequests(MATVEC_SLOTS)
		INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),recvRequests(MATVEC_SLOTS)
		INTEGER(SIK) :: lowIdx,highIdx,sendCounter,recvCounter
		#ifdef HAVE_MPI
		! Get rank
		INTEGER(SIK) :: ctRecv(MATVEC_SLOTS),srcRank,destRank
		INTEGER(SIK) :: i,idxTmp,cnt,ctDefault,mpierr,nproc
		INTEGER(SIK) :: i,idxTmp,ctDefault,mpierr,nproc
		CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr)
		CALL MPI_Comm_Size(thisMatrix%comm,nProc,mpierr)
		#else
		@@ -722,16 +720,10 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
		sendCounter=0
		recvCounter=0
		sendRequests=0
		sendIdxRequests=0
		recvRequests=0
		recvIdxRequests=0
		! The recv/sendResult array will sometimes hold it's full length,
		! and sometimes less.
		! The variable "cnt" will be used to determine this at
		! each iteration
		ALLOCATE(recvIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS))
		ALLOCATE(recvResult(thisMatrix%iOffsets(2),MATVEC_SLOTS))
		ALLOCATE(sendIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS))
		ALLOCATE(sendResult(thisMatrix%iOffsets(2),MATVEC_SLOTS))
		ALLOCATE(tmpProduct(thisMatrix%iOffsets(rank+2)- &
		thisMatrix%iOffsets(rank+1)))
		@@ -784,73 +776,61 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
		sendCounter=sendCounter+1
		idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1
		! Check if we can safely write to sendRequests
		CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)
		CALL pop_send(sendResult,idxTmp,sendRequests)
		CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(destRank+1),X=x,&
		y=sendResult(1:ctDefault,idxTmp),ALPHA=1.0_SRK,BETA=0.0_SRK)
		CALL MPI_ISend(sendResult(1:ctDefault,idxTmp),ctDefault, &
		MPI_DOUBLE_PRECISION,destRank,0,thisMatrix%comm,sendRequests(idxTmp),mpierr)
		ELSE
		! Send several sparse vectors
		DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx)
		! Gather and send several sparse vectors
		sendCounter=sendCounter+1
		idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1
		CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)

		! compute destination indices
		! perform special-case multiplication from single band here
		cnt=SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)
		sendIdx(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%jIdx &
		-thisMatrix%chunks(destRank+1)%bandIdx(k)
		CALL MPI_ISend(sendIdx(1:cnt,idxTmp),cnt,MPI_INTEGER,destRank,0, &
		thisMatrix%comm,sendIdxRequests(idxTmp), mpierr)
		sendResult(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem &
		CALL pop_send(sendResult,idxTmp,sendRequests)
		lowIdx=1
		DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx)
		highidx=lowIdx+SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)-1
		sendResult(lowIdx:highIdx,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem &
		*x(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)
		CALL MPI_ISend(sendResult(1:cnt,idxTmp),cnt,MPI_DOUBLE_PRECISION, &
		destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr)
		lowIdx=highIdx+1
		ENDDO
		CALL MPI_ISend(sendResult(1:highIdx,idxTmp),highIdx,MPI_DOUBLE_PRECISION, &
		destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr)
		ENDIF
		ENDIF
		! We might receive data from rank MOD(rank-i,nproc)
		srcRank = MODULO(rank-i,nProc)
		IF (ALLOCATED(thisMatrix%bandSizes(srcRank+1)%p)) THEN
		IF (thisMatrix%bandSizes(srcRank+1)%p(1) < 0) THEN
		IF (thisMatrix%incIdxStt(srcRank+1) == -1) THEN
		! We are receiving a whole vector at once
		recvCounter=recvCounter+1
		idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1
		! If we've filled up the available storage, we need
		! to wait for communication to finish up
		CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, &
		recvRequests, recvIdxRequests)
		CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
		ctRecv,idxTmp,recvRequests)
		ctRecv(MOD(recvCounter,MATVEC_SLOTS)+1)=ctDefault
		CALL MPI_IRecv(recvResult(1:ctDefault,idxTmp), ctDefault, &
		MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm, &
		recvRequests(idxTmp),mpierr)
		ELSE
		! We are receiving multiple sparse chunks
		DO k=1,SIZE(thisMatrix%bandSizes(srcRank+1)%p)
		ELSEIF (thisMatrix%incIdxStt(srcRank+1) > 0) THEN
		! We are receiving multiple sparse chunks gathered together
		recvCounter=recvCounter+1
		idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1
		! If we've filled up the available storage, we need
		! to wait for communication to finish up
		CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, &
		recvRequests, recvIdxRequests)
		ctRecv(idxTmp)=-thisMatrix%bandSizes(srcRank+1)%p(k)
		CALL MPI_IRecv(recvIdx(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), &
		MPI_INTEGER,srcRank,0,thisMatrix%comm,recvIdxRequests(idxTmp), mpierr)
		CALL MPI_IRecv(recvResult(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), &
		CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
		ctRecv,idxTmp,recvRequests)
		ctRecv(idxTmp)= -srcRank
		lowIdx=thisMatrix%incIdxStt(srcRank+1)
		highIdx=thisMatrix%incIdxStp(srcRank+1)
		CALL MPI_IRecv(recvResult(1:(highIdx-lowIdx+1),idxTmp),highIdx-lowIdx+1, &
		MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm,recvRequests(idxTmp), mpierr)
		ENDDO
		ENDIF
		ENDIF
		ENDDO
		! We've finished calling irecv. Wait for remaining
		! requests to finish:
		DO idxTmp=1,MATVEC_SLOTS
		CALL pop_recv(tmpProduct, recvResult, recvIdx, ctRecv, idxTmp, &
		recvRequests, recvIdxRequests)
		CALL pop_recv(tmpProduct,recvResult,thisMatrix,ctRecv,idxTmp,recvRequests)
		ENDDO
		DO idxTmp=1,MATVEC_SLOTS
		CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)
		CALL pop_send(sendResult,idxTmp,sendRequests)
		ENDDO
		#endif

		@@ -912,28 +892,29 @@ ENDSUBROUTINE matvec_DistrBandedMatrixType
		!> @param req The array of active requests for data
		!> @param idxReq The array of requests for indices
		!>
		SUBROUTINE pop_recv(acc,valBuf,idxBuf,ctBuf,idx,req,idxReq)
		SUBROUTINE pop_recv(acc,valBuf,thisMat,ctBuf,idx,req)
		REAL(SRK), INTENT(INOUT) :: acc(:)
		REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
		CLASS(DistributedBandedMatrixType),INTENT(INOUT) :: thisMat
		INTEGER(SIK), INTENT(INOUT) :: ctBuf(MATVEC_SLOTS)
		INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:)
		INTEGER(SIK), INTENT(IN) :: idx
		INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
		INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS)
		INTEGER(SIK) :: mpierr
		INTEGER(SIK) :: stt,stp,mpierr,i

		IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN
		IF(req(idx) == 0) RETURN

		CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
		IF(ctBuf(idx) > 0) THEN
		acc(1:ctBuf(idx))=acc(1:ctBuf(idx))+valBuf(1:ctBuf(idx),idx)
		ELSE
		CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr)
		idxReq(idx)=0
		acc(idxBuf(1:-ctBuf(idx),idx))=acc(idxBuf(1:-ctBuf(idx),idx)) &
		+valBuf(1:-ctBuf(idx),idx)
		stt=thisMat%incIdxStt(-ctBuf(idx)+1)
		stp=thisMat%incIdxStp(-ctBuf(idx)+1)
		DO i=stt,stp
		acc(thisMat%incIdxMap(i))=acc(thisMat%incIdxMap(i))+valBuf(i,idx)
		ENDDO
		ENDIF
		ENDSUBROUTINE pop_recv

		!
		!-------------------------------------------------------------------------------
		!> @brief Helper routine that keeps of data in the send buffer while
		@@ -946,25 +927,16 @@ ENDSUBROUTINE pop_recv
		!> @param req The array of requests for data
		!> @param idxReq The array of requests for indices
		!>
		SUBROUTINE pop_send(valBuf, idxBuf, idx, req, idxReq)
		SUBROUTINE pop_send(valBuf, idx, req)
		REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
		!> The send buffer for indices
		INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:)
		!> The buffer slot to pop
		INTEGER(SIK), INTENT(IN) :: idx
		!> The array of requests for data
		INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
		!> The array of requests for indices
		INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS)
		INTEGER(SIK) :: mpierr

		IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN

		IF(req(idx) == 0) RETURN
		CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
		IF(idxReq(idx) /= 0) THEN
		CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr)
		idxReq(idx)=0
		ENDIF
		ENDSUBROUTINE pop_send
		#endif
		!

src/MatrixTypes_Native.f90

+63 −39

Original line number	Diff line number	Diff line
		@@ -132,11 +132,6 @@ TYPE Band
		REAL(SRK), ALLOCATABLE :: elem(:)
		ENDTYPE Band

		!> @brief Integer array used for 'ragged' storage of band contents
		TYPE IntPtr
		INTEGER(SIK),ALLOCATABLE :: p(:)
		ENDTYPE IntPtr

		!> @brief The basic banded matrix type
		TYPE,EXTENDS(MatrixType) :: BandedMatrixType
		!> Map of band indices stored (-m to n)
		@@ -200,7 +195,9 @@ TYPE,EXTENDS(DistributedMatrixType) :: DistributedBandedMatrixType
		!> Block size (smallest indivisble unit)
		INTEGER(SIK) :: blockSize
		!> Array of band sizes used to determine optimal communication
		TYPE(IntPtr), ALLOCATABLE :: bandSizes(:)
		INTEGER(SIK), ALLOCATABLE :: incIdxMap(:)
		INTEGER(SIK), ALLOCATABLE :: incIdxStt(:)
		INTEGER(SIK), ALLOCATABLE :: incIdxStp(:)
		!> Temporary containers used before (and deallocated after) assembly
		INTEGER(SIK), ALLOCATABLE :: iTmp(:),jTmp(:)
		REAL(SRK),ALLOCATABLE :: elemTmp(:)
		@@ -575,7 +572,6 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params)
		matrix%nnz=nnz
		matrix%blockSize = blockSize
		matrix%comm=commID
		ALLOCATE(matrix%bandSizes(nProc))
		ENDIF
		ELSE
		CALL eMatrixType%raiseError('Incorrect call to '// &
		@@ -764,14 +760,14 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
		INTEGER(SIK),INTENT(OUT),OPTIONAL :: ierr
		#ifdef HAVE_MPI
		TYPE(ParamType) :: bandedPList
		INTEGER(SIK) :: mpierr, rank, nproc, i, j, nBandLocal
		INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nband(:),bandSizeTmp(:)
		INTEGER(SIK) :: mpierr,rank,nproc,i,j,nTransmit,stt,stp
		INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nRecv(:),idxMapTmp(:)

		REQUIRE(thisMatrix%isInit)
		CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr)
		CALL MPI_Comm_size(thisMatrix%comm,nproc,mpierr)
		ALLOCATE(nnzPerChunk(nProc))
		ALLOCATE(nBand(nProc))
		ALLOCATE(nRecv(nProc))
		nnzPerChunk=0
		DO i=1,thisMatrix%nLocal
		DO j=1,nproc
		@@ -804,45 +800,76 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
		CALL thisMatrix%chunks(i)%assemble()
		CALL bandedPList%clear()
		ENDIF
		! Set nbandlocal (to be gathered into nband

		! Decide the sending method to use. 'Full' chunks will send their whole
		! contents while 'sparse' chunks will send band products.
		! Set nTransmit to the number of REAL's to be sent from that processor to
		! this one.
		! >0: This many
		! 0: None (chunk empty)
		! -1: All
		nTransmit=0
		IF(thisMatrix%chunks(i)%isInit) THEN
		IF(2*thisMatrix%chunks(i)%nnz/3 < thisMatrix%chunks(i)%n) THEN
		nBandLocal=SIZE(thisMatrix%chunks(i)%bandIdx)
		DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx)
		nTransmit=nTransmit+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx)
		ENDDO
		ELSE
		nBandLocal=-1
		nTransmit=-1
		ENDIF
		ELSE
		nBandLocal=0
		ENDIF
		! This info must be disseminated through all procs

		! Gather to rank i-1 into nBand(:)
		CALL MPI_Gather(nBandLocal,1,MPI_INTEGER,nband(1),1,MPI_INTEGER,i-1, &
		! Gather to rank i-1 into nRecv(:)
		CALL MPI_Gather(nTransmit,1,MPI_INTEGER,nRecv,1,MPI_INTEGER,i-1, &
		thisMatrix%comm,mpierr)
		! Have rank i-1 allocate
		IF(rank == i-1) THEN
		nTransmit=0
		DO j=1,SIZE(nRecv)
		IF (nRecv(j) > 0) THEN
		nTransmit=nTransmit+nRecv(j)
		ENDIF
		ENDDO
		ALLOCATE(thisMatrix%incIdxMap(nTransmit))
		ALLOCATE(thisMatrix%incIdxStt(nProc))
		ALLOCATE(thisMatrix%incIdxStp(nProc))
		thisMatrix%incIdxStt=0
		thisMatrix%incIdxStp=0
		DO j=1,nProc
		IF(j /= i) THEN
		IF(nBand(j) > 0) THEN
		ALLOCATE(thisMatrix%bandSizes(j)%p(nBand(j)))
		CALL MPI_Recv(thisMatrix%bandSizes(j)%p(1),nBand(j),MPI_INTEGER, &
		j-1, 0, thisMatrix%comm,MPI_STATUS_IGNORE, mpierr)
		! Use nTransmit as counter
		nTransmit=1
		IF(j == i) THEN
		thisMatrix%incIdxStt(j)=-1
		thisMatrix%incIdxStp(j)=-1
		ELSE
		IF(nBand(j) /= 0) THEN
		ALLOCATE(thisMatrix%bandSizes(j)%p(1))
		thisMatrix%bandSizes(j)%p(1)=-1
		ENDIF
		IF(nRecv(j) > 0) THEN
		thisMatrix%incIdxStt(j)=nTransmit
		thisMatrix%incIdxStp(j)=nTransmit+nRecv(j)-1
		! We receive data (and don't want to bother sending idx's at mvmult
		! time)
		CALL MPI_Recv(thisMatrix%incIdxMap(nTransmit),nRecv(j),MPI_INTEGER, &
		j-1,0,thisMatrix%comm,MPI_STATUS_IGNORE, mpierr)
		nTransmit=nTransmit+nRecv(j)
		ELSE IF (nRecv(j) == -1) THEN
		thisMatrix%incIdxStt(j)=-1
		thisMatrix%incIdxStp(j)=-1
		ENDIF
		ENDIF
		ENDDO
		ELSE
		IF(nBandLocal > 0) THEN
		ALLOCATE(bandSizeTmp(nBandLocal))
		DO j=1,nBandLocal
		bandSizeTmp(j)=SIZE(thisMatrix%chunks(i)%bands(j)%elem)
		! Build the index map
		IF(nTransmit > 0) THEN
		ALLOCATE(idxMapTmp(nTransmit))
		stt=1
		DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx)
		stp=stt+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx)-1
		idxMapTmp(stt:stp)=thisMatrix%chunks(i)%bands(j)%jIdx-thisMatrix%chunks(i)%bandIdx(j)
		stt=stp+1
		ENDDO
		CALL MPI_Send(bandSizeTmp(:),nBandLocal,MPI_INTEGER,i-1,0, &
		CALL MPI_Send(idxMapTmp,nTransmit,MPI_INTEGER,i-1,0, &
		thisMatrix%comm,mpierr)
		DEALLOCATE(bandSizeTmp)
		DEALLOCATE(idxMapTmp)
		ENDIF
		ENDIF
		ENDDO
		@@ -1039,12 +1066,9 @@ SUBROUTINE clear_DistributedBandedMatrixType(matrix)
		ENDDO
		DEALLOCATE(matrix%chunks)
		ENDIF
		IF(ALLOCATED(matrix%bandSizes)) THEN
		DO i=1,SIZE(matrix%bandSizes)
		IF(ALLOCATED(matrix%bandSizes(i)%p)) DEALLOCATE(matrix%bandSizes(i)%p)
		ENDDO
		DEALLOCATE(matrix%bandSizes)
		ENDIF
		IF(ALLOCATED(matrix%incIdxMap)) DEALLOCATE(matrix%incIdxMap)
		IF(ALLOCATED(matrix%incIdxStt)) DEALLOCATE(matrix%incIdxStt)
		IF(ALLOCATED(matrix%incIdxStp)) DEALLOCATE(matrix%incIdxStp)
		IF(ALLOCATED(matrix%iOffsets)) DEALLOCATE(matrix%iOffsets)
		IF(ALLOCATED(matrix%jOffsets)) DEALLOCATE(matrix%jOffsets)
		IF(ALLOCATED(matrix%iTmp)) DEALLOCATE(matrix%iTmp)

src/VectorTypes.f90

+6 −0

Original line number	Diff line number	Diff line
		@@ -420,6 +420,9 @@ SUBROUTINE VectorResembleAllocScal(dest, source, inparams)
		ENDIF
		#ifdef HAVE_MPI
		TYPE IS(NativeDistributedVectorType)
		IF(.NOT. params%has("VectorType->nlocal")) THEN
		CALL params%add("VectorType->nlocal",SIZE(source%b))
		ENDIF
		IF(.NOT. params%has("VectorType->chunkSize")) THEN
		CALL params%add("VectorType->chunkSize",source%chunkSize)
		ENDIF
		@@ -497,6 +500,9 @@ SUBROUTINE VectorResembleAllocArray(dest, source, nvec, inparams)
		ENDIF
		#ifdef HAVE_MPI
		TYPE IS(NativeDistributedVectorType)
		IF(.NOT. params%has("VectorType->nlocal")) THEN
		CALL params%add("VectorType->nlocal",SIZE(source%b))
		ENDIF
		IF(.NOT. params%has("VectorType->chunkSize")) THEN
		CALL params%add("VectorType->chunkSize",source%chunkSize)
		ENDIF