Changed to Waitany for nonblocking (9a41c81f) · Commits · Futility / Futility

src/MatrixTypes.f90

+62 −34

Original line number	Diff line number	Diff line
		@@ -701,7 +701,7 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
		INTEGER(SIK) :: rank,k
		REAL(SRK),ALLOCATABLE :: recvResult(:,:),sendResult(:,:),tmpProduct(:)
		INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),recvRequests(MATVEC_SLOTS)
		INTEGER(SIK) :: lowIdx,highIdx,sendCounter,recvCounter
		INTEGER(SIK) :: lowIdx,highIdx
		#ifdef HAVE_MPI
		! Get rank
		INTEGER(SIK) :: ctRecv(MATVEC_SLOTS),srcRank,destRank
		@@ -717,10 +717,8 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
		! This allows one to be used for computation and the rest for
		! communication.
		! Which one to write to will be determined by *Counter MOD MATVEC_SLOTS
		sendCounter=0
		recvCounter=0
		sendRequests=0
		recvRequests=0
		sendRequests=MPI_REQUEST_NULL
		recvRequests=MPI_REQUEST_NULL
		! The recv/sendResult array will sometimes hold it's full length,
		! and sometimes less.
		ALLOCATE(recvResult(thisMatrix%iOffsets(2),MATVEC_SLOTS))
		@@ -772,9 +770,7 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
		IF (thismatrix%chunks(destRank+1)%isInit) THEN

		! Decide whether to send whole vector or multiple sparse
		IF (2*thisMatrix%chunks(destRank+1)%nnz/3 >= thisMatrix%chunks(destRank+1)%n) THEN
		sendCounter=sendCounter+1
		idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1
		IF (thisMatrix%chunks(destRank+1)%nnz >= thisMatrix%chunks(destRank+1)%n) THEN
		! Check if we can safely write to sendRequests
		CALL pop_send(sendResult,idxTmp,sendRequests)
		CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(destRank+1),X=x,&
		@@ -783,8 +779,6 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
		MPI_DOUBLE_PRECISION,destRank,0,thisMatrix%comm,sendRequests(idxTmp),mpierr)
		ELSE
		! Gather and send several sparse vectors
		sendCounter=sendCounter+1
		idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1
		CALL pop_send(sendResult,idxTmp,sendRequests)
		lowIdx=1
		DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx)
		@@ -801,23 +795,19 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
		srcRank = MODULO(rank-i,nProc)
		IF (thisMatrix%incIdxStt(srcRank+1) == -1) THEN
		! We are receiving a whole vector at once
		recvCounter=recvCounter+1
		idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1
		! If we've filled up the available storage, we need
		! to wait for communication to finish up
		CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
		ctRecv,idxTmp,recvRequests)
		ctRecv(MOD(recvCounter,MATVEC_SLOTS)+1)=ctDefault
		CALL MPI_IRecv(recvResult(1:ctDefault,idxTmp), ctDefault, &
		ctRecv(idxTmp)=SIZE(tmpProduct)
		CALL MPI_IRecv(recvResult(1:ctRecv(idxTmp),idxTmp), ctRecv(idxTmp), &
		MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm, &
		recvRequests(idxTmp),mpierr)
		ELSEIF (thisMatrix%incIdxStt(srcRank+1) > 0) THEN
		! We are receiving multiple sparse chunks gathered together
		recvCounter=recvCounter+1
		idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1
		CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
		ctRecv,idxTmp,recvRequests)
		ctRecv(idxTmp)= -srcRank
		ctRecv(idxTmp)= -srcRank-1
		lowIdx=thisMatrix%incIdxStt(srcRank+1)
		highIdx=thisMatrix%incIdxStp(srcRank+1)
		CALL MPI_IRecv(recvResult(1:(highIdx-lowIdx+1),idxTmp),highIdx-lowIdx+1, &
		@@ -826,11 +816,13 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
		ENDDO
		! We've finished calling irecv. Wait for remaining
		! requests to finish:
		DO idxTmp=1,MATVEC_SLOTS
		CALL pop_recv(tmpProduct,recvResult,thisMatrix,ctRecv,idxTmp,recvRequests)
		DO k=1,MATVEC_SLOTS
		idxTmp=k
		CALL pop_recv(tmpProduct,recvResult,thisMatrix,ctRecv,idxTmp,recvRequests,.TRUE.)
		ENDDO
		DO idxTmp=1,MATVEC_SLOTS
		CALL pop_send(sendResult,idxTmp,sendRequests)
		DO k=1,MATVEC_SLOTS
		idxTmp=k
		CALL pop_send(sendResult,idxTmp,sendRequests,.TRUE.)
		ENDDO
		#endif

		@@ -892,25 +884,43 @@ ENDSUBROUTINE matvec_DistrBandedMatrixType
		!> @param req The array of active requests for data
		!> @param idxReq The array of requests for indices
		!>
		SUBROUTINE pop_recv(acc,valBuf,thisMat,ctBuf,idx,req)
		SUBROUTINE pop_recv(acc,valBuf,thisMat,ctBuf,idx,req,f)
		REAL(SRK), INTENT(INOUT) :: acc(:)
		REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
		CLASS(DistributedBandedMatrixType),INTENT(INOUT) :: thisMat
		INTEGER(SIK), INTENT(INOUT) :: ctBuf(MATVEC_SLOTS)
		INTEGER(SIK), INTENT(IN) :: idx
		INTEGER(SIK), INTENT(INOUT) :: idx
		INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
		INTEGER(SIK) :: stt,stp,mpierr,i

		IF(req(idx) == 0) RETURN
		LOGICAL(SBK),OPTIONAL,INTENT(IN) :: f
		INTEGER(SIK) :: stt,stp,mpierr,i,rank
		LOGICAL(SBK) :: force
		CALL MPI_Comm_Rank(thisMat%comm,rank,mpierr)

		force=.FALSE.
		IF (PRESENT(f)) force=f
		IF (force) THEN
		IF (req(idx) == MPI_REQUEST_NULL) RETURN
		CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
		ELSE
		IF (ANY(req == MPI_REQUEST_NULL)) THEN
		DO i=1,SIZE(req)
		IF (req(i) == MPI_REQUEST_NULL) THEN
		idx=i
		RETURN
		ENDIF
		ENDDO
		ELSE
		CALL MPI_WaitAny(SIZE(req),req,idx,MPI_STATUS_IGNORE,mpierr)
		ENDIF
		ENDIF

		IF(ctBuf(idx) > 0) THEN
		acc(1:ctBuf(idx))=acc(1:ctBuf(idx))+valBuf(1:ctBuf(idx),idx)
		ELSE
		stt=thisMat%incIdxStt(-ctBuf(idx)+1)
		stp=thisMat%incIdxStp(-ctBuf(idx)+1)
		stt=thisMat%incIdxStt(-ctBuf(idx))
		stp=thisMat%incIdxStp(-ctBuf(idx))
		DO i=stt,stp
		acc(thisMat%incIdxMap(i))=acc(thisMat%incIdxMap(i))+valBuf(i,idx)
		acc(thisMat%incIdxMap(i))=acc(thisMat%incIdxMap(i))+valBuf(i-stt+1,idx)
		ENDDO
		ENDIF
		ENDSUBROUTINE pop_recv
		@@ -927,16 +937,34 @@ ENDSUBROUTINE pop_recv
		!> @param req The array of requests for data
		!> @param idxReq The array of requests for indices
		!>
		SUBROUTINE pop_send(valBuf, idx, req)
		SUBROUTINE pop_send(valBuf,idx,req,f)
		REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
		!> The buffer slot to pop
		INTEGER(SIK), INTENT(IN) :: idx
		INTEGER(SIK), INTENT(INOUT) :: idx
		!> The array of requests for data
		INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
		INTEGER(SIK) :: mpierr
		LOGICAL(SBK),OPTIONAL,INTENT(IN) :: f
		INTEGER(SIK) :: mpierr,i
		LOGICAL(SBK) :: force

		IF(req(idx) == 0) RETURN
		force=.FALSE.
		IF (PRESENT(f)) force=f
		!IF(req(idx) == MPI_REQUEST_NULL) RETURN
		IF (force) THEN
		IF (req(idx) == MPI_REQUEST_NULL) RETURN
		CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
		ELSE
		IF (ANY(req == MPI_REQUEST_NULL)) THEN
		DO i=1,SIZE(req)
		IF (req(i) == MPI_REQUEST_NULL) THEN
		idx=i
		RETURN
		ENDIF
		ENDDO
		ELSE
		CALL MPI_WaitAny(SIZE(req),req,idx,MPI_STATUS_IGNORE,mpierr)
		ENDIF
		ENDIF
		ENDSUBROUTINE pop_send
		#endif
		!

src/MatrixTypes_Native.f90

+8 −7

Original line number	Diff line number	Diff line
		@@ -810,7 +810,7 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
		! -1: All
		nTransmit=0
		IF(thisMatrix%chunks(i)%isInit) THEN
		IF(2*thisMatrix%chunks(i)%nnz/3 < thisMatrix%chunks(i)%n) THEN
		IF(thisMatrix%chunks(i)%nnz < thisMatrix%chunks(i)%n) THEN
		DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx)
		nTransmit=nTransmit+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx)
		ENDDO
		@@ -825,6 +825,7 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
		thisMatrix%comm,mpierr)
		! Have rank i-1 allocate
		IF(rank == i-1) THEN
		nRecv(i)=0
		nTransmit=0
		DO j=1,SIZE(nRecv)
		IF (nRecv(j) > 0) THEN
		@@ -836,20 +837,20 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
		ALLOCATE(thisMatrix%incIdxStp(nProc))
		thisMatrix%incIdxStt=0
		thisMatrix%incIdxStp=0
		DO j=1,nProc
		! Use nTransmit as counter
		nTransmit=1
		DO j=1,nProc
		IF(j == i) THEN
		thisMatrix%incIdxStt(j)=-1
		thisMatrix%incIdxStp(j)=-1
		thisMatrix%incIdxStt(j)=0
		thisMatrix%incIdxStp(j)=0
		ELSE
		IF(nRecv(j) > 0) THEN
		thisMatrix%incIdxStt(j)=nTransmit
		thisMatrix%incIdxStp(j)=nTransmit+nRecv(j)-1
		! We receive data (and don't want to bother sending idx's at mvmult
		! time)
		CALL MPI_Recv(thisMatrix%incIdxMap(nTransmit),nRecv(j),MPI_INTEGER, &
		j-1,0,thisMatrix%comm,MPI_STATUS_IGNORE, mpierr)
		CALL MPI_Recv(thisMatrix%incIdxMap(nTransmit:nTransmit+nRecv(j)-1), &
		nRecv(j),MPI_INTEGER,j-1,0,thisMatrix%comm,MPI_STATUS_IGNORE, mpierr)
		nTransmit=nTransmit+nRecv(j)
		ELSE IF (nRecv(j) == -1) THEN
		thisMatrix%incIdxStt(j)=-1