Loading src/MatrixTypes.f90 +50 −78 Original line number Diff line number Diff line Loading @@ -699,15 +699,13 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b) CHARACTER(LEN=1),INTENT(IN) :: d INTEGER(SIK),INTENT(IN) :: incx INTEGER(SIK) :: rank,k INTEGER(SIK),ALLOCATABLE :: recvIdx(:,:), sendIdx(:,:) REAL(SRK),ALLOCATABLE :: recvResult(:,:),sendResult(:,:),tmpProduct(:) INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),sendIdxRequests(MATVEC_SLOTS) INTEGER(SIK) :: recvRequests(MATVEC_SLOTS),recvIdxRequests(MATVEC_SLOTS) INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),recvRequests(MATVEC_SLOTS) INTEGER(SIK) :: lowIdx,highIdx,sendCounter,recvCounter #ifdef HAVE_MPI ! Get rank INTEGER(SIK) :: ctRecv(MATVEC_SLOTS),srcRank,destRank INTEGER(SIK) :: i,idxTmp,cnt,ctDefault,mpierr,nproc INTEGER(SIK) :: i,idxTmp,ctDefault,mpierr,nproc CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr) CALL MPI_Comm_Size(thisMatrix%comm,nProc,mpierr) #else Loading @@ -722,16 +720,10 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b) sendCounter=0 recvCounter=0 sendRequests=0 sendIdxRequests=0 recvRequests=0 recvIdxRequests=0 ! The recv/sendResult array will sometimes hold it's full length, ! and sometimes less. ! The variable "cnt" will be used to determine this at ! each iteration ALLOCATE(recvIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS)) ALLOCATE(recvResult(thisMatrix%iOffsets(2),MATVEC_SLOTS)) ALLOCATE(sendIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS)) ALLOCATE(sendResult(thisMatrix%iOffsets(2),MATVEC_SLOTS)) ALLOCATE(tmpProduct(thisMatrix%iOffsets(rank+2)- & thisMatrix%iOffsets(rank+1))) Loading Loading @@ -784,73 +776,61 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b) sendCounter=sendCounter+1 idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1 ! Check if we can safely write to sendRequests CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests) CALL pop_send(sendResult,idxTmp,sendRequests) CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(destRank+1),X=x,& y=sendResult(1:ctDefault,idxTmp),ALPHA=1.0_SRK,BETA=0.0_SRK) CALL MPI_ISend(sendResult(1:ctDefault,idxTmp),ctDefault, & MPI_DOUBLE_PRECISION,destRank,0,thisMatrix%comm,sendRequests(idxTmp),mpierr) ELSE ! Send several sparse vectors DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx) ! Gather and send several sparse vectors sendCounter=sendCounter+1 idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1 CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests) ! compute destination indices ! perform special-case multiplication from single band here cnt=SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx) sendIdx(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%jIdx & -thisMatrix%chunks(destRank+1)%bandIdx(k) CALL MPI_ISend(sendIdx(1:cnt,idxTmp),cnt,MPI_INTEGER,destRank,0, & thisMatrix%comm,sendIdxRequests(idxTmp), mpierr) sendResult(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem & CALL pop_send(sendResult,idxTmp,sendRequests) lowIdx=1 DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx) highidx=lowIdx+SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)-1 sendResult(lowIdx:highIdx,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem & *x(thisMatrix%chunks(destRank+1)%bands(k)%jIdx) CALL MPI_ISend(sendResult(1:cnt,idxTmp),cnt,MPI_DOUBLE_PRECISION, & destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr) lowIdx=highIdx+1 ENDDO CALL MPI_ISend(sendResult(1:highIdx,idxTmp),highIdx,MPI_DOUBLE_PRECISION, & destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr) ENDIF ENDIF ! We might receive data from rank MOD(rank-i,nproc) srcRank = MODULO(rank-i,nProc) IF (ALLOCATED(thisMatrix%bandSizes(srcRank+1)%p)) THEN IF (thisMatrix%bandSizes(srcRank+1)%p(1) < 0) THEN IF (thisMatrix%incIdxStt(srcRank+1) == -1) THEN ! We are receiving a whole vector at once recvCounter=recvCounter+1 idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1 ! If we've filled up the available storage, we need ! to wait for communication to finish up CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, & recvRequests, recvIdxRequests) CALL pop_recv(tmpProduct,recvResult,thisMatrix, & ctRecv,idxTmp,recvRequests) ctRecv(MOD(recvCounter,MATVEC_SLOTS)+1)=ctDefault CALL MPI_IRecv(recvResult(1:ctDefault,idxTmp), ctDefault, & MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm, & recvRequests(idxTmp),mpierr) ELSE ! We are receiving multiple sparse chunks DO k=1,SIZE(thisMatrix%bandSizes(srcRank+1)%p) ELSEIF (thisMatrix%incIdxStt(srcRank+1) > 0) THEN ! We are receiving multiple sparse chunks gathered together recvCounter=recvCounter+1 idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1 ! If we've filled up the available storage, we need ! to wait for communication to finish up CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, & recvRequests, recvIdxRequests) ctRecv(idxTmp)=-thisMatrix%bandSizes(srcRank+1)%p(k) CALL MPI_IRecv(recvIdx(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), & MPI_INTEGER,srcRank,0,thisMatrix%comm,recvIdxRequests(idxTmp), mpierr) CALL MPI_IRecv(recvResult(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), & CALL pop_recv(tmpProduct,recvResult,thisMatrix, & ctRecv,idxTmp,recvRequests) ctRecv(idxTmp)= -srcRank lowIdx=thisMatrix%incIdxStt(srcRank+1) highIdx=thisMatrix%incIdxStp(srcRank+1) CALL MPI_IRecv(recvResult(1:(highIdx-lowIdx+1),idxTmp),highIdx-lowIdx+1, & MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm,recvRequests(idxTmp), mpierr) ENDDO ENDIF ENDIF ENDDO ! We've finished calling irecv. Wait for remaining ! requests to finish: DO idxTmp=1,MATVEC_SLOTS CALL pop_recv(tmpProduct, recvResult, recvIdx, ctRecv, idxTmp, & recvRequests, recvIdxRequests) CALL pop_recv(tmpProduct,recvResult,thisMatrix,ctRecv,idxTmp,recvRequests) ENDDO DO idxTmp=1,MATVEC_SLOTS CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests) CALL pop_send(sendResult,idxTmp,sendRequests) ENDDO #endif Loading Loading @@ -912,28 +892,29 @@ ENDSUBROUTINE matvec_DistrBandedMatrixType !> @param req The array of active requests for data !> @param idxReq The array of requests for indices !> SUBROUTINE pop_recv(acc,valBuf,idxBuf,ctBuf,idx,req,idxReq) SUBROUTINE pop_recv(acc,valBuf,thisMat,ctBuf,idx,req) REAL(SRK), INTENT(INOUT) :: acc(:) REAL(SRK), INTENT(INOUT) :: valBuf(:,:) CLASS(DistributedBandedMatrixType),INTENT(INOUT) :: thisMat INTEGER(SIK), INTENT(INOUT) :: ctBuf(MATVEC_SLOTS) INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:) INTEGER(SIK), INTENT(IN) :: idx INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS) INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS) INTEGER(SIK) :: mpierr INTEGER(SIK) :: stt,stp,mpierr,i IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN IF(req(idx) == 0) RETURN CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr) IF(ctBuf(idx) > 0) THEN acc(1:ctBuf(idx))=acc(1:ctBuf(idx))+valBuf(1:ctBuf(idx),idx) ELSE CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr) idxReq(idx)=0 acc(idxBuf(1:-ctBuf(idx),idx))=acc(idxBuf(1:-ctBuf(idx),idx)) & +valBuf(1:-ctBuf(idx),idx) stt=thisMat%incIdxStt(-ctBuf(idx)+1) stp=thisMat%incIdxStp(-ctBuf(idx)+1) DO i=stt,stp acc(thisMat%incIdxMap(i))=acc(thisMat%incIdxMap(i))+valBuf(i,idx) ENDDO ENDIF ENDSUBROUTINE pop_recv ! !------------------------------------------------------------------------------- !> @brief Helper routine that keeps of data in the send buffer while Loading @@ -946,25 +927,16 @@ ENDSUBROUTINE pop_recv !> @param req The array of requests for data !> @param idxReq The array of requests for indices !> SUBROUTINE pop_send(valBuf, idxBuf, idx, req, idxReq) SUBROUTINE pop_send(valBuf, idx, req) REAL(SRK), INTENT(INOUT) :: valBuf(:,:) !> The send buffer for indices INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:) !> The buffer slot to pop INTEGER(SIK), INTENT(IN) :: idx !> The array of requests for data INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS) !> The array of requests for indices INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS) INTEGER(SIK) :: mpierr IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN IF(req(idx) == 0) RETURN CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr) IF(idxReq(idx) /= 0) THEN CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr) idxReq(idx)=0 ENDIF ENDSUBROUTINE pop_send #endif ! Loading src/MatrixTypes_Native.f90 +63 −39 Original line number Diff line number Diff line Loading @@ -132,11 +132,6 @@ TYPE Band REAL(SRK), ALLOCATABLE :: elem(:) ENDTYPE Band !> @brief Integer array used for 'ragged' storage of band contents TYPE IntPtr INTEGER(SIK),ALLOCATABLE :: p(:) ENDTYPE IntPtr !> @brief The basic banded matrix type TYPE,EXTENDS(MatrixType) :: BandedMatrixType !> Map of band indices stored (-m to n) Loading Loading @@ -200,7 +195,9 @@ TYPE,EXTENDS(DistributedMatrixType) :: DistributedBandedMatrixType !> Block size (smallest indivisble unit) INTEGER(SIK) :: blockSize !> Array of band sizes used to determine optimal communication TYPE(IntPtr), ALLOCATABLE :: bandSizes(:) INTEGER(SIK), ALLOCATABLE :: incIdxMap(:) INTEGER(SIK), ALLOCATABLE :: incIdxStt(:) INTEGER(SIK), ALLOCATABLE :: incIdxStp(:) !> Temporary containers used before (and deallocated after) assembly INTEGER(SIK), ALLOCATABLE :: iTmp(:),jTmp(:) REAL(SRK),ALLOCATABLE :: elemTmp(:) Loading Loading @@ -575,7 +572,6 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params) matrix%nnz=nnz matrix%blockSize = blockSize matrix%comm=commID ALLOCATE(matrix%bandSizes(nProc)) ENDIF ELSE CALL eMatrixType%raiseError('Incorrect call to '// & Loading Loading @@ -764,14 +760,14 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr) INTEGER(SIK),INTENT(OUT),OPTIONAL :: ierr #ifdef HAVE_MPI TYPE(ParamType) :: bandedPList INTEGER(SIK) :: mpierr, rank, nproc, i, j, nBandLocal INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nband(:),bandSizeTmp(:) INTEGER(SIK) :: mpierr,rank,nproc,i,j,nTransmit,stt,stp INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nRecv(:),idxMapTmp(:) REQUIRE(thisMatrix%isInit) CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr) CALL MPI_Comm_size(thisMatrix%comm,nproc,mpierr) ALLOCATE(nnzPerChunk(nProc)) ALLOCATE(nBand(nProc)) ALLOCATE(nRecv(nProc)) nnzPerChunk=0 DO i=1,thisMatrix%nLocal DO j=1,nproc Loading Loading @@ -804,45 +800,76 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr) CALL thisMatrix%chunks(i)%assemble() CALL bandedPList%clear() ENDIF ! Set nbandlocal (to be gathered into nband ! Decide the sending method to use. 'Full' chunks will send their whole ! contents while 'sparse' chunks will send band products. ! Set nTransmit to the number of REAL's to be sent from that processor to ! this one. ! >0: This many ! 0: None (chunk empty) ! -1: All nTransmit=0 IF(thisMatrix%chunks(i)%isInit) THEN IF(2*thisMatrix%chunks(i)%nnz/3 < thisMatrix%chunks(i)%n) THEN nBandLocal=SIZE(thisMatrix%chunks(i)%bandIdx) DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx) nTransmit=nTransmit+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx) ENDDO ELSE nBandLocal=-1 nTransmit=-1 ENDIF ELSE nBandLocal=0 ENDIF ! This info must be disseminated through all procs ! Gather to rank i-1 into nBand(:) CALL MPI_Gather(nBandLocal,1,MPI_INTEGER,nband(1),1,MPI_INTEGER,i-1, & ! Gather to rank i-1 into nRecv(:) CALL MPI_Gather(nTransmit,1,MPI_INTEGER,nRecv,1,MPI_INTEGER,i-1, & thisMatrix%comm,mpierr) ! Have rank i-1 allocate IF(rank == i-1) THEN nTransmit=0 DO j=1,SIZE(nRecv) IF (nRecv(j) > 0) THEN nTransmit=nTransmit+nRecv(j) ENDIF ENDDO ALLOCATE(thisMatrix%incIdxMap(nTransmit)) ALLOCATE(thisMatrix%incIdxStt(nProc)) ALLOCATE(thisMatrix%incIdxStp(nProc)) thisMatrix%incIdxStt=0 thisMatrix%incIdxStp=0 DO j=1,nProc IF(j /= i) THEN IF(nBand(j) > 0) THEN ALLOCATE(thisMatrix%bandSizes(j)%p(nBand(j))) CALL MPI_Recv(thisMatrix%bandSizes(j)%p(1),nBand(j),MPI_INTEGER, & j-1, 0, thisMatrix%comm,MPI_STATUS_IGNORE, mpierr) ! Use nTransmit as counter nTransmit=1 IF(j == i) THEN thisMatrix%incIdxStt(j)=-1 thisMatrix%incIdxStp(j)=-1 ELSE IF(nBand(j) /= 0) THEN ALLOCATE(thisMatrix%bandSizes(j)%p(1)) thisMatrix%bandSizes(j)%p(1)=-1 ENDIF IF(nRecv(j) > 0) THEN thisMatrix%incIdxStt(j)=nTransmit thisMatrix%incIdxStp(j)=nTransmit+nRecv(j)-1 ! We receive data (and don't want to bother sending idx's at mvmult ! time) CALL MPI_Recv(thisMatrix%incIdxMap(nTransmit),nRecv(j),MPI_INTEGER, & j-1,0,thisMatrix%comm,MPI_STATUS_IGNORE, mpierr) nTransmit=nTransmit+nRecv(j) ELSE IF (nRecv(j) == -1) THEN thisMatrix%incIdxStt(j)=-1 thisMatrix%incIdxStp(j)=-1 ENDIF ENDIF ENDDO ELSE IF(nBandLocal > 0) THEN ALLOCATE(bandSizeTmp(nBandLocal)) DO j=1,nBandLocal bandSizeTmp(j)=SIZE(thisMatrix%chunks(i)%bands(j)%elem) ! Build the index map IF(nTransmit > 0) THEN ALLOCATE(idxMapTmp(nTransmit)) stt=1 DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx) stp=stt+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx)-1 idxMapTmp(stt:stp)=thisMatrix%chunks(i)%bands(j)%jIdx-thisMatrix%chunks(i)%bandIdx(j) stt=stp+1 ENDDO CALL MPI_Send(bandSizeTmp(:),nBandLocal,MPI_INTEGER,i-1,0, & CALL MPI_Send(idxMapTmp,nTransmit,MPI_INTEGER,i-1,0, & thisMatrix%comm,mpierr) DEALLOCATE(bandSizeTmp) DEALLOCATE(idxMapTmp) ENDIF ENDIF ENDDO Loading Loading @@ -1039,12 +1066,9 @@ SUBROUTINE clear_DistributedBandedMatrixType(matrix) ENDDO DEALLOCATE(matrix%chunks) ENDIF IF(ALLOCATED(matrix%bandSizes)) THEN DO i=1,SIZE(matrix%bandSizes) IF(ALLOCATED(matrix%bandSizes(i)%p)) DEALLOCATE(matrix%bandSizes(i)%p) ENDDO DEALLOCATE(matrix%bandSizes) ENDIF IF(ALLOCATED(matrix%incIdxMap)) DEALLOCATE(matrix%incIdxMap) IF(ALLOCATED(matrix%incIdxStt)) DEALLOCATE(matrix%incIdxStt) IF(ALLOCATED(matrix%incIdxStp)) DEALLOCATE(matrix%incIdxStp) IF(ALLOCATED(matrix%iOffsets)) DEALLOCATE(matrix%iOffsets) IF(ALLOCATED(matrix%jOffsets)) DEALLOCATE(matrix%jOffsets) IF(ALLOCATED(matrix%iTmp)) DEALLOCATE(matrix%iTmp) Loading src/VectorTypes.f90 +6 −0 Original line number Diff line number Diff line Loading @@ -420,6 +420,9 @@ SUBROUTINE VectorResembleAllocScal(dest, source, inparams) ENDIF #ifdef HAVE_MPI TYPE IS(NativeDistributedVectorType) IF(.NOT. params%has("VectorType->nlocal")) THEN CALL params%add("VectorType->nlocal",SIZE(source%b)) ENDIF IF(.NOT. params%has("VectorType->chunkSize")) THEN CALL params%add("VectorType->chunkSize",source%chunkSize) ENDIF Loading Loading @@ -497,6 +500,9 @@ SUBROUTINE VectorResembleAllocArray(dest, source, nvec, inparams) ENDIF #ifdef HAVE_MPI TYPE IS(NativeDistributedVectorType) IF(.NOT. params%has("VectorType->nlocal")) THEN CALL params%add("VectorType->nlocal",SIZE(source%b)) ENDIF IF(.NOT. params%has("VectorType->chunkSize")) THEN CALL params%add("VectorType->chunkSize",source%chunkSize) ENDIF Loading Loading
src/MatrixTypes.f90 +50 −78 Original line number Diff line number Diff line Loading @@ -699,15 +699,13 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b) CHARACTER(LEN=1),INTENT(IN) :: d INTEGER(SIK),INTENT(IN) :: incx INTEGER(SIK) :: rank,k INTEGER(SIK),ALLOCATABLE :: recvIdx(:,:), sendIdx(:,:) REAL(SRK),ALLOCATABLE :: recvResult(:,:),sendResult(:,:),tmpProduct(:) INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),sendIdxRequests(MATVEC_SLOTS) INTEGER(SIK) :: recvRequests(MATVEC_SLOTS),recvIdxRequests(MATVEC_SLOTS) INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),recvRequests(MATVEC_SLOTS) INTEGER(SIK) :: lowIdx,highIdx,sendCounter,recvCounter #ifdef HAVE_MPI ! Get rank INTEGER(SIK) :: ctRecv(MATVEC_SLOTS),srcRank,destRank INTEGER(SIK) :: i,idxTmp,cnt,ctDefault,mpierr,nproc INTEGER(SIK) :: i,idxTmp,ctDefault,mpierr,nproc CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr) CALL MPI_Comm_Size(thisMatrix%comm,nProc,mpierr) #else Loading @@ -722,16 +720,10 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b) sendCounter=0 recvCounter=0 sendRequests=0 sendIdxRequests=0 recvRequests=0 recvIdxRequests=0 ! The recv/sendResult array will sometimes hold it's full length, ! and sometimes less. ! The variable "cnt" will be used to determine this at ! each iteration ALLOCATE(recvIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS)) ALLOCATE(recvResult(thisMatrix%iOffsets(2),MATVEC_SLOTS)) ALLOCATE(sendIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS)) ALLOCATE(sendResult(thisMatrix%iOffsets(2),MATVEC_SLOTS)) ALLOCATE(tmpProduct(thisMatrix%iOffsets(rank+2)- & thisMatrix%iOffsets(rank+1))) Loading Loading @@ -784,73 +776,61 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b) sendCounter=sendCounter+1 idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1 ! Check if we can safely write to sendRequests CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests) CALL pop_send(sendResult,idxTmp,sendRequests) CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(destRank+1),X=x,& y=sendResult(1:ctDefault,idxTmp),ALPHA=1.0_SRK,BETA=0.0_SRK) CALL MPI_ISend(sendResult(1:ctDefault,idxTmp),ctDefault, & MPI_DOUBLE_PRECISION,destRank,0,thisMatrix%comm,sendRequests(idxTmp),mpierr) ELSE ! Send several sparse vectors DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx) ! Gather and send several sparse vectors sendCounter=sendCounter+1 idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1 CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests) ! compute destination indices ! perform special-case multiplication from single band here cnt=SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx) sendIdx(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%jIdx & -thisMatrix%chunks(destRank+1)%bandIdx(k) CALL MPI_ISend(sendIdx(1:cnt,idxTmp),cnt,MPI_INTEGER,destRank,0, & thisMatrix%comm,sendIdxRequests(idxTmp), mpierr) sendResult(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem & CALL pop_send(sendResult,idxTmp,sendRequests) lowIdx=1 DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx) highidx=lowIdx+SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)-1 sendResult(lowIdx:highIdx,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem & *x(thisMatrix%chunks(destRank+1)%bands(k)%jIdx) CALL MPI_ISend(sendResult(1:cnt,idxTmp),cnt,MPI_DOUBLE_PRECISION, & destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr) lowIdx=highIdx+1 ENDDO CALL MPI_ISend(sendResult(1:highIdx,idxTmp),highIdx,MPI_DOUBLE_PRECISION, & destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr) ENDIF ENDIF ! We might receive data from rank MOD(rank-i,nproc) srcRank = MODULO(rank-i,nProc) IF (ALLOCATED(thisMatrix%bandSizes(srcRank+1)%p)) THEN IF (thisMatrix%bandSizes(srcRank+1)%p(1) < 0) THEN IF (thisMatrix%incIdxStt(srcRank+1) == -1) THEN ! We are receiving a whole vector at once recvCounter=recvCounter+1 idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1 ! If we've filled up the available storage, we need ! to wait for communication to finish up CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, & recvRequests, recvIdxRequests) CALL pop_recv(tmpProduct,recvResult,thisMatrix, & ctRecv,idxTmp,recvRequests) ctRecv(MOD(recvCounter,MATVEC_SLOTS)+1)=ctDefault CALL MPI_IRecv(recvResult(1:ctDefault,idxTmp), ctDefault, & MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm, & recvRequests(idxTmp),mpierr) ELSE ! We are receiving multiple sparse chunks DO k=1,SIZE(thisMatrix%bandSizes(srcRank+1)%p) ELSEIF (thisMatrix%incIdxStt(srcRank+1) > 0) THEN ! We are receiving multiple sparse chunks gathered together recvCounter=recvCounter+1 idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1 ! If we've filled up the available storage, we need ! to wait for communication to finish up CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, & recvRequests, recvIdxRequests) ctRecv(idxTmp)=-thisMatrix%bandSizes(srcRank+1)%p(k) CALL MPI_IRecv(recvIdx(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), & MPI_INTEGER,srcRank,0,thisMatrix%comm,recvIdxRequests(idxTmp), mpierr) CALL MPI_IRecv(recvResult(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), & CALL pop_recv(tmpProduct,recvResult,thisMatrix, & ctRecv,idxTmp,recvRequests) ctRecv(idxTmp)= -srcRank lowIdx=thisMatrix%incIdxStt(srcRank+1) highIdx=thisMatrix%incIdxStp(srcRank+1) CALL MPI_IRecv(recvResult(1:(highIdx-lowIdx+1),idxTmp),highIdx-lowIdx+1, & MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm,recvRequests(idxTmp), mpierr) ENDDO ENDIF ENDIF ENDDO ! We've finished calling irecv. Wait for remaining ! requests to finish: DO idxTmp=1,MATVEC_SLOTS CALL pop_recv(tmpProduct, recvResult, recvIdx, ctRecv, idxTmp, & recvRequests, recvIdxRequests) CALL pop_recv(tmpProduct,recvResult,thisMatrix,ctRecv,idxTmp,recvRequests) ENDDO DO idxTmp=1,MATVEC_SLOTS CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests) CALL pop_send(sendResult,idxTmp,sendRequests) ENDDO #endif Loading Loading @@ -912,28 +892,29 @@ ENDSUBROUTINE matvec_DistrBandedMatrixType !> @param req The array of active requests for data !> @param idxReq The array of requests for indices !> SUBROUTINE pop_recv(acc,valBuf,idxBuf,ctBuf,idx,req,idxReq) SUBROUTINE pop_recv(acc,valBuf,thisMat,ctBuf,idx,req) REAL(SRK), INTENT(INOUT) :: acc(:) REAL(SRK), INTENT(INOUT) :: valBuf(:,:) CLASS(DistributedBandedMatrixType),INTENT(INOUT) :: thisMat INTEGER(SIK), INTENT(INOUT) :: ctBuf(MATVEC_SLOTS) INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:) INTEGER(SIK), INTENT(IN) :: idx INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS) INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS) INTEGER(SIK) :: mpierr INTEGER(SIK) :: stt,stp,mpierr,i IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN IF(req(idx) == 0) RETURN CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr) IF(ctBuf(idx) > 0) THEN acc(1:ctBuf(idx))=acc(1:ctBuf(idx))+valBuf(1:ctBuf(idx),idx) ELSE CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr) idxReq(idx)=0 acc(idxBuf(1:-ctBuf(idx),idx))=acc(idxBuf(1:-ctBuf(idx),idx)) & +valBuf(1:-ctBuf(idx),idx) stt=thisMat%incIdxStt(-ctBuf(idx)+1) stp=thisMat%incIdxStp(-ctBuf(idx)+1) DO i=stt,stp acc(thisMat%incIdxMap(i))=acc(thisMat%incIdxMap(i))+valBuf(i,idx) ENDDO ENDIF ENDSUBROUTINE pop_recv ! !------------------------------------------------------------------------------- !> @brief Helper routine that keeps of data in the send buffer while Loading @@ -946,25 +927,16 @@ ENDSUBROUTINE pop_recv !> @param req The array of requests for data !> @param idxReq The array of requests for indices !> SUBROUTINE pop_send(valBuf, idxBuf, idx, req, idxReq) SUBROUTINE pop_send(valBuf, idx, req) REAL(SRK), INTENT(INOUT) :: valBuf(:,:) !> The send buffer for indices INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:) !> The buffer slot to pop INTEGER(SIK), INTENT(IN) :: idx !> The array of requests for data INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS) !> The array of requests for indices INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS) INTEGER(SIK) :: mpierr IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN IF(req(idx) == 0) RETURN CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr) IF(idxReq(idx) /= 0) THEN CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr) idxReq(idx)=0 ENDIF ENDSUBROUTINE pop_send #endif ! Loading
src/MatrixTypes_Native.f90 +63 −39 Original line number Diff line number Diff line Loading @@ -132,11 +132,6 @@ TYPE Band REAL(SRK), ALLOCATABLE :: elem(:) ENDTYPE Band !> @brief Integer array used for 'ragged' storage of band contents TYPE IntPtr INTEGER(SIK),ALLOCATABLE :: p(:) ENDTYPE IntPtr !> @brief The basic banded matrix type TYPE,EXTENDS(MatrixType) :: BandedMatrixType !> Map of band indices stored (-m to n) Loading Loading @@ -200,7 +195,9 @@ TYPE,EXTENDS(DistributedMatrixType) :: DistributedBandedMatrixType !> Block size (smallest indivisble unit) INTEGER(SIK) :: blockSize !> Array of band sizes used to determine optimal communication TYPE(IntPtr), ALLOCATABLE :: bandSizes(:) INTEGER(SIK), ALLOCATABLE :: incIdxMap(:) INTEGER(SIK), ALLOCATABLE :: incIdxStt(:) INTEGER(SIK), ALLOCATABLE :: incIdxStp(:) !> Temporary containers used before (and deallocated after) assembly INTEGER(SIK), ALLOCATABLE :: iTmp(:),jTmp(:) REAL(SRK),ALLOCATABLE :: elemTmp(:) Loading Loading @@ -575,7 +572,6 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params) matrix%nnz=nnz matrix%blockSize = blockSize matrix%comm=commID ALLOCATE(matrix%bandSizes(nProc)) ENDIF ELSE CALL eMatrixType%raiseError('Incorrect call to '// & Loading Loading @@ -764,14 +760,14 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr) INTEGER(SIK),INTENT(OUT),OPTIONAL :: ierr #ifdef HAVE_MPI TYPE(ParamType) :: bandedPList INTEGER(SIK) :: mpierr, rank, nproc, i, j, nBandLocal INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nband(:),bandSizeTmp(:) INTEGER(SIK) :: mpierr,rank,nproc,i,j,nTransmit,stt,stp INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nRecv(:),idxMapTmp(:) REQUIRE(thisMatrix%isInit) CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr) CALL MPI_Comm_size(thisMatrix%comm,nproc,mpierr) ALLOCATE(nnzPerChunk(nProc)) ALLOCATE(nBand(nProc)) ALLOCATE(nRecv(nProc)) nnzPerChunk=0 DO i=1,thisMatrix%nLocal DO j=1,nproc Loading Loading @@ -804,45 +800,76 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr) CALL thisMatrix%chunks(i)%assemble() CALL bandedPList%clear() ENDIF ! Set nbandlocal (to be gathered into nband ! Decide the sending method to use. 'Full' chunks will send their whole ! contents while 'sparse' chunks will send band products. ! Set nTransmit to the number of REAL's to be sent from that processor to ! this one. ! >0: This many ! 0: None (chunk empty) ! -1: All nTransmit=0 IF(thisMatrix%chunks(i)%isInit) THEN IF(2*thisMatrix%chunks(i)%nnz/3 < thisMatrix%chunks(i)%n) THEN nBandLocal=SIZE(thisMatrix%chunks(i)%bandIdx) DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx) nTransmit=nTransmit+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx) ENDDO ELSE nBandLocal=-1 nTransmit=-1 ENDIF ELSE nBandLocal=0 ENDIF ! This info must be disseminated through all procs ! Gather to rank i-1 into nBand(:) CALL MPI_Gather(nBandLocal,1,MPI_INTEGER,nband(1),1,MPI_INTEGER,i-1, & ! Gather to rank i-1 into nRecv(:) CALL MPI_Gather(nTransmit,1,MPI_INTEGER,nRecv,1,MPI_INTEGER,i-1, & thisMatrix%comm,mpierr) ! Have rank i-1 allocate IF(rank == i-1) THEN nTransmit=0 DO j=1,SIZE(nRecv) IF (nRecv(j) > 0) THEN nTransmit=nTransmit+nRecv(j) ENDIF ENDDO ALLOCATE(thisMatrix%incIdxMap(nTransmit)) ALLOCATE(thisMatrix%incIdxStt(nProc)) ALLOCATE(thisMatrix%incIdxStp(nProc)) thisMatrix%incIdxStt=0 thisMatrix%incIdxStp=0 DO j=1,nProc IF(j /= i) THEN IF(nBand(j) > 0) THEN ALLOCATE(thisMatrix%bandSizes(j)%p(nBand(j))) CALL MPI_Recv(thisMatrix%bandSizes(j)%p(1),nBand(j),MPI_INTEGER, & j-1, 0, thisMatrix%comm,MPI_STATUS_IGNORE, mpierr) ! Use nTransmit as counter nTransmit=1 IF(j == i) THEN thisMatrix%incIdxStt(j)=-1 thisMatrix%incIdxStp(j)=-1 ELSE IF(nBand(j) /= 0) THEN ALLOCATE(thisMatrix%bandSizes(j)%p(1)) thisMatrix%bandSizes(j)%p(1)=-1 ENDIF IF(nRecv(j) > 0) THEN thisMatrix%incIdxStt(j)=nTransmit thisMatrix%incIdxStp(j)=nTransmit+nRecv(j)-1 ! We receive data (and don't want to bother sending idx's at mvmult ! time) CALL MPI_Recv(thisMatrix%incIdxMap(nTransmit),nRecv(j),MPI_INTEGER, & j-1,0,thisMatrix%comm,MPI_STATUS_IGNORE, mpierr) nTransmit=nTransmit+nRecv(j) ELSE IF (nRecv(j) == -1) THEN thisMatrix%incIdxStt(j)=-1 thisMatrix%incIdxStp(j)=-1 ENDIF ENDIF ENDDO ELSE IF(nBandLocal > 0) THEN ALLOCATE(bandSizeTmp(nBandLocal)) DO j=1,nBandLocal bandSizeTmp(j)=SIZE(thisMatrix%chunks(i)%bands(j)%elem) ! Build the index map IF(nTransmit > 0) THEN ALLOCATE(idxMapTmp(nTransmit)) stt=1 DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx) stp=stt+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx)-1 idxMapTmp(stt:stp)=thisMatrix%chunks(i)%bands(j)%jIdx-thisMatrix%chunks(i)%bandIdx(j) stt=stp+1 ENDDO CALL MPI_Send(bandSizeTmp(:),nBandLocal,MPI_INTEGER,i-1,0, & CALL MPI_Send(idxMapTmp,nTransmit,MPI_INTEGER,i-1,0, & thisMatrix%comm,mpierr) DEALLOCATE(bandSizeTmp) DEALLOCATE(idxMapTmp) ENDIF ENDIF ENDDO Loading Loading @@ -1039,12 +1066,9 @@ SUBROUTINE clear_DistributedBandedMatrixType(matrix) ENDDO DEALLOCATE(matrix%chunks) ENDIF IF(ALLOCATED(matrix%bandSizes)) THEN DO i=1,SIZE(matrix%bandSizes) IF(ALLOCATED(matrix%bandSizes(i)%p)) DEALLOCATE(matrix%bandSizes(i)%p) ENDDO DEALLOCATE(matrix%bandSizes) ENDIF IF(ALLOCATED(matrix%incIdxMap)) DEALLOCATE(matrix%incIdxMap) IF(ALLOCATED(matrix%incIdxStt)) DEALLOCATE(matrix%incIdxStt) IF(ALLOCATED(matrix%incIdxStp)) DEALLOCATE(matrix%incIdxStp) IF(ALLOCATED(matrix%iOffsets)) DEALLOCATE(matrix%iOffsets) IF(ALLOCATED(matrix%jOffsets)) DEALLOCATE(matrix%jOffsets) IF(ALLOCATED(matrix%iTmp)) DEALLOCATE(matrix%iTmp) Loading
src/VectorTypes.f90 +6 −0 Original line number Diff line number Diff line Loading @@ -420,6 +420,9 @@ SUBROUTINE VectorResembleAllocScal(dest, source, inparams) ENDIF #ifdef HAVE_MPI TYPE IS(NativeDistributedVectorType) IF(.NOT. params%has("VectorType->nlocal")) THEN CALL params%add("VectorType->nlocal",SIZE(source%b)) ENDIF IF(.NOT. params%has("VectorType->chunkSize")) THEN CALL params%add("VectorType->chunkSize",source%chunkSize) ENDIF Loading Loading @@ -497,6 +500,9 @@ SUBROUTINE VectorResembleAllocArray(dest, source, nvec, inparams) ENDIF #ifdef HAVE_MPI TYPE IS(NativeDistributedVectorType) IF(.NOT. params%has("VectorType->nlocal")) THEN CALL params%add("VectorType->nlocal",SIZE(source%b)) ENDIF IF(.NOT. params%has("VectorType->chunkSize")) THEN CALL params%add("VectorType->chunkSize",source%chunkSize) ENDIF Loading