Commit 2eed2386 authored by Kabelitz, Matthew Edward's avatar Kabelitz, Matthew Edward Committed by Sooyoung Choi
Browse files

Native matrices now determine indices for data transfer at assemble-time

parent 88d3370c
Loading
Loading
Loading
Loading
+50 −78
Original line number Diff line number Diff line
@@ -699,15 +699,13 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
  CHARACTER(LEN=1),INTENT(IN) :: d
  INTEGER(SIK),INTENT(IN) :: incx
  INTEGER(SIK) :: rank,k
  INTEGER(SIK),ALLOCATABLE :: recvIdx(:,:), sendIdx(:,:)
  REAL(SRK),ALLOCATABLE :: recvResult(:,:),sendResult(:,:),tmpProduct(:)
  INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),sendIdxRequests(MATVEC_SLOTS)
  INTEGER(SIK) :: recvRequests(MATVEC_SLOTS),recvIdxRequests(MATVEC_SLOTS)
  INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),recvRequests(MATVEC_SLOTS)
  INTEGER(SIK) :: lowIdx,highIdx,sendCounter,recvCounter
#ifdef HAVE_MPI
  ! Get rank
  INTEGER(SIK) :: ctRecv(MATVEC_SLOTS),srcRank,destRank
  INTEGER(SIK) :: i,idxTmp,cnt,ctDefault,mpierr,nproc
  INTEGER(SIK) :: i,idxTmp,ctDefault,mpierr,nproc
  CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr)
  CALL MPI_Comm_Size(thisMatrix%comm,nProc,mpierr)
#else
@@ -722,16 +720,10 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
  sendCounter=0
  recvCounter=0
  sendRequests=0
  sendIdxRequests=0
  recvRequests=0
  recvIdxRequests=0
  ! The recv/sendResult array will sometimes hold it's full length,
  ! and sometimes less.
  ! The variable "cnt" will be used to determine this at
  ! each iteration
  ALLOCATE(recvIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS))
  ALLOCATE(recvResult(thisMatrix%iOffsets(2),MATVEC_SLOTS))
  ALLOCATE(sendIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS))
  ALLOCATE(sendResult(thisMatrix%iOffsets(2),MATVEC_SLOTS))
  ALLOCATE(tmpProduct(thisMatrix%iOffsets(rank+2)- &
      thisMatrix%iOffsets(rank+1)))
@@ -784,73 +776,61 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
        sendCounter=sendCounter+1
        idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1
        ! Check if we can safely write to sendRequests
        CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)
        CALL pop_send(sendResult,idxTmp,sendRequests)
        CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(destRank+1),X=x,&
            y=sendResult(1:ctDefault,idxTmp),ALPHA=1.0_SRK,BETA=0.0_SRK)
        CALL MPI_ISend(sendResult(1:ctDefault,idxTmp),ctDefault, &
            MPI_DOUBLE_PRECISION,destRank,0,thisMatrix%comm,sendRequests(idxTmp),mpierr)
      ELSE
        ! Send several sparse vectors
        DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx)
        ! Gather and send several sparse vectors
        sendCounter=sendCounter+1
        idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1
          CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)

          ! compute destination indices
          ! perform special-case multiplication from single band here
          cnt=SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)
          sendIdx(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%jIdx &
              -thisMatrix%chunks(destRank+1)%bandIdx(k)
          CALL MPI_ISend(sendIdx(1:cnt,idxTmp),cnt,MPI_INTEGER,destRank,0, &
              thisMatrix%comm,sendIdxRequests(idxTmp), mpierr)
          sendResult(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem &
        CALL pop_send(sendResult,idxTmp,sendRequests)
        lowIdx=1
        DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx)
          highidx=lowIdx+SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)-1
          sendResult(lowIdx:highIdx,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem &
              *x(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)
          CALL MPI_ISend(sendResult(1:cnt,idxTmp),cnt,MPI_DOUBLE_PRECISION, &
              destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr)
          lowIdx=highIdx+1
        ENDDO
        CALL MPI_ISend(sendResult(1:highIdx,idxTmp),highIdx,MPI_DOUBLE_PRECISION, &
            destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr)
      ENDIF
    ENDIF
    ! We might receive data from rank MOD(rank-i,nproc)
    srcRank = MODULO(rank-i,nProc)
    IF (ALLOCATED(thisMatrix%bandSizes(srcRank+1)%p)) THEN
      IF (thisMatrix%bandSizes(srcRank+1)%p(1) < 0) THEN
    IF (thisMatrix%incIdxStt(srcRank+1) == -1) THEN
      ! We are receiving a whole vector at once
      recvCounter=recvCounter+1
      idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1
      ! If we've filled up the available storage, we need
      ! to wait for communication to finish up
        CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, &
            recvRequests, recvIdxRequests)
      CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
          ctRecv,idxTmp,recvRequests)
      ctRecv(MOD(recvCounter,MATVEC_SLOTS)+1)=ctDefault
      CALL MPI_IRecv(recvResult(1:ctDefault,idxTmp), ctDefault, &
          MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm, &
          recvRequests(idxTmp),mpierr)
      ELSE
        ! We are receiving multiple sparse chunks
        DO k=1,SIZE(thisMatrix%bandSizes(srcRank+1)%p)
    ELSEIF (thisMatrix%incIdxStt(srcRank+1) > 0) THEN
      ! We are receiving multiple sparse chunks gathered together
      recvCounter=recvCounter+1
      idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1
          ! If we've filled up the available storage, we need
          ! to wait for communication to finish up
          CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, &
              recvRequests, recvIdxRequests)
          ctRecv(idxTmp)=-thisMatrix%bandSizes(srcRank+1)%p(k)
          CALL MPI_IRecv(recvIdx(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), &
              MPI_INTEGER,srcRank,0,thisMatrix%comm,recvIdxRequests(idxTmp), mpierr)
          CALL MPI_IRecv(recvResult(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), &
      CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
          ctRecv,idxTmp,recvRequests)
      ctRecv(idxTmp)= -srcRank
      lowIdx=thisMatrix%incIdxStt(srcRank+1)
      highIdx=thisMatrix%incIdxStp(srcRank+1)
      CALL MPI_IRecv(recvResult(1:(highIdx-lowIdx+1),idxTmp),highIdx-lowIdx+1, &
          MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm,recvRequests(idxTmp), mpierr)
        ENDDO
      ENDIF
    ENDIF
  ENDDO
  ! We've finished calling irecv. Wait for remaining
  ! requests to finish:
  DO idxTmp=1,MATVEC_SLOTS
    CALL pop_recv(tmpProduct, recvResult, recvIdx, ctRecv, idxTmp, &
        recvRequests, recvIdxRequests)
    CALL pop_recv(tmpProduct,recvResult,thisMatrix,ctRecv,idxTmp,recvRequests)
  ENDDO
  DO idxTmp=1,MATVEC_SLOTS
    CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)
    CALL pop_send(sendResult,idxTmp,sendRequests)
  ENDDO
#endif

@@ -912,28 +892,29 @@ ENDSUBROUTINE matvec_DistrBandedMatrixType
!> @param req The array of active requests for data
!> @param idxReq The array of requests for indices
!>
SUBROUTINE pop_recv(acc,valBuf,idxBuf,ctBuf,idx,req,idxReq)
SUBROUTINE pop_recv(acc,valBuf,thisMat,ctBuf,idx,req)
  REAL(SRK), INTENT(INOUT) :: acc(:)
  REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
  CLASS(DistributedBandedMatrixType),INTENT(INOUT) :: thisMat
  INTEGER(SIK), INTENT(INOUT) :: ctBuf(MATVEC_SLOTS)
  INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:)
  INTEGER(SIK), INTENT(IN) :: idx
  INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
  INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS)
  INTEGER(SIK) :: mpierr
  INTEGER(SIK) :: stt,stp,mpierr,i

  IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN
  IF(req(idx) == 0) RETURN

  CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
  IF(ctBuf(idx) > 0) THEN
    acc(1:ctBuf(idx))=acc(1:ctBuf(idx))+valBuf(1:ctBuf(idx),idx)
  ELSE
    CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr)
    idxReq(idx)=0
    acc(idxBuf(1:-ctBuf(idx),idx))=acc(idxBuf(1:-ctBuf(idx),idx)) &
        +valBuf(1:-ctBuf(idx),idx)
    stt=thisMat%incIdxStt(-ctBuf(idx)+1)
    stp=thisMat%incIdxStp(-ctBuf(idx)+1)
    DO i=stt,stp
      acc(thisMat%incIdxMap(i))=acc(thisMat%incIdxMap(i))+valBuf(i,idx)
    ENDDO
  ENDIF
ENDSUBROUTINE pop_recv

!
!-------------------------------------------------------------------------------
!> @brief Helper routine that keeps of data in the send buffer while
@@ -946,25 +927,16 @@ ENDSUBROUTINE pop_recv
!> @param req The array of requests for data
!> @param idxReq The array of requests for indices
!>
SUBROUTINE pop_send(valBuf, idxBuf, idx, req, idxReq)
SUBROUTINE pop_send(valBuf, idx, req)
  REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
  !> The send buffer for indices
  INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:)
  !> The buffer slot to pop
  INTEGER(SIK), INTENT(IN) :: idx
  !> The array of requests for data
  INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
  !> The array of requests for indices
  INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS)
  INTEGER(SIK) :: mpierr

  IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN

  IF(req(idx) == 0) RETURN
  CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
  IF(idxReq(idx) /= 0) THEN
    CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr)
    idxReq(idx)=0
  ENDIF
ENDSUBROUTINE pop_send
#endif
!
+63 −39
Original line number Diff line number Diff line
@@ -132,11 +132,6 @@ TYPE Band
  REAL(SRK), ALLOCATABLE :: elem(:)
ENDTYPE Band

!> @brief Integer array used for 'ragged' storage of band contents
TYPE IntPtr
  INTEGER(SIK),ALLOCATABLE :: p(:)
ENDTYPE IntPtr

!> @brief The basic banded matrix type
TYPE,EXTENDS(MatrixType) :: BandedMatrixType
  !> Map of band indices stored (-m to n)
@@ -200,7 +195,9 @@ TYPE,EXTENDS(DistributedMatrixType) :: DistributedBandedMatrixType
  !> Block size (smallest indivisble unit)
  INTEGER(SIK) :: blockSize
  !> Array of band sizes used to determine optimal communication
  TYPE(IntPtr), ALLOCATABLE :: bandSizes(:)
  INTEGER(SIK), ALLOCATABLE :: incIdxMap(:)
  INTEGER(SIK), ALLOCATABLE :: incIdxStt(:)
  INTEGER(SIK), ALLOCATABLE :: incIdxStp(:)
  !> Temporary containers used before (and deallocated after) assembly
  INTEGER(SIK), ALLOCATABLE :: iTmp(:),jTmp(:)
  REAL(SRK),ALLOCATABLE :: elemTmp(:)
@@ -575,7 +572,6 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params)
      matrix%nnz=nnz
      matrix%blockSize = blockSize
      matrix%comm=commID
      ALLOCATE(matrix%bandSizes(nProc))
    ENDIF
  ELSE
    CALL eMatrixType%raiseError('Incorrect call to '// &
@@ -764,14 +760,14 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
  INTEGER(SIK),INTENT(OUT),OPTIONAL :: ierr
#ifdef HAVE_MPI
  TYPE(ParamType) :: bandedPList
  INTEGER(SIK) :: mpierr, rank, nproc, i, j, nBandLocal
  INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nband(:),bandSizeTmp(:)
  INTEGER(SIK) :: mpierr,rank,nproc,i,j,nTransmit,stt,stp
  INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nRecv(:),idxMapTmp(:)

  REQUIRE(thisMatrix%isInit)
  CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr)
  CALL MPI_Comm_size(thisMatrix%comm,nproc,mpierr)
  ALLOCATE(nnzPerChunk(nProc))
  ALLOCATE(nBand(nProc))
  ALLOCATE(nRecv(nProc))
  nnzPerChunk=0
  DO i=1,thisMatrix%nLocal
    DO j=1,nproc
@@ -804,45 +800,76 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
      CALL thisMatrix%chunks(i)%assemble()
      CALL bandedPList%clear()
    ENDIF
    ! Set nbandlocal (to be gathered into nband

    ! Decide the sending method to use. 'Full' chunks will send their whole
    ! contents while 'sparse' chunks will send band products.
    ! Set nTransmit to the number of REAL's to be sent from that processor to
    ! this one.
    !  >0: This many
    !   0: None (chunk empty)
    !  -1: All
    nTransmit=0
    IF(thisMatrix%chunks(i)%isInit) THEN
      IF(2*thisMatrix%chunks(i)%nnz/3 < thisMatrix%chunks(i)%n) THEN
        nBandLocal=SIZE(thisMatrix%chunks(i)%bandIdx)
        DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx)
          nTransmit=nTransmit+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx)
        ENDDO
      ELSE
        nBandLocal=-1
        nTransmit=-1
      ENDIF
    ELSE
      nBandLocal=0
    ENDIF
    ! This info must be disseminated through all procs

    ! Gather to rank i-1 into nBand(:)
    CALL MPI_Gather(nBandLocal,1,MPI_INTEGER,nband(1),1,MPI_INTEGER,i-1, &
    ! Gather to rank i-1 into nRecv(:)
    CALL MPI_Gather(nTransmit,1,MPI_INTEGER,nRecv,1,MPI_INTEGER,i-1, &
        thisMatrix%comm,mpierr)
    ! Have rank i-1 allocate
    IF(rank == i-1) THEN
      nTransmit=0
      DO j=1,SIZE(nRecv)
        IF (nRecv(j) > 0) THEN
          nTransmit=nTransmit+nRecv(j)
        ENDIF
      ENDDO
      ALLOCATE(thisMatrix%incIdxMap(nTransmit))
      ALLOCATE(thisMatrix%incIdxStt(nProc))
      ALLOCATE(thisMatrix%incIdxStp(nProc))
      thisMatrix%incIdxStt=0
      thisMatrix%incIdxStp=0
      DO j=1,nProc
        IF(j /= i) THEN
          IF(nBand(j) > 0) THEN
            ALLOCATE(thisMatrix%bandSizes(j)%p(nBand(j)))
            CALL MPI_Recv(thisMatrix%bandSizes(j)%p(1),nBand(j),MPI_INTEGER, &
                j-1, 0, thisMatrix%comm,MPI_STATUS_IGNORE, mpierr)
        ! Use nTransmit as counter
        nTransmit=1
        IF(j == i) THEN
          thisMatrix%incIdxStt(j)=-1
          thisMatrix%incIdxStp(j)=-1
        ELSE
            IF(nBand(j) /= 0) THEN
              ALLOCATE(thisMatrix%bandSizes(j)%p(1))
              thisMatrix%bandSizes(j)%p(1)=-1
            ENDIF
          IF(nRecv(j) > 0) THEN
            thisMatrix%incIdxStt(j)=nTransmit
            thisMatrix%incIdxStp(j)=nTransmit+nRecv(j)-1
            ! We receive data (and don't want to bother sending idx's at mvmult
            ! time)
            CALL MPI_Recv(thisMatrix%incIdxMap(nTransmit),nRecv(j),MPI_INTEGER, &
                j-1,0,thisMatrix%comm,MPI_STATUS_IGNORE, mpierr)
            nTransmit=nTransmit+nRecv(j)
          ELSE IF (nRecv(j) == -1) THEN
            thisMatrix%incIdxStt(j)=-1
            thisMatrix%incIdxStp(j)=-1
          ENDIF
        ENDIF
      ENDDO
    ELSE
      IF(nBandLocal > 0) THEN
        ALLOCATE(bandSizeTmp(nBandLocal))
        DO j=1,nBandLocal
          bandSizeTmp(j)=SIZE(thisMatrix%chunks(i)%bands(j)%elem)
      ! Build the index map
      IF(nTransmit > 0) THEN
        ALLOCATE(idxMapTmp(nTransmit))
        stt=1
        DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx)
          stp=stt+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx)-1
          idxMapTmp(stt:stp)=thisMatrix%chunks(i)%bands(j)%jIdx-thisMatrix%chunks(i)%bandIdx(j)
          stt=stp+1
        ENDDO
        CALL MPI_Send(bandSizeTmp(:),nBandLocal,MPI_INTEGER,i-1,0, &
        CALL MPI_Send(idxMapTmp,nTransmit,MPI_INTEGER,i-1,0, &
            thisMatrix%comm,mpierr)
        DEALLOCATE(bandSizeTmp)
        DEALLOCATE(idxMapTmp)
      ENDIF
    ENDIF
  ENDDO
@@ -1039,12 +1066,9 @@ SUBROUTINE clear_DistributedBandedMatrixType(matrix)
    ENDDO
    DEALLOCATE(matrix%chunks)
  ENDIF
  IF(ALLOCATED(matrix%bandSizes)) THEN
    DO i=1,SIZE(matrix%bandSizes)
      IF(ALLOCATED(matrix%bandSizes(i)%p)) DEALLOCATE(matrix%bandSizes(i)%p)
    ENDDO
    DEALLOCATE(matrix%bandSizes)
  ENDIF
  IF(ALLOCATED(matrix%incIdxMap)) DEALLOCATE(matrix%incIdxMap)
  IF(ALLOCATED(matrix%incIdxStt)) DEALLOCATE(matrix%incIdxStt)
  IF(ALLOCATED(matrix%incIdxStp)) DEALLOCATE(matrix%incIdxStp)
  IF(ALLOCATED(matrix%iOffsets)) DEALLOCATE(matrix%iOffsets)
  IF(ALLOCATED(matrix%jOffsets)) DEALLOCATE(matrix%jOffsets)
  IF(ALLOCATED(matrix%iTmp)) DEALLOCATE(matrix%iTmp)
+6 −0
Original line number Diff line number Diff line
@@ -420,6 +420,9 @@ SUBROUTINE VectorResembleAllocScal(dest, source, inparams)
    ENDIF
#ifdef HAVE_MPI
  TYPE IS(NativeDistributedVectorType)
    IF(.NOT. params%has("VectorType->nlocal")) THEN
      CALL params%add("VectorType->nlocal",SIZE(source%b))
    ENDIF
    IF(.NOT. params%has("VectorType->chunkSize")) THEN
      CALL params%add("VectorType->chunkSize",source%chunkSize)
    ENDIF
@@ -497,6 +500,9 @@ SUBROUTINE VectorResembleAllocArray(dest, source, nvec, inparams)
    ENDIF
#ifdef HAVE_MPI
  TYPE IS(NativeDistributedVectorType)
    IF(.NOT. params%has("VectorType->nlocal")) THEN
      CALL params%add("VectorType->nlocal",SIZE(source%b))
    ENDIF
    IF(.NOT. params%has("VectorType->chunkSize")) THEN
      CALL params%add("VectorType->chunkSize",source%chunkSize)
    ENDIF