Unverified Commit ef0deee4 authored by Kabelitz, Matthew Edward's avatar Kabelitz, Matthew Edward Committed by GitHub
Browse files

Native SpMV Kernel Update and nnz changes (#316)

* Native matrices now determine indices for data transfer at assemble-time

* Changed to Waitany for nonblocking

* Moved local computation to end

* Updated NNZ to be compatibly with PETSc o/dnnz parameters

* Resolved CI python version mismatch

* Fixed a bug where too much memory was being allocated by Native matrix init
parent 81fdbc2c
Loading
Loading
Loading
Loading
+136 −140
Original line number Diff line number Diff line
@@ -111,7 +111,7 @@ PUBLIC :: BandedMatrixType
PUBLIC :: DistributedBandedMatrixType
PUBLIC :: DistributedBlockBandedMatrixType
PUBLIC :: SparseMatrixType
INTEGER(SIK),PARAMETER,PUBLIC :: MATVEC_SLOTS=4
INTEGER(SIK),PARAMETER,PUBLIC :: MATVEC_SLOTS=10
! PETSc implementations
#ifdef FUTILITY_HAVE_PETSC
PUBLIC :: PETScMatrixType
@@ -698,18 +698,17 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
  CHARACTER(LEN=1),INTENT(IN) :: ul
  CHARACTER(LEN=1),INTENT(IN) :: d
  INTEGER(SIK),INTENT(IN) :: incx
  INTEGER(SIK) :: rank,k
  INTEGER(SIK),ALLOCATABLE :: recvIdx(:,:), sendIdx(:,:)
  INTEGER(SIK) :: rank,k,lowIdx,highIdx
  REAL(SRK),ALLOCATABLE :: recvResult(:,:),sendResult(:,:),tmpProduct(:)
  INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),sendIdxRequests(MATVEC_SLOTS)
  INTEGER(SIK) :: recvRequests(MATVEC_SLOTS),recvIdxRequests(MATVEC_SLOTS)
  INTEGER(SIK) :: lowIdx,highIdx,sendCounter,recvCounter
#ifdef HAVE_MPI
  ! Get rank
  INTEGER(SIK) :: sendRequests(MATVEC_SLOTS),recvRequests(MATVEC_SLOTS)
  INTEGER(SIK) :: ctRecv(MATVEC_SLOTS),srcRank,destRank
  INTEGER(SIK) :: i,idxTmp,cnt,ctDefault,mpierr,nproc
  INTEGER(SIK) :: i,idxTmp,ctDefault,mpierr,nproc
  CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr)
  CALL MPI_Comm_Size(thisMatrix%comm,nProc,mpierr)
  sendRequests=MPI_REQUEST_NULL
  recvRequests=MPI_REQUEST_NULL
#else
  rank=0
#endif
@@ -719,55 +718,13 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)
  ! This allows one to be used for computation and the rest for
  ! communication.
  ! Which one to write to will be determined by *Counter MOD MATVEC_SLOTS
  sendCounter=0
  recvCounter=0
  sendRequests=0
  sendIdxRequests=0
  recvRequests=0
  recvIdxRequests=0
  ! The recv/sendResult array will sometimes hold it's full length,
  ! and sometimes less.
  ! The variable "cnt" will be used to determine this at
  ! each iteration
  ALLOCATE(recvIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS))
  ALLOCATE(recvResult(thisMatrix%iOffsets(2),MATVEC_SLOTS))
  ALLOCATE(sendIdx(thisMatrix%iOffsets(2),MATVEC_SLOTS))
  ALLOCATE(sendResult(thisMatrix%iOffsets(2),MATVEC_SLOTS))
  ALLOCATE(tmpProduct(thisMatrix%iOffsets(rank+2)- &
      thisMatrix%iOffsets(rank+1)))

  ! First, take care of locally held data.
  SELECT TYPE(thisMatrix)
  TYPE IS(DistributedBlockBandedMatrixType)
    IF(.NOT. thisMatrix%blockMask) THEN
      DO k=1,thisMatrix%nlocalBlocks
        lowIdx=(k-1)*thisMatrix%blockSize+1
        highIdx=lowIdx-1+thisMatrix%blockSize
        CALL matvec_MatrixType(THISMATRIX=thisMatrix%blocks(k),X=x(lowIdx:highIdx), &
            Y=tmpProduct(lowIdx:highIdx),ALPHA=1.0_SRK,BETA=0.0_SRK)
      ENDDO
      IF(thisMatrix%chunks(rank+1)%isInit) THEN
        CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
            y=tmpProduct,ALPHA=1.0_SRK,BETA=1.0_SRK)
      ENDIF
    ELSE
      IF(thisMatrix%chunks(rank+1)%isInit) THEN
        CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
            y=tmpProduct,ALPHA=1.0_SRK,BETA=0.0_SRK)
      ELSE
  tmpProduct=0.0_SRK
      ENDIF
    ENDIF
  TYPE IS(DistributedBandedMatrixType)
    IF(thisMatrix%chunks(rank+1)%isInit) THEN
      CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
          y=tmpProduct,ALPHA=1.0_SRK,BETA=0.0_SRK)
    ELSE
      tmpProduct=0.0_SRK
    ENDIF
  CLASS DEFAULT
    tmpProduct = 0.0_SRK
  ENDSELECT

#ifdef HAVE_MPI
  ! On each rank, loop over the chunks held (on diagonal moving down)
@@ -778,79 +735,82 @@ SUBROUTINE matvec_DistrBandedMatrixType(thisMatrix,x,y,t,ul,d,incx,a,b)

    ! First do local computation and send
    IF (thismatrix%chunks(destRank+1)%isInit) THEN

      ! Decide whether to send whole vector or multiple sparse
      IF (2*thisMatrix%chunks(destRank+1)%nnz/3 >= thisMatrix%chunks(destRank+1)%n) THEN
        sendCounter=sendCounter+1
        idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1
      IF (thisMatrix%chunks(destRank+1)%nnz >= thisMatrix%chunks(destRank+1)%n) THEN
        ! Check if we can safely write to sendRequests
        CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)
        CALL pop_send(sendResult,idxTmp,sendRequests)
        CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(destRank+1),X=x,&
            y=sendResult(1:ctDefault,idxTmp),ALPHA=1.0_SRK,BETA=0.0_SRK)
        CALL MPI_ISend(sendResult(1:ctDefault,idxTmp),ctDefault, &
            MPI_DOUBLE_PRECISION,destRank,0,thisMatrix%comm,sendRequests(idxTmp),mpierr)
      ELSE
        ! Send several sparse vectors
        ! Gather and send several sparse vectors
        CALL pop_send(sendResult,idxTmp,sendRequests)
        lowIdx=1
        DO k=1,SIZE(thisMatrix%chunks(destRank+1)%bandIdx)
          sendCounter=sendCounter+1
          idxTmp=MOD(sendCounter,MATVEC_SLOTS)+1
          CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)

          ! compute destination indices
          ! perform special-case multiplication from single band here
          cnt=SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)
          sendIdx(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%jIdx &
              -thisMatrix%chunks(destRank+1)%bandIdx(k)
          CALL MPI_ISend(sendIdx(1:cnt,idxTmp),cnt,MPI_INTEGER,destRank,0, &
              thisMatrix%comm,sendIdxRequests(idxTmp), mpierr)
          sendResult(1:cnt,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem &
          highidx=lowIdx+SIZE(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)-1
          sendResult(lowIdx:highIdx,idxTmp)=thisMatrix%chunks(destRank+1)%bands(k)%elem &
              *x(thisMatrix%chunks(destRank+1)%bands(k)%jIdx)
          CALL MPI_ISend(sendResult(1:cnt,idxTmp),cnt,MPI_DOUBLE_PRECISION, &
              destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr)
          lowIdx=highIdx+1
        ENDDO
        CALL MPI_ISend(sendResult(1:highIdx,idxTmp),highIdx,MPI_DOUBLE_PRECISION, &
            destRank,0,thisMatrix%comm,sendRequests(idxTmp), mpierr)
      ENDIF
    ENDIF
    ! We might receive data from rank MOD(rank-i,nproc)
    srcRank = MODULO(rank-i,nProc)
    IF (ALLOCATED(thisMatrix%bandSizes(srcRank+1)%p)) THEN
      IF (thisMatrix%bandSizes(srcRank+1)%p(1) < 0) THEN
    IF (thisMatrix%incIdxStt(srcRank+1) == -1) THEN
      ! We are receiving a whole vector at once
        recvCounter=recvCounter+1
        idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1
      ! If we've filled up the available storage, we need
      ! to wait for communication to finish up
        CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, &
            recvRequests, recvIdxRequests)
        ctRecv(MOD(recvCounter,MATVEC_SLOTS)+1)=ctDefault
        CALL MPI_IRecv(recvResult(1:ctDefault,idxTmp), ctDefault, &
      CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
          ctRecv,idxTmp,recvRequests)
      ctRecv(idxTmp)=SIZE(tmpProduct)
      CALL MPI_IRecv(recvResult(1:ctRecv(idxTmp),idxTmp), ctRecv(idxTmp), &
          MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm, &
          recvRequests(idxTmp),mpierr)
      ELSE
        ! We are receiving multiple sparse chunks
        DO k=1,SIZE(thisMatrix%bandSizes(srcRank+1)%p)
          recvCounter=recvCounter+1
          idxTmp=MOD(recvCounter,MATVEC_SLOTS)+1
          ! If we've filled up the available storage, we need
          ! to wait for communication to finish up
          CALL pop_recv(tmpProduct,recvResult,recvIdx,ctRecv,idxTmp, &
              recvRequests, recvIdxRequests)
          ctRecv(idxTmp)=-thisMatrix%bandSizes(srcRank+1)%p(k)
          CALL MPI_IRecv(recvIdx(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), &
              MPI_INTEGER,srcRank,0,thisMatrix%comm,recvIdxRequests(idxTmp), mpierr)
          CALL MPI_IRecv(recvResult(1:ctRecv(idxTmp),idxTmp),-ctRecv(idxTmp), &
    ELSEIF (thisMatrix%incIdxStt(srcRank+1) > 0) THEN
      ! We are receiving multiple sparse chunks gathered together
      CALL pop_recv(tmpProduct,recvResult,thisMatrix, &
          ctRecv,idxTmp,recvRequests)
      ctRecv(idxTmp)= -srcRank-1
      lowIdx=thisMatrix%incIdxStt(srcRank+1)
      highIdx=thisMatrix%incIdxStp(srcRank+1)
      CALL MPI_IRecv(recvResult(1:(highIdx-lowIdx+1),idxTmp),highIdx-lowIdx+1, &
          MPI_DOUBLE_PRECISION,srcRank,0,thisMatrix%comm,recvRequests(idxTmp), mpierr)
        ENDDO
    ENDIF
  ENDDO
#endif
  ! Now, take care of locally held data.
  SELECT TYPE(thisMatrix)
  TYPE IS(DistributedBlockBandedMatrixType)
    IF(thisMatrix%chunks(rank+1)%isInit) THEN
      CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
          y=tmpProduct,ALPHA=1.0_SRK,BETA=1.0_SRK)
    ENDIF
    IF(.NOT. thisMatrix%blockMask) THEN
      DO k=1,thisMatrix%nlocalBlocks
        lowIdx=(k-1)*thisMatrix%blockSize+1
        highIdx=lowIdx-1+thisMatrix%blockSize
        CALL matvec_MatrixType(THISMATRIX=thisMatrix%blocks(k),X=x(lowIdx:highIdx), &
            Y=tmpProduct(lowIdx:highIdx),ALPHA=1.0_SRK,BETA=1.0_SRK)
      ENDDO
  ! We've finished calling irecv. Wait for remaining
  ! requests to finish:
  DO idxTmp=1,MATVEC_SLOTS
    CALL pop_recv(tmpProduct, recvResult, recvIdx, ctRecv, idxTmp, &
        recvRequests, recvIdxRequests)
    ENDIF
  TYPE IS(DistributedBandedMatrixType)
    IF(thisMatrix%chunks(rank+1)%isInit) THEN
      CALL BLAS_matvec(THISMATRIX=thisMatrix%chunks(rank+1),X=x, &
          y=tmpProduct,ALPHA=1.0_SRK,BETA=1.0_SRK)
    ENDIF
  ENDSELECT
#if HAVE_MPI
  ! Wait for remaining requests to finish:
  DO k=1,MATVEC_SLOTS
    idxTmp=k
    CALL pop_recv(tmpProduct,recvResult,thisMatrix,ctRecv,idxTmp,recvRequests,.TRUE.)
  ENDDO
  DO idxTmp=1,MATVEC_SLOTS
    CALL pop_send(sendResult,sendIdx,idxTmp,sendRequests,sendIdxRequests)
  DO k=1,MATVEC_SLOTS
    idxTmp=k
    CALL pop_send(sendResult,idxTmp,sendRequests,.TRUE.)
  ENDDO
#endif

@@ -904,66 +864,102 @@ ENDSUBROUTINE matvec_DistrBandedMatrixType
!> @brief Helper routine that handles the movement of data out of the
!>        recieving buffer. This buffer has multiple "slots" that are either
!>        being worked on directly or receiving data from MPI in background.
!>        Empty slots are denoted with MPI_REQUEST_NULL
!> @param acc The accumulating vector
!> @param valBuf The recv buffer for values
!> @param thisMat The matrixtype used in multiplication
!> @param ctBuf Count of elements in each buffer slot
!> @param idxBuf The recv buffer for indices
!> @param idx The buffer slot to pop
!> @param idx The empty buffer slot that was popped
!> @param req The array of active requests for data
!> @param idxReq The array of requests for indices
!> @param f Optional bool to flush all requests
!>
SUBROUTINE pop_recv(acc,valBuf,idxBuf,ctBuf,idx,req,idxReq)
SUBROUTINE pop_recv(acc,valBuf,thisMat,ctBuf,idx,req,f)
  !> Local vector data
  REAL(SRK), INTENT(INOUT) :: acc(:)
  !> List of buffers
  REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
  !> Matrix used in SpMV
  CLASS(DistributedBandedMatrixType),INTENT(INOUT) :: thisMat
  !> Array of buffer sizes
  INTEGER(SIK), INTENT(INOUT) :: ctBuf(MATVEC_SLOTS)
  INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:)
  INTEGER(SIK), INTENT(IN) :: idx
  !> The buffer slot popped
  INTEGER(SIK), INTENT(OUT) :: idx
  !> List of MPI Requests
  INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
  INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS)
  INTEGER(SIK) :: mpierr

  IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN
  !> Optional argument to flush all buffers
  LOGICAL(SBK),OPTIONAL,INTENT(IN) :: f
  INTEGER(SIK) :: stt,stp,mpierr,i,rank
  LOGICAL(SBK) :: force
  CALL MPI_Comm_Rank(thisMat%comm,rank,mpierr)

  force=.FALSE.
  IF (PRESENT(f)) force=f
  IF (force) THEN
    IF (req(idx) == MPI_REQUEST_NULL) RETURN
    CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
  ELSE
    IF (ANY(req == MPI_REQUEST_NULL)) THEN
      DO i=1,SIZE(req)
        IF (req(i) == MPI_REQUEST_NULL) THEN
          idx=i
          RETURN
        ENDIF
      ENDDO
    ELSE
      CALL MPI_WaitAny(SIZE(req),req,idx,MPI_STATUS_IGNORE,mpierr)
    ENDIF
  ENDIF

  IF(ctBuf(idx) > 0) THEN
    acc(1:ctBuf(idx))=acc(1:ctBuf(idx))+valBuf(1:ctBuf(idx),idx)
  ELSE
    CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr)
    idxReq(idx)=0
    acc(idxBuf(1:-ctBuf(idx),idx))=acc(idxBuf(1:-ctBuf(idx),idx)) &
        +valBuf(1:-ctBuf(idx),idx)
    stt=thisMat%incIdxStt(-ctBuf(idx))
    stp=thisMat%incIdxStp(-ctBuf(idx))
    DO i=stt,stp
      acc(thisMat%incIdxMap(i))=acc(thisMat%incIdxMap(i))+valBuf(i-stt+1,idx)
    ENDDO
  ENDIF
ENDSUBROUTINE pop_recv

!
!-------------------------------------------------------------------------------
!> @brief Helper routine that keeps of data in the send buffer while
!> @brief Helper routine that keeps track of data in the send buffer while
!>        MPI is still working. This buffer has multiple "slots" that are
!>        either being worked on directly or contain data being sent by
!>        MPI in background.
!>        MPI in background. Empty slots are denoted with MPI_REQUEST_NULL
!> @param valBuf The send buffer for values
!> @param idxBuf The send buffer for indices
!> @param idx The buffer slot to pop
!> @param idx The buffer slot popped
!> @param req The array of requests for data
!> @param idxReq The array of requests for indices
!> @param f Optional bool to flush all requests
!>
SUBROUTINE pop_send(valBuf, idxBuf, idx, req, idxReq)
SUBROUTINE pop_send(valBuf,idx,req,f)
  !> Set of data buffers
  REAL(SRK), INTENT(INOUT) :: valBuf(:,:)
  !> The send buffer for indices
  INTEGER(SIK), INTENT(INOUT) :: idxBuf(:,:)
  !> The buffer slot to pop
  INTEGER(SIK), INTENT(IN) :: idx
  !> The array of requests for data
  !> The buffer slot popped
  INTEGER(SIK), INTENT(OUT) :: idx
  !> The array of MPI requests
  INTEGER(SIK), INTENT(INOUT) :: req(MATVEC_SLOTS)
  !> The array of requests for indices
  INTEGER(SIK), INTENT(INOUT) :: idxReq(MATVEC_SLOTS)
  INTEGER(SIK) :: mpierr

  IF(req(idx) == 0 .AND. idxReq(idx) == 0) RETURN
  !> Optional argument to flush all buffers
  LOGICAL(SBK),OPTIONAL,INTENT(IN) :: f
  INTEGER(SIK) :: mpierr,i
  LOGICAL(SBK) :: force

  force=.FALSE.
  IF (PRESENT(f)) force=f
  IF (force) THEN
    IF (req(idx) == MPI_REQUEST_NULL) RETURN
    CALL MPI_Wait(req(idx),MPI_STATUS_IGNORE,mpierr)
  IF(idxReq(idx) /= 0) THEN
    CALL MPI_Wait(idxReq(idx),MPI_STATUS_IGNORE,mpierr)
    idxReq(idx)=0
  ELSE
    IF (ANY(req == MPI_REQUEST_NULL)) THEN
      DO i=1,SIZE(req)
        IF (req(i) == MPI_REQUEST_NULL) THEN
          idx=i
          RETURN
        ENDIF
      ENDDO
    ELSE
      CALL MPI_WaitAny(SIZE(req),req,idx,MPI_STATUS_IGNORE,mpierr)
    ENDIF
  ENDIF
ENDSUBROUTINE pop_send
#endif
+88 −48
Original line number Diff line number Diff line
@@ -132,11 +132,6 @@ TYPE Band
  REAL(SRK), ALLOCATABLE :: elem(:)
ENDTYPE Band

!> @brief Integer array used for 'ragged' storage of band contents
TYPE IntPtr
  INTEGER(SIK),ALLOCATABLE :: p(:)
ENDTYPE IntPtr

!> @brief The basic banded matrix type
TYPE,EXTENDS(MatrixType) :: BandedMatrixType
  !> Map of band indices stored (-m to n)
@@ -194,13 +189,15 @@ TYPE,EXTENDS(DistributedMatrixType) :: DistributedBandedMatrixType
  !> The column of banded matrix 'chunks' stored locally
  TYPE(BandedMatrixType),ALLOCATABLE :: chunks(:)
  !> Number of nonzero elements
  INTEGER(SIK) :: nnz
  INTEGER(SLK) :: nnz
  !> Number of columns
  INTEGER(SIK) :: m
  !> Block size (smallest indivisble unit)
  INTEGER(SIK) :: blockSize
  !> Array of band sizes used to determine optimal communication
  TYPE(IntPtr), ALLOCATABLE :: bandSizes(:)
  INTEGER(SIK), ALLOCATABLE :: incIdxMap(:)
  INTEGER(SIK), ALLOCATABLE :: incIdxStt(:)
  INTEGER(SIK), ALLOCATABLE :: incIdxStp(:)
  !> Temporary containers used before (and deallocated after) assembly
  INTEGER(SIK), ALLOCATABLE :: iTmp(:),jTmp(:)
  REAL(SRK),ALLOCATABLE :: elemTmp(:)
@@ -482,7 +479,9 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params)
#ifdef HAVE_MPI
  CHARACTER(LEN=*),PARAMETER :: myName='init_DistributedBandedMatrixParam'
  TYPE(ParamType) :: validParams
  INTEGER(SIK) :: n,m,nnz,commID,rank,mpierr,nproc,i,blocksize,nlocal
  INTEGER(SIK),ALLOCATABLE :: onnz(:),dnnz(:)
  INTEGER(SIK) :: n,m,commID,rank,mpierr,nproc,i,blocksize,nlocal,nnz_int
  INTEGER(SLK) :: nnz

  !Check to set up required and optional param lists.
  IF(.NOT.MatrixType_Paramsflag) CALL MatrixTypes_Declare_ValidParams()
@@ -496,10 +495,22 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params)
  CALL validParams%get('MatrixType->n',n)
  CALL validParams%get('MatrixType->m',m)
  CALL validParams%get('MatrixType->MPI_Comm_ID',commID)
  CALL validParams%get('MatrixType->nnz',nnz)
  CALL validParams%get('MatrixType->nnz',nnz_int)
  CALL validParams%get('MatrixType->blockSize',blockSize)
  CALL validParams%get('MatrixType->nlocal',nlocal)
  CALL validParams%clear()
  nnz=INT(nnz_int,KIND=8)
  IF(nnz_int <= 1) THEN
    IF (validParams%has("MatrixType->dnnz") .AND. validParams%has("MatrixType->onnz")) THEN
      nnz=0_SLK
      CALL validParams%get("MatrixType->onnz",onnz)
      CALL validParams%get("MatrixType->dnnz",dnnz)
      DO i=1,SIZE(onnz)
        nnz=nnz+INT(onnz(i),KIND=SLK)
        nnz=nnz+INT(dnnz(i),KIND=SLK)
      ENDDO
      CALL MPI_AllReduce(nnz,MPI_IN_PLACE,1,MPI_LONG,MPI_SUM,commID,mpierr)
    ENDIF
  ENDIF

  IF(.NOT. matrix%isInit) THEN
    IF(n <= 1) THEN
@@ -575,12 +586,12 @@ SUBROUTINE init_DistributedBandedMatrixParam(matrix,Params)
      matrix%nnz=nnz
      matrix%blockSize = blockSize
      matrix%comm=commID
      ALLOCATE(matrix%bandSizes(nProc))
    ENDIF
  ELSE
    CALL eMatrixType%raiseError('Incorrect call to '// &
        modName//'::'//myName//' - MatrixType already initialized')
  ENDIF
  CALL validParams%clear()
#endif
ENDSUBROUTINE init_DistributedBandedMatrixParam

@@ -690,10 +701,10 @@ SUBROUTINE assemble_BandedMatrixType(thisMatrix)
  ALLOCATE(idxOrig(thisMatrix%counter))
  IF (thisMatrix%isAssembled) RETURN
  DO i=1,thisMatrix%counter
    iLong=INT(thisMatrix%iTmp(i),kind=SLK)
    jLong=INT(thisMatrix%jTmp(i),kind=SLK)
    nLong=INT(thisMatrix%n,kind=SLK)
    mLong=INT(thisMatrix%m,kind=SLK)
    iLong=INT(thisMatrix%iTmp(i),KIND=SLK)
    jLong=INT(thisMatrix%jTmp(i),KIND=SLK)
    nLong=INT(thisMatrix%n,KIND=SLK)
    mLong=INT(thisMatrix%m,KIND=SLK)
    idxOrig(i)=i
    ! The diagonal rank counts upwards as one moves southeast in the matrix,
    ! starting at a large negative number in the bottom left, and reaching
@@ -764,14 +775,14 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
  INTEGER(SIK),INTENT(OUT),OPTIONAL :: ierr
#ifdef HAVE_MPI
  TYPE(ParamType) :: bandedPList
  INTEGER(SIK) :: mpierr, rank, nproc, i, j, nBandLocal
  INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nband(:),bandSizeTmp(:)
  INTEGER(SIK) :: mpierr,rank,nproc,i,j,nTransmit,stt,stp
  INTEGER(SIK),ALLOCATABLE :: nnzPerChunk(:),nRecv(:),idxMapTmp(:)

  REQUIRE(thisMatrix%isInit)
  CALL MPI_Comm_rank(thisMatrix%comm,rank,mpierr)
  CALL MPI_Comm_size(thisMatrix%comm,nproc,mpierr)
  ALLOCATE(nnzPerChunk(nProc))
  ALLOCATE(nBand(nProc))
  ALLOCATE(nRecv(nProc))
  nnzPerChunk=0
  DO i=1,thisMatrix%nLocal
    DO j=1,nproc
@@ -804,45 +815,77 @@ SUBROUTINE assemble_DistributedBandedMatrixType(thisMatrix,ierr)
      CALL thisMatrix%chunks(i)%assemble()
      CALL bandedPList%clear()
    ENDIF
    ! Set nbandlocal (to be gathered into nband

    ! Decide the sending method to use. 'Full' chunks will send their whole
    ! contents while 'sparse' chunks will send band products.
    ! Set nTransmit to the number of REAL's to be sent from that processor to
    ! this one.
    !  >0: This many
    !   0: None (chunk empty)
    !  -1: All
    nTransmit=0
    IF(thisMatrix%chunks(i)%isInit) THEN
      IF(2*thisMatrix%chunks(i)%nnz/3 < thisMatrix%chunks(i)%n) THEN
        nBandLocal=SIZE(thisMatrix%chunks(i)%bandIdx)
      IF(thisMatrix%chunks(i)%nnz < thisMatrix%chunks(i)%n) THEN
        DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx)
          nTransmit=nTransmit+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx)
        ENDDO
      ELSE
        nBandLocal=-1
        nTransmit=-1
      ENDIF
    ELSE
      nBandLocal=0
    ENDIF
    ! This info must be disseminated through all procs

    ! Gather to rank i-1 into nBand(:)
    CALL MPI_Gather(nBandLocal,1,MPI_INTEGER,nband(1),1,MPI_INTEGER,i-1, &
    ! Gather to rank i-1 into nRecv(:)
    CALL MPI_Gather(nTransmit,1,MPI_INTEGER,nRecv,1,MPI_INTEGER,i-1, &
        thisMatrix%comm,mpierr)
    ! Have rank i-1 allocate
    IF(rank == i-1) THEN
      nRecv(i)=0
      nTransmit=0
      DO j=1,SIZE(nRecv)
        IF (nRecv(j) > 0) THEN
          nTransmit=nTransmit+nRecv(j)
        ENDIF
      ENDDO
      ALLOCATE(thisMatrix%incIdxMap(nTransmit))
      ALLOCATE(thisMatrix%incIdxStt(nProc))
      ALLOCATE(thisMatrix%incIdxStp(nProc))
      thisMatrix%incIdxStt=0
      thisMatrix%incIdxStp=0
      ! Use nTransmit as counter
      nTransmit=1
      DO j=1,nProc
        IF(j /= i) THEN
          IF(nBand(j) > 0) THEN
            ALLOCATE(thisMatrix%bandSizes(j)%p(nBand(j)))
            CALL MPI_Recv(thisMatrix%bandSizes(j)%p(1),nBand(j),MPI_INTEGER, &
                j-1, 0, thisMatrix%comm,MPI_STATUS_IGNORE, mpierr)
        IF(j == i) THEN
          thisMatrix%incIdxStt(j)=0
          thisMatrix%incIdxStp(j)=0
        ELSE
            IF(nBand(j) /= 0) THEN
              ALLOCATE(thisMatrix%bandSizes(j)%p(1))
              thisMatrix%bandSizes(j)%p(1)=-1
            ENDIF
          IF(nRecv(j) > 0) THEN
            thisMatrix%incIdxStt(j)=nTransmit
            thisMatrix%incIdxStp(j)=nTransmit+nRecv(j)-1
            ! We receive data (and don't want to bother sending idx's at mvmult
            ! time)
            CALL MPI_Recv(thisMatrix%incIdxMap(nTransmit:nTransmit+nRecv(j)-1), &
                nRecv(j),MPI_INTEGER,j-1,0,thisMatrix%comm,MPI_STATUS_IGNORE, mpierr)
            nTransmit=nTransmit+nRecv(j)
          ELSE IF (nRecv(j) == -1) THEN
            thisMatrix%incIdxStt(j)=-1
            thisMatrix%incIdxStp(j)=-1
          ENDIF
        ENDIF
      ENDDO
    ELSE
      IF(nBandLocal > 0) THEN
        ALLOCATE(bandSizeTmp(nBandLocal))
        DO j=1,nBandLocal
          bandSizeTmp(j)=SIZE(thisMatrix%chunks(i)%bands(j)%elem)
      ! Build the index map
      IF(nTransmit > 0) THEN
        ALLOCATE(idxMapTmp(nTransmit))
        stt=1
        DO j=1,SIZE(thisMatrix%chunks(i)%bandIdx)
          stp=stt+SIZE(thisMatrix%chunks(i)%bands(j)%jIdx)-1
          idxMapTmp(stt:stp)=thisMatrix%chunks(i)%bands(j)%jIdx-thisMatrix%chunks(i)%bandIdx(j)
          stt=stp+1
        ENDDO
        CALL MPI_Send(bandSizeTmp(:),nBandLocal,MPI_INTEGER,i-1,0, &
        CALL MPI_Send(idxMapTmp,nTransmit,MPI_INTEGER,i-1,0, &
            thisMatrix%comm,mpierr)
        DEALLOCATE(bandSizeTmp)
        DEALLOCATE(idxMapTmp)
      ENDIF
    ENDIF
  ENDDO
@@ -1039,12 +1082,9 @@ SUBROUTINE clear_DistributedBandedMatrixType(matrix)
    ENDDO
    DEALLOCATE(matrix%chunks)
  ENDIF
  IF(ALLOCATED(matrix%bandSizes)) THEN
    DO i=1,SIZE(matrix%bandSizes)
      IF(ALLOCATED(matrix%bandSizes(i)%p)) DEALLOCATE(matrix%bandSizes(i)%p)
    ENDDO
    DEALLOCATE(matrix%bandSizes)
  ENDIF
  IF(ALLOCATED(matrix%incIdxMap)) DEALLOCATE(matrix%incIdxMap)
  IF(ALLOCATED(matrix%incIdxStt)) DEALLOCATE(matrix%incIdxStt)
  IF(ALLOCATED(matrix%incIdxStp)) DEALLOCATE(matrix%incIdxStp)
  IF(ALLOCATED(matrix%iOffsets)) DEALLOCATE(matrix%iOffsets)
  IF(ALLOCATED(matrix%jOffsets)) DEALLOCATE(matrix%jOffsets)
  IF(ALLOCATED(matrix%iTmp)) DEALLOCATE(matrix%iTmp)
+1 −1
Original line number Diff line number Diff line
@@ -758,7 +758,7 @@ SUBROUTINE init_DistributedSOR_PreCondtype(thisPC,A,params)

  thisPC%A => A
  !get omega from the parameter list
  thisPC%omega = 0.0_SRK
  thisPC%omega = 1.0_SRK
  IF (params%has('PreCondType->omega')) &
    CALL params%get('PreCondType->omega',thisPC%omega)

+6 −0
Original line number Diff line number Diff line
@@ -420,6 +420,9 @@ SUBROUTINE VectorResembleAllocScal(dest, source, inparams)
    ENDIF
#ifdef HAVE_MPI
  TYPE IS(NativeDistributedVectorType)
    IF(.NOT. params%has("VectorType->nlocal")) THEN
      CALL params%add("VectorType->nlocal",SIZE(source%b))
    ENDIF
    IF(.NOT. params%has("VectorType->chunkSize")) THEN
      CALL params%add("VectorType->chunkSize",source%chunkSize)
    ENDIF
@@ -497,6 +500,9 @@ SUBROUTINE VectorResembleAllocArray(dest, source, nvec, inparams)
    ENDIF
#ifdef HAVE_MPI
  TYPE IS(NativeDistributedVectorType)
    IF(.NOT. params%has("VectorType->nlocal")) THEN
      CALL params%add("VectorType->nlocal",SIZE(source%b))
    ENDIF
    IF(.NOT. params%has("VectorType->chunkSize")) THEN
      CALL params%add("VectorType->chunkSize",source%chunkSize)
    ENDIF
+34 −34

File changed.

Preview size limit exceeded, changes collapsed.