Commit 4d930470 authored by Nichols, Stephen's avatar Nichols, Stephen
Browse files

On branch gcc_openmp

 Changes to be committed:
	new file:   Makefile
	modified:   Makefile_summit
	modified:   README.md
	new file:   RUNTIME_FILES/batch_spock.sh
	deleted:    RUNTIME_FILES/matrix_ops.f90
	modified:   RUNTIME_FILES/summit_cpu.lsf
	new file:   SCRIPTS/build_spock.sh
	modified:   SCRIPTS/build_summit.sh
	new file:   SCRIPTS/setUpModules_spock.sh
	modified:   SCRIPTS/setUpModules_summit.sh
	modified:   custom_linsolve.f90
	modified:   my_hipinterfaces.f90
	modified:   nvtx.f90
	modified:   timing_functions.f90

Changes to build with GGC/v11 that supports OpenMP.
Changes for more thorough timings.
parent baf456c9
Loading
Loading
Loading
Loading

Makefile

0 → 100644
+129 −0
Original line number Diff line number Diff line
SHELL=/bin/bash

#MACHINE = Summit
### the Spock compiler flags also work on Crusher (with exception that no GCC with OpenMP Offloading on Crusher yet)
MACHINE = Spock

#COMPILER = Native
COMPILER = GCC

#CPU_or_GPU = CPU
CPU_or_GPU = GPU

#### Summit
ifeq ($(MACHINE),Summit)
   ifeq ($(COMPILER),Native)
      ifeq ($(CPU_or_GPU),CPU)
         ## CPU-Only 
         ## enable OpenMP without defining USE_OPENMP_OFFLOAD to use the omp_get_wtime 
         ## function and this makes me use the xlf_r compiler instead of xlf
         FC      = xlf_r
         FFLAGS = -O3 -qsmp=omp -qfree -qpreprocess -qtune=pwr9 -qarch=pwr9 -W@,"-v"
         LDFLAGS = -qsmp=omp
      else
         ## GPUs with OpenMP Offloading
         FC      = xlcuf
         FFLAGS = -O3 -qfree -qpreprocess -qsmp=omp -qoffload -qcuda -qtune=pwr9 -qarch=pwr9 -qtgtarch=sm_70 -W@,"-v"
         FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_CUDA
         LDFLAGS = -qsmp=omp -qoffload -L${OLCF_CUDA_ROOT}/lib64 -lnvToolsExt -lcudart
      endif
   endif

   ifeq ($(COMPILER),GCC)
      ifeq ($(CPU_or_GPU),GPU)
         FC      = gfortran
         FFLAGS = -O3 -ffree-form -cpp -fopenmp -foffload="-lm -latomic"
         FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_CUDA
         LDFLAGS = -fopenmp -foffload="-lm -latomic" -L${OLCF_CUDA_ROOT}/lib64 -lnvToolsExt -lcudart
      else
         FC      = gfortran
         FFLAGS = -O3 -ffree-form -cpp
         LDFLAGS =
      endif
   endif
endif

#### Spock and Crusher (GCC option not available on Crusher yet)
ifeq ($(MACHINE),Spock)
   ROCM_LIB_PATH=${OLCF_ROCM_ROOT}/lib
   ROCM_INC_PATH=${OLCF_ROCM_ROOT}/include

   ifeq ($(COMPILER),Native)
      FC = ftn

      ifeq ($(CPU_or_GPU),GPU)
         ### GPU
         FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree -DUSE_OPENMP_OFFLOAD -DUSE_AMD 
         ##FFLAGS += -I$(ROCM_INC_PATH)
         LDFLAGS = -homp -L$(ROCM_LIB_PATH) -lamdhip64 -lhsa-runtime64
      else
         ### CPU-only without OpenMP
         #FFLAGS = -N1023 -O3 -hnoomp -em -ef -eT -ffree
         #LDFLAGS = -hnoomp

         ### CPU-only with OpenMP flag ... no cpu-based openmp is in the code, but I 
         ### want to use omp_get_wtime function ... don't define USE_OPENMP_OFFLOAD
         FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree
         LDFLAGS = -homp
      endif
   endif

   ifeq ($(COMPILER),GCC)
      ifeq ($(CPU_or_GPU),GPU)
         FC = gfortran
         FFLAGS = -Ofast -ffree-form -cpp -fopenmp -foffload="-march=gfx908 -lm"
         FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_AMD
         LDFLAGS = -fopenmp -foffload="-march=gfx908 -lm"
         #LDFLAGS += -L$(ROCM_LIB_PATH) -lamdhip64 -lhsa-runtime64
         LDFLAGS += -L$(ROCM_LIB_PATH) -lamdhip64
      else
         ### CPU-only without OpenMP ... uses gfortran timer
         #FC = gfortran
         #FFLAGS = -O3 -ffree-form -ffree-line-length-none -cpp
         #LDFLAGS =

         ### CPU-only with OpenMP flag ... no cpu-based openmp is in the code, but I 
         ### want to use omp_get_wtime function ... don't define USE_OPENMP_OFFLOAD
         FC = gfortran
         FFLAGS = -O3 -fopenmp -ffree-form -ffree-line-length-none -cpp
         LDFLAGS = -fopenmp
      endif
   endif
endif

F_SRC = custom_linsolve.f90

MODS =

ifneq (,$(findstring USE_CUDA,$(FFLAGS)))
   MODS += my_cudainterfaces.f90
   MODS += nvtx.f90
endif
ifneq (,$(findstring USE_AMD,$(FFLAGS)))
   MODS += my_hipinterfaces.f90
endif

MODS += global_parameters.f90 timing_functions.f90 matrix_ops.f90

OBJECTS = $(MODS:.f90=.o) $(F_SRC:.f90=.o)
EXE = custom_linsolve.x

# rules for makefile
#$(FC) -o $@ $^ $(LDFLAGS)
$(EXE): $(OBJECTS)
	$(FC) $(LDFLAGS) -o $@ $^

%.o:%.f90
	$(FC) $(FFLAGS) -c -o $@ $<

default: all

all: $(EXE)

clean:
	rm -f *.o *.mod *.x *.s

clobber:
	rm -f *.o *.mod *.x *.s *~

+82 −29
Original line number Diff line number Diff line
SHELL=/bin/bash

MACHINE = Summit
### the Spock compiler flags also work on Crusher
#MACHINE = Spock
#MACHINE = Summit
### the Spock compiler flags also work on Crusher (with exception that no GCC with OpenMP Offloading on Crusher yet)
MACHINE = Spock

#COMPILER = Native
COMPILER = GCC

#CPU_or_GPU = CPU
CPU_or_GPU = GPU

#### Summit
ifeq ($(MACHINE),Summit)
   ifeq ($(COMPILER),Native)
      ifeq ($(CPU_or_GPU),CPU)
         ## CPU-Only 
   #FC      = xlf
   #FFLAGS = -O3 -qfree -qpreprocess -qtune=pwr9 -qarch=pwr9 -qtgtarch=sm_70 -W@,"-v"
   #LDFLAGS =

         ## enable OpenMP without defining USE_OPENMP_OFFLOAD to use the omp_get_wtime 
         ## function and this makes me use the xlf_r compiler instead of xlf
         FC      = xlf_r
         FFLAGS = -O3 -qsmp=omp -qfree -qpreprocess -qtune=pwr9 -qarch=pwr9 -W@,"-v"
         LDFLAGS = -qsmp=omp
      else
         ## GPUs with OpenMP Offloading
         FC      = xlcuf
         FFLAGS = -O3 -qfree -qpreprocess -qsmp=omp -qoffload -qcuda -qtune=pwr9 -qarch=pwr9 -qtgtarch=sm_70 -W@,"-v"
         FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_CUDA
         LDFLAGS = -qsmp=omp -qoffload -L${OLCF_CUDA_ROOT}/lib64 -lnvToolsExt -lcudart
      endif
   endif

   ifeq ($(COMPILER),GCC)
      ifeq ($(CPU_or_GPU),GPU)
         FC      = gfortran
         FFLAGS = -O3 -ffree-form -cpp -fopenmp -foffload="-lm -latomic"
         FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_CUDA
         LDFLAGS = -fopenmp -foffload="-lm -latomic" -L${OLCF_CUDA_ROOT}/lib64 -lnvToolsExt -lcudart
      else
         FC      = gfortran
         FFLAGS = -O3 -ffree-form -cpp
         LDFLAGS =
      endif
   endif
endif

#### Spock and Crusher
#### Spock and Crusher (GCC option not available on Crusher yet)
ifeq ($(MACHINE),Spock)
   ROCM_LIB_PATH=${OLCF_ROCM_ROOT}/lib
   ROCM_INC_PATH=${OLCF_ROCM_ROOT}/include

   ifeq ($(COMPILER),Native)
      FC = ftn

      ifeq ($(CPU_or_GPU),GPU)
         ### GPU
         FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree -DUSE_OPENMP_OFFLOAD -DUSE_AMD 
         ##FFLAGS += -I$(ROCM_INC_PATH)
         LDFLAGS = -homp -L$(ROCM_LIB_PATH) -lamdhip64 -lhsa-runtime64

      else
         ### CPU-only without OpenMP
         #FFLAGS = -N1023 -O3 -hnoomp -em -ef -eT -ffree
         #LDFLAGS = -hnoomp

   ### CPU-only with OpenMP flag ... no cpu-based openmp is in the code
   #FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree
   #LDFLAGS = -homp
         ### CPU-only with OpenMP flag ... no cpu-based openmp is in the code, but I 
         ### want to use omp_get_wtime function ... don't define USE_OPENMP_OFFLOAD
         FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree
         LDFLAGS = -homp
      endif
   endif

   ifeq ($(COMPILER),GCC)
      ifeq ($(CPU_or_GPU),GPU)
         FC = gfortran
         FFLAGS = -O3 -ffree-form -cpp -fopenmp -foffload="-march=gfx908 -lm"
         FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_AMD
         LDFLAGS = -fopenmp -foffload="-march=gfx908 -lm"
         #LDFLAGS += -L$(ROCM_LIB_PATH) -lamdhip64 -lhsa-runtime64
         LDFLAGS += -L$(ROCM_LIB_PATH) -lamdhip64
      else
         ### CPU-only without OpenMP ... uses gfortran timer
         #FC = gfortran
         #FFLAGS = -O3 -ffree-form -ffree-line-length-none -cpp
         #LDFLAGS =

         ### CPU-only with OpenMP flag ... no cpu-based openmp is in the code, but I 
         ### want to use omp_get_wtime function ... don't define USE_OPENMP_OFFLOAD
         FC = gfortran
         FFLAGS = -O3 -fopenmp -ffree-form -ffree-line-length-none -cpp
         LDFLAGS = -fopenmp
      endif
   endif
endif

F_SRC = custom_linsolve.f90
@@ -57,8 +109,9 @@ OBJECTS = $(MODS:.f90=.o) $(F_SRC:.f90=.o)
EXE = custom_linsolve.x

# rules for makefile
#$(FC) -o $@ $^ $(LDFLAGS)
$(EXE): $(OBJECTS)
	$(FC) -o $@ $^ $(LDFLAGS)
	$(FC) $(LDFLAGS) -o $@ $^

%.o:%.f90
	$(FC) $(FFLAGS) -c -o $@ $<
+5 −2
Original line number Diff line number Diff line
@@ -5,8 +5,11 @@ Demonstration code the uses OpenMP to offload linear solver routines to Nvidia a

SCRIPTS/setUpModules_summit.sh : loads necessary modules
SCRIPTS/build_summit.sh : sources setUpModules_summit.sh and builds the code
SCRIPTS/setUpModules_spock.sh : loads necessary modules
SCRIPTS/build_spock.sh : sources setUpModules_spock.sh and builds the code

RUNTIME_FILES contains submission scripts for Summit, an input file, and 6x6 matrix.
RUNTIME_FILES contains submission scripts for Summit and Spock, an input file, and 6x6 matrix.

All files were pulled from existing codes for testing except for nvtx.f90 which comes from https://github.com/maxcuda/NVTX_example.git
All files were pulled from existing codes for testing except for nvtx.f90 which comes from 
https://github.com/maxcuda/NVTX_example.git. The nvtx.f90 was modified slightly for GNU/v11.
+24 −0
Original line number Diff line number Diff line
#!/bin/bash
#SBATCH -A CSC434_SPOCK
##SBATCH -A STF006_CRUSHER
#SBATCH -J linsolve_test
#SBATCH -o gpu_run.log
#SBATCH -e stderr
#SBATCH -t 00:05:00
#SBATCH -p batch
#SBATCH -N 1


source setUpModules_spock.sh
module list

export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}

export OMP_NUM_THREADS=1
export HIP_LAUNCH_BLOCKING=1

exec=./custom_linsolve.x


srun -N1 -n1 -c16 --ntasks-per-gpu=1 --gpu-bind=closest --export=ALL $exec

RUNTIME_FILES/matrix_ops.f90

deleted100644 → 0
+0 −431
Original line number Diff line number Diff line
      module matrix_operations
      use omp_lib
      use global_parameters
      implicit none

      contains
! =========================================================================
        subroutine matrix_vect_mult(neqn,aa,bb,cc)
        implicit none

#ifdef USE_OPENMP_OFFLOAD
        !$omp declare target
#endif

        integer, intent(in) :: neqn
        real*8, intent(in) :: aa(neqn,neqn),bb(neqn)
        real*8, intent(out) :: cc(neqn)

        integer :: i,j,k
        real*8 :: dummy

        do i = 1,neqn
            dummy = aa(i,1)*bb(1)
            do k = 2,neqn
                dummy = dummy + aa(i,k)*bb(k)
            enddo
            cc(i) = dummy
        enddo

        return
        end subroutine matrix_vect_mult
! =========================================================================
        subroutine matrix_matrix_mult(neqn,aa,bb,cc)
        implicit none

#ifdef USE_OPENMP_OFFLOAD
        !$omp declare target
#endif

        integer, intent(in) :: neqn
        real*8, intent(in) :: aa(neqn,neqn),bb(neqn,neqn)
        real*8, intent(out) :: cc(neqn,neqn)

        integer :: i,j,k
        real*8 :: dummy

        do i = 1,neqn
            do j = 1,neqn
                dummy = aa(i,1)*bb(1,j)
                do k = 2,neqn
                    dummy = dummy + aa(i,k)*bb(k,j)
                enddo
                cc(i,j) = dummy
            enddo
        enddo

        return
        end subroutine matrix_matrix_mult
! =========================================================================
        subroutine small_value(neqn,aa)
        implicit none

#ifdef USE_OPENMP_OFFLOAD
        !$omp declare target
#endif

        integer, intent(in) :: neqn
        real*8, intent(inout) :: aa(neqn,neqn)

        integer :: i,j

        do j = 1,neqn
            do i = 1,neqn
                if (dabs(aa(i,j)) .lt. 1.0e-15) then
                    aa(i,j) = 0.d0
                endif
            enddo
        enddo

        return
        end subroutine small_value
! =========================================================================
      subroutine ludecomp(neqn,ppivot,aa,matrix_info)
      implicit none

#ifdef USE_OPENMP_OFFLOAD
      !$omp declare target
#endif

      ! it uses gaussian elimination with row pivoting to reduce
      ! a (neqn x neqn) matrix to an upper triangular form.
      ! NOTE: also known as LU decompositon

      integer, intent(in) :: neqn
      integer, intent(inout) :: ppivot(neqn)
      real *8, intent(inout) :: aa(neqn,neqn)
      integer, intent(inout) :: matrix_info
     
      real*8 :: mult,apm
      integer :: l,m,row,col,pivot_loc,xtra
      integer :: pl, pm
      real*8 :: diag,maxdiag,absdiag

      ! initialize values
      do m = 1,neqn
          ppivot(m) = m
      end do

      do m = 1,neqn-1
          
          row = m
          maxdiag = dabs(aa(ppivot(row),m))

          do l = m+1,neqn
              if (dabs(aa(ppivot(l),m)) .gt. maxdiag) then
                  row = l
                  maxdiag = dabs(aa(ppivot(row),m))
              endif
          end do

          pivot_loc = ppivot(row)

          ! pivot the rows in-place (ie. re-map the rows instead
          !                          of physically exchanging rows)
          if (pivot_loc .ne. ppivot(m)) then
              xtra = ppivot(m)
              ppivot(m) = pivot_loc
              ppivot(row) = xtra
          endif

          pm = ppivot(m)
          absdiag = dabs(aa(pm,m))

          ! Check for singularity ...
          ! Instead of returning a error if absdiag <= 1.0e-16 like I would on the CPU,
          ! set diag so that everything will work and let the error check identify the 
          ! problem after the lu decomp is completed.
          ! NOTE: OpenMP has difficulty building GPU code with logic to return early
          if (absdiag .gt. 1.d-16) then
             diag = aa(pm,m)
          else
             diag = 1.d+16
          endif

          ! find and store multipliers for the column
          do row = m+1,neqn
              pl = ppivot(row)
              if (aa(pl,m) .ne. 0.d0) then   !! no need to mult by 0.0
                  mult = aa(pl,m)/diag  
                  aa(pl,m) = mult
                  !aa(pl,m) = aa(pl,m)/diag
              endif
          end do

          ! reduce the rows in a column-oriented fashion
          do col = m+1,neqn
              if (aa(pm,col) .ne. 0.d0) then !! no need to add 0.0
                  apm = aa(pm,col)
                  do row = m+1,neqn
                      pl = ppivot(row)
                      if (aa(pl,m) .ne. 0.d0) then !! no need to mult by 0.0
                          mult = aa(pl,m)
                          aa(pl,col) = aa(pl,col) - mult*apm
                      endif
                  end do
              endif
          end do

      end do !end of the lu decomp loop

      ! check the matrix for problems
      call check_matrix(neqn,ppivot,aa,matrix_info)

      return
      end subroutine ludecomp

! =========================================================================o

      subroutine check_matrix(neqn,ppivot,aa,matrix_info)
      implicit none

#ifdef USE_OPENMP_OFFLOAD
      !$omp declare target
#endif

      integer, intent(in) ::  neqn
      integer, intent(inout) :: ppivot(neqn)
      real *8, intent(inout) :: aa(neqn,neqn)
      integer, intent(inout) :: matrix_info

      integer :: l
      real *8 :: ratio
      real *8 :: det
      integer :: pm
      real *8 :: diag,absdiag,min_val,max_val

      det = 1.d0
      max_val = 0.d0
      min_val = 1.d+16

      do l = 1,neqn

         pm = ppivot(l)
         diag = aa(pm,l)
         absdiag = dabs(diag)
         det = det*(absdiag)
         max_val = max(absdiag,max_val)
         min_val = min(absdiag,min_val)

         ! check for singularity at every location
         if (absdiag .le. 1.d-16) then
              !write(*,*) 'c matrix is not invertible on target'
              !write(*,*) 'diagonal entry <= 1.0e-16'
              !write(*,*) 'halting program'
              !write(*,*) ' '
              matrix_info = 1
         endif

      enddo

      if (matrix_info .eq. 0) then
         ratio = min_val/max_val

         if ((max_val .le. 1.d-16) .and. (min_val .le. 1.d-16)) then
             ! all diagonal values <= 1.d-16
             ! NOTE: matrix needs better scaling
             !write(*,*) 'all diagonal values <= 1.0e-16 on target'
             !write(*,*) 'abs(det) = ',det
             !write(*,*) 'needs better scaling'
             !write(*,*) 'halting program'
             !write(*,*) ' '
             matrix_info = 3
         elseif (ratio .le. 1.d-15) then
             ! check for poor scaling between the diagonal values
             ! NOTE: leads to poor conditioning
             !write(*,*) 'c matrix is ill-conditioned on target'
             !write(*,*) 'ratio = abs((min_diag)/(max_diag)) = ',ratio
             !write(*,*) 'abs(det) = ',det
             !write(*,*) 'halting program'
             !write(*,*) ' '
             matrix_info = 2
         endif

      endif

      return
      end subroutine check_matrix

! =========================================================================
      subroutine fwd_bk_sub(neqn,nn,i,j,k)
      implicit none

#ifdef USE_OPENMP_OFFLOAD
      !$omp declare target
#endif

      integer, intent(in) :: neqn,nn,i,j,k
      real*8 :: zpl,zpm
      integer :: l,m
      integer :: pl,pm

      ! perform the same eliminations on Z that were
      ! completed on A (this is the quasi-forward substitution pass)
      do m = nn,neqn-1
          pm = pivot(m,i,j,k)
          if (z(pm,i,j,k) .ne. 0.d0) then    !! no need to mult by 0.0
              zpm = z(pm,i,j,k)
              do l = m+1,neqn
                  pl = pivot(l,i,j,k)
                  if (a(pl,m,i,j,k) .ne. 0.d0) then !! no need to add 0.0
                      z(pl,i,j,k) = z(pl,i,j,k) - a(pl,m,i,j,k)*zpm
                  endif
              end do
          endif
      end do

      ! solves the upper triangular system using column
      ! oriented back substitution
      do l = neqn,1,-1
          pl = pivot(l,i,j,k)
          if (z(pl,i,j,k) .ne. 0.d0) then     !! no need to mult by 0.0
              z(pl,i,j,k) = z(pl,i,j,k)/a(pl,l,i,j,k)
              zpl = z(pl,i,j,k)
              do m = l-1,1,-1
                  pm = pivot(m,i,j,k)
                  if (a(pm,l,i,j,k) .ne. 0.d0) then !! no need to add 0.0
                      z(pm,i,j,k) = z(pm,i,j,k) - a(pm,l,i,j,k)*zpl
                  endif
              end do
          endif
      end do

      return
      end subroutine fwd_bk_sub

! =========================================================================
      subroutine find_inverse(neqn,i,j,k)
      implicit none

#ifdef USE_OPENMP_OFFLOAD
      !$omp declare target
#endif

      integer, intent(in) :: neqn,i,j,k
      integer :: l,m
      integer :: pm

      ! using the LU decomp in matrix a, find and return the inverse of a
      ! NOTE: the a matrix is pivoted during the LU decomp. the pivoting is
      !       is used during the fwd/back substitution. once the inverse
      !       is found, the pivoting is reversed before returning to the  
      !       calling function

      ! NOTE: Finds the inverse of A directly

      do l = 1,neqn
          ! initialize z with the appropriate column of the Identity matrix
          do m = 1, neqn
              z(m,i,j,k) = 0.d0
          enddo
          z(pivot(l,i,j,k),i,j,k) = 1.d0

          ! solve for the appropriate column of the inverse of a
          ! NOTE: uses the pivoted a matrix
          ! NOTE2: can send in "l" iff we're initializing z with 
          !        a column of the Identity matrix ... otherwise
          !        we must send in ONE : "1" instead of "l"
          call fwd_bk_sub(neqn,l,i,j,k)
          
          ! store the column in a
          ! NOTE: pivoting is reversed so that the matrix returned in a
          !       is truly the inverse of the original a
          ! NOTE2: also need to pivot the column with this algorithm
          do m = 1,neqn
              pm = pivot(m,i,j,k)
              ao(m,pivot(l,i,j,k),i,j,k) = z(pm,i,j,k)
          enddo
      enddo

      do m = 1,neqn
         do l = 1,neqn
            a(l,m,i,j,k) = ao(l,m,i,j,k)
         enddo
      enddo

      return
      end subroutine find_inverse

! =========================================================================

      subroutine matrix_invert(neqn,i,j,k,matrix_info)
      implicit none

#ifdef USE_OPENMP_OFFLOAD
      !$omp declare target
#endif

      integer, intent(in) ::  i,j,k,neqn
      integer, intent(inout) :: matrix_info

      ! routines for inverting the C matrix using custom library

      call ludecomp(neqn,pivot(1,i,j,k),a(1,1,i,j,k),matrix_info)

      if (matrix_info .gt. 0) then
         !! bail if there is problem with the matrix
         return
      else
         !! find the inverse if the matrix passes the checks
         call find_inverse(neqn,i,j,k)
      endif

      return
      end subroutine matrix_invert

! ====================================================================

  subroutine solve(neqn,i,j,k,matrix_info)
  implicit none

#ifdef USE_OPENMP_OFFLOAD
  !$omp declare target
#endif

  integer, intent(in) ::  neqn,i,j,k
  integer, intent(inout) :: matrix_info

  real*8 :: z_tmp
  integer :: l,m,l_tmp

  ! routines for inverting the C matrix using custom library

  ! lu decomp of matrix c
  call ludecomp(neqn,pivot(1,i,j,k),a(1,1,i,j,k),matrix_info)

  if (matrix_info .gt. 0) then
     !! bail if there is problem with the matrix
     return
  else
     call fwd_bk_sub(neqn,1,i,j,k)

     ! reverse the pivoting without an additional array
     do l = 1,neqn-1
        if (l .ne. pivot(l,i,j,k)) then

           ! find the row to switch
           l_tmp = 0
           do m = l+1, neqn
              if (pivot(m,i,j,k) .eq. l) l_tmp = m
           end do

           ! switch rows in solution vector
           z_tmp = z(pivot(l,i,j,k),i,j,k)
           z(pivot(l,i,j,k),i,j,k) = z(pivot(l_tmp,i,j,k),i,j,k)
           z(pivot(l_tmp,i,j,k),i,j,k) = z_tmp

           ! update pivot vector
           pivot(l_tmp,i,j,k) = pivot(l,i,j,k)
           pivot(l,i,j,k) = l
        end if
     end do

  endif

  return
  end subroutine solve

! ====================================================================

      end module Matrix_Operations
Loading