On branch gcc_openmp (4d930470) · Commits · OpenMP-Offloading / Linear Solver With OpenMP And NVTX

Makefile

0 → 100644

+129 −0

Original line number	Diff line number	Diff line
		SHELL=/bin/bash

		#MACHINE = Summit
		### the Spock compiler flags also work on Crusher (with exception that no GCC with OpenMP Offloading on Crusher yet)
		MACHINE = Spock

		#COMPILER = Native
		COMPILER = GCC

		#CPU_or_GPU = CPU
		CPU_or_GPU = GPU

		#### Summit
		ifeq ($(MACHINE),Summit)
		ifeq ($(COMPILER),Native)
		ifeq ($(CPU_or_GPU),CPU)
		## CPU-Only
		## enable OpenMP without defining USE_OPENMP_OFFLOAD to use the omp_get_wtime
		## function and this makes me use the xlf_r compiler instead of xlf
		FC = xlf_r
		FFLAGS = -O3 -qsmp=omp -qfree -qpreprocess -qtune=pwr9 -qarch=pwr9 -W@,"-v"
		LDFLAGS = -qsmp=omp
		else
		## GPUs with OpenMP Offloading
		FC = xlcuf
		FFLAGS = -O3 -qfree -qpreprocess -qsmp=omp -qoffload -qcuda -qtune=pwr9 -qarch=pwr9 -qtgtarch=sm_70 -W@,"-v"
		FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_CUDA
		LDFLAGS = -qsmp=omp -qoffload -L${OLCF_CUDA_ROOT}/lib64 -lnvToolsExt -lcudart
		endif
		endif

		ifeq ($(COMPILER),GCC)
		ifeq ($(CPU_or_GPU),GPU)
		FC = gfortran
		FFLAGS = -O3 -ffree-form -cpp -fopenmp -foffload="-lm -latomic"
		FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_CUDA
		LDFLAGS = -fopenmp -foffload="-lm -latomic" -L${OLCF_CUDA_ROOT}/lib64 -lnvToolsExt -lcudart
		else
		FC = gfortran
		FFLAGS = -O3 -ffree-form -cpp
		LDFLAGS =
		endif
		endif
		endif

		#### Spock and Crusher (GCC option not available on Crusher yet)
		ifeq ($(MACHINE),Spock)
		ROCM_LIB_PATH=${OLCF_ROCM_ROOT}/lib
		ROCM_INC_PATH=${OLCF_ROCM_ROOT}/include

		ifeq ($(COMPILER),Native)
		FC = ftn

		ifeq ($(CPU_or_GPU),GPU)
		### GPU
		FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree -DUSE_OPENMP_OFFLOAD -DUSE_AMD
		##FFLAGS += -I$(ROCM_INC_PATH)
		LDFLAGS = -homp -L$(ROCM_LIB_PATH) -lamdhip64 -lhsa-runtime64
		else
		### CPU-only without OpenMP
		#FFLAGS = -N1023 -O3 -hnoomp -em -ef -eT -ffree
		#LDFLAGS = -hnoomp

		### CPU-only with OpenMP flag ... no cpu-based openmp is in the code, but I
		### want to use omp_get_wtime function ... don't define USE_OPENMP_OFFLOAD
		FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree
		LDFLAGS = -homp
		endif
		endif

		ifeq ($(COMPILER),GCC)
		ifeq ($(CPU_or_GPU),GPU)
		FC = gfortran
		FFLAGS = -Ofast -ffree-form -cpp -fopenmp -foffload="-march=gfx908 -lm"
		FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_AMD
		LDFLAGS = -fopenmp -foffload="-march=gfx908 -lm"
		#LDFLAGS += -L$(ROCM_LIB_PATH) -lamdhip64 -lhsa-runtime64
		LDFLAGS += -L$(ROCM_LIB_PATH) -lamdhip64
		else
		### CPU-only without OpenMP ... uses gfortran timer
		#FC = gfortran
		#FFLAGS = -O3 -ffree-form -ffree-line-length-none -cpp
		#LDFLAGS =

		### CPU-only with OpenMP flag ... no cpu-based openmp is in the code, but I
		### want to use omp_get_wtime function ... don't define USE_OPENMP_OFFLOAD
		FC = gfortran
		FFLAGS = -O3 -fopenmp -ffree-form -ffree-line-length-none -cpp
		LDFLAGS = -fopenmp
		endif
		endif
		endif

		F_SRC = custom_linsolve.f90

		MODS =

		ifneq (,$(findstring USE_CUDA,$(FFLAGS)))
		MODS += my_cudainterfaces.f90
		MODS += nvtx.f90
		endif
		ifneq (,$(findstring USE_AMD,$(FFLAGS)))
		MODS += my_hipinterfaces.f90
		endif

		MODS += global_parameters.f90 timing_functions.f90 matrix_ops.f90

		OBJECTS = $(MODS:.f90=.o) $(F_SRC:.f90=.o)
		EXE = custom_linsolve.x

		# rules for makefile
		#$(FC) -o $@ $^ $(LDFLAGS)
		$(EXE): $(OBJECTS)
		$(FC) $(LDFLAGS) -o $@ $^

		%.o:%.f90
		$(FC) $(FFLAGS) -c -o $@ $<

		default: all

		all: $(EXE)

		clean:
		rm -f .o .mod .x .s

		clobber:
		rm -f .o .mod .x .s *~

Makefile_summit

+82 −29

Original line number	Diff line number	Diff line
		SHELL=/bin/bash

		MACHINE = Summit
		### the Spock compiler flags also work on Crusher
		#MACHINE = Spock
		#MACHINE = Summit
		### the Spock compiler flags also work on Crusher (with exception that no GCC with OpenMP Offloading on Crusher yet)
		MACHINE = Spock

		#COMPILER = Native
		COMPILER = GCC

		#CPU_or_GPU = CPU
		CPU_or_GPU = GPU

		#### Summit
		ifeq ($(MACHINE),Summit)
		ifeq ($(COMPILER),Native)
		ifeq ($(CPU_or_GPU),CPU)
		## CPU-Only
		#FC = xlf
		#FFLAGS = -O3 -qfree -qpreprocess -qtune=pwr9 -qarch=pwr9 -qtgtarch=sm_70 -W@,"-v"
		#LDFLAGS =

		## enable OpenMP without defining USE_OPENMP_OFFLOAD to use the omp_get_wtime
		## function and this makes me use the xlf_r compiler instead of xlf
		FC = xlf_r
		FFLAGS = -O3 -qsmp=omp -qfree -qpreprocess -qtune=pwr9 -qarch=pwr9 -W@,"-v"
		LDFLAGS = -qsmp=omp
		else
		## GPUs with OpenMP Offloading
		FC = xlcuf
		FFLAGS = -O3 -qfree -qpreprocess -qsmp=omp -qoffload -qcuda -qtune=pwr9 -qarch=pwr9 -qtgtarch=sm_70 -W@,"-v"
		FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_CUDA
		LDFLAGS = -qsmp=omp -qoffload -L${OLCF_CUDA_ROOT}/lib64 -lnvToolsExt -lcudart
		endif
		endif

		ifeq ($(COMPILER),GCC)
		ifeq ($(CPU_or_GPU),GPU)
		FC = gfortran
		FFLAGS = -O3 -ffree-form -cpp -fopenmp -foffload="-lm -latomic"
		FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_CUDA
		LDFLAGS = -fopenmp -foffload="-lm -latomic" -L${OLCF_CUDA_ROOT}/lib64 -lnvToolsExt -lcudart
		else
		FC = gfortran
		FFLAGS = -O3 -ffree-form -cpp
		LDFLAGS =
		endif
		endif
		endif

		#### Spock and Crusher
		#### Spock and Crusher (GCC option not available on Crusher yet)
		ifeq ($(MACHINE),Spock)
		ROCM_LIB_PATH=${OLCF_ROCM_ROOT}/lib
		ROCM_INC_PATH=${OLCF_ROCM_ROOT}/include

		ifeq ($(COMPILER),Native)
		FC = ftn

		ifeq ($(CPU_or_GPU),GPU)
		### GPU
		FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree -DUSE_OPENMP_OFFLOAD -DUSE_AMD
		##FFLAGS += -I$(ROCM_INC_PATH)
		LDFLAGS = -homp -L$(ROCM_LIB_PATH) -lamdhip64 -lhsa-runtime64

		else
		### CPU-only without OpenMP
		#FFLAGS = -N1023 -O3 -hnoomp -em -ef -eT -ffree
		#LDFLAGS = -hnoomp

		### CPU-only with OpenMP flag ... no cpu-based openmp is in the code
		#FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree
		#LDFLAGS = -homp
		### CPU-only with OpenMP flag ... no cpu-based openmp is in the code, but I
		### want to use omp_get_wtime function ... don't define USE_OPENMP_OFFLOAD
		FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree
		LDFLAGS = -homp
		endif
		endif

		ifeq ($(COMPILER),GCC)
		ifeq ($(CPU_or_GPU),GPU)
		FC = gfortran
		FFLAGS = -O3 -ffree-form -cpp -fopenmp -foffload="-march=gfx908 -lm"
		FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_AMD
		LDFLAGS = -fopenmp -foffload="-march=gfx908 -lm"
		#LDFLAGS += -L$(ROCM_LIB_PATH) -lamdhip64 -lhsa-runtime64
		LDFLAGS += -L$(ROCM_LIB_PATH) -lamdhip64
		else
		### CPU-only without OpenMP ... uses gfortran timer
		#FC = gfortran
		#FFLAGS = -O3 -ffree-form -ffree-line-length-none -cpp
		#LDFLAGS =

		### CPU-only with OpenMP flag ... no cpu-based openmp is in the code, but I
		### want to use omp_get_wtime function ... don't define USE_OPENMP_OFFLOAD
		FC = gfortran
		FFLAGS = -O3 -fopenmp -ffree-form -ffree-line-length-none -cpp
		LDFLAGS = -fopenmp
		endif
		endif
		endif

		F_SRC = custom_linsolve.f90
		@@ -57,8 +109,9 @@ OBJECTS = $(MODS:.f90=.o) $(F_SRC:.f90=.o)
		EXE = custom_linsolve.x

		# rules for makefile
		#$(FC) -o $@ $^ $(LDFLAGS)
		$(EXE): $(OBJECTS)
		$(FC) -o $@ $^ $(LDFLAGS)
		$(FC) $(LDFLAGS) -o $@ $^

		%.o:%.f90
		$(FC) $(FFLAGS) -c -o $@ $<

README.md

+5 −2

Original line number	Diff line number	Diff line
		@@ -5,8 +5,11 @@ Demonstration code the uses OpenMP to offload linear solver routines to Nvidia a

		SCRIPTS/setUpModules_summit.sh : loads necessary modules
		SCRIPTS/build_summit.sh : sources setUpModules_summit.sh and builds the code
		SCRIPTS/setUpModules_spock.sh : loads necessary modules
		SCRIPTS/build_spock.sh : sources setUpModules_spock.sh and builds the code

		RUNTIME_FILES contains submission scripts for Summit, an input file, and 6x6 matrix.
		RUNTIME_FILES contains submission scripts for Summit and Spock, an input file, and 6x6 matrix.

		All files were pulled from existing codes for testing except for nvtx.f90 which comes from https://github.com/maxcuda/NVTX_example.git
		All files were pulled from existing codes for testing except for nvtx.f90 which comes from
		https://github.com/maxcuda/NVTX_example.git. The nvtx.f90 was modified slightly for GNU/v11.

RUNTIME_FILES/batch_spock.sh

0 → 100644

+24 −0

Original line number	Diff line number	Diff line
		#!/bin/bash
		#SBATCH -A CSC434_SPOCK
		##SBATCH -A STF006_CRUSHER
		#SBATCH -J linsolve_test
		#SBATCH -o gpu_run.log
		#SBATCH -e stderr
		#SBATCH -t 00:05:00
		#SBATCH -p batch
		#SBATCH -N 1


		source setUpModules_spock.sh
		module list

		export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}

		export OMP_NUM_THREADS=1
		export HIP_LAUNCH_BLOCKING=1

		exec=./custom_linsolve.x


		srun -N1 -n1 -c16 --ntasks-per-gpu=1 --gpu-bind=closest --export=ALL $exec

RUNTIME_FILES/matrix_ops.f90

deleted100644 → 0

+0 −431

Original line number	Diff line number	Diff line
		module matrix_operations
		use omp_lib
		use global_parameters
		implicit none

		contains
		! =========================================================================
		subroutine matrix_vect_mult(neqn,aa,bb,cc)
		implicit none

		#ifdef USE_OPENMP_OFFLOAD
		!$omp declare target
		#endif

		integer, intent(in) :: neqn
		real*8, intent(in) :: aa(neqn,neqn),bb(neqn)
		real*8, intent(out) :: cc(neqn)

		integer :: i,j,k
		real*8 :: dummy

		do i = 1,neqn
		dummy = aa(i,1)*bb(1)
		do k = 2,neqn
		dummy = dummy + aa(i,k)*bb(k)
		enddo
		cc(i) = dummy
		enddo

		return
		end subroutine matrix_vect_mult
		! =========================================================================
		subroutine matrix_matrix_mult(neqn,aa,bb,cc)
		implicit none

		#ifdef USE_OPENMP_OFFLOAD
		!$omp declare target
		#endif

		integer, intent(in) :: neqn
		real*8, intent(in) :: aa(neqn,neqn),bb(neqn,neqn)
		real*8, intent(out) :: cc(neqn,neqn)

		integer :: i,j,k
		real*8 :: dummy

		do i = 1,neqn
		do j = 1,neqn
		dummy = aa(i,1)*bb(1,j)
		do k = 2,neqn
		dummy = dummy + aa(i,k)*bb(k,j)
		enddo
		cc(i,j) = dummy
		enddo
		enddo

		return
		end subroutine matrix_matrix_mult
		! =========================================================================
		subroutine small_value(neqn,aa)
		implicit none

		#ifdef USE_OPENMP_OFFLOAD
		!$omp declare target
		#endif

		integer, intent(in) :: neqn
		real*8, intent(inout) :: aa(neqn,neqn)

		integer :: i,j

		do j = 1,neqn
		do i = 1,neqn
		if (dabs(aa(i,j)) .lt. 1.0e-15) then
		aa(i,j) = 0.d0
		endif
		enddo
		enddo

		return
		end subroutine small_value
		! =========================================================================
		subroutine ludecomp(neqn,ppivot,aa,matrix_info)
		implicit none

		#ifdef USE_OPENMP_OFFLOAD
		!$omp declare target
		#endif

		! it uses gaussian elimination with row pivoting to reduce
		! a (neqn x neqn) matrix to an upper triangular form.
		! NOTE: also known as LU decompositon

		integer, intent(in) :: neqn
		integer, intent(inout) :: ppivot(neqn)
		real *8, intent(inout) :: aa(neqn,neqn)
		integer, intent(inout) :: matrix_info

		real*8 :: mult,apm
		integer :: l,m,row,col,pivot_loc,xtra
		integer :: pl, pm
		real*8 :: diag,maxdiag,absdiag

		! initialize values
		do m = 1,neqn
		ppivot(m) = m
		end do

		do m = 1,neqn-1

		row = m
		maxdiag = dabs(aa(ppivot(row),m))

		do l = m+1,neqn
		if (dabs(aa(ppivot(l),m)) .gt. maxdiag) then
		row = l
		maxdiag = dabs(aa(ppivot(row),m))
		endif
		end do

		pivot_loc = ppivot(row)

		! pivot the rows in-place (ie. re-map the rows instead
		! of physically exchanging rows)
		if (pivot_loc .ne. ppivot(m)) then
		xtra = ppivot(m)
		ppivot(m) = pivot_loc
		ppivot(row) = xtra
		endif

		pm = ppivot(m)
		absdiag = dabs(aa(pm,m))

		! Check for singularity ...
		! Instead of returning a error if absdiag <= 1.0e-16 like I would on the CPU,
		! set diag so that everything will work and let the error check identify the
		! problem after the lu decomp is completed.
		! NOTE: OpenMP has difficulty building GPU code with logic to return early
		if (absdiag .gt. 1.d-16) then
		diag = aa(pm,m)
		else
		diag = 1.d+16
		endif

		! find and store multipliers for the column
		do row = m+1,neqn
		pl = ppivot(row)
		if (aa(pl,m) .ne. 0.d0) then !! no need to mult by 0.0
		mult = aa(pl,m)/diag
		aa(pl,m) = mult
		!aa(pl,m) = aa(pl,m)/diag
		endif
		end do

		! reduce the rows in a column-oriented fashion
		do col = m+1,neqn
		if (aa(pm,col) .ne. 0.d0) then !! no need to add 0.0
		apm = aa(pm,col)
		do row = m+1,neqn
		pl = ppivot(row)
		if (aa(pl,m) .ne. 0.d0) then !! no need to mult by 0.0
		mult = aa(pl,m)
		aa(pl,col) = aa(pl,col) - mult*apm
		endif
		end do
		endif
		end do

		end do !end of the lu decomp loop

		! check the matrix for problems
		call check_matrix(neqn,ppivot,aa,matrix_info)

		return
		end subroutine ludecomp

		! =========================================================================o

		subroutine check_matrix(neqn,ppivot,aa,matrix_info)
		implicit none

		#ifdef USE_OPENMP_OFFLOAD
		!$omp declare target
		#endif

		integer, intent(in) :: neqn
		integer, intent(inout) :: ppivot(neqn)
		real *8, intent(inout) :: aa(neqn,neqn)
		integer, intent(inout) :: matrix_info

		integer :: l
		real *8 :: ratio
		real *8 :: det
		integer :: pm
		real *8 :: diag,absdiag,min_val,max_val

		det = 1.d0
		max_val = 0.d0
		min_val = 1.d+16

		do l = 1,neqn

		pm = ppivot(l)
		diag = aa(pm,l)
		absdiag = dabs(diag)
		det = det*(absdiag)
		max_val = max(absdiag,max_val)
		min_val = min(absdiag,min_val)

		! check for singularity at every location
		if (absdiag .le. 1.d-16) then
		!write(,) 'c matrix is not invertible on target'
		!write(,) 'diagonal entry <= 1.0e-16'
		!write(,) 'halting program'
		!write(,) ' '
		matrix_info = 1
		endif

		enddo

		if (matrix_info .eq. 0) then
		ratio = min_val/max_val

		if ((max_val .le. 1.d-16) .and. (min_val .le. 1.d-16)) then
		! all diagonal values <= 1.d-16
		! NOTE: matrix needs better scaling
		!write(,) 'all diagonal values <= 1.0e-16 on target'
		!write(,) 'abs(det) = ',det
		!write(,) 'needs better scaling'
		!write(,) 'halting program'
		!write(,) ' '
		matrix_info = 3
		elseif (ratio .le. 1.d-15) then
		! check for poor scaling between the diagonal values
		! NOTE: leads to poor conditioning
		!write(,) 'c matrix is ill-conditioned on target'
		!write(,) 'ratio = abs((min_diag)/(max_diag)) = ',ratio
		!write(,) 'abs(det) = ',det
		!write(,) 'halting program'
		!write(,) ' '
		matrix_info = 2
		endif

		endif

		return
		end subroutine check_matrix

		! =========================================================================
		subroutine fwd_bk_sub(neqn,nn,i,j,k)
		implicit none

		#ifdef USE_OPENMP_OFFLOAD
		!$omp declare target
		#endif

		integer, intent(in) :: neqn,nn,i,j,k
		real*8 :: zpl,zpm
		integer :: l,m
		integer :: pl,pm

		! perform the same eliminations on Z that were
		! completed on A (this is the quasi-forward substitution pass)
		do m = nn,neqn-1
		pm = pivot(m,i,j,k)
		if (z(pm,i,j,k) .ne. 0.d0) then !! no need to mult by 0.0
		zpm = z(pm,i,j,k)
		do l = m+1,neqn
		pl = pivot(l,i,j,k)
		if (a(pl,m,i,j,k) .ne. 0.d0) then !! no need to add 0.0
		z(pl,i,j,k) = z(pl,i,j,k) - a(pl,m,i,j,k)*zpm
		endif
		end do
		endif
		end do

		! solves the upper triangular system using column
		! oriented back substitution
		do l = neqn,1,-1
		pl = pivot(l,i,j,k)
		if (z(pl,i,j,k) .ne. 0.d0) then !! no need to mult by 0.0
		z(pl,i,j,k) = z(pl,i,j,k)/a(pl,l,i,j,k)
		zpl = z(pl,i,j,k)
		do m = l-1,1,-1
		pm = pivot(m,i,j,k)
		if (a(pm,l,i,j,k) .ne. 0.d0) then !! no need to add 0.0
		z(pm,i,j,k) = z(pm,i,j,k) - a(pm,l,i,j,k)*zpl
		endif
		end do
		endif
		end do

		return
		end subroutine fwd_bk_sub

		! =========================================================================
		subroutine find_inverse(neqn,i,j,k)
		implicit none

		#ifdef USE_OPENMP_OFFLOAD
		!$omp declare target
		#endif

		integer, intent(in) :: neqn,i,j,k
		integer :: l,m
		integer :: pm

		! using the LU decomp in matrix a, find and return the inverse of a
		! NOTE: the a matrix is pivoted during the LU decomp. the pivoting is
		! is used during the fwd/back substitution. once the inverse
		! is found, the pivoting is reversed before returning to the
		! calling function

		! NOTE: Finds the inverse of A directly

		do l = 1,neqn
		! initialize z with the appropriate column of the Identity matrix
		do m = 1, neqn
		z(m,i,j,k) = 0.d0
		enddo
		z(pivot(l,i,j,k),i,j,k) = 1.d0

		! solve for the appropriate column of the inverse of a
		! NOTE: uses the pivoted a matrix
		! NOTE2: can send in "l" iff we're initializing z with
		! a column of the Identity matrix ... otherwise
		! we must send in ONE : "1" instead of "l"
		call fwd_bk_sub(neqn,l,i,j,k)

		! store the column in a
		! NOTE: pivoting is reversed so that the matrix returned in a
		! is truly the inverse of the original a
		! NOTE2: also need to pivot the column with this algorithm
		do m = 1,neqn
		pm = pivot(m,i,j,k)
		ao(m,pivot(l,i,j,k),i,j,k) = z(pm,i,j,k)
		enddo
		enddo

		do m = 1,neqn
		do l = 1,neqn
		a(l,m,i,j,k) = ao(l,m,i,j,k)
		enddo
		enddo

		return
		end subroutine find_inverse

		! =========================================================================

		subroutine matrix_invert(neqn,i,j,k,matrix_info)
		implicit none

		#ifdef USE_OPENMP_OFFLOAD
		!$omp declare target
		#endif

		integer, intent(in) :: i,j,k,neqn
		integer, intent(inout) :: matrix_info

		! routines for inverting the C matrix using custom library

		call ludecomp(neqn,pivot(1,i,j,k),a(1,1,i,j,k),matrix_info)

		if (matrix_info .gt. 0) then
		!! bail if there is problem with the matrix
		return
		else
		!! find the inverse if the matrix passes the checks
		call find_inverse(neqn,i,j,k)
		endif

		return
		end subroutine matrix_invert

		! ====================================================================

		subroutine solve(neqn,i,j,k,matrix_info)
		implicit none

		#ifdef USE_OPENMP_OFFLOAD
		!$omp declare target
		#endif

		integer, intent(in) :: neqn,i,j,k
		integer, intent(inout) :: matrix_info

		real*8 :: z_tmp
		integer :: l,m,l_tmp

		! routines for inverting the C matrix using custom library

		! lu decomp of matrix c
		call ludecomp(neqn,pivot(1,i,j,k),a(1,1,i,j,k),matrix_info)

		if (matrix_info .gt. 0) then
		!! bail if there is problem with the matrix
		return
		else
		call fwd_bk_sub(neqn,1,i,j,k)

		! reverse the pivoting without an additional array
		do l = 1,neqn-1
		if (l .ne. pivot(l,i,j,k)) then

		! find the row to switch
		l_tmp = 0
		do m = l+1, neqn
		if (pivot(m,i,j,k) .eq. l) l_tmp = m
		end do

		! switch rows in solution vector
		z_tmp = z(pivot(l,i,j,k),i,j,k)
		z(pivot(l,i,j,k),i,j,k) = z(pivot(l_tmp,i,j,k),i,j,k)
		z(pivot(l_tmp,i,j,k),i,j,k) = z_tmp

		! update pivot vector
		pivot(l_tmp,i,j,k) = pivot(l,i,j,k)
		pivot(l,i,j,k) = l
		end if
		end do

		endif

		return
		end subroutine solve

		! ====================================================================

		end module Matrix_Operations