Commit 908afd35 authored by Nichols, Stephen's avatar Nichols, Stephen
Browse files

Changes to be committed:

	new file:   Makefile_summit
	new file:   RUNTIME_FILES/input
	new file:   RUNTIME_FILES/matrix
	new file:   RUNTIME_FILES/matrix6
	new file:   RUNTIME_FILES/matrix_meumapps6
	new file:   RUNTIME_FILES/matrix_ops.f90
	new file:   RUNTIME_FILES/summit_cpu.lsf
	new file:   RUNTIME_FILES/summit_gpu.lsf
	new file:   SCRIPTS/build_summit.sh
	new file:   SCRIPTS/setUpModules_summit.sh
	new file:   custom_linsolve.f90
	new file:   global_parameters.f90
	new file:   my_cudainterfaces.f90
	new file:   my_hipinterfaces.f90
	new file:   nvtx.f90
	new file:   timing_functions.f90

Forgot to add the files ....
parent bec581ba
SHELL=/bin/bash
MACHINE = Summit
### the Spock compiler flags also work on Crusher
#MACHINE = Spock
#### Summit
ifeq ($(MACHINE),Summit)
## CPU-Only
#FC = xlf
#FFLAGS = -O3 -qfree -qpreprocess -qtune=pwr9 -qarch=pwr9 -qtgtarch=sm_70 -W@,"-v"
#LDFLAGS =
## GPUs with OpenMP Offloading
FC = xlcuf
FFLAGS = -O3 -qfree -qpreprocess -qsmp=omp -qoffload -qcuda -qtune=pwr9 -qarch=pwr9 -qtgtarch=sm_70 -W@,"-v"
FFLAGS += -DUSE_OPENMP_OFFLOAD -DUSE_CUDA
LDFLAGS = -qsmp=omp -qoffload -L${OLCF_CUDA_ROOT}/lib64 -lnvToolsExt -lcudart
endif
#### Spock and Crusher
ifeq ($(MACHINE),Spock)
ROCM_LIB_PATH=${OLCF_ROCM_ROOT}/lib
ROCM_INC_PATH=${OLCF_ROCM_ROOT}/include
FC = ftn
### GPU
FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree -DUSE_OPENMP_OFFLOAD -DUSE_AMD
##FFLAGS += -I$(ROCM_INC_PATH)
LDFLAGS = -homp -L$(ROCM_LIB_PATH) -lamdhip64 -lhsa-runtime64
### CPU-only without OpenMP
#FFLAGS = -N1023 -O3 -hnoomp -em -ef -eT -ffree
#LDFLAGS = -hnoomp
### CPU-only with OpenMP flag ... no cpu-based openmp is in the code
#FFLAGS = -N1023 -O3 -homp -em -ef -eT -ffree
#LDFLAGS = -homp
endif
F_SRC = custom_linsolve.f90
MODS =
ifneq (,$(findstring USE_CUDA,$(FFLAGS)))
MODS += my_cudainterfaces.f90
MODS += nvtx.f90
endif
ifneq (,$(findstring USE_AMD,$(FFLAGS)))
MODS += my_hipinterfaces.f90
endif
MODS += global_parameters.f90 timing_functions.f90 matrix_ops.f90
OBJECTS = $(MODS:.f90=.o) $(F_SRC:.f90=.o)
EXE = custom_linsolve.x
# rules for makefile
$(EXE): $(OBJECTS)
$(FC) -o $@ $^ $(LDFLAGS)
%.o:%.f90
$(FC) $(FFLAGS) -c -o $@ $<
default: all
all: $(EXE)
clean:
rm -f *.o *.mod *.x *.s
clobber:
rm -f *.o *.mod *.x *.s *~
2 2 2
16 128
first line is nx, ny, nz
second line is nteams, nthreads
6
3.0 1.0 2.0 0.3 -1.8 0.0
-6.0 0.4 3.0 2.6 -1.8 -5.0
2.0 -5.0 8.0 -10.3 2.2 4.0
-2.0 1.0 -8.0 0.3 4.8 -1.0
6.0 1.1 5.4 -5.3 1.8 1.0
2.0 -5.0 2.0 -0.3 -0.8 -9.0
6
3.0 1.0 2.0 0.3 -1.8 0.0
-6.0 0.4 3.0 2.6 -1.8 -5.0
2.0 -5.0 8.0 -10.3 2.2 4.0
-2.0 1.0 -8.0 0.3 4.8 -1.0
6.0 1.1 5.4 -5.3 1.8 1.0
2.0 -5.0 2.0 -0.3 -0.8 -9.0
6
15970710450.3098679 -22759620211.5585022 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00
0.000000000000000000E+00 22759620211.5585022 -7191753351.78947639 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00
0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 29153743254.0771141 -4034123109.03958082 0.000000000000000000E+00
0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 4034123109.03958082 -14800757876.6520786
0.000000000000000000E+00 1.00000000000000000 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00
0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 1.00000000000000000 0.000000000000000000E+00
module matrix_operations
use omp_lib
use global_parameters
implicit none
contains
! =========================================================================
subroutine matrix_vect_mult(neqn,aa,bb,cc)
implicit none
#ifdef USE_OPENMP_OFFLOAD
!$omp declare target
#endif
integer, intent(in) :: neqn
real*8, intent(in) :: aa(neqn,neqn),bb(neqn)
real*8, intent(out) :: cc(neqn)
integer :: i,j,k
real*8 :: dummy
do i = 1,neqn
dummy = aa(i,1)*bb(1)
do k = 2,neqn
dummy = dummy + aa(i,k)*bb(k)
enddo
cc(i) = dummy
enddo
return
end subroutine matrix_vect_mult
! =========================================================================
subroutine matrix_matrix_mult(neqn,aa,bb,cc)
implicit none
#ifdef USE_OPENMP_OFFLOAD
!$omp declare target
#endif
integer, intent(in) :: neqn
real*8, intent(in) :: aa(neqn,neqn),bb(neqn,neqn)
real*8, intent(out) :: cc(neqn,neqn)
integer :: i,j,k
real*8 :: dummy
do i = 1,neqn
do j = 1,neqn
dummy = aa(i,1)*bb(1,j)
do k = 2,neqn
dummy = dummy + aa(i,k)*bb(k,j)
enddo
cc(i,j) = dummy
enddo
enddo
return
end subroutine matrix_matrix_mult
! =========================================================================
subroutine small_value(neqn,aa)
implicit none
#ifdef USE_OPENMP_OFFLOAD
!$omp declare target
#endif
integer, intent(in) :: neqn
real*8, intent(inout) :: aa(neqn,neqn)
integer :: i,j
do j = 1,neqn
do i = 1,neqn
if (dabs(aa(i,j)) .lt. 1.0e-15) then
aa(i,j) = 0.d0
endif
enddo
enddo
return
end subroutine small_value
! =========================================================================
subroutine ludecomp(neqn,ppivot,aa,matrix_info)
implicit none
#ifdef USE_OPENMP_OFFLOAD
!$omp declare target
#endif
! it uses gaussian elimination with row pivoting to reduce
! a (neqn x neqn) matrix to an upper triangular form.
! NOTE: also known as LU decompositon
integer, intent(in) :: neqn
integer, intent(inout) :: ppivot(neqn)
real *8, intent(inout) :: aa(neqn,neqn)
integer, intent(inout) :: matrix_info
real*8 :: mult,apm
integer :: l,m,row,col,pivot_loc,xtra
integer :: pl, pm
real*8 :: diag,maxdiag,absdiag
! initialize values
do m = 1,neqn
ppivot(m) = m
end do
do m = 1,neqn-1
row = m
maxdiag = dabs(aa(ppivot(row),m))
do l = m+1,neqn
if (dabs(aa(ppivot(l),m)) .gt. maxdiag) then
row = l
maxdiag = dabs(aa(ppivot(row),m))
endif
end do
pivot_loc = ppivot(row)
! pivot the rows in-place (ie. re-map the rows instead
! of physically exchanging rows)
if (pivot_loc .ne. ppivot(m)) then
xtra = ppivot(m)
ppivot(m) = pivot_loc
ppivot(row) = xtra
endif
pm = ppivot(m)
absdiag = dabs(aa(pm,m))
! Check for singularity ...
! Instead of returning a error if absdiag <= 1.0e-16 like I would on the CPU,
! set diag so that everything will work and let the error check identify the
! problem after the lu decomp is completed.
! NOTE: OpenMP has difficulty building GPU code with logic to return early
if (absdiag .gt. 1.d-16) then
diag = aa(pm,m)
else
diag = 1.d+16
endif
! find and store multipliers for the column
do row = m+1,neqn
pl = ppivot(row)
if (aa(pl,m) .ne. 0.d0) then !! no need to mult by 0.0
mult = aa(pl,m)/diag
aa(pl,m) = mult
!aa(pl,m) = aa(pl,m)/diag
endif
end do
! reduce the rows in a column-oriented fashion
do col = m+1,neqn
if (aa(pm,col) .ne. 0.d0) then !! no need to add 0.0
apm = aa(pm,col)
do row = m+1,neqn
pl = ppivot(row)
if (aa(pl,m) .ne. 0.d0) then !! no need to mult by 0.0
mult = aa(pl,m)
aa(pl,col) = aa(pl,col) - mult*apm
endif
end do
endif
end do
end do !end of the lu decomp loop
! check the matrix for problems
call check_matrix(neqn,ppivot,aa,matrix_info)
return
end subroutine ludecomp
! =========================================================================o
subroutine check_matrix(neqn,ppivot,aa,matrix_info)
implicit none
#ifdef USE_OPENMP_OFFLOAD
!$omp declare target
#endif
integer, intent(in) :: neqn
integer, intent(inout) :: ppivot(neqn)
real *8, intent(inout) :: aa(neqn,neqn)
integer, intent(inout) :: matrix_info
integer :: l
real *8 :: ratio
real *8 :: det
integer :: pm
real *8 :: diag,absdiag,min_val,max_val
det = 1.d0
max_val = 0.d0
min_val = 1.d+16
do l = 1,neqn
pm = ppivot(l)
diag = aa(pm,l)
absdiag = dabs(diag)
det = det*(absdiag)
max_val = max(absdiag,max_val)
min_val = min(absdiag,min_val)
! check for singularity at every location
if (absdiag .le. 1.d-16) then
!write(*,*) 'c matrix is not invertible on target'
!write(*,*) 'diagonal entry <= 1.0e-16'
!write(*,*) 'halting program'
!write(*,*) ' '
matrix_info = 1
endif
enddo
if (matrix_info .eq. 0) then
ratio = min_val/max_val
if ((max_val .le. 1.d-16) .and. (min_val .le. 1.d-16)) then
! all diagonal values <= 1.d-16
! NOTE: matrix needs better scaling
!write(*,*) 'all diagonal values <= 1.0e-16 on target'
!write(*,*) 'abs(det) = ',det
!write(*,*) 'needs better scaling'
!write(*,*) 'halting program'
!write(*,*) ' '
matrix_info = 3
elseif (ratio .le. 1.d-15) then
! check for poor scaling between the diagonal values
! NOTE: leads to poor conditioning
!write(*,*) 'c matrix is ill-conditioned on target'
!write(*,*) 'ratio = abs((min_diag)/(max_diag)) = ',ratio
!write(*,*) 'abs(det) = ',det
!write(*,*) 'halting program'
!write(*,*) ' '
matrix_info = 2
endif
endif
return
end subroutine check_matrix
! =========================================================================
subroutine fwd_bk_sub(neqn,nn,i,j,k)
implicit none
#ifdef USE_OPENMP_OFFLOAD
!$omp declare target
#endif
integer, intent(in) :: neqn,nn,i,j,k
real*8 :: zpl,zpm
integer :: l,m
integer :: pl,pm
! perform the same eliminations on Z that were
! completed on A (this is the quasi-forward substitution pass)
do m = nn,neqn-1
pm = pivot(m,i,j,k)
if (z(pm,i,j,k) .ne. 0.d0) then !! no need to mult by 0.0
zpm = z(pm,i,j,k)
do l = m+1,neqn
pl = pivot(l,i,j,k)
if (a(pl,m,i,j,k) .ne. 0.d0) then !! no need to add 0.0
z(pl,i,j,k) = z(pl,i,j,k) - a(pl,m,i,j,k)*zpm
endif
end do
endif
end do
! solves the upper triangular system using column
! oriented back substitution
do l = neqn,1,-1
pl = pivot(l,i,j,k)
if (z(pl,i,j,k) .ne. 0.d0) then !! no need to mult by 0.0
z(pl,i,j,k) = z(pl,i,j,k)/a(pl,l,i,j,k)
zpl = z(pl,i,j,k)
do m = l-1,1,-1
pm = pivot(m,i,j,k)
if (a(pm,l,i,j,k) .ne. 0.d0) then !! no need to add 0.0
z(pm,i,j,k) = z(pm,i,j,k) - a(pm,l,i,j,k)*zpl
endif
end do
endif
end do
return
end subroutine fwd_bk_sub
! =========================================================================
subroutine find_inverse(neqn,i,j,k)
implicit none
#ifdef USE_OPENMP_OFFLOAD
!$omp declare target
#endif
integer, intent(in) :: neqn,i,j,k
integer :: l,m
integer :: pm
! using the LU decomp in matrix a, find and return the inverse of a
! NOTE: the a matrix is pivoted during the LU decomp. the pivoting is
! is used during the fwd/back substitution. once the inverse
! is found, the pivoting is reversed before returning to the
! calling function
! NOTE: Finds the inverse of A directly
do l = 1,neqn
! initialize z with the appropriate column of the Identity matrix
do m = 1, neqn
z(m,i,j,k) = 0.d0
enddo
z(pivot(l,i,j,k),i,j,k) = 1.d0
! solve for the appropriate column of the inverse of a
! NOTE: uses the pivoted a matrix
! NOTE2: can send in "l" iff we're initializing z with
! a column of the Identity matrix ... otherwise
! we must send in ONE : "1" instead of "l"
call fwd_bk_sub(neqn,l,i,j,k)
! store the column in a
! NOTE: pivoting is reversed so that the matrix returned in a
! is truly the inverse of the original a
! NOTE2: also need to pivot the column with this algorithm
do m = 1,neqn
pm = pivot(m,i,j,k)
ao(m,pivot(l,i,j,k),i,j,k) = z(pm,i,j,k)
enddo
enddo
do m = 1,neqn
do l = 1,neqn
a(l,m,i,j,k) = ao(l,m,i,j,k)
enddo
enddo
return
end subroutine find_inverse
! =========================================================================
subroutine matrix_invert(neqn,i,j,k,matrix_info)
implicit none
#ifdef USE_OPENMP_OFFLOAD
!$omp declare target
#endif
integer, intent(in) :: i,j,k,neqn
integer, intent(inout) :: matrix_info
! routines for inverting the C matrix using custom library
call ludecomp(neqn,pivot(1,i,j,k),a(1,1,i,j,k),matrix_info)
if (matrix_info .gt. 0) then
!! bail if there is problem with the matrix
return
else
!! find the inverse if the matrix passes the checks
call find_inverse(neqn,i,j,k)
endif
return
end subroutine matrix_invert
! ====================================================================
subroutine solve(neqn,i,j,k,matrix_info)
implicit none
#ifdef USE_OPENMP_OFFLOAD
!$omp declare target
#endif
integer, intent(in) :: neqn,i,j,k
integer, intent(inout) :: matrix_info
real*8 :: z_tmp
integer :: l,m,l_tmp
! routines for inverting the C matrix using custom library
! lu decomp of matrix c
call ludecomp(neqn,pivot(1,i,j,k),a(1,1,i,j,k),matrix_info)
if (matrix_info .gt. 0) then
!! bail if there is problem with the matrix
return
else
call fwd_bk_sub(neqn,1,i,j,k)
! reverse the pivoting without an additional array
do l = 1,neqn-1
if (l .ne. pivot(l,i,j,k)) then
! find the row to switch
l_tmp = 0
do m = l+1, neqn
if (pivot(m,i,j,k) .eq. l) l_tmp = m
end do
! switch rows in solution vector
z_tmp = z(pivot(l,i,j,k),i,j,k)
z(pivot(l,i,j,k),i,j,k) = z(pivot(l_tmp,i,j,k),i,j,k)
z(pivot(l_tmp,i,j,k),i,j,k) = z_tmp
! update pivot vector
pivot(l_tmp,i,j,k) = pivot(l,i,j,k)
pivot(l,i,j,k) = l
end if
end do
endif
return
end subroutine solve
! ====================================================================
end module Matrix_Operations
#!/bin/bash
# Begin LSF Directives
#BSUB -P STF006
#BSUB -W 0:05
#BSUB -nnodes 1
#BSUB -alloc_flags "smt1"
#BSUB -J linsolve_test
#BSUB -o linsolve_test.%J
#BSUB -e linsolve_test.%J
date
#source setUpModules_summit.sh
module list
#exit
#export CUDA_LAUNCH_BLOCKING=1
export export XLFRTEOPTS=cpu_time_type=total_alltime
export OMP_NUM_THREADS=1
echo "start time"
ti='date +%s.%N'
$ti
jsrun -n 1 -a 1 -c 1 -g 0 ./custom_linsolve.x > cpu_run.log
echo "end time"
tf='date +%s.%N'
$tf
#!/bin/bash
# Begin LSF Directives
#BSUB -P STF006
#BSUB -W 0:05
#BSUB -nnodes 1
#BSUB -alloc_flags "smt1"
#BSUB -J linsolve_test
#BSUB -o linsolve_test.%J
#BSUB -e linsolve_test.%J
date
source setUpModules_summit.sh
module list
#exit
export CUDA_LAUNCH_BLOCKING=1
export export XLFRTEOPTS=cpu_time_type=total_alltime
export OMP_NUM_THREADS=1
echo "start time"
ti='date +%s.%N'
$ti
#jsrun -n 1 -a 1 -c 1 -g 1 ./custom_linsolve.x > gpu_run.log
jsrun -n 1 -a 1 -g 1 -c 1 -E HOME=${PWD} nsys profile -o GPU_Linsolve_Profile -f true --stats=true ./custom_linsolve.x > gpu_run.log