inlining of device functions, loop unrolling, and bug fixes to MSAMPI (07c7262d) · Commits · disMultiABM / namsa

namsa/cuda_kernels.py

+1 −1

Original line number	Diff line number	Diff line
		from pycuda.tools import dtype_to_ctype
		from pycuda.compiler import SourceModule
		from pycuda.compiler import SourceModule, DynamicSourceModule
		from jinja2 import Template
		import numpy as np
		import namsa

namsa/potential_kernels.cu

+5 −3

Original line number	Diff line number	Diff line
		@@ -10,16 +10,18 @@
		{
		const int pot_size_y = {{pot_shape_y}}, pot_size_x = {{pot_shape_x}};
		const int slice_size_y = {{y_sampling}}, slice_size_x = {{x_sampling}};
		const int num_slices = {{num_slices}}, sites_size = {{sites_size}};
		int row_idx = blockDim.y * blockIdx.y + threadIdx.y;
		int col_idx = blockDim.x * blockIdx.x + threadIdx.x;
		int stk_idx = blockDim.z * blockIdx.z + threadIdx.z;
		// if (stk_idx == 0 && row_idx == 0 && col_idx == 0)
		// {
		for (int slice_num=0; slice_num<{{num_slices}}; slice_num++)
		// #pragma unroll
		for (int slice_num=0; slice_num<num_slices; slice_num++)
		{
		if (stk_idx == slice_num)
		{
		for(int my_site=0;my_site<{{sites_size}}/3;my_site++)
		for(int my_site=0; my_site<sites_size/3; my_site++)
		{
		const int Z = sites[stk_idx][3 * my_site];
		const int y_cen = sites[stk_idx][3 * my_site + 1];
		@@ -40,7 +42,7 @@
		}
		__syncthreads();

		if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < {{num_slices}})
		if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices)
		{
		slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma),
		sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma));

namsa/probe_kernels.cu

+10 −10

Original line number	Diff line number	Diff line
		@@ -8,14 +8,14 @@ __inline__ __device__ int warpReduceSumSync(int val, int mask){
		return val;
		}

		__device__ double calc_krad(float k_max, int size_x, int size_y, int col_idx, int row_idx){
		__inline__ __device__ double calc_krad(float k_max, int size_x, int size_y, int col_idx, int row_idx){
		double kx = double(col_idx) * double(k_max)/double(size_x - 1) - double(k_max)/2. ;
		double ky = double(row_idx) * double(k_max)/double(size_y - 1) - double(k_max)/2. ;
		double k_rad = sqrt(kx * kx + ky * ky);
		return k_rad;
		}

		__device__ float phase_shift(float k_max, int size_x, int size_y, int col_idx, int row_idx, int stk_idx,
		__inline__ __device__ float phase_shift(float k_max, int size_x, int size_y, int col_idx, int row_idx, int stk_idx,
		int grid_step, float grid_range){
		const double pi = acos(-1.0);
		float kx = float(col_idx) * k_max/float(size_x - 1) - k_max/2.;
		@@ -42,10 +42,10 @@ __global__ void norm_const_stack(pycuda::complex<float> arr[][{{x_sampling}} * {
		if (stk_idx < size_z)
		{
		int idx = blockIdx.x * blockDim.x + threadIdx.x;
		for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x)
		{
		sum += pycuda::norm(arr[stk_idx][idx]);
		}
		// for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x)
		// {
		// sum += pycuda::norm(arr[stk_idx][idx]);
		// }
		int mask = __ballot_sync(FULL_MASK, idx < {{x_sampling}} * {{y_sampling}});
		sum = warpReduceSumSync(sum, mask);
		if ((threadIdx.x & (warpSize - 1)) == 0)
		@@ -58,10 +58,10 @@ __global__ void norm_const_stack(pycuda::complex<float> arr[][{{x_sampling}} * {
		__global__ void norm_const(pycuda::complex<float> arr[][{{x_sampling}} * {{y_sampling}}], float* norm) {
		float sum = 0.f;
		int idx = blockIdx.x * blockDim.x + threadIdx.x;
		for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x)
		{
		sum += pycuda::norm(arr[0][idx]);
		}
		// for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x)
		// {
		// sum += pycuda::norm(arr[0][idx]);
		// }
		int mask = __ballot_sync(FULL_MASK, idx < {{x_sampling}} * {{y_sampling}});
		sum = warpReduceSumSync(sum, mask);
		if ((threadIdx.x & (warpSize - 1)) == 0)

Admin message