Loading namsa/cuda_kernels.py +1 −1 Original line number Diff line number Diff line from pycuda.tools import dtype_to_ctype from pycuda.compiler import SourceModule from pycuda.compiler import SourceModule, DynamicSourceModule from jinja2 import Template import numpy as np import namsa Loading namsa/potential_kernels.cu +5 −3 Original line number Diff line number Diff line Loading @@ -10,16 +10,18 @@ { const int pot_size_y = {{pot_shape_y}}, pot_size_x = {{pot_shape_x}}; const int slice_size_y = {{y_sampling}}, slice_size_x = {{x_sampling}}; const int num_slices = {{num_slices}}, sites_size = {{sites_size}}; int row_idx = blockDim.y * blockIdx.y + threadIdx.y; int col_idx = blockDim.x * blockIdx.x + threadIdx.x; int stk_idx = blockDim.z * blockIdx.z + threadIdx.z; // if (stk_idx == 0 && row_idx == 0 && col_idx == 0) // { for (int slice_num=0; slice_num<{{num_slices}}; slice_num++) // #pragma unroll for (int slice_num=0; slice_num<num_slices; slice_num++) { if (stk_idx == slice_num) { for(int my_site=0;my_site<{{sites_size}}/3;my_site++) for(int my_site=0; my_site<sites_size/3; my_site++) { const int Z = sites[stk_idx][3 * my_site]; const int y_cen = sites[stk_idx][3 * my_site + 1]; Loading @@ -40,7 +42,7 @@ } __syncthreads(); if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < {{num_slices}}) if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices) { slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma), sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma)); Loading namsa/probe_kernels.cu +10 −10 Original line number Diff line number Diff line Loading @@ -8,14 +8,14 @@ __inline__ __device__ int warpReduceSumSync(int val, int mask){ return val; } __device__ double calc_krad(float k_max, int size_x, int size_y, int col_idx, int row_idx){ __inline__ __device__ double calc_krad(float k_max, int size_x, int size_y, int col_idx, int row_idx){ double kx = double(col_idx) * double(k_max)/double(size_x - 1) - double(k_max)/2. ; double ky = double(row_idx) * double(k_max)/double(size_y - 1) - double(k_max)/2. ; double k_rad = sqrt(kx * kx + ky * ky); return k_rad; } __device__ float phase_shift(float k_max, int size_x, int size_y, int col_idx, int row_idx, int stk_idx, __inline__ __device__ float phase_shift(float k_max, int size_x, int size_y, int col_idx, int row_idx, int stk_idx, int *grid_step, float *grid_range){ const double pi = acos(-1.0); float kx = float(col_idx) * k_max/float(size_x - 1) - k_max/2.; Loading @@ -42,10 +42,10 @@ __global__ void norm_const_stack(pycuda::complex<float> arr[][{{x_sampling}} * { if (stk_idx < size_z) { int idx = blockIdx.x * blockDim.x + threadIdx.x; for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x) { sum += pycuda::norm(arr[stk_idx][idx]); } // for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x) // { // sum += pycuda::norm(arr[stk_idx][idx]); // } int mask = __ballot_sync(FULL_MASK, idx < {{x_sampling}} * {{y_sampling}}); sum = warpReduceSumSync(sum, mask); if ((threadIdx.x & (warpSize - 1)) == 0) Loading @@ -58,10 +58,10 @@ __global__ void norm_const_stack(pycuda::complex<float> arr[][{{x_sampling}} * { __global__ void norm_const(pycuda::complex<float> arr[][{{x_sampling}} * {{y_sampling}}], float* norm) { float sum = 0.f; int idx = blockIdx.x * blockDim.x + threadIdx.x; for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x) { sum += pycuda::norm(arr[0][idx]); } // for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x) // { // sum += pycuda::norm(arr[0][idx]); // } int mask = __ballot_sync(FULL_MASK, idx < {{x_sampling}} * {{y_sampling}}); sum = warpReduceSumSync(sum, mask); if ((threadIdx.x & (warpSize - 1)) == 0) Loading Loading
namsa/cuda_kernels.py +1 −1 Original line number Diff line number Diff line from pycuda.tools import dtype_to_ctype from pycuda.compiler import SourceModule from pycuda.compiler import SourceModule, DynamicSourceModule from jinja2 import Template import numpy as np import namsa Loading
namsa/potential_kernels.cu +5 −3 Original line number Diff line number Diff line Loading @@ -10,16 +10,18 @@ { const int pot_size_y = {{pot_shape_y}}, pot_size_x = {{pot_shape_x}}; const int slice_size_y = {{y_sampling}}, slice_size_x = {{x_sampling}}; const int num_slices = {{num_slices}}, sites_size = {{sites_size}}; int row_idx = blockDim.y * blockIdx.y + threadIdx.y; int col_idx = blockDim.x * blockIdx.x + threadIdx.x; int stk_idx = blockDim.z * blockIdx.z + threadIdx.z; // if (stk_idx == 0 && row_idx == 0 && col_idx == 0) // { for (int slice_num=0; slice_num<{{num_slices}}; slice_num++) // #pragma unroll for (int slice_num=0; slice_num<num_slices; slice_num++) { if (stk_idx == slice_num) { for(int my_site=0;my_site<{{sites_size}}/3;my_site++) for(int my_site=0; my_site<sites_size/3; my_site++) { const int Z = sites[stk_idx][3 * my_site]; const int y_cen = sites[stk_idx][3 * my_site + 1]; Loading @@ -40,7 +42,7 @@ } __syncthreads(); if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < {{num_slices}}) if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices) { slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma), sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma)); Loading
namsa/probe_kernels.cu +10 −10 Original line number Diff line number Diff line Loading @@ -8,14 +8,14 @@ __inline__ __device__ int warpReduceSumSync(int val, int mask){ return val; } __device__ double calc_krad(float k_max, int size_x, int size_y, int col_idx, int row_idx){ __inline__ __device__ double calc_krad(float k_max, int size_x, int size_y, int col_idx, int row_idx){ double kx = double(col_idx) * double(k_max)/double(size_x - 1) - double(k_max)/2. ; double ky = double(row_idx) * double(k_max)/double(size_y - 1) - double(k_max)/2. ; double k_rad = sqrt(kx * kx + ky * ky); return k_rad; } __device__ float phase_shift(float k_max, int size_x, int size_y, int col_idx, int row_idx, int stk_idx, __inline__ __device__ float phase_shift(float k_max, int size_x, int size_y, int col_idx, int row_idx, int stk_idx, int *grid_step, float *grid_range){ const double pi = acos(-1.0); float kx = float(col_idx) * k_max/float(size_x - 1) - k_max/2.; Loading @@ -42,10 +42,10 @@ __global__ void norm_const_stack(pycuda::complex<float> arr[][{{x_sampling}} * { if (stk_idx < size_z) { int idx = blockIdx.x * blockDim.x + threadIdx.x; for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x) { sum += pycuda::norm(arr[stk_idx][idx]); } // for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x) // { // sum += pycuda::norm(arr[stk_idx][idx]); // } int mask = __ballot_sync(FULL_MASK, idx < {{x_sampling}} * {{y_sampling}}); sum = warpReduceSumSync(sum, mask); if ((threadIdx.x & (warpSize - 1)) == 0) Loading @@ -58,10 +58,10 @@ __global__ void norm_const_stack(pycuda::complex<float> arr[][{{x_sampling}} * { __global__ void norm_const(pycuda::complex<float> arr[][{{x_sampling}} * {{y_sampling}}], float* norm) { float sum = 0.f; int idx = blockIdx.x * blockDim.x + threadIdx.x; for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x) { sum += pycuda::norm(arr[0][idx]); } // for(idx; idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x) // { // sum += pycuda::norm(arr[0][idx]); // } int mask = __ballot_sync(FULL_MASK, idx < {{x_sampling}} * {{y_sampling}}); sum = warpReduceSumSync(sum, mask); if ((threadIdx.x & (warpSize - 1)) == 0) Loading