Commit 07c7262d authored by Laanait, Nouamane's avatar Laanait, Nouamane
Browse files

inlining of device functions, loop unrolling, and bug fixes to MSAMPI

parent 81f055fd
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
from pycuda.tools import dtype_to_ctype
from pycuda.compiler import SourceModule
from pycuda.compiler import SourceModule, DynamicSourceModule
from jinja2 import Template
import numpy as np
import namsa
+5 −3
Original line number Diff line number Diff line
@@ -10,16 +10,18 @@
     {
         const int pot_size_y = {{pot_shape_y}}, pot_size_x = {{pot_shape_x}};
         const int slice_size_y = {{y_sampling}}, slice_size_x = {{x_sampling}};
         const int num_slices = {{num_slices}}, sites_size = {{sites_size}};
         int row_idx = blockDim.y * blockIdx.y + threadIdx.y;
         int col_idx = blockDim.x * blockIdx.x + threadIdx.x;
         int stk_idx = blockDim.z * blockIdx.z + threadIdx.z;
        // if (stk_idx == 0  && row_idx == 0  && col_idx == 0)
        // {
        for (int slice_num=0; slice_num<{{num_slices}}; slice_num++)
        // #pragma unroll
        for (int slice_num=0; slice_num<num_slices; slice_num++)
        {
             if (stk_idx == slice_num)
             {
                 for(int my_site=0;my_site<{{sites_size}}/3;my_site++)
                 for(int my_site=0; my_site<sites_size/3; my_site++)
                 {
                     const int Z = sites[stk_idx][3 * my_site];
                     const int y_cen = sites[stk_idx][3 * my_site + 1];
@@ -40,7 +42,7 @@
         }
         __syncthreads();

        if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < {{num_slices}})
        if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices)
        {
             slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma),
                                                                       sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma));
+10 −10
Original line number Diff line number Diff line
@@ -8,14 +8,14 @@ __inline__ __device__ int warpReduceSumSync(int val, int mask){
  return val;
}

__device__ double calc_krad(float k_max, int size_x, int size_y, int col_idx, int row_idx){
 __inline__ __device__ double calc_krad(float k_max, int size_x, int size_y, int col_idx, int row_idx){
    double kx = double(col_idx) * double(k_max)/double(size_x - 1) - double(k_max)/2. ;
    double ky = double(row_idx) * double(k_max)/double(size_y - 1) - double(k_max)/2. ;
    double k_rad = sqrt(kx * kx + ky * ky);
    return k_rad;
}

__device__ float phase_shift(float k_max, int size_x, int size_y, int col_idx, int row_idx, int stk_idx,
 __inline__ __device__ float phase_shift(float k_max, int size_x, int size_y, int col_idx, int row_idx, int stk_idx,
    int *grid_step, float *grid_range){
    const double pi = acos(-1.0);
    float kx = float(col_idx) * k_max/float(size_x - 1) - k_max/2.;
@@ -42,10 +42,10 @@ __global__ void norm_const_stack(pycuda::complex<float> arr[][{{x_sampling}} * {
  if (stk_idx < size_z)
  {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    for(idx;  idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x)
      {
         sum += pycuda::norm(arr[stk_idx][idx]);
      }
    // for(idx;  idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x)
    // {
    //      sum += pycuda::norm(arr[stk_idx][idx]);
    // }
    int mask = __ballot_sync(FULL_MASK, idx < {{x_sampling}} * {{y_sampling}});
    sum = warpReduceSumSync(sum, mask);
    if ((threadIdx.x & (warpSize - 1)) == 0)
@@ -58,10 +58,10 @@ __global__ void norm_const_stack(pycuda::complex<float> arr[][{{x_sampling}} * {
__global__ void norm_const(pycuda::complex<float> arr[][{{x_sampling}} * {{y_sampling}}], float* norm) {
  float sum = 0.f;
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for(idx;  idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x)
    {
       sum += pycuda::norm(arr[0][idx]);
    }
  // for(idx;  idx < {{x_sampling}} * {{y_sampling}} ; idx += blockDim.x * gridDim.x)
  //   {
  //      sum += pycuda::norm(arr[0][idx]);
  //   }
  int mask = __ballot_sync(FULL_MASK, idx < {{x_sampling}} * {{y_sampling}});
  sum = warpReduceSumSync(sum, mask);
  if ((threadIdx.x & (warpSize - 1)) == 0)