Commit 3e6e9340 authored by Laanait, Nouamane's avatar Laanait, Nouamane
Browse files

change to potential building kernel

parent b8df5eb3
Loading
Loading
Loading
Loading
+7 −5
Original line number Diff line number Diff line
@@ -490,7 +490,7 @@ class MSAGPU(MSAHybrid):
        # allocate memory
        atom_pot_stack_d = cuda.to_device(atom_pot_stack)
        sites_d = cuda.to_device(Zxy_input)
        self.potential_slices = cuda.aligned_empty((int(self.num_slices),
        self.potential_slices = cuda.aligned_zeros((int(self.num_slices),
                    int(self.sampling[0]), int(self.sampling[1])), np.complex64)
        self.vars = []
        self.vars.append(self.potential_slices.base)
@@ -504,7 +504,9 @@ class MSAGPU(MSAHybrid):
        block, grid = self._get_blockgrid([self.sampling[1], self.sampling[0], self.num_slices],
                    mode='3D')
        build_potential = self.pot_kernels['build_potential']

        print("block:%s, grid:%s" %(format(block), format(grid)))
        print("max, row idx:%d, col idx:%d, stk idx:%d" %(block[0]*grid[0], block[1]*grid[1], block[2]*grid[2]))
        print("sites: %s" %format(Zxy_input.shape))
        # build potential
        build_potential(potential_slices_d, atom_pot_stack_d, sites_d,
                        np.float32(self.sigma), block=block, grid=grid)
@@ -674,11 +676,11 @@ class MSAGPU(MSAHybrid):

        # allocate memory
        # self.probes = np.empty((self.num_probes, shape_y, shape_x), dtype=np.complex64)
        self.propag = cuda.aligned_empty((int(self.sampling[0]), int(self.sampling[1])), np.complex64)
        self.propag = cuda.aligned_zeros((int(self.sampling[0]), int(self.sampling[1])), np.complex64)
        self.vars.append(self.propag.base)
        self.propag = cuda.register_host_memory(self.propag)
        propag_d = cuda.to_device(self.propag)
        self.mask = cuda.aligned_empty((int(self.sampling[0]), int(self.sampling[1])), np.float32)
        self.mask = cuda.aligned_zeros((int(self.sampling[0]), int(self.sampling[1])), np.float32)
        self.vars.append(self.mask.base)
        self.mask = cuda.register_host_memory(self.mask)
        mask_d = cuda.to_device(self.mask)
@@ -690,7 +692,7 @@ class MSAGPU(MSAHybrid):
                                        dtype=np.complex64, mem_flags=cuda.mem_attach_flags.GLOBAL)
        else:
            # pinned memory is default
            self.probes = cuda.aligned_empty((int(self.num_probes), int(self.sampling[0]), int(self.sampling[1])), np.complex64)
            self.probes = cuda.aligned_zeros((int(self.num_probes), int(self.sampling[0]), int(self.sampling[1])), np.complex64)
            self.vars.append(self.probes.base)
            self.probes = cuda.register_host_memory(self.probes)
        ones = cuda.aligned_zeros((int(self.sampling[0]), int(self.sampling[1])), np.complex64) + 1
+53 −1
Original line number Diff line number Diff line
@@ -2,6 +2,55 @@
#define FULL_MASK 0xffffffff
#include <stdio.h>

    //  __global__ void BuildScatteringPotential(pycuda::complex<float> slice[][{{y_sampling}}][{{x_sampling}}],
    //                               float atom_pot_stack[][{{pot_shape_y}}][{{pot_shape_x}}],
    //                               int sites[][{{sites_size}}],
    //                               float sigma)

    //  {
    //      const int pot_size_y = {{pot_shape_y}}, pot_size_x = {{pot_shape_x}};
    //      const int slice_size_y = {{y_sampling}}, slice_size_x = {{x_sampling}};
    //      const int num_slices = {{num_slices}}, sites_size = {{sites_size}};
    //      int row_idx = blockDim.y * blockIdx.y + threadIdx.y;
    //      int col_idx = blockDim.x * blockIdx.x + threadIdx.x;
    //      int stk_idx = blockDim.z * blockIdx.z + threadIdx.z;
    //     // if (stk_idx == 0  && row_idx == 0  && col_idx == 0)
    //     // {
    //     // #pragma unroll
    //     for (int slice_num=0; slice_num<num_slices; slice_num++)
    //     {
    //          if (stk_idx == slice_num)
    //          {
    //              for(int my_site=0; my_site<sites_size/3; my_site++)
    //              {
    //                  const int Z = sites[stk_idx][3 * my_site];
    //                  const int y_cen = sites[stk_idx][3 * my_site + 1];
    //                  const int x_cen = sites[stk_idx][3 * my_site + 2];
    //                  const int y_start = rintf(y_cen - pot_size_y * 1.f/2);
    //                  const int y_end =  rintf(y_cen + pot_size_y * 1.f /2);
    //                  const int x_start = rintf(x_cen - pot_size_x * 1.f/2);
    //                  const int x_end = rintf(x_cen + pot_size_x * 1.f/2);
    //                  //printf(" slice_num: %d, Z: %d, y_cen:%d, x_cen: %d\\n ", slice_num, Z, y_cen, x_cen);
    //                  if (row_idx >=0 && row_idx < y_end && row_idx < slice_size_y && row_idx >= y_start
    //                      && col_idx >=0 && col_idx < x_end && col_idx < slice_size_x && col_idx >= x_start)
    //                  {
    //                      const int pot_i = row_idx-y_start, pot_j = col_idx-x_start;
    //                      atomicAdd(&slice[stk_idx][row_idx][col_idx]._M_re, atom_pot_stack[Z][pot_i][pot_j]);
    //                  }
    //              }
    //          }
    //      }
    //      __syncthreads();

    //     if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices)
    //     {
    //          slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma),
    //                                                                    sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma));
    //     }

    //  }

     
     __global__ void BuildScatteringPotential(pycuda::complex<float> slice[][{{y_sampling}}][{{x_sampling}}],
                                  float atom_pot_stack[][{{pot_shape_y}}][{{pot_shape_x}}],
                                  int sites[][{{sites_size}}],
@@ -42,10 +91,13 @@
         }
         __syncthreads();

        if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices)
        if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx <= num_slices)
        {
             slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma),
                                                                       sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma));
             if (isnan(slice[stk_idx][row_idx][col_idx]._M_re) || isnan(slice[stk_idx][row_idx][col_idx]._M_im)){
                slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(1.f, 1.f); 
             }
        }

     }