Loading namsa/msa.py +7 −5 Original line number Diff line number Diff line Loading @@ -490,7 +490,7 @@ class MSAGPU(MSAHybrid): # allocate memory atom_pot_stack_d = cuda.to_device(atom_pot_stack) sites_d = cuda.to_device(Zxy_input) self.potential_slices = cuda.aligned_empty((int(self.num_slices), self.potential_slices = cuda.aligned_zeros((int(self.num_slices), int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars = [] self.vars.append(self.potential_slices.base) Loading @@ -504,7 +504,9 @@ class MSAGPU(MSAHybrid): block, grid = self._get_blockgrid([self.sampling[1], self.sampling[0], self.num_slices], mode='3D') build_potential = self.pot_kernels['build_potential'] print("block:%s, grid:%s" %(format(block), format(grid))) print("max, row idx:%d, col idx:%d, stk idx:%d" %(block[0]*grid[0], block[1]*grid[1], block[2]*grid[2])) print("sites: %s" %format(Zxy_input.shape)) # build potential build_potential(potential_slices_d, atom_pot_stack_d, sites_d, np.float32(self.sigma), block=block, grid=grid) Loading Loading @@ -674,11 +676,11 @@ class MSAGPU(MSAHybrid): # allocate memory # self.probes = np.empty((self.num_probes, shape_y, shape_x), dtype=np.complex64) self.propag = cuda.aligned_empty((int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.propag = cuda.aligned_zeros((int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars.append(self.propag.base) self.propag = cuda.register_host_memory(self.propag) propag_d = cuda.to_device(self.propag) self.mask = cuda.aligned_empty((int(self.sampling[0]), int(self.sampling[1])), np.float32) self.mask = cuda.aligned_zeros((int(self.sampling[0]), int(self.sampling[1])), np.float32) self.vars.append(self.mask.base) self.mask = cuda.register_host_memory(self.mask) mask_d = cuda.to_device(self.mask) Loading @@ -690,7 +692,7 @@ class MSAGPU(MSAHybrid): dtype=np.complex64, mem_flags=cuda.mem_attach_flags.GLOBAL) else: # pinned memory is default self.probes = cuda.aligned_empty((int(self.num_probes), int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.probes = cuda.aligned_zeros((int(self.num_probes), int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars.append(self.probes.base) self.probes = cuda.register_host_memory(self.probes) ones = cuda.aligned_zeros((int(self.sampling[0]), int(self.sampling[1])), np.complex64) + 1 Loading namsa/potential_kernels.cu +53 −1 Original line number Diff line number Diff line Loading @@ -2,6 +2,55 @@ #define FULL_MASK 0xffffffff #include <stdio.h> // __global__ void BuildScatteringPotential(pycuda::complex<float> slice[][{{y_sampling}}][{{x_sampling}}], // float atom_pot_stack[][{{pot_shape_y}}][{{pot_shape_x}}], // int sites[][{{sites_size}}], // float sigma) // { // const int pot_size_y = {{pot_shape_y}}, pot_size_x = {{pot_shape_x}}; // const int slice_size_y = {{y_sampling}}, slice_size_x = {{x_sampling}}; // const int num_slices = {{num_slices}}, sites_size = {{sites_size}}; // int row_idx = blockDim.y * blockIdx.y + threadIdx.y; // int col_idx = blockDim.x * blockIdx.x + threadIdx.x; // int stk_idx = blockDim.z * blockIdx.z + threadIdx.z; // // if (stk_idx == 0 && row_idx == 0 && col_idx == 0) // // { // // #pragma unroll // for (int slice_num=0; slice_num<num_slices; slice_num++) // { // if (stk_idx == slice_num) // { // for(int my_site=0; my_site<sites_size/3; my_site++) // { // const int Z = sites[stk_idx][3 * my_site]; // const int y_cen = sites[stk_idx][3 * my_site + 1]; // const int x_cen = sites[stk_idx][3 * my_site + 2]; // const int y_start = rintf(y_cen - pot_size_y * 1.f/2); // const int y_end = rintf(y_cen + pot_size_y * 1.f /2); // const int x_start = rintf(x_cen - pot_size_x * 1.f/2); // const int x_end = rintf(x_cen + pot_size_x * 1.f/2); // //printf(" slice_num: %d, Z: %d, y_cen:%d, x_cen: %d\\n ", slice_num, Z, y_cen, x_cen); // if (row_idx >=0 && row_idx < y_end && row_idx < slice_size_y && row_idx >= y_start // && col_idx >=0 && col_idx < x_end && col_idx < slice_size_x && col_idx >= x_start) // { // const int pot_i = row_idx-y_start, pot_j = col_idx-x_start; // atomicAdd(&slice[stk_idx][row_idx][col_idx]._M_re, atom_pot_stack[Z][pot_i][pot_j]); // } // } // } // } // __syncthreads(); // if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices) // { // slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma), // sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma)); // } // } __global__ void BuildScatteringPotential(pycuda::complex<float> slice[][{{y_sampling}}][{{x_sampling}}], float atom_pot_stack[][{{pot_shape_y}}][{{pot_shape_x}}], int sites[][{{sites_size}}], Loading Loading @@ -42,10 +91,13 @@ } __syncthreads(); if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices) if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx <= num_slices) { slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma), sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma)); if (isnan(slice[stk_idx][row_idx][col_idx]._M_re) || isnan(slice[stk_idx][row_idx][col_idx]._M_im)){ slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(1.f, 1.f); } } } Loading
namsa/msa.py +7 −5 Original line number Diff line number Diff line Loading @@ -490,7 +490,7 @@ class MSAGPU(MSAHybrid): # allocate memory atom_pot_stack_d = cuda.to_device(atom_pot_stack) sites_d = cuda.to_device(Zxy_input) self.potential_slices = cuda.aligned_empty((int(self.num_slices), self.potential_slices = cuda.aligned_zeros((int(self.num_slices), int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars = [] self.vars.append(self.potential_slices.base) Loading @@ -504,7 +504,9 @@ class MSAGPU(MSAHybrid): block, grid = self._get_blockgrid([self.sampling[1], self.sampling[0], self.num_slices], mode='3D') build_potential = self.pot_kernels['build_potential'] print("block:%s, grid:%s" %(format(block), format(grid))) print("max, row idx:%d, col idx:%d, stk idx:%d" %(block[0]*grid[0], block[1]*grid[1], block[2]*grid[2])) print("sites: %s" %format(Zxy_input.shape)) # build potential build_potential(potential_slices_d, atom_pot_stack_d, sites_d, np.float32(self.sigma), block=block, grid=grid) Loading Loading @@ -674,11 +676,11 @@ class MSAGPU(MSAHybrid): # allocate memory # self.probes = np.empty((self.num_probes, shape_y, shape_x), dtype=np.complex64) self.propag = cuda.aligned_empty((int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.propag = cuda.aligned_zeros((int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars.append(self.propag.base) self.propag = cuda.register_host_memory(self.propag) propag_d = cuda.to_device(self.propag) self.mask = cuda.aligned_empty((int(self.sampling[0]), int(self.sampling[1])), np.float32) self.mask = cuda.aligned_zeros((int(self.sampling[0]), int(self.sampling[1])), np.float32) self.vars.append(self.mask.base) self.mask = cuda.register_host_memory(self.mask) mask_d = cuda.to_device(self.mask) Loading @@ -690,7 +692,7 @@ class MSAGPU(MSAHybrid): dtype=np.complex64, mem_flags=cuda.mem_attach_flags.GLOBAL) else: # pinned memory is default self.probes = cuda.aligned_empty((int(self.num_probes), int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.probes = cuda.aligned_zeros((int(self.num_probes), int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars.append(self.probes.base) self.probes = cuda.register_host_memory(self.probes) ones = cuda.aligned_zeros((int(self.sampling[0]), int(self.sampling[1])), np.complex64) + 1 Loading
namsa/potential_kernels.cu +53 −1 Original line number Diff line number Diff line Loading @@ -2,6 +2,55 @@ #define FULL_MASK 0xffffffff #include <stdio.h> // __global__ void BuildScatteringPotential(pycuda::complex<float> slice[][{{y_sampling}}][{{x_sampling}}], // float atom_pot_stack[][{{pot_shape_y}}][{{pot_shape_x}}], // int sites[][{{sites_size}}], // float sigma) // { // const int pot_size_y = {{pot_shape_y}}, pot_size_x = {{pot_shape_x}}; // const int slice_size_y = {{y_sampling}}, slice_size_x = {{x_sampling}}; // const int num_slices = {{num_slices}}, sites_size = {{sites_size}}; // int row_idx = blockDim.y * blockIdx.y + threadIdx.y; // int col_idx = blockDim.x * blockIdx.x + threadIdx.x; // int stk_idx = blockDim.z * blockIdx.z + threadIdx.z; // // if (stk_idx == 0 && row_idx == 0 && col_idx == 0) // // { // // #pragma unroll // for (int slice_num=0; slice_num<num_slices; slice_num++) // { // if (stk_idx == slice_num) // { // for(int my_site=0; my_site<sites_size/3; my_site++) // { // const int Z = sites[stk_idx][3 * my_site]; // const int y_cen = sites[stk_idx][3 * my_site + 1]; // const int x_cen = sites[stk_idx][3 * my_site + 2]; // const int y_start = rintf(y_cen - pot_size_y * 1.f/2); // const int y_end = rintf(y_cen + pot_size_y * 1.f /2); // const int x_start = rintf(x_cen - pot_size_x * 1.f/2); // const int x_end = rintf(x_cen + pot_size_x * 1.f/2); // //printf(" slice_num: %d, Z: %d, y_cen:%d, x_cen: %d\\n ", slice_num, Z, y_cen, x_cen); // if (row_idx >=0 && row_idx < y_end && row_idx < slice_size_y && row_idx >= y_start // && col_idx >=0 && col_idx < x_end && col_idx < slice_size_x && col_idx >= x_start) // { // const int pot_i = row_idx-y_start, pot_j = col_idx-x_start; // atomicAdd(&slice[stk_idx][row_idx][col_idx]._M_re, atom_pot_stack[Z][pot_i][pot_j]); // } // } // } // } // __syncthreads(); // if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices) // { // slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma), // sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma)); // } // } __global__ void BuildScatteringPotential(pycuda::complex<float> slice[][{{y_sampling}}][{{x_sampling}}], float atom_pot_stack[][{{pot_shape_y}}][{{pot_shape_x}}], int sites[][{{sites_size}}], Loading Loading @@ -42,10 +91,13 @@ } __syncthreads(); if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices) if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx <= num_slices) { slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma), sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma)); if (isnan(slice[stk_idx][row_idx][col_idx]._M_re) || isnan(slice[stk_idx][row_idx][col_idx]._M_im)){ slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(1.f, 1.f); } } }