Loading namsa/msa.py +6 −5 Original line number Diff line number Diff line Loading @@ -467,10 +467,11 @@ class MSAGPU(MSAHybrid): for Z, yx in zip(Z_arr, yx_arr)]) # pad sites array to have equal num of element per slice shapes = np.array([itm.shape for itm in Zxy_input]) pads = [(0, shapes.max() - itm.shape) for itm in Zxy_input] padded = [np.pad(itm,pad,'constant',constant_values=-1) for pad, itm in zip(pads, Zxy_input)] max_shape = np.max(np.array([itm.shape for itm in Zxy_input])) pads = [(0, max_shape - itm.shape[0]) for itm in Zxy_input] # padded = [np.pad(itm,pad,'constant',constant_values=-1) for pad, itm in zip(pads, Zxy_input)] Zxy_input = np.vstack([np.pad(itm, pad, 'constant', constant_values=-1) for pad, itm in zip(pads, Zxy_input)]) Zxy_input = Zxy_input.astype(np.float32) # stack atomic potentials of unique elements atom_pot_stack = np.array([self.cached_pots[uq_Z] for uq_Z in unique_Z]).astype(np.float32) Loading namsa/potential_kernels.cu +7 −54 Original line number Diff line number Diff line Loading @@ -2,58 +2,9 @@ #define FULL_MASK 0xffffffff #include <stdio.h> // __global__ void BuildScatteringPotential(pycuda::complex<float> slice[][{{y_sampling}}][{{x_sampling}}], // float atom_pot_stack[][{{pot_shape_y}}][{{pot_shape_x}}], // int sites[][{{sites_size}}], // float sigma) // { // const int pot_size_y = {{pot_shape_y}}, pot_size_x = {{pot_shape_x}}; // const int slice_size_y = {{y_sampling}}, slice_size_x = {{x_sampling}}; // const int num_slices = {{num_slices}}, sites_size = {{sites_size}}; // int row_idx = blockDim.y * blockIdx.y + threadIdx.y; // int col_idx = blockDim.x * blockIdx.x + threadIdx.x; // int stk_idx = blockDim.z * blockIdx.z + threadIdx.z; // // if (stk_idx == 0 && row_idx == 0 && col_idx == 0) // // { // // #pragma unroll // for (int slice_num=0; slice_num<num_slices; slice_num++) // { // if (stk_idx == slice_num) // { // for(int my_site=0; my_site<sites_size/3; my_site++) // { // const int Z = sites[stk_idx][3 * my_site]; // const int y_cen = sites[stk_idx][3 * my_site + 1]; // const int x_cen = sites[stk_idx][3 * my_site + 2]; // const int y_start = rintf(y_cen - pot_size_y * 1.f/2); // const int y_end = rintf(y_cen + pot_size_y * 1.f /2); // const int x_start = rintf(x_cen - pot_size_x * 1.f/2); // const int x_end = rintf(x_cen + pot_size_x * 1.f/2); // //printf(" slice_num: %d, Z: %d, y_cen:%d, x_cen: %d\\n ", slice_num, Z, y_cen, x_cen); // if (row_idx >=0 && row_idx < y_end && row_idx < slice_size_y && row_idx >= y_start // && col_idx >=0 && col_idx < x_end && col_idx < slice_size_x && col_idx >= x_start) // { // const int pot_i = row_idx-y_start, pot_j = col_idx-x_start; // atomicAdd(&slice[stk_idx][row_idx][col_idx]._M_re, atom_pot_stack[Z][pot_i][pot_j]); // } // } // } // } // __syncthreads(); // if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices) // { // slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma), // sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma)); // } // } __global__ void BuildScatteringPotential(pycuda::complex<float> slice[][{{y_sampling}}][{{x_sampling}}], float atom_pot_stack[][{{pot_shape_y}}][{{pot_shape_x}}], int sites[][{{sites_size}}], float sites[][{{sites_size}}], float sigma) { Loading @@ -72,9 +23,9 @@ { for(int my_site=0; my_site<sites_size/3; my_site++) { const int Z = sites[stk_idx][3 * my_site]; const int y_cen = sites[stk_idx][3 * my_site + 1]; const int x_cen = sites[stk_idx][3 * my_site + 2]; const int Z = (int) sites[stk_idx][3 * my_site]; const int y_cen = (int) sites[stk_idx][3 * my_site + 1]; const int x_cen = (int) sites[stk_idx][3 * my_site + 2]; const int y_start = rintf(y_cen - pot_size_y * 1.f/2); const int y_end = rintf(y_cen + pot_size_y * 1.f /2); const int x_start = rintf(x_cen - pot_size_x * 1.f/2); Loading @@ -84,11 +35,13 @@ && col_idx >=0 && col_idx < x_end && col_idx < slice_size_x && col_idx >= x_start) { const int pot_i = row_idx-y_start, pot_j = col_idx-x_start; if (pot_i < pot_size_y && pot_j < pot_size_x && pot_i >= 0 && pot_j >= 0){ atomicAdd(&slice[stk_idx][row_idx][col_idx]._M_re, atom_pot_stack[Z][pot_i][pot_j]); } } } } } __syncthreads(); if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices) Loading setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -9,7 +9,7 @@ setup( author='Numan Laanait', author_email='laanaitn@ornl.gov', description='', install_requires=['scipy', 'pymatgen', 'numpy', 'pycuda', 'scikit-cuda'], install_requires=['scipy', 'pymatgen', 'numpy', 'pycuda==2019.1', 'scikit-cuda', 'mpi4py'], #install_requires=['numpy', 'scipy', 'pymatgen', 'pybtex'], test_suite='tests', python_requires='>=3.6', Loading Loading
namsa/msa.py +6 −5 Original line number Diff line number Diff line Loading @@ -467,10 +467,11 @@ class MSAGPU(MSAHybrid): for Z, yx in zip(Z_arr, yx_arr)]) # pad sites array to have equal num of element per slice shapes = np.array([itm.shape for itm in Zxy_input]) pads = [(0, shapes.max() - itm.shape) for itm in Zxy_input] padded = [np.pad(itm,pad,'constant',constant_values=-1) for pad, itm in zip(pads, Zxy_input)] max_shape = np.max(np.array([itm.shape for itm in Zxy_input])) pads = [(0, max_shape - itm.shape[0]) for itm in Zxy_input] # padded = [np.pad(itm,pad,'constant',constant_values=-1) for pad, itm in zip(pads, Zxy_input)] Zxy_input = np.vstack([np.pad(itm, pad, 'constant', constant_values=-1) for pad, itm in zip(pads, Zxy_input)]) Zxy_input = Zxy_input.astype(np.float32) # stack atomic potentials of unique elements atom_pot_stack = np.array([self.cached_pots[uq_Z] for uq_Z in unique_Z]).astype(np.float32) Loading
namsa/potential_kernels.cu +7 −54 Original line number Diff line number Diff line Loading @@ -2,58 +2,9 @@ #define FULL_MASK 0xffffffff #include <stdio.h> // __global__ void BuildScatteringPotential(pycuda::complex<float> slice[][{{y_sampling}}][{{x_sampling}}], // float atom_pot_stack[][{{pot_shape_y}}][{{pot_shape_x}}], // int sites[][{{sites_size}}], // float sigma) // { // const int pot_size_y = {{pot_shape_y}}, pot_size_x = {{pot_shape_x}}; // const int slice_size_y = {{y_sampling}}, slice_size_x = {{x_sampling}}; // const int num_slices = {{num_slices}}, sites_size = {{sites_size}}; // int row_idx = blockDim.y * blockIdx.y + threadIdx.y; // int col_idx = blockDim.x * blockIdx.x + threadIdx.x; // int stk_idx = blockDim.z * blockIdx.z + threadIdx.z; // // if (stk_idx == 0 && row_idx == 0 && col_idx == 0) // // { // // #pragma unroll // for (int slice_num=0; slice_num<num_slices; slice_num++) // { // if (stk_idx == slice_num) // { // for(int my_site=0; my_site<sites_size/3; my_site++) // { // const int Z = sites[stk_idx][3 * my_site]; // const int y_cen = sites[stk_idx][3 * my_site + 1]; // const int x_cen = sites[stk_idx][3 * my_site + 2]; // const int y_start = rintf(y_cen - pot_size_y * 1.f/2); // const int y_end = rintf(y_cen + pot_size_y * 1.f /2); // const int x_start = rintf(x_cen - pot_size_x * 1.f/2); // const int x_end = rintf(x_cen + pot_size_x * 1.f/2); // //printf(" slice_num: %d, Z: %d, y_cen:%d, x_cen: %d\\n ", slice_num, Z, y_cen, x_cen); // if (row_idx >=0 && row_idx < y_end && row_idx < slice_size_y && row_idx >= y_start // && col_idx >=0 && col_idx < x_end && col_idx < slice_size_x && col_idx >= x_start) // { // const int pot_i = row_idx-y_start, pot_j = col_idx-x_start; // atomicAdd(&slice[stk_idx][row_idx][col_idx]._M_re, atom_pot_stack[Z][pot_i][pot_j]); // } // } // } // } // __syncthreads(); // if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices) // { // slice[stk_idx][row_idx][col_idx] = pycuda::complex<float>(cosf(slice[stk_idx][row_idx][col_idx]._M_re * sigma), // sinf(slice[stk_idx][row_idx][col_idx]._M_re * sigma)); // } // } __global__ void BuildScatteringPotential(pycuda::complex<float> slice[][{{y_sampling}}][{{x_sampling}}], float atom_pot_stack[][{{pot_shape_y}}][{{pot_shape_x}}], int sites[][{{sites_size}}], float sites[][{{sites_size}}], float sigma) { Loading @@ -72,9 +23,9 @@ { for(int my_site=0; my_site<sites_size/3; my_site++) { const int Z = sites[stk_idx][3 * my_site]; const int y_cen = sites[stk_idx][3 * my_site + 1]; const int x_cen = sites[stk_idx][3 * my_site + 2]; const int Z = (int) sites[stk_idx][3 * my_site]; const int y_cen = (int) sites[stk_idx][3 * my_site + 1]; const int x_cen = (int) sites[stk_idx][3 * my_site + 2]; const int y_start = rintf(y_cen - pot_size_y * 1.f/2); const int y_end = rintf(y_cen + pot_size_y * 1.f /2); const int x_start = rintf(x_cen - pot_size_x * 1.f/2); Loading @@ -84,11 +35,13 @@ && col_idx >=0 && col_idx < x_end && col_idx < slice_size_x && col_idx >= x_start) { const int pot_i = row_idx-y_start, pot_j = col_idx-x_start; if (pot_i < pot_size_y && pot_j < pot_size_x && pot_i >= 0 && pot_j >= 0){ atomicAdd(&slice[stk_idx][row_idx][col_idx]._M_re, atom_pot_stack[Z][pot_i][pot_j]); } } } } } __syncthreads(); if (col_idx < slice_size_x && row_idx < slice_size_y && stk_idx < num_slices) Loading
setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -9,7 +9,7 @@ setup( author='Numan Laanait', author_email='laanaitn@ornl.gov', description='', install_requires=['scipy', 'pymatgen', 'numpy', 'pycuda', 'scikit-cuda'], install_requires=['scipy', 'pymatgen', 'numpy', 'pycuda==2019.1', 'scikit-cuda', 'mpi4py'], #install_requires=['numpy', 'scipy', 'pymatgen', 'pybtex'], test_suite='tests', python_requires='>=3.6', Loading