Loading namsa/msa.py +43 −13 Original line number Diff line number Diff line Loading @@ -335,13 +335,13 @@ class MSAHybrid(MSA): atexit.register(_clean_up) return ctx # return context in case of manual clean-up @staticmethod def clean_up(ctx): ctx.pop() ctx.detach() ctx = None from pycuda.tools import clear_context_caches clear_context_caches() # @staticmethod # def clean_up(ctx): # ctx.pop() # ctx.detach() # ctx = None # from pycuda.tools import clear_context_caches # clear_context_caches() def plan_simulation(self, num_probes=None): if num_probes is None: Loading Loading @@ -429,6 +429,21 @@ class MSAHybrid(MSA): class MSAGPU(MSAHybrid): @staticmethod def clean_up(ctx=None, vars=None): if vars is not None: for var in vars: if var is not None: var.free() if ctx is not None: ctx.pop() ctx.detach() ctx = None from pycuda.tools import clear_context_caches clear_context_caches() def build_potential_slices(self, slice_thickness): # find number of slices and atomic sites per slice self.slice_t = slice_thickness Loading Loading @@ -465,6 +480,8 @@ class MSAGPU(MSAHybrid): sites_d = cuda.to_device(Zxy_input) self.potential_slices = cuda.aligned_empty((int(self.num_slices), int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars = [] self.vars.append(self.potential_slices.base) self.potential_slices = cuda.register_host_memory(self.potential_slices) potential_slices_d = cuda.to_device(self.potential_slices) Loading @@ -487,6 +504,9 @@ class MSAGPU(MSAHybrid): atom_pot_stack_d.free() self.print_verbose('Built %d potential slices with shape:%s pixels' % (self.potential_slices.shape[0], format(self.potential_slices.shape[1:]))) # unregister host memory #self.potential_slices.base.unregister() #self.potential_slices.base.free() def _load_kernels(self): try: Loading Loading @@ -540,12 +560,12 @@ class MSAGPU(MSAHybrid): # allocate memory self.apert = np.empty(self.sampling, dtype=np.float32) self.apert = cuda.register_host_memory(self.apert) #self.apert = cuda.register_host_memory(self.apert) apert_d = cuda.mem_alloc(self.apert.nbytes) self.psi_k = np.empty(self.sampling, dtype=np.complex64) self.psi_k = cuda.register_host_memory(self.psi_k) #self.psi_k = cuda.register_host_memory(self.psi_k) self.psi = np.empty_like(self.psi_k) self.psi = cuda.register_host_memory(self.psi) #self.psi = cuda.register_host_memory(self.psi) psi_k_d = cuda.mem_alloc(self.psi_k.nbytes) psi_x_d = cuda.mem_alloc(self.psi_k.nbytes) Loading Loading @@ -576,6 +596,11 @@ class MSAGPU(MSAHybrid): apert_d.free() cufft.cufftDestroy(fft_plan.handle) # unregister host memory #self.apert.unregister() #self.psi_k.unregister() #self.psi.unregister() def generate_probe_positions(self, probe_step=np.array([0.1, 0.1]), probe_range=np.array([[0., 1.0], [0., 1.0]])): grid_steps_x, grid_steps_y = np.floor(np.diff(probe_range).flatten() * self.dims[:2] / probe_step).astype(np.int) grid_range_x, grid_range_y = [(probe_range[i] - np.ones((2,)) * 0.5) * self.dims[i] Loading Loading @@ -652,10 +677,12 @@ class MSAGPU(MSAHybrid): # allocate memory self.probes = np.empty((num_probes, shape_y, shape_x), dtype=np.complex64) self.propag = np.empty(self.sampling, dtype=np.complex64) self.propag = cuda.aligned_empty((int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars.append(self.propag.base) self.propag = cuda.register_host_memory(self.propag) propag_d = cuda.to_device(self.propag) self.mask = np.empty(self.sampling, dtype=np.float32) self.mask = cuda.aligned_empty((int(self.sampling[0]), int(self.sampling[1])), np.float32) self.vars.append(self.mask.base) self.mask = cuda.register_host_memory(self.mask) mask_d = cuda.to_device(self.mask) grid_steps_d = cuda.to_device(self.grid_steps.astype(np.int32)) Loading @@ -667,8 +694,11 @@ class MSAGPU(MSAHybrid): else: # pinned memory is default self.probes = cuda.aligned_empty((int(self.num_probes), int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars.append(self.probes.base) self.probes = cuda.register_host_memory(self.probes) ones = cuda.register_host_memory(np.ones(self.sampling, dtype=np.complex64)) ones = cuda.aligned_zeros((int(self.sampling[0]), int(self.sampling[1])), np.complex64) + 1 self.vars.append(ones.base) ones = cuda.register_host_memory(ones) ones_d = cuda.mem_alloc(ones.nbytes) cuda.memcpy_htod_async(ones_d, ones, cuda.Stream()) Loading Loading
namsa/msa.py +43 −13 Original line number Diff line number Diff line Loading @@ -335,13 +335,13 @@ class MSAHybrid(MSA): atexit.register(_clean_up) return ctx # return context in case of manual clean-up @staticmethod def clean_up(ctx): ctx.pop() ctx.detach() ctx = None from pycuda.tools import clear_context_caches clear_context_caches() # @staticmethod # def clean_up(ctx): # ctx.pop() # ctx.detach() # ctx = None # from pycuda.tools import clear_context_caches # clear_context_caches() def plan_simulation(self, num_probes=None): if num_probes is None: Loading Loading @@ -429,6 +429,21 @@ class MSAHybrid(MSA): class MSAGPU(MSAHybrid): @staticmethod def clean_up(ctx=None, vars=None): if vars is not None: for var in vars: if var is not None: var.free() if ctx is not None: ctx.pop() ctx.detach() ctx = None from pycuda.tools import clear_context_caches clear_context_caches() def build_potential_slices(self, slice_thickness): # find number of slices and atomic sites per slice self.slice_t = slice_thickness Loading Loading @@ -465,6 +480,8 @@ class MSAGPU(MSAHybrid): sites_d = cuda.to_device(Zxy_input) self.potential_slices = cuda.aligned_empty((int(self.num_slices), int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars = [] self.vars.append(self.potential_slices.base) self.potential_slices = cuda.register_host_memory(self.potential_slices) potential_slices_d = cuda.to_device(self.potential_slices) Loading @@ -487,6 +504,9 @@ class MSAGPU(MSAHybrid): atom_pot_stack_d.free() self.print_verbose('Built %d potential slices with shape:%s pixels' % (self.potential_slices.shape[0], format(self.potential_slices.shape[1:]))) # unregister host memory #self.potential_slices.base.unregister() #self.potential_slices.base.free() def _load_kernels(self): try: Loading Loading @@ -540,12 +560,12 @@ class MSAGPU(MSAHybrid): # allocate memory self.apert = np.empty(self.sampling, dtype=np.float32) self.apert = cuda.register_host_memory(self.apert) #self.apert = cuda.register_host_memory(self.apert) apert_d = cuda.mem_alloc(self.apert.nbytes) self.psi_k = np.empty(self.sampling, dtype=np.complex64) self.psi_k = cuda.register_host_memory(self.psi_k) #self.psi_k = cuda.register_host_memory(self.psi_k) self.psi = np.empty_like(self.psi_k) self.psi = cuda.register_host_memory(self.psi) #self.psi = cuda.register_host_memory(self.psi) psi_k_d = cuda.mem_alloc(self.psi_k.nbytes) psi_x_d = cuda.mem_alloc(self.psi_k.nbytes) Loading Loading @@ -576,6 +596,11 @@ class MSAGPU(MSAHybrid): apert_d.free() cufft.cufftDestroy(fft_plan.handle) # unregister host memory #self.apert.unregister() #self.psi_k.unregister() #self.psi.unregister() def generate_probe_positions(self, probe_step=np.array([0.1, 0.1]), probe_range=np.array([[0., 1.0], [0., 1.0]])): grid_steps_x, grid_steps_y = np.floor(np.diff(probe_range).flatten() * self.dims[:2] / probe_step).astype(np.int) grid_range_x, grid_range_y = [(probe_range[i] - np.ones((2,)) * 0.5) * self.dims[i] Loading Loading @@ -652,10 +677,12 @@ class MSAGPU(MSAHybrid): # allocate memory self.probes = np.empty((num_probes, shape_y, shape_x), dtype=np.complex64) self.propag = np.empty(self.sampling, dtype=np.complex64) self.propag = cuda.aligned_empty((int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars.append(self.propag.base) self.propag = cuda.register_host_memory(self.propag) propag_d = cuda.to_device(self.propag) self.mask = np.empty(self.sampling, dtype=np.float32) self.mask = cuda.aligned_empty((int(self.sampling[0]), int(self.sampling[1])), np.float32) self.vars.append(self.mask.base) self.mask = cuda.register_host_memory(self.mask) mask_d = cuda.to_device(self.mask) grid_steps_d = cuda.to_device(self.grid_steps.astype(np.int32)) Loading @@ -667,8 +694,11 @@ class MSAGPU(MSAHybrid): else: # pinned memory is default self.probes = cuda.aligned_empty((int(self.num_probes), int(self.sampling[0]), int(self.sampling[1])), np.complex64) self.vars.append(self.probes.base) self.probes = cuda.register_host_memory(self.probes) ones = cuda.register_host_memory(np.ones(self.sampling, dtype=np.complex64)) ones = cuda.aligned_zeros((int(self.sampling[0]), int(self.sampling[1])), np.complex64) + 1 self.vars.append(ones.base) ones = cuda.register_host_memory(ones) ones_d = cuda.mem_alloc(ones.nbytes) cuda.memcpy_htod_async(ones_d, ones, cuda.Stream()) Loading