From 71a673696a238aa9b277b4860f0ab99c6c189576 Mon Sep 17 00:00:00 2001 From: Chris Smith <csmith55@utk.edu> Date: Thu, 29 Sep 2016 09:30:55 -0400 Subject: [PATCH] Repack improvement After calling repack, code checks the last modification time vs the current time. Only proceeds to delete old file and move repacked file after 1s has passed. --- pycroscopy/io/io_hdf5.py | 102 ++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 49 deletions(-) diff --git a/pycroscopy/io/io_hdf5.py b/pycroscopy/io/io_hdf5.py index 35ca1ee1..5e8548e4 100644 --- a/pycroscopy/io/io_hdf5.py +++ b/pycroscopy/io/io_hdf5.py @@ -8,7 +8,7 @@ Main Class in charge of writing/reading to/from hdf5 file. import os import subprocess import sys -from time import sleep +from time import time, sleep from warnings import warn import h5py @@ -57,7 +57,7 @@ class ioHDF5(object): return self.file = file_handle.file self.path = file_handle.filename - + def clear(self): ''' Clear h5.file of all contents @@ -70,7 +70,7 @@ class ioHDF5(object): self.file.clear() self.repack() - + def repack(self): ''' Uses the h5repack command to recover cleared space in an hdf5 file. @@ -84,14 +84,18 @@ class ioHDF5(object): Repack the opened hdf5 file into a temporary file ''' try: - repack_line = 'h5repack '+self.path+' '+tmpfile + repack_line = ' '.join(['h5repack',self.path,tmpfile]) subprocess.check_output(repack_line, stderr=subprocess.STDOUT, shell=True) - sleep(2) + # Check that the file is done being modified + while time()-os.stat(tmpfile).st_mtime <= 1: + sleep(0.5) except subprocess.CalledProcessError as err: print('Could not repack hdf5 file') raise Exception(err.output) + except: + raise ''' Delete the original file and move the temporary file to the originals path @@ -114,17 +118,17 @@ class ioHDF5(object): def close(self): '''Close h5.file''' self.file.close() - + def delete(self): ''' Delete h5.file''' self.close() os.remove(self.path) - + def flush(self): '''Flush data from memory and commit to file. Use this after manually inserting data into the hdf dataset''' self.file.flush() - + def writeData(self, data, print_log=False): ''' Writes data into the hdf5 file and assigns data attributes such as region references. @@ -140,18 +144,18 @@ class ioHDF5(object): refList : List of HDF5dataset or HDF5Datagroup references References to the objects written ''' - + f = self.file - + f.attrs['PySPM version']=version - + # Checking if the data is an MicroDataGroup object - if not isinstance(data, MicroDataGroup): + if not isinstance(data, MicroDataGroup): warn('Input of type: {} \n'.format(type(data))) sys.exit("Input not of type MicroDataGroup.\n We're done here! \n") - + # Figuring out if the first item in AFMData tree is file or group - if data.name is '' and data.parent is '/': + if data.name is '' and data.parent is '/': # For file we just write the attributes for key in data.attrs.iterkeys(): f.attrs[key] = data.attrs[key] @@ -184,12 +188,12 @@ class ioHDF5(object): g.attrs[key] = data.attrs[key] if print_log: print('Wrote attributes to group: {} \n'.format(data.name)) root = g.name - + # Populating the tree structure recursively refList = [] # Recursive function def __populate(child, parent): - + if isinstance(child, MicroDataGroup): if child.indexed: previous = np.where([child.name in key for key in f[parent].keys()])[0] @@ -198,7 +202,7 @@ class ioHDF5(object): else: last = f[parent].keys()[previous[-1]] index = int(last.split('_')[-1])+1 - child.name+='{:03d}'.format(index) + child.name+='{:03d}'.format(index) try: itm = f[parent].create_group(child.name) if print_log: print('Created Group {}'.format(itm.name)) @@ -219,10 +223,10 @@ class ioHDF5(object): # finite sized dataset and maxshape is not provided # Typically for small / ancilliary datasets try: - itm = f[parent].create_dataset(child.name, - data = child.data, + itm = f[parent].create_dataset(child.name, + data = child.data, compression = child.compression, - dtype = child.data.dtype, + dtype = child.data.dtype, chunks= child.chunking) except RuntimeError: itm = f[parent][child.name] @@ -234,23 +238,23 @@ class ioHDF5(object): # Here, we only allocate the space. The provided data is ignored # print child.name try: - itm = f[parent].create_dataset(child.name, child.maxshape, + itm = f[parent].create_dataset(child.name, child.maxshape, compression = child.compression, - dtype = child.dtype, + dtype = child.dtype, chunks= child.chunking) except RuntimeError: itm = f[parent][child.name] warn('Found Dataset already exists {}'.format(itm.name)) except: - raise - else: + raise + else: # Resizable but the written files are significantly larger max_shape = tuple([ None for i in range(len(child.data.shape))]) try: - itm = f[parent].create_dataset(child.name, - data = child.data, + itm = f[parent].create_dataset(child.name, + data = child.data, compression = child.compression, - dtype = child.data.dtype, + dtype = child.data.dtype, chunks= child.chunking, maxshape = max_shape) except RuntimeError: @@ -258,54 +262,54 @@ class ioHDF5(object): warn('Found Dataset already exists {}'.format(itm.name)) except: raise - + if print_log: print('Created Dataset {}'.format(itm.name)) for key in child.attrs.iterkeys(): # print('Found some region references') # writing region reference if key is 'labels': # print('Found some region references') - labels = child.attrs[key]# labels here is a dictionary + labels = child.attrs[key]# labels here is a dictionary self.regionRefs(itm, labels, print_log=print_log) ''' Now make an attribute called 'labels' that is a list of strings First ascertain the dimension of the slicing: - ''' + ''' found_dim = False - for dimen, slobj in enumerate(labels[labels.keys()[0]]): + for dimen, slobj in enumerate(labels[labels.keys()[0]]): # We make the assumption that checking the start is sufficient - if slobj.start != None: - found_dim = True + if slobj.start != None: + found_dim = True break - if found_dim: - headers = [None]*len(labels) # The list that will hold all the names - for col_name in labels.keys(): - headers[labels[col_name][dimen].start] = col_name - # Now write the list of col / row names as an attribute: - itm.attrs[key] = headers - else: - warn('Unable to write region labels for %s' %(itm.name.split('/')[-1])) + if found_dim: + headers = [None]*len(labels) # The list that will hold all the names + for col_name in labels.keys(): + headers[labels[col_name][dimen].start] = col_name + # Now write the list of col / row names as an attribute: + itm.attrs[key] = headers + else: + warn('Unable to write region labels for %s' %(itm.name.split('/')[-1])) if print_log: print('Wrote Region References of Dataset %s' %(itm.name.split('/')[-1])) - else: + else: itm.attrs[key] = child.attrs[key] if print_log: print('Wrote Attributes of Dataset %s \n' %(itm.name.split('/')[-1])) # Make a dictionary of references refList.append(itm) return refList - + # Recursive function is called at each stage beginning at the root for child in data.children: __populate(child, root) - - if print_log: + + if print_log: print('Finished writing to h5 file.\n'+ 'Right now you got yourself a fancy folder structure. \n'+ 'Make sure you do some reference linking to take advantage of the full power of HDF5.') return refList - - - + + + def regionRefs(self, dataset, slices, print_log=False): ''' Creates attributes of a h5.Dataset that refer to regions in the arrays @@ -321,4 +325,4 @@ class ioHDF5(object): for sl in slices.iterkeys(): if print_log: print('Wrote Region Reference:%s to Dataset %s' %(sl, dataset.name)) dataset.attrs[sl] = dataset.regionref[slices[sl]] - + -- GitLab