Skip to content
Snippets Groups Projects
Commit 71a67369 authored by Chris Smith's avatar Chris Smith
Browse files

Repack improvement

After calling repack, code checks the last modification time vs the current time.  Only proceeds to delete old file and move repacked file after 1s has passed.
parent d0f6fbf8
No related branches found
No related tags found
1 merge request!13Pyspm port suhas
...@@ -8,7 +8,7 @@ Main Class in charge of writing/reading to/from hdf5 file. ...@@ -8,7 +8,7 @@ Main Class in charge of writing/reading to/from hdf5 file.
import os import os
import subprocess import subprocess
import sys import sys
from time import sleep from time import time, sleep
from warnings import warn from warnings import warn
import h5py import h5py
...@@ -57,7 +57,7 @@ class ioHDF5(object): ...@@ -57,7 +57,7 @@ class ioHDF5(object):
return return
self.file = file_handle.file self.file = file_handle.file
self.path = file_handle.filename self.path = file_handle.filename
def clear(self): def clear(self):
''' '''
Clear h5.file of all contents Clear h5.file of all contents
...@@ -70,7 +70,7 @@ class ioHDF5(object): ...@@ -70,7 +70,7 @@ class ioHDF5(object):
self.file.clear() self.file.clear()
self.repack() self.repack()
def repack(self): def repack(self):
''' '''
Uses the h5repack command to recover cleared space in an hdf5 file. Uses the h5repack command to recover cleared space in an hdf5 file.
...@@ -84,14 +84,18 @@ class ioHDF5(object): ...@@ -84,14 +84,18 @@ class ioHDF5(object):
Repack the opened hdf5 file into a temporary file Repack the opened hdf5 file into a temporary file
''' '''
try: try:
repack_line = 'h5repack '+self.path+' '+tmpfile repack_line = ' '.join(['h5repack',self.path,tmpfile])
subprocess.check_output(repack_line, subprocess.check_output(repack_line,
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
shell=True) shell=True)
sleep(2) # Check that the file is done being modified
while time()-os.stat(tmpfile).st_mtime <= 1:
sleep(0.5)
except subprocess.CalledProcessError as err: except subprocess.CalledProcessError as err:
print('Could not repack hdf5 file') print('Could not repack hdf5 file')
raise Exception(err.output) raise Exception(err.output)
except:
raise
''' '''
Delete the original file and move the temporary file to the originals path Delete the original file and move the temporary file to the originals path
...@@ -114,17 +118,17 @@ class ioHDF5(object): ...@@ -114,17 +118,17 @@ class ioHDF5(object):
def close(self): def close(self):
'''Close h5.file''' '''Close h5.file'''
self.file.close() self.file.close()
def delete(self): def delete(self):
''' Delete h5.file''' ''' Delete h5.file'''
self.close() self.close()
os.remove(self.path) os.remove(self.path)
def flush(self): def flush(self):
'''Flush data from memory and commit to file. '''Flush data from memory and commit to file.
Use this after manually inserting data into the hdf dataset''' Use this after manually inserting data into the hdf dataset'''
self.file.flush() self.file.flush()
def writeData(self, data, print_log=False): def writeData(self, data, print_log=False):
''' '''
Writes data into the hdf5 file and assigns data attributes such as region references. Writes data into the hdf5 file and assigns data attributes such as region references.
...@@ -140,18 +144,18 @@ class ioHDF5(object): ...@@ -140,18 +144,18 @@ class ioHDF5(object):
refList : List of HDF5dataset or HDF5Datagroup references refList : List of HDF5dataset or HDF5Datagroup references
References to the objects written References to the objects written
''' '''
f = self.file f = self.file
f.attrs['PySPM version']=version f.attrs['PySPM version']=version
# Checking if the data is an MicroDataGroup object # Checking if the data is an MicroDataGroup object
if not isinstance(data, MicroDataGroup): if not isinstance(data, MicroDataGroup):
warn('Input of type: {} \n'.format(type(data))) warn('Input of type: {} \n'.format(type(data)))
sys.exit("Input not of type MicroDataGroup.\n We're done here! \n") sys.exit("Input not of type MicroDataGroup.\n We're done here! \n")
# Figuring out if the first item in AFMData tree is file or group # Figuring out if the first item in AFMData tree is file or group
if data.name is '' and data.parent is '/': if data.name is '' and data.parent is '/':
# For file we just write the attributes # For file we just write the attributes
for key in data.attrs.iterkeys(): for key in data.attrs.iterkeys():
f.attrs[key] = data.attrs[key] f.attrs[key] = data.attrs[key]
...@@ -184,12 +188,12 @@ class ioHDF5(object): ...@@ -184,12 +188,12 @@ class ioHDF5(object):
g.attrs[key] = data.attrs[key] g.attrs[key] = data.attrs[key]
if print_log: print('Wrote attributes to group: {} \n'.format(data.name)) if print_log: print('Wrote attributes to group: {} \n'.format(data.name))
root = g.name root = g.name
# Populating the tree structure recursively # Populating the tree structure recursively
refList = [] refList = []
# Recursive function # Recursive function
def __populate(child, parent): def __populate(child, parent):
if isinstance(child, MicroDataGroup): if isinstance(child, MicroDataGroup):
if child.indexed: if child.indexed:
previous = np.where([child.name in key for key in f[parent].keys()])[0] previous = np.where([child.name in key for key in f[parent].keys()])[0]
...@@ -198,7 +202,7 @@ class ioHDF5(object): ...@@ -198,7 +202,7 @@ class ioHDF5(object):
else: else:
last = f[parent].keys()[previous[-1]] last = f[parent].keys()[previous[-1]]
index = int(last.split('_')[-1])+1 index = int(last.split('_')[-1])+1
child.name+='{:03d}'.format(index) child.name+='{:03d}'.format(index)
try: try:
itm = f[parent].create_group(child.name) itm = f[parent].create_group(child.name)
if print_log: print('Created Group {}'.format(itm.name)) if print_log: print('Created Group {}'.format(itm.name))
...@@ -219,10 +223,10 @@ class ioHDF5(object): ...@@ -219,10 +223,10 @@ class ioHDF5(object):
# finite sized dataset and maxshape is not provided # finite sized dataset and maxshape is not provided
# Typically for small / ancilliary datasets # Typically for small / ancilliary datasets
try: try:
itm = f[parent].create_dataset(child.name, itm = f[parent].create_dataset(child.name,
data = child.data, data = child.data,
compression = child.compression, compression = child.compression,
dtype = child.data.dtype, dtype = child.data.dtype,
chunks= child.chunking) chunks= child.chunking)
except RuntimeError: except RuntimeError:
itm = f[parent][child.name] itm = f[parent][child.name]
...@@ -234,23 +238,23 @@ class ioHDF5(object): ...@@ -234,23 +238,23 @@ class ioHDF5(object):
# Here, we only allocate the space. The provided data is ignored # Here, we only allocate the space. The provided data is ignored
# print child.name # print child.name
try: try:
itm = f[parent].create_dataset(child.name, child.maxshape, itm = f[parent].create_dataset(child.name, child.maxshape,
compression = child.compression, compression = child.compression,
dtype = child.dtype, dtype = child.dtype,
chunks= child.chunking) chunks= child.chunking)
except RuntimeError: except RuntimeError:
itm = f[parent][child.name] itm = f[parent][child.name]
warn('Found Dataset already exists {}'.format(itm.name)) warn('Found Dataset already exists {}'.format(itm.name))
except: except:
raise raise
else: else:
# Resizable but the written files are significantly larger # Resizable but the written files are significantly larger
max_shape = tuple([ None for i in range(len(child.data.shape))]) max_shape = tuple([ None for i in range(len(child.data.shape))])
try: try:
itm = f[parent].create_dataset(child.name, itm = f[parent].create_dataset(child.name,
data = child.data, data = child.data,
compression = child.compression, compression = child.compression,
dtype = child.data.dtype, dtype = child.data.dtype,
chunks= child.chunking, chunks= child.chunking,
maxshape = max_shape) maxshape = max_shape)
except RuntimeError: except RuntimeError:
...@@ -258,54 +262,54 @@ class ioHDF5(object): ...@@ -258,54 +262,54 @@ class ioHDF5(object):
warn('Found Dataset already exists {}'.format(itm.name)) warn('Found Dataset already exists {}'.format(itm.name))
except: except:
raise raise
if print_log: print('Created Dataset {}'.format(itm.name)) if print_log: print('Created Dataset {}'.format(itm.name))
for key in child.attrs.iterkeys(): for key in child.attrs.iterkeys():
# print('Found some region references') # print('Found some region references')
# writing region reference # writing region reference
if key is 'labels': if key is 'labels':
# print('Found some region references') # print('Found some region references')
labels = child.attrs[key]# labels here is a dictionary labels = child.attrs[key]# labels here is a dictionary
self.regionRefs(itm, labels, print_log=print_log) self.regionRefs(itm, labels, print_log=print_log)
''' '''
Now make an attribute called 'labels' that is a list of strings Now make an attribute called 'labels' that is a list of strings
First ascertain the dimension of the slicing: First ascertain the dimension of the slicing:
''' '''
found_dim = False found_dim = False
for dimen, slobj in enumerate(labels[labels.keys()[0]]): for dimen, slobj in enumerate(labels[labels.keys()[0]]):
# We make the assumption that checking the start is sufficient # We make the assumption that checking the start is sufficient
if slobj.start != None: if slobj.start != None:
found_dim = True found_dim = True
break break
if found_dim: if found_dim:
headers = [None]*len(labels) # The list that will hold all the names headers = [None]*len(labels) # The list that will hold all the names
for col_name in labels.keys(): for col_name in labels.keys():
headers[labels[col_name][dimen].start] = col_name headers[labels[col_name][dimen].start] = col_name
# Now write the list of col / row names as an attribute: # Now write the list of col / row names as an attribute:
itm.attrs[key] = headers itm.attrs[key] = headers
else: else:
warn('Unable to write region labels for %s' %(itm.name.split('/')[-1])) warn('Unable to write region labels for %s' %(itm.name.split('/')[-1]))
if print_log: print('Wrote Region References of Dataset %s' %(itm.name.split('/')[-1])) if print_log: print('Wrote Region References of Dataset %s' %(itm.name.split('/')[-1]))
else: else:
itm.attrs[key] = child.attrs[key] itm.attrs[key] = child.attrs[key]
if print_log: print('Wrote Attributes of Dataset %s \n' %(itm.name.split('/')[-1])) if print_log: print('Wrote Attributes of Dataset %s \n' %(itm.name.split('/')[-1]))
# Make a dictionary of references # Make a dictionary of references
refList.append(itm) refList.append(itm)
return refList return refList
# Recursive function is called at each stage beginning at the root # Recursive function is called at each stage beginning at the root
for child in data.children: for child in data.children:
__populate(child, root) __populate(child, root)
if print_log: if print_log:
print('Finished writing to h5 file.\n'+ print('Finished writing to h5 file.\n'+
'Right now you got yourself a fancy folder structure. \n'+ 'Right now you got yourself a fancy folder structure. \n'+
'Make sure you do some reference linking to take advantage of the full power of HDF5.') 'Make sure you do some reference linking to take advantage of the full power of HDF5.')
return refList return refList
def regionRefs(self, dataset, slices, print_log=False): def regionRefs(self, dataset, slices, print_log=False):
''' '''
Creates attributes of a h5.Dataset that refer to regions in the arrays Creates attributes of a h5.Dataset that refer to regions in the arrays
...@@ -321,4 +325,4 @@ class ioHDF5(object): ...@@ -321,4 +325,4 @@ class ioHDF5(object):
for sl in slices.iterkeys(): for sl in slices.iterkeys():
if print_log: print('Wrote Region Reference:%s to Dataset %s' %(sl, dataset.name)) if print_log: print('Wrote Region Reference:%s to Dataset %s' %(sl, dataset.name))
dataset.attrs[sl] = dataset.regionref[slices[sl]] dataset.attrs[sl] = dataset.regionref[slices[sl]]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment