Skip to content
Snippets Groups Projects
Commit 71a67369 authored by Chris Smith's avatar Chris Smith
Browse files

Repack improvement

After calling repack, code checks the last modification time vs the current time.  Only proceeds to delete old file and move repacked file after 1s has passed.
parent d0f6fbf8
No related branches found
No related tags found
1 merge request!13Pyspm port suhas
......@@ -8,7 +8,7 @@ Main Class in charge of writing/reading to/from hdf5 file.
import os
import subprocess
import sys
from time import sleep
from time import time, sleep
from warnings import warn
import h5py
......@@ -57,7 +57,7 @@ class ioHDF5(object):
return
self.file = file_handle.file
self.path = file_handle.filename
def clear(self):
'''
Clear h5.file of all contents
......@@ -70,7 +70,7 @@ class ioHDF5(object):
self.file.clear()
self.repack()
def repack(self):
'''
Uses the h5repack command to recover cleared space in an hdf5 file.
......@@ -84,14 +84,18 @@ class ioHDF5(object):
Repack the opened hdf5 file into a temporary file
'''
try:
repack_line = 'h5repack '+self.path+' '+tmpfile
repack_line = ' '.join(['h5repack',self.path,tmpfile])
subprocess.check_output(repack_line,
stderr=subprocess.STDOUT,
shell=True)
sleep(2)
# Check that the file is done being modified
while time()-os.stat(tmpfile).st_mtime <= 1:
sleep(0.5)
except subprocess.CalledProcessError as err:
print('Could not repack hdf5 file')
raise Exception(err.output)
except:
raise
'''
Delete the original file and move the temporary file to the originals path
......@@ -114,17 +118,17 @@ class ioHDF5(object):
def close(self):
'''Close h5.file'''
self.file.close()
def delete(self):
''' Delete h5.file'''
self.close()
os.remove(self.path)
def flush(self):
'''Flush data from memory and commit to file.
Use this after manually inserting data into the hdf dataset'''
self.file.flush()
def writeData(self, data, print_log=False):
'''
Writes data into the hdf5 file and assigns data attributes such as region references.
......@@ -140,18 +144,18 @@ class ioHDF5(object):
refList : List of HDF5dataset or HDF5Datagroup references
References to the objects written
'''
f = self.file
f.attrs['PySPM version']=version
# Checking if the data is an MicroDataGroup object
if not isinstance(data, MicroDataGroup):
if not isinstance(data, MicroDataGroup):
warn('Input of type: {} \n'.format(type(data)))
sys.exit("Input not of type MicroDataGroup.\n We're done here! \n")
# Figuring out if the first item in AFMData tree is file or group
if data.name is '' and data.parent is '/':
if data.name is '' and data.parent is '/':
# For file we just write the attributes
for key in data.attrs.iterkeys():
f.attrs[key] = data.attrs[key]
......@@ -184,12 +188,12 @@ class ioHDF5(object):
g.attrs[key] = data.attrs[key]
if print_log: print('Wrote attributes to group: {} \n'.format(data.name))
root = g.name
# Populating the tree structure recursively
refList = []
# Recursive function
def __populate(child, parent):
if isinstance(child, MicroDataGroup):
if child.indexed:
previous = np.where([child.name in key for key in f[parent].keys()])[0]
......@@ -198,7 +202,7 @@ class ioHDF5(object):
else:
last = f[parent].keys()[previous[-1]]
index = int(last.split('_')[-1])+1
child.name+='{:03d}'.format(index)
child.name+='{:03d}'.format(index)
try:
itm = f[parent].create_group(child.name)
if print_log: print('Created Group {}'.format(itm.name))
......@@ -219,10 +223,10 @@ class ioHDF5(object):
# finite sized dataset and maxshape is not provided
# Typically for small / ancilliary datasets
try:
itm = f[parent].create_dataset(child.name,
data = child.data,
itm = f[parent].create_dataset(child.name,
data = child.data,
compression = child.compression,
dtype = child.data.dtype,
dtype = child.data.dtype,
chunks= child.chunking)
except RuntimeError:
itm = f[parent][child.name]
......@@ -234,23 +238,23 @@ class ioHDF5(object):
# Here, we only allocate the space. The provided data is ignored
# print child.name
try:
itm = f[parent].create_dataset(child.name, child.maxshape,
itm = f[parent].create_dataset(child.name, child.maxshape,
compression = child.compression,
dtype = child.dtype,
dtype = child.dtype,
chunks= child.chunking)
except RuntimeError:
itm = f[parent][child.name]
warn('Found Dataset already exists {}'.format(itm.name))
except:
raise
else:
raise
else:
# Resizable but the written files are significantly larger
max_shape = tuple([ None for i in range(len(child.data.shape))])
try:
itm = f[parent].create_dataset(child.name,
data = child.data,
itm = f[parent].create_dataset(child.name,
data = child.data,
compression = child.compression,
dtype = child.data.dtype,
dtype = child.data.dtype,
chunks= child.chunking,
maxshape = max_shape)
except RuntimeError:
......@@ -258,54 +262,54 @@ class ioHDF5(object):
warn('Found Dataset already exists {}'.format(itm.name))
except:
raise
if print_log: print('Created Dataset {}'.format(itm.name))
for key in child.attrs.iterkeys():
# print('Found some region references')
# writing region reference
if key is 'labels':
# print('Found some region references')
labels = child.attrs[key]# labels here is a dictionary
labels = child.attrs[key]# labels here is a dictionary
self.regionRefs(itm, labels, print_log=print_log)
'''
Now make an attribute called 'labels' that is a list of strings
First ascertain the dimension of the slicing:
'''
'''
found_dim = False
for dimen, slobj in enumerate(labels[labels.keys()[0]]):
for dimen, slobj in enumerate(labels[labels.keys()[0]]):
# We make the assumption that checking the start is sufficient
if slobj.start != None:
found_dim = True
if slobj.start != None:
found_dim = True
break
if found_dim:
headers = [None]*len(labels) # The list that will hold all the names
for col_name in labels.keys():
headers[labels[col_name][dimen].start] = col_name
# Now write the list of col / row names as an attribute:
itm.attrs[key] = headers
else:
warn('Unable to write region labels for %s' %(itm.name.split('/')[-1]))
if found_dim:
headers = [None]*len(labels) # The list that will hold all the names
for col_name in labels.keys():
headers[labels[col_name][dimen].start] = col_name
# Now write the list of col / row names as an attribute:
itm.attrs[key] = headers
else:
warn('Unable to write region labels for %s' %(itm.name.split('/')[-1]))
if print_log: print('Wrote Region References of Dataset %s' %(itm.name.split('/')[-1]))
else:
else:
itm.attrs[key] = child.attrs[key]
if print_log: print('Wrote Attributes of Dataset %s \n' %(itm.name.split('/')[-1]))
# Make a dictionary of references
refList.append(itm)
return refList
# Recursive function is called at each stage beginning at the root
for child in data.children:
__populate(child, root)
if print_log:
if print_log:
print('Finished writing to h5 file.\n'+
'Right now you got yourself a fancy folder structure. \n'+
'Make sure you do some reference linking to take advantage of the full power of HDF5.')
return refList
def regionRefs(self, dataset, slices, print_log=False):
'''
Creates attributes of a h5.Dataset that refer to regions in the arrays
......@@ -321,4 +325,4 @@ class ioHDF5(object):
for sl in slices.iterkeys():
if print_log: print('Wrote Region Reference:%s to Dataset %s' %(sl, dataset.name))
dataset.attrs[sl] = dataset.regionref[slices[sl]]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment