io_hdf5.py 14.8 KB
Newer Older
1
2
3
4
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 21 12:29:33 2015
Main Class in charge of writing/reading to/from hdf5 file.
5
@author: Numan Laanait, Suhas Somnath, Chris Smith
6
7
8
9
"""

import os
import subprocess
10
import sys
Chris Smith's avatar
Chris Smith committed
11
from time import time, sleep
12
13
14
15
16
from warnings import warn

import h5py
import numpy as np

17
from .microdata import MicroDataGroup
18
from ..__version__ import version
19
20
21
22


class ioHDF5(object):

23
    def __init__(self, file_handle, cachemult=1):
24
        """
25
        Handles:
26
            + I/O operation from HDF5 file.
27
            + Utilities to get data and associated auxiliary.
28

29
30
        Parameters
        ----------
31
32
33
34
        file_handle : Object - String or Unicode or open hdf5 file
            Absolute path to the h5 file or an open hdf5 file
        cachemult : unsigned int (Optional. default = 1)
            Cache multiplier
35
        """
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
        if type(file_handle) in [str, unicode]:
            # file handle is actually a file path
            propfaid = h5py.h5p.create(h5py.h5p.FILE_ACCESS)
            if cachemult != 1:
                settings = list(propfaid.get_cache())
                settings[2] *= cachemult
                propfaid.set_cache(*settings)
            try:
                fid = h5py.h5f.open(file_handle,fapl=propfaid)
                self.file = h5py.File(fid, mode = 'r+')
            except IOError:
                #print('Unable to open file %s. \n Making a new one! \n' %(filename))
                fid = h5py.h5f.create(file_handle,fapl=propfaid)
                self.file = h5py.File(fid, mode = 'w')
            except:
                raise
            self.path = file_handle
53
        elif type(file_handle) == h5py.File:
54
55
56
57
58
59
            # file handle is actually an open hdf file
            if file_handle.mode == 'r':
                warn('ioHDF5 cannot work with open HDF5 files in read mode. Change to r+ or w')
                return
            self.file = file_handle.file
            self.path = file_handle.filename
Chris Smith's avatar
Chris Smith committed
60

61
    def clear(self):
62
        """
63
        Clear h5.file of all contents
64

65
66
        file.clear() only removes the contents, it does not free up previously allocated space.
        To do so, it's necessary to use the h5repack command after clearing.
67
        Because the file must be closed and reopened, it is best to call this
68
        function immediately after the creation of the ioHDF5 object.
69
        """
70
71
72
        self.file.clear()
        self.repack()

Chris Smith's avatar
Chris Smith committed
73

74
    def repack(self):
75
        """
76
77
78
        Uses the h5repack command to recover cleared space in an hdf5 file.
        h5repack can also be used to change chunking and compression, but these options have
        not yet been implemented here.
79
        """
80
81
82
83
84
85
86
        self.close()
        tmpfile = self.path+'.tmp'

        '''
        Repack the opened hdf5 file into a temporary file
        '''
        try:
87
            repack_line = ' '.join(['h5repack', '"'+self.path+'"', '"'+tmpfile+'"'])
88
89
90
            subprocess.check_output(repack_line,
                                    stderr=subprocess.STDOUT,
                                    shell=True)
Chris Smith's avatar
Chris Smith committed
91
            # Check that the file is done being modified
92
            sleep(0.5)
93
            while time()-os.stat(tmpfile).st_mtime <= 1:
Chris Smith's avatar
Chris Smith committed
94
                sleep(0.5)
95
        except subprocess.CalledProcessError as err:
Somnath, Suhas's avatar
Somnath, Suhas committed
96
            print('Could not repack hdf5 file')
97
            raise Exception(err.output)
Chris Smith's avatar
Chris Smith committed
98
99
        except:
            raise
100
101
102
103

        '''
        Delete the original file and move the temporary file to the originals path
        '''
104
# TODO Find way to get the real OS error that works in and out of Spyder
105
106
107
108
        try:
            os.remove(self.path)
            os.rename(tmpfile, self.path)
        except:
109
            print('Could not copy repacked file to original path.')
110
111
112
113
            print('The original file is located {}'.format(self.path))
            print('The repacked file is located {}'.format(tmpfile))
            raise

114
115
116
117
        '''
        Open the repacked file
        '''
        self.file = h5py.File(self.path, mode = 'r+')
118

119
120
121
    def close(self):
        '''Close h5.file'''
        self.file.close()
Chris Smith's avatar
Chris Smith committed
122

123
124
125
126
    def delete(self):
        ''' Delete h5.file'''
        self.close()
        os.remove(self.path)
Chris Smith's avatar
Chris Smith committed
127

128
129
130
131
    def flush(self):
        '''Flush data from memory and commit to file. 
        Use this after manually inserting data into the hdf dataset'''
        self.file.flush()
Chris Smith's avatar
Chris Smith committed
132

133
134
    def writeData(self, data, print_log=False):
        '''
135
        Writes data into the hdf5 file and assigns data attributes such as region references.
136
137
138
        The tree structure is inferred from the AFMData Object.
        
        Parameters
139
        ----------
140
141
142
143
        data : Instance of MicroData
            Tree structure describing the organization of the data
            
        Returns
144
        -------
145
146
147
        refList : List of HDF5dataset or HDF5Datagroup references
            References to the objects written
        '''
Chris Smith's avatar
Chris Smith committed
148

149
        f = self.file
Chris Smith's avatar
Chris Smith committed
150

151
        f.attrs['PySPM version']=version
Chris Smith's avatar
Chris Smith committed
152

153
        # Checking if the data is an MicroDataGroup object
Chris Smith's avatar
Chris Smith committed
154
        if not isinstance(data, MicroDataGroup):
155
156
            warn('Input of type: {} \n'.format(type(data)))
            sys.exit("Input not of type MicroDataGroup.\n We're done here! \n")
Chris Smith's avatar
Chris Smith committed
157

158
        # Figuring out if the first item in AFMData tree is file or group
Chris Smith's avatar
Chris Smith committed
159
        if data.name is '' and data.parent is '/':
160
            # For file we just write the attributes
Chris Smith's avatar
Chris Smith committed
161
162
            for key, val in data.attrs.iteritems():
                f.attrs[key] = val
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
            if print_log: print('Wrote attributes of file {} \n'.format(f.name))
            root = f.name
        else:
            # For a group we write it and its attributes
            if data.indexed:
                ''' If the name of the requested group ends in a '_', the user expects
                the suffix index to be appended automatically. Here, we check to
                ensure that the chosen index is new.
                '''
                previous = np.where([data.name in key for key in f[data.parent].keys()])[0]
                if len(previous)==0:
                    index = 0
                else:
                    # assuming that the last element of previous contains the highest index
                    last = f[data.parent].keys()[previous[-1]]
                    index = int(last.split('_')[-1])+1
179
                data.name += '{:03d}'.format(index)
180
181
182
183
184
            try:
                g = f[data.parent].create_group(data.name)
                if print_log: print('Created group {}'.format(g.name))
            except ValueError:
                g = f[data.parent][data.name]
185
                if print_log: print('Group already exists: {}'.format(g.name))
186
            except:
187
188
                f.flush()
                f.close()
189
                raise
Chris Smith's avatar
Chris Smith committed
190
191
            for key, val in data.attrs.iteritems():
                if val is None:
192
                    continue
Chris Smith's avatar
Chris Smith committed
193
                g.attrs[key] = val
194
195
            if print_log: print('Wrote attributes to group: {} \n'.format(data.name))
            root = g.name
Chris Smith's avatar
Chris Smith committed
196

197
198
199
200
        # Populating the tree structure recursively
        refList = []
        # Recursive function
        def __populate(child, parent):
Chris Smith's avatar
Chris Smith committed
201

202
203
204
205
206
207
208
209
            if isinstance(child, MicroDataGroup):
                if child.indexed:
                    previous = np.where([child.name in key for key in f[parent].keys()])[0]
                    if len(previous)==0:
                        index = 0
                    else:
                        last = f[parent].keys()[previous[-1]]
                        index = int(last.split('_')[-1])+1
Chris Smith's avatar
Chris Smith committed
210
                    child.name+='{:03d}'.format(index)
211
212
213
214
215
216
217
                try:
                    itm = f[parent].create_group(child.name)
                    if print_log: print('Created Group {}'.format(itm.name))
                except ValueError:
                    itm = f[parent][child.name]
                    print('Found Group already exists {}'.format(itm.name))
                except:
218
219
                    f.flush()
                    f.close()
220
                    raise
Chris Smith's avatar
Chris Smith committed
221
222
223
224
225
                for key, val in child.attrs.iteritems():
                    print key, val
                    if val is None:
                        continue
                    itm.attrs[key] = val
226
227
228
229
230
231
232
233
234
235
                if print_log: print('Wrote attributes to group {}\n'.format(itm.name))
                # here we do the recursive function call
                for ch in child.children:
                    __populate(ch, parent+'/'+child.name)
            else:
                if not child.resizable:
                    if not bool(child.maxshape):
                        # finite sized dataset and maxshape is not provided
                        # Typically for small / ancilliary datasets
                        try:
Chris Smith's avatar
Chris Smith committed
236
237
                            itm = f[parent].create_dataset(child.name,
                                                        data = child.data,
238
                                                        compression = child.compression,
Chris Smith's avatar
Chris Smith committed
239
                                                        dtype = child.data.dtype,
240
241
242
243
244
                                                        chunks= child.chunking)
                        except RuntimeError:
                            itm = f[parent][child.name]
                            warn('Found Dataset already exists {}'.format(itm.name))
                        except:
245
246
                            f.flush()
                            f.close()
247
248
249
250
251
252
                            raise
                    else:
                        # In many cases, we DON'T need resizable datasets but we know the max-size
                        # Here, we only allocate the space. The provided data is ignored
                        # print child.name
                        try:
Chris Smith's avatar
Chris Smith committed
253
                            itm = f[parent].create_dataset(child.name, child.maxshape,
254
                                                        compression = child.compression,
Chris Smith's avatar
Chris Smith committed
255
                                                        dtype = child.dtype,
256
257
258
259
260
                                                        chunks= child.chunking)
                        except RuntimeError:
                            itm = f[parent][child.name]
                            warn('Found Dataset already exists {}'.format(itm.name))
                        except:
261
262
                            f.flush()
                            f.close()
Chris Smith's avatar
Chris Smith committed
263
264
                            raise
                else:
265
266
267
                    # Resizable but the written files are significantly larger
                    max_shape = tuple([ None for i in range(len(child.data.shape))])
                    try:
Chris Smith's avatar
Chris Smith committed
268
269
                        itm = f[parent].create_dataset(child.name,
                                                    data = child.data,
270
                                                    compression = child.compression,
Chris Smith's avatar
Chris Smith committed
271
                                                    dtype = child.data.dtype,
272
273
274
275
276
277
                                                    chunks= child.chunking,
                                                    maxshape = max_shape)
                    except RuntimeError:
                        itm = f[parent][child.name]
                        warn('Found Dataset already exists {}'.format(itm.name))
                    except:
278
279
                        f.flush()
                        f.close()
280
                        raise
Chris Smith's avatar
Chris Smith committed
281

282
283
284
285
286
287
                if print_log: print('Created Dataset {}'.format(itm.name))
                for key in child.attrs.iterkeys():
                    # print('Found some region references')
                    # writing region reference
                    if key is 'labels':
                        # print('Found some region references')
Chris Smith's avatar
Chris Smith committed
288
                        labels = child.attrs[key]# labels here is a dictionary
289
                        self.write_region_references(itm, labels, print_log=print_log)
290
291
292
                        '''
                        Now make an attribute called 'labels' that is a list of strings 
                        First ascertain the dimension of the slicing:
Chris Smith's avatar
Chris Smith committed
293
                        '''
294
                        found_dim = False
Chris Smith's avatar
Chris Smith committed
295
                        for dimen, slobj in enumerate(labels[labels.keys()[0]]):
296
                            # We make the assumption that checking the start is sufficient 
Chris Smith's avatar
Chris Smith committed
297
298
                            if slobj.start != None:
                                found_dim = True
299
                                break
Chris Smith's avatar
Chris Smith committed
300
301
302
303
304
305
306
307
                        if found_dim:
                            headers = [None]*len(labels) # The list that will hold all the names
                            for col_name in labels.keys():
                                headers[labels[col_name][dimen].start] = col_name
                            # Now write the list of col / row names as an attribute:
                            itm.attrs[key] = headers
                        else:
                            warn('Unable to write region labels for %s' %(itm.name.split('/')[-1]))
308
309

                        if print_log: print('Wrote Region References of Dataset %s' %(itm.name.split('/')[-1]))
Chris Smith's avatar
Chris Smith committed
310
                    else:
311
312
313
314
315
                        itm.attrs[key] = child.attrs[key]
                        if print_log: print('Wrote Attributes of Dataset %s \n' %(itm.name.split('/')[-1]))
                        # Make a dictionary of references
            refList.append(itm)
            return refList
Chris Smith's avatar
Chris Smith committed
316

317
318
319
        # Recursive function is called at each stage beginning at the root
        for child in data.children:
            __populate(child, root)
Chris Smith's avatar
Chris Smith committed
320
321

        if print_log:
322
323
324
325
            print('Finished writing to h5 file.\n'+
                  'Right now you got yourself a fancy folder structure. \n'+
                  'Make sure you do some reference linking to take advantage of the full power of HDF5.')
        return refList
Chris Smith's avatar
Chris Smith committed
326

327
328
    @staticmethod
    def write_region_references(dataset, slices, print_log=False):
329
330
331
332
333
334
335
336
337
338
        '''
        Creates attributes of a h5.Dataset that refer to regions in the arrays
        
        Parameters
        ----------
        dataset : h5.Dataset instance
            Dataset to which region references will be added as attributes
        slices : dictionary
            The slicing information must be formatted using tuples of slice objects. 
            For example {'region_1':(slice(None, None), slice (0,1))}
339
340
        print_log : Boolean (Optional. Default = False)
            Whether or not to print status messages
341
        '''
342
        if print_log: print('Starting to write Region References to Dataset', dataset.name, 'of shape:', dataset.shape)
343
        for sl in slices.iterkeys():
344
345
346
347
348
349
350
351
            if print_log: print('About to write region reference:', sl, ':', slices[sl])
            if len(slices[sl]) == len(dataset.shape):
                dataset.attrs[sl] = dataset.regionref[slices[sl]]
                if print_log: print('Wrote Region Reference:%s' % sl)
            else:
                warn('Region reference %s could not be written since the object size was not equal to the dimensions of'
                     ' the dataset' % sl)
                raise ValueError