Commit 99556b27 authored by syz's avatar syz
Browse files

SVD reorganized as a process

parent 2b925789
......@@ -13,159 +13,169 @@ import numpy as np
from sklearn.utils import gen_batches
from sklearn.utils.extmath import randomized_svd
from ..io.hdf_utils import getH5DsetRefs, checkAndLinkAncillary, findH5group, create_empty_dataset, \
from .process import Process
from ..io.hdf_utils import getH5DsetRefs, checkAndLinkAncillary, findH5group, \
getH5RegRefIndices, createRefFromIndices, checkIfMain, calc_chunks, copy_main_attributes, copyAttributes
from ..io.io_hdf5 import ioHDF5
from ..io.io_utils import check_dtype, transformToTargetType, getAvailableMem
from ..io.microdata import MicroDataset, MicroDataGroup
def doSVD(h5_main, num_comps=None):
"""
Does SVD on the provided dataset and writes the result. File is not closed
Parameters
----------
h5_main : h5py.Dataset reference
Reference to the dataset on which SVD will be performed
num_comps : Unsigned integer (Optional)
Number of principal components of interest
Returns
-------
h5_pca : h5py.Datagroup reference
Reference to the group containing the PCA results
"""
if not checkIfMain(h5_main):
warn('Dataset does not meet requirements for performing PCA.')
return
dset_name = h5_main.name.split('/')[-1]
t1 = time.time()
class SVD(Process):
'''
Calculate the size of the main data in memory and compare to max_mem
We use the minimum of the actual dtype's itemsize and float32 since we
don't want to read it in yet and do the proper type conversions.
'''
func, is_complex, is_compound, n_features, n_samples, type_mult = check_dtype(h5_main)
if num_comps is None:
num_comps = min(n_samples, n_features)
else:
num_comps = min(n_samples, n_features, num_comps)
def __init__(self, h5_main, num_components=None):
'''
Check if a number of compnents has been set and ensure that the number is less than
the minimum axis length of the data. If both conditions are met, use fsvd. If not
use the regular svd.
C.Smith -- We might need to put a lower limit on num_comps in the future. I don't
know enough about svd to be sure.
'''
print('Performing SVD decomposition')
super(SVD, self).__init__(h5_main)
self.process_name = 'SVD'
U, S, V = randomized_svd(func(h5_main), num_comps, n_iter=3)
svd_type = 'sklearn-randomized'
print('SVD took {} seconds. Writing results to file.'.format(round(time.time() - t1, 2)))
'''
Create datasets for V and S, deleting original arrays afterward to save
memory.
'''
ds_S = MicroDataset('S', data=np.float32(S))
ds_S.attrs['labels'] = {'Principal Component': [slice(0, None)]}
ds_S.attrs['units'] = ''
ds_inds = MicroDataset('Component_Indices', data=np.uint32(np.arange(len(S))))
ds_inds.attrs['labels'] = {'Principal Component': [slice(0, None)]}
ds_inds.attrs['units'] = ''
del S
u_chunks = calc_chunks(U.shape, np.float32(0).itemsize)
ds_U = MicroDataset('U', data=np.float32(U), chunking=u_chunks)
del U
V = transformToTargetType(V, h5_main.dtype)
v_chunks = calc_chunks(V.shape, h5_main.dtype.itemsize)
ds_V = MicroDataset('V', data=V, chunking=v_chunks)
del V
'''
Create the Group to hold the results and add the existing datasets as
children
'''
grp_name = dset_name + '-SVD_'
svd_grp = MicroDataGroup(grp_name, h5_main.parent.name[1:])
svd_grp.addChildren([ds_V, ds_S, ds_U, ds_inds])
'''
Write the attributes to the group
'''
svd_grp.attrs['num_components'] = num_comps
svd_grp.attrs['svd_method'] = svd_type
'''
Write the data and retrieve the HDF5 objects then delete the Microdatasets
'''
hdf = ioHDF5(h5_main.file)
h5_svd_refs = hdf.writeData(svd_grp)
h5_U = getH5DsetRefs(['U'], h5_svd_refs)[0]
h5_S = getH5DsetRefs(['S'], h5_svd_refs)[0]
h5_V = getH5DsetRefs(['V'], h5_svd_refs)[0]
h5_svd_inds = getH5DsetRefs(['Component_Indices'], h5_svd_refs)[0]
h5_svd_grp = h5_S.parent
# copy attributes
copy_main_attributes(h5_main, h5_V)
h5_V.attrs['units'] = np.array(['a. u.'], dtype='S')
del ds_S, ds_V, ds_U, svd_grp
# Will attempt to see if there is anything linked to this dataset.
# Since I was meticulous about the translators that I wrote, I know I will find something here
checkAndLinkAncillary(h5_U,
['Position_Indices', 'Position_Values'],
h5_main=h5_main)
checkAndLinkAncillary(h5_V,
['Position_Indices', 'Position_Values'],
anc_refs=[h5_svd_inds, h5_S])
checkAndLinkAncillary(h5_U,
['Spectroscopic_Indices', 'Spectroscopic_Values'],
anc_refs=[h5_svd_inds, h5_S])
checkAndLinkAncillary(h5_V,
['Spectroscopic_Indices', 'Spectroscopic_Values'],
h5_main=h5_main)
'''
Check h5_main for plot group references.
Copy them into V if they exist
'''
for key in h5_main.attrs.keys():
if '_Plot_Group' not in key:
continue
ref_inds = getH5RegRefIndices(h5_main.attrs[key], h5_main, return_method='corners')
ref_inds = ref_inds.reshape([-1, 2, 2])
ref_inds[:, 1, 0] = h5_V.shape[0] - 1
svd_ref = createRefFromIndices(h5_V, ref_inds)
h5_V.attrs[key] = svd_ref
return h5_svd_grp
'''
Calculate the size of the main data in memory and compare to max_mem
We use the minimum of the actual dtype's itemsize and float32 since we
don't want to read it in yet and do the proper type conversions.
'''
self.data_transform_func, is_complex, is_compound, n_features, n_samples, type_mult = check_dtype(h5_main)
if num_components is None:
num_components = min(n_samples, n_features)
else:
num_components = min(n_samples, n_features, num_components)
self.num_components = num_components
self.parms_dict = {'num_components': num_components}
self.duplicate_h5_groups = self._check_for_duplicates()
def compute(self):
"""
Computes SVD and writes results to file
Returns
-------
h5_results_grp : h5py.Datagroup object
Datagroup containing all the results
"""
'''
Check if a number of compnents has been set and ensure that the number is less than
the minimum axis length of the data. If both conditions are met, use fsvd. If not
use the regular svd.
C.Smith -- We might need to put a lower limit on num_comps in the future. I don't
know enough about svd to be sure.
'''
print('Performing SVD decomposition')
t1 = time.time()
U, S, V = randomized_svd(self.data_transform_func(self.h5_main), self.num_components, n_iter=3)
print('SVD took {} seconds. Writing results to file.'.format(round(time.time() - t1, 2)))
self._write_results_chunk(U, S, V)
del U, S, V
return self.h5_results_grp
def _write_results_chunk(self, U, S, V):
"""
Writes the provided SVD results to file
Parameters
----------
U : array-like
Abundance matrix
S : array-like
variance vector
V : array-like
eigenvector matrix
"""
ds_S = MicroDataset('S', data=np.float32(S))
ds_S.attrs['labels'] = {'Principal Component': [slice(0, None)]}
ds_S.attrs['units'] = ''
ds_inds = MicroDataset('Component_Indices', data=np.uint32(np.arange(len(S))))
ds_inds.attrs['labels'] = {'Principal Component': [slice(0, None)]}
ds_inds.attrs['units'] = ''
del S
u_chunks = calc_chunks(U.shape, np.float32(0).itemsize)
ds_U = MicroDataset('U', data=np.float32(U), chunking=u_chunks)
del U
V = transformToTargetType(V, self.h5_main.dtype)
v_chunks = calc_chunks(V.shape, self.h5_main.dtype.itemsize)
ds_V = MicroDataset('V', data=V, chunking=v_chunks)
del V
'''
Create the Group to hold the results and add the existing datasets as
children
'''
grp_name = self.h5_main.name.split('/')[-1] + '-' + self.process_name + '_'
svd_grp = MicroDataGroup(grp_name, self.h5_main.parent.name[1:])
svd_grp.addChildren([ds_V, ds_S, ds_U, ds_inds])
'''
Write the attributes to the group
'''
svd_grp.attrs = self.parms_dict
svd_grp.attrs.update({'svd_method': 'sklearn-randomized', 'last_pixel': self.h5_main.shape[0] - 1})
'''
Write the data and retrieve the HDF5 objects then delete the Microdatasets
'''
hdf = ioHDF5(self.h5_main.file)
h5_svd_refs = hdf.writeData(svd_grp)
h5_U = getH5DsetRefs(['U'], h5_svd_refs)[0]
h5_S = getH5DsetRefs(['S'], h5_svd_refs)[0]
h5_V = getH5DsetRefs(['V'], h5_svd_refs)[0]
h5_svd_inds = getH5DsetRefs(['Component_Indices'], h5_svd_refs)[0]
self.h5_results_grp = h5_S.parent
# copy attributes
copy_main_attributes(self.h5_main, h5_V)
h5_V.attrs['units'] = np.array(['a. u.'], dtype='S')
del ds_S, ds_V, ds_U, svd_grp
# Will attempt to see if there is anything linked to this dataset.
# Since I was meticulous about the translators that I wrote, I know I will find something here
checkAndLinkAncillary(h5_U,
['Position_Indices', 'Position_Values'],
h5_main=self.h5_main)
checkAndLinkAncillary(h5_V,
['Position_Indices', 'Position_Values'],
anc_refs=[h5_svd_inds, h5_S])
checkAndLinkAncillary(h5_U,
['Spectroscopic_Indices', 'Spectroscopic_Values'],
anc_refs=[h5_svd_inds, h5_S])
checkAndLinkAncillary(h5_V,
['Spectroscopic_Indices', 'Spectroscopic_Values'],
h5_main=self.h5_main)
'''
Check h5_main for plot group references.
Copy them into V if they exist
'''
for key in self.h5_main.attrs.keys():
if '_Plot_Group' not in key:
continue
ref_inds = getH5RegRefIndices(self.h5_main.attrs[key], self.h5_main, return_method='corners')
ref_inds = ref_inds.reshape([-1, 2, 2])
ref_inds[:, 1, 0] = h5_V.shape[0] - 1
svd_ref = createRefFromIndices(h5_V, ref_inds)
h5_V.attrs[key] = svd_ref
###############################################################################
def simplifiedKPCA(kpca, source_data):
def simplified_kpca(kpca, source_data):
"""
Performs kernel PCA on the provided dataset and returns the familiar
eigenvector, eigenvalue, and scree matrices.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment