Commit 7ecc40ba authored by Unknown's avatar Unknown
Browse files

Merge remote-tracking branch 'origin/cades_dev' into cades_dev

parents c85bb335 ac711b42
......@@ -1611,18 +1611,20 @@ def create_spec_inds_from_vals(ds_spec_val_mat):
return ds_spec_inds_mat
def get_unit_values(h5_spec_ind, h5_spec_val, dim_names=None):
def get_unit_values(h5_inds, h5_vals, is_spec=True, dim_names=None):
"""
Gets the unit arrays of values that describe the spectroscopic dimensions
Parameters
----------
h5_spec_ind : h5py.Dataset
Spectroscopic Indices dataset
h5_spec_val : h5py.Dataset
Spectroscopic Values dataset
h5_inds : h5py.Dataset
Spectroscopic or Position Indices dataset
h5_vals : h5py.Dataset
Spectroscopic or Position Values dataset
is_spec : bool, recommended
Are the provided datasets spectral. Default = True
dim_names : str, or list of str, Optional
Names of the dimensions of interest
Names of the dimensions of interest. Default = all
Note - this function can be extended / modified for ancillary position dimensions as well
......@@ -1632,33 +1634,41 @@ def get_unit_values(h5_spec_ind, h5_spec_val, dim_names=None):
Dictionary containing the unit array for each dimension. The name of the dimensions are the keys.
"""
# First load to memory
inds_mat = h5_inds[()]
vals_mat = h5_vals[()]
if not is_spec:
# Convert to spectral shape
inds_mat = np.transpose(inds_mat)
vals_mat = np.transpose(vals_mat)
# For all dimensions, find where the index = 0
# basically, we are indexing all dimensions to 0
first_indices = []
for dim_ind in range(h5_spec_ind.shape[0]):
first_indices.append(h5_spec_ind[dim_ind] == 0)
for dim_ind in range(inds_mat.shape[0]):
first_indices.append(inds_mat[dim_ind] == 0)
first_indices = np.vstack(first_indices)
spec_dim_names = get_attr(h5_spec_ind, 'labels')
full_dim_names = get_attr(h5_inds, 'labels')
if dim_names is None:
dim_names = spec_dim_names
dim_names = full_dim_names
elif not isinstance(dim_names, list):
dim_names = [dim_names]
unit_values = dict()
for dim_name in dim_names:
# Find the row in the spectroscopic indices that corresponds to the dimensions we want to slice:
desired_row_ind = np.where(spec_dim_names == dim_name)[0][0]
desired_row_ind = np.where(full_dim_names == dim_name)[0][0]
# Find indices of all other dimensions
remaining_dims = list(range(h5_spec_ind.shape[0]))
remaining_dims = list(range(inds_mat.shape[0]))
remaining_dims.remove(desired_row_ind)
# The intersection of all these indices should give the desired index for the desired row
intersections = np.all(first_indices[remaining_dims, :], axis=0)
# apply this slicing to the values dataset:
unit_values[dim_name] = h5_spec_val[desired_row_ind, intersections]
unit_values[dim_name] = vals_mat[desired_row_ind, intersections]
return unit_values
......
......@@ -13,6 +13,7 @@ import numpy as np
from .hdf_utils import checkIfMain, get_attr, get_data_descriptor, get_formatted_labels, \
get_dimensionality, get_sort_order, get_unit_values, reshape_to_Ndims
from .io_utils import transformToReal
from ..viz.jupyter_utils import simple_ndim_visualizer
class PycroDataset(h5py.Dataset):
......@@ -339,3 +340,26 @@ class PycroDataset(h5py.Dataset):
return transformToReal(data_slice), success
else:
return data_slice, success
def visualize(self, slice_dict=None, **kwargs):
"""
Interactive visualization of this dataset. Only available on jupyter notebooks
Parameters
----------
slice_dict : dictionary, optional
Slicing instructions
"""
# TODO: Robust implementation that allows slicing
if len(self.pos_dim_labels + self.spec_dim_labels) > 4:
raise NotImplementedError('Unable to support visualization of more than 4 dimensions. Try slicing')
data_mat = self.get_n_dim_form()
pos_dim_names = self.pos_dim_labels[::-1]
spec_dim_names = self.spec_dim_labels
pos_dim_units_old = get_attr(self.h5_pos_inds, 'units')
spec_dim_units_old = get_attr(self.h5_spec_inds, 'units')
pos_ref_vals = get_unit_values(self.h5_pos_inds, self.h5_pos_vals, is_spec=False)
spec_ref_vals = get_unit_values(self.h5_spec_inds, self.h5_spec_vals, is_spec=True)
simple_ndim_visualizer(data_mat, pos_dim_names, pos_dim_units_old, spec_dim_names, spec_dim_units_old,
pos_ref_vals=pos_ref_vals, spec_ref_vals=spec_ref_vals, **kwargs)
......@@ -26,7 +26,7 @@ from . import fft
from . import gmode_utils
from . import proc_utils
from . import svd_utils
from .svd_utils import doSVD, rebuild_svd
from .svd_utils import SVD, rebuild_svd
from . import decomposition
from .decomposition import Decomposition
from . import cluster
......@@ -58,5 +58,5 @@ else:
FeatureExtractor = FeatureExtractorParallel
geoTransformer = geoTransformerParallel
__all__ = ['Cluster', 'Decomposition', 'ImageWindow', 'doSVD', 'fft', 'gmode_utils', 'proc_utils', 'svd_utils',
__all__ = ['Cluster', 'Decomposition', 'ImageWindow', 'SVD', 'fft', 'gmode_utils', 'proc_utils', 'svd_utils',
'giv_utils', 'rebuild_svd', 'Process', 'parallel_compute', 'Process', 'GIVBayesian', 'SignalFilter']
......@@ -13,159 +13,169 @@ import numpy as np
from sklearn.utils import gen_batches
from sklearn.utils.extmath import randomized_svd
from ..io.hdf_utils import getH5DsetRefs, checkAndLinkAncillary, findH5group, create_empty_dataset, \
from .process import Process
from ..io.hdf_utils import getH5DsetRefs, checkAndLinkAncillary, findH5group, \
getH5RegRefIndices, createRefFromIndices, checkIfMain, calc_chunks, copy_main_attributes, copyAttributes
from ..io.io_hdf5 import ioHDF5
from ..io.io_utils import check_dtype, transformToTargetType, getAvailableMem
from ..io.microdata import MicroDataset, MicroDataGroup
def doSVD(h5_main, num_comps=None):
"""
Does SVD on the provided dataset and writes the result. File is not closed
Parameters
----------
h5_main : h5py.Dataset reference
Reference to the dataset on which SVD will be performed
num_comps : Unsigned integer (Optional)
Number of principal components of interest
Returns
-------
h5_pca : h5py.Datagroup reference
Reference to the group containing the PCA results
"""
if not checkIfMain(h5_main):
warn('Dataset does not meet requirements for performing PCA.')
return
dset_name = h5_main.name.split('/')[-1]
t1 = time.time()
class SVD(Process):
'''
Calculate the size of the main data in memory and compare to max_mem
We use the minimum of the actual dtype's itemsize and float32 since we
don't want to read it in yet and do the proper type conversions.
'''
func, is_complex, is_compound, n_features, n_samples, type_mult = check_dtype(h5_main)
if num_comps is None:
num_comps = min(n_samples, n_features)
else:
num_comps = min(n_samples, n_features, num_comps)
def __init__(self, h5_main, num_components=None):
'''
Check if a number of compnents has been set and ensure that the number is less than
the minimum axis length of the data. If both conditions are met, use fsvd. If not
use the regular svd.
C.Smith -- We might need to put a lower limit on num_comps in the future. I don't
know enough about svd to be sure.
'''
print('Performing SVD decomposition')
super(SVD, self).__init__(h5_main)
self.process_name = 'SVD'
U, S, V = randomized_svd(func(h5_main), num_comps, n_iter=3)
svd_type = 'sklearn-randomized'
print('SVD took {} seconds. Writing results to file.'.format(round(time.time() - t1, 2)))
'''
Create datasets for V and S, deleting original arrays afterward to save
memory.
'''
ds_S = MicroDataset('S', data=np.float32(S))
ds_S.attrs['labels'] = {'Principal Component': [slice(0, None)]}
ds_S.attrs['units'] = ''
ds_inds = MicroDataset('Component_Indices', data=np.uint32(np.arange(len(S))))
ds_inds.attrs['labels'] = {'Principal Component': [slice(0, None)]}
ds_inds.attrs['units'] = ''
del S
u_chunks = calc_chunks(U.shape, np.float32(0).itemsize)
ds_U = MicroDataset('U', data=np.float32(U), chunking=u_chunks)
del U
V = transformToTargetType(V, h5_main.dtype)
v_chunks = calc_chunks(V.shape, h5_main.dtype.itemsize)
ds_V = MicroDataset('V', data=V, chunking=v_chunks)
del V
'''
Create the Group to hold the results and add the existing datasets as
children
'''
grp_name = dset_name + '-SVD_'
svd_grp = MicroDataGroup(grp_name, h5_main.parent.name[1:])
svd_grp.addChildren([ds_V, ds_S, ds_U, ds_inds])
'''
Write the attributes to the group
'''
svd_grp.attrs['num_components'] = num_comps
svd_grp.attrs['svd_method'] = svd_type
'''
Write the data and retrieve the HDF5 objects then delete the Microdatasets
'''
hdf = ioHDF5(h5_main.file)
h5_svd_refs = hdf.writeData(svd_grp)
h5_U = getH5DsetRefs(['U'], h5_svd_refs)[0]
h5_S = getH5DsetRefs(['S'], h5_svd_refs)[0]
h5_V = getH5DsetRefs(['V'], h5_svd_refs)[0]
h5_svd_inds = getH5DsetRefs(['Component_Indices'], h5_svd_refs)[0]
h5_svd_grp = h5_S.parent
# copy attributes
copy_main_attributes(h5_main, h5_V)
h5_V.attrs['units'] = np.array(['a. u.'], dtype='S')
del ds_S, ds_V, ds_U, svd_grp
# Will attempt to see if there is anything linked to this dataset.
# Since I was meticulous about the translators that I wrote, I know I will find something here
checkAndLinkAncillary(h5_U,
['Position_Indices', 'Position_Values'],
h5_main=h5_main)
checkAndLinkAncillary(h5_V,
['Position_Indices', 'Position_Values'],
anc_refs=[h5_svd_inds, h5_S])
checkAndLinkAncillary(h5_U,
['Spectroscopic_Indices', 'Spectroscopic_Values'],
anc_refs=[h5_svd_inds, h5_S])
checkAndLinkAncillary(h5_V,
['Spectroscopic_Indices', 'Spectroscopic_Values'],
h5_main=h5_main)
'''
Check h5_main for plot group references.
Copy them into V if they exist
'''
for key in h5_main.attrs.keys():
if '_Plot_Group' not in key:
continue
ref_inds = getH5RegRefIndices(h5_main.attrs[key], h5_main, return_method='corners')
ref_inds = ref_inds.reshape([-1, 2, 2])
ref_inds[:, 1, 0] = h5_V.shape[0] - 1
svd_ref = createRefFromIndices(h5_V, ref_inds)
h5_V.attrs[key] = svd_ref
return h5_svd_grp
'''
Calculate the size of the main data in memory and compare to max_mem
We use the minimum of the actual dtype's itemsize and float32 since we
don't want to read it in yet and do the proper type conversions.
'''
self.data_transform_func, is_complex, is_compound, n_features, n_samples, type_mult = check_dtype(h5_main)
if num_components is None:
num_components = min(n_samples, n_features)
else:
num_components = min(n_samples, n_features, num_components)
self.num_components = num_components
self.parms_dict = {'num_components': num_components}
self.duplicate_h5_groups = self._check_for_duplicates()
def compute(self):
"""
Computes SVD and writes results to file
Returns
-------
h5_results_grp : h5py.Datagroup object
Datagroup containing all the results
"""
'''
Check if a number of compnents has been set and ensure that the number is less than
the minimum axis length of the data. If both conditions are met, use fsvd. If not
use the regular svd.
C.Smith -- We might need to put a lower limit on num_comps in the future. I don't
know enough about svd to be sure.
'''
print('Performing SVD decomposition')
t1 = time.time()
U, S, V = randomized_svd(self.data_transform_func(self.h5_main), self.num_components, n_iter=3)
print('SVD took {} seconds. Writing results to file.'.format(round(time.time() - t1, 2)))
self._write_results_chunk(U, S, V)
del U, S, V
return self.h5_results_grp
def _write_results_chunk(self, U, S, V):
"""
Writes the provided SVD results to file
Parameters
----------
U : array-like
Abundance matrix
S : array-like
variance vector
V : array-like
eigenvector matrix
"""
ds_S = MicroDataset('S', data=np.float32(S))
ds_S.attrs['labels'] = {'Principal Component': [slice(0, None)]}
ds_S.attrs['units'] = ''
ds_inds = MicroDataset('Component_Indices', data=np.uint32(np.arange(len(S))))
ds_inds.attrs['labels'] = {'Principal Component': [slice(0, None)]}
ds_inds.attrs['units'] = ''
del S
u_chunks = calc_chunks(U.shape, np.float32(0).itemsize)
ds_U = MicroDataset('U', data=np.float32(U), chunking=u_chunks)
del U
V = transformToTargetType(V, self.h5_main.dtype)
v_chunks = calc_chunks(V.shape, self.h5_main.dtype.itemsize)
ds_V = MicroDataset('V', data=V, chunking=v_chunks)
del V
'''
Create the Group to hold the results and add the existing datasets as
children
'''
grp_name = self.h5_main.name.split('/')[-1] + '-' + self.process_name + '_'
svd_grp = MicroDataGroup(grp_name, self.h5_main.parent.name[1:])
svd_grp.addChildren([ds_V, ds_S, ds_U, ds_inds])
'''
Write the attributes to the group
'''
svd_grp.attrs = self.parms_dict
svd_grp.attrs.update({'svd_method': 'sklearn-randomized', 'last_pixel': self.h5_main.shape[0] - 1})
'''
Write the data and retrieve the HDF5 objects then delete the Microdatasets
'''
hdf = ioHDF5(self.h5_main.file)
h5_svd_refs = hdf.writeData(svd_grp)
h5_U = getH5DsetRefs(['U'], h5_svd_refs)[0]
h5_S = getH5DsetRefs(['S'], h5_svd_refs)[0]
h5_V = getH5DsetRefs(['V'], h5_svd_refs)[0]
h5_svd_inds = getH5DsetRefs(['Component_Indices'], h5_svd_refs)[0]
self.h5_results_grp = h5_S.parent
# copy attributes
copy_main_attributes(self.h5_main, h5_V)
h5_V.attrs['units'] = np.array(['a. u.'], dtype='S')
del ds_S, ds_V, ds_U, svd_grp
# Will attempt to see if there is anything linked to this dataset.
# Since I was meticulous about the translators that I wrote, I know I will find something here
checkAndLinkAncillary(h5_U,
['Position_Indices', 'Position_Values'],
h5_main=self.h5_main)
checkAndLinkAncillary(h5_V,
['Position_Indices', 'Position_Values'],
anc_refs=[h5_svd_inds, h5_S])
checkAndLinkAncillary(h5_U,
['Spectroscopic_Indices', 'Spectroscopic_Values'],
anc_refs=[h5_svd_inds, h5_S])
checkAndLinkAncillary(h5_V,
['Spectroscopic_Indices', 'Spectroscopic_Values'],
h5_main=self.h5_main)
'''
Check h5_main for plot group references.
Copy them into V if they exist
'''
for key in self.h5_main.attrs.keys():
if '_Plot_Group' not in key:
continue
ref_inds = getH5RegRefIndices(self.h5_main.attrs[key], self.h5_main, return_method='corners')
ref_inds = ref_inds.reshape([-1, 2, 2])
ref_inds[:, 1, 0] = h5_V.shape[0] - 1
svd_ref = createRefFromIndices(h5_V, ref_inds)
h5_V.attrs[key] = svd_ref
###############################################################################
def simplifiedKPCA(kpca, source_data):
def simplified_kpca(kpca, source_data):
"""
Performs kernel PCA on the provided dataset and returns the familiar
eigenvector, eigenvalue, and scree matrices.
......
......@@ -9,10 +9,11 @@ Submodules
be_viz_utils
plot_utils
jupyter_utils
"""
from . import tests
from . import plot_utils
from . import be_viz_utils
__all__ = ['plot_utils', 'be_viz_utils']
__all__ = ['plot_utils', 'be_viz_utils', 'jupyter_utils']
"""
Created on 11/11/16 10:08 AM
@author: Suhas Somnath, Chris Smith
"""
import matplotlib.pyplot as plt
from IPython.display import display
import ipywidgets as widgets
import numpy as np
from .plot_utils import single_img_cbar_plot
def simple_ndim_visualizer(data_mat, pos_dim_names, pos_dim_units_old, spec_dim_names, spec_dim_units_old,
pos_ref_vals=None, spec_ref_vals=None, pos_plot_2d=True, spec_plot_2d=True, spec_xdim=None,
pos_xdim=None):
"""
Generates a simple visualizer for visualizing simple datasets (up to 4 dimensions). The visualizer will ONLY work
within the context of a jupyter notebook!
The visualizer consists of two panels - spatial map and spectrograms. slider widgets will be generated to slice
dimensions. The data matrix can be real, complex or compound valued
Parameters
----------
data_mat : numpy.array object
Data to be visualized
pos_dim_names : list of strings
Names of the position dimensions
pos_dim_units_old : list of strings
Units for the position dimension
spec_dim_names : list of strings
Names of the spectroscopic dimensions
spec_dim_units_old : list of strings
Units for the spectroscopic dimensions
pos_ref_vals : dictionary, optional
Dictionary of names and reference values for each of the position dimensions.
Default - linear distribution for each dimension
spec_ref_vals : dictionary, optional
Dictionary of names and reference values for each of the spectroscopic dimensions.
Default - linear distribution for each dimension
pos_plot_2d : bool, optional
Whether or not to plot spatial data as 2D images. Default = True
spec_plot_2d : bool, optional
Whether or not to plot spectral data as 2D images. Default = True
spec_xdim : str, optional
Name of dimension with respect to which the spectral data will be plotted for 1D plots
pos_xdim : str, optional
Name of dimension with respect to which the position data will be plotted for 1D plots
"""
def check_data_type(data_mat):
if data_mat.dtype.names is not None:
return 2, list(data_mat.dtype.names), None
if data_mat.dtype in [np.complex64, np.complex128, np.complex]:
return 1, ['Real','Imaginary', 'Amplitude','Phase'], [np.real, np.imag, np.abs, np.angle]
else:
return 0, None, None
def get_clims(data, stdev=2):
avg = np.mean(data)
std = np.std(data)
return (avg -stdev*std, avg + stdev*std)
def get_slice_string(slice_dict, dim_names, values_dict, units_dict):
slice_str = ''
for cur_name in dim_names:
if cur_name in dim_names:
slice_str += '{} = {} {}\n'.format(cur_name,
values_dict[cur_name][slice_dict[cur_name]],
units_dict[cur_name])
slice_str = slice_str[:-1]
return slice_str
def get_slicing_tuple(slice_dict):
slice_list = []
for dim_name in pos_dim_names + spec_dim_names:
cur_slice = slice(None)
if slice_dict[dim_name] is not None:
cur_slice = slice(slice_dict[dim_name], slice_dict[dim_name]+1)
slice_list.append(cur_slice)
return tuple(slice_list)
def naive_slice(data_mat, slice_dict):
return np.squeeze(data_mat[get_slicing_tuple(slice_dict)])
def get_spatmap_slice_dict(slice_dict={}):
spatmap_slicing = {}
for name in pos_dim_names:
spatmap_slicing[name] = None
for ind, name in enumerate(spec_dim_names):
spatmap_slicing[name] = slice_dict.get(name, data_mat.shape[ind + len(pos_dim_names)] // 2)
return spatmap_slicing
def get_spgram_slice_dict(slice_dict={}):
spgram_slicing = {}
for ind, name in enumerate(pos_dim_names):
spgram_slicing[name] = slice_dict.get(name, data_mat.shape[ind] // 2)
for name in spec_dim_names:
spgram_slicing[name] = None
return spgram_slicing
def update_image(img_handle, data_mat, slice_dict, twoD=True):
if twoD:
img_handle.set_data(naive_slice(data_mat, slice_dict))
else:
y_mat = naive_slice(data_mat, slice_dict)
if y_mat.ndim > 1:
if y_mat.shape[0] != len(img_handle):
y_mat = y_mat.T
for line_handle, y_vec in zip(img_handle, y_mat):
line_handle.set_ydata(y_vec)
img_handle[0].get_axes().set_ylim([np.min(y_mat), np.max(y_mat)])
# ###########################################################################
pos_plot_2d = pos_plot_2d and len(pos_dim_names) > 1
spec_plot_2d = spec_plot_2d and len(spec_dim_names) > 1
if not spec_plot_2d and spec_xdim is None:
# Take the largest dimension you can find:
spec_xdim = spec_dim_names[np.argmax(data_mat.shape[len(pos_dim_names):])]
if not pos_plot_2d and pos_xdim is None:
# Take the largest dimension you can find:
pos_xdim = pos_dim_names[np.argmax(data_mat.shape[:len(pos_dim_names)])]
if pos_ref_vals is None:
spec_ref_vals = {}
for ind, name in enumerate(pos_dim_names):
spec_ref_vals[name] = np.arange(data_mat.shape[ind + len(pos_dim_names)])
if spec_ref_vals is None:
pos_ref_vals = {}
for ind, name in enumerate(pos_dim_names):
pos_ref_vals[name] = np.arange(data_mat.shape[ind])
pos_dim_units = {}
spec_dim_units = {}
for name, unit in zip(pos_dim_names, pos_dim_units_old):
pos_dim_units[name] = unit
for name, unit in zip(spec_dim_names, spec_dim_units_old):
spec_dim_units[name] = unit
data_type, data_names, data_funcs = check_data_type(data_mat)
sub_data = data_mat
component_name = 'Real'
if data_type == 1:
sub_data = data_funcs[0](data_mat)
component_name = data_names[0]