Commit 7ecc40ba authored by Unknown's avatar Unknown
Browse files

Merge remote-tracking branch 'origin/cades_dev' into cades_dev

parents c85bb335 ac711b42
......@@ -1611,18 +1611,20 @@ def create_spec_inds_from_vals(ds_spec_val_mat):
return ds_spec_inds_mat
def get_unit_values(h5_spec_ind, h5_spec_val, dim_names=None):
def get_unit_values(h5_inds, h5_vals, is_spec=True, dim_names=None):
"""
Gets the unit arrays of values that describe the spectroscopic dimensions
Parameters
----------
h5_spec_ind : h5py.Dataset
Spectroscopic Indices dataset
h5_spec_val : h5py.Dataset
Spectroscopic Values dataset
h5_inds : h5py.Dataset
Spectroscopic or Position Indices dataset
h5_vals : h5py.Dataset
Spectroscopic or Position Values dataset
is_spec : bool, recommended
Are the provided datasets spectral. Default = True
dim_names : str, or list of str, Optional
Names of the dimensions of interest
Names of the dimensions of interest. Default = all
Note - this function can be extended / modified for ancillary position dimensions as well
......@@ -1632,33 +1634,41 @@ def get_unit_values(h5_spec_ind, h5_spec_val, dim_names=None):
Dictionary containing the unit array for each dimension. The name of the dimensions are the keys.
"""
# First load to memory
inds_mat = h5_inds[()]
vals_mat = h5_vals[()]
if not is_spec:
# Convert to spectral shape
inds_mat = np.transpose(inds_mat)
vals_mat = np.transpose(vals_mat)
# For all dimensions, find where the index = 0
# basically, we are indexing all dimensions to 0
first_indices = []
for dim_ind in range(h5_spec_ind.shape[0]):
first_indices.append(h5_spec_ind[dim_ind] == 0)
for dim_ind in range(inds_mat.shape[0]):
first_indices.append(inds_mat[dim_ind] == 0)
first_indices = np.vstack(first_indices)
spec_dim_names = get_attr(h5_spec_ind, 'labels')
full_dim_names = get_attr(h5_inds, 'labels')
if dim_names is None:
dim_names = spec_dim_names
dim_names = full_dim_names
elif not isinstance(dim_names, list):
dim_names = [dim_names]
unit_values = dict()
for dim_name in dim_names:
# Find the row in the spectroscopic indices that corresponds to the dimensions we want to slice:
desired_row_ind = np.where(spec_dim_names == dim_name)[0][0]
desired_row_ind = np.where(full_dim_names == dim_name)[0][0]
# Find indices of all other dimensions
remaining_dims = list(range(h5_spec_ind.shape[0]))
remaining_dims = list(range(inds_mat.shape[0]))
remaining_dims.remove(desired_row_ind)
# The intersection of all these indices should give the desired index for the desired row
intersections = np.all(first_indices[remaining_dims, :], axis=0)
# apply this slicing to the values dataset:
unit_values[dim_name] = h5_spec_val[desired_row_ind, intersections]
unit_values[dim_name] = vals_mat[desired_row_ind, intersections]
return unit_values
......
......@@ -13,6 +13,7 @@ import numpy as np
from .hdf_utils import checkIfMain, get_attr, get_data_descriptor, get_formatted_labels, \
get_dimensionality, get_sort_order, get_unit_values, reshape_to_Ndims
from .io_utils import transformToReal
from ..viz.jupyter_utils import simple_ndim_visualizer
class PycroDataset(h5py.Dataset):
......@@ -339,3 +340,26 @@ class PycroDataset(h5py.Dataset):
return transformToReal(data_slice), success
else:
return data_slice, success
def visualize(self, slice_dict=None, **kwargs):
"""
Interactive visualization of this dataset. Only available on jupyter notebooks
Parameters
----------
slice_dict : dictionary, optional
Slicing instructions
"""
# TODO: Robust implementation that allows slicing
if len(self.pos_dim_labels + self.spec_dim_labels) > 4:
raise NotImplementedError('Unable to support visualization of more than 4 dimensions. Try slicing')
data_mat = self.get_n_dim_form()
pos_dim_names = self.pos_dim_labels[::-1]
spec_dim_names = self.spec_dim_labels
pos_dim_units_old = get_attr(self.h5_pos_inds, 'units')
spec_dim_units_old = get_attr(self.h5_spec_inds, 'units')
pos_ref_vals = get_unit_values(self.h5_pos_inds, self.h5_pos_vals, is_spec=False)
spec_ref_vals = get_unit_values(self.h5_spec_inds, self.h5_spec_vals, is_spec=True)
simple_ndim_visualizer(data_mat, pos_dim_names, pos_dim_units_old, spec_dim_names, spec_dim_units_old,
pos_ref_vals=pos_ref_vals, spec_ref_vals=spec_ref_vals, **kwargs)
......@@ -26,7 +26,7 @@ from . import fft
from . import gmode_utils
from . import proc_utils
from . import svd_utils
from .svd_utils import doSVD, rebuild_svd
from .svd_utils import SVD, rebuild_svd
from . import decomposition
from .decomposition import Decomposition
from . import cluster
......@@ -58,5 +58,5 @@ else:
FeatureExtractor = FeatureExtractorParallel
geoTransformer = geoTransformerParallel
__all__ = ['Cluster', 'Decomposition', 'ImageWindow', 'doSVD', 'fft', 'gmode_utils', 'proc_utils', 'svd_utils',
__all__ = ['Cluster', 'Decomposition', 'ImageWindow', 'SVD', 'fft', 'gmode_utils', 'proc_utils', 'svd_utils',
'giv_utils', 'rebuild_svd', 'Process', 'parallel_compute', 'Process', 'GIVBayesian', 'SignalFilter']
......@@ -13,49 +13,46 @@ import numpy as np
from sklearn.utils import gen_batches
from sklearn.utils.extmath import randomized_svd
from ..io.hdf_utils import getH5DsetRefs, checkAndLinkAncillary, findH5group, create_empty_dataset, \
from .process import Process
from ..io.hdf_utils import getH5DsetRefs, checkAndLinkAncillary, findH5group, \
getH5RegRefIndices, createRefFromIndices, checkIfMain, calc_chunks, copy_main_attributes, copyAttributes
from ..io.io_hdf5 import ioHDF5
from ..io.io_utils import check_dtype, transformToTargetType, getAvailableMem
from ..io.microdata import MicroDataset, MicroDataGroup
def doSVD(h5_main, num_comps=None):
"""
Does SVD on the provided dataset and writes the result. File is not closed
class SVD(Process):
Parameters
----------
h5_main : h5py.Dataset reference
Reference to the dataset on which SVD will be performed
num_comps : Unsigned integer (Optional)
Number of principal components of interest
def __init__(self, h5_main, num_components=None):
Returns
-------
h5_pca : h5py.Datagroup reference
Reference to the group containing the PCA results
"""
if not checkIfMain(h5_main):
warn('Dataset does not meet requirements for performing PCA.')
return
dset_name = h5_main.name.split('/')[-1]
t1 = time.time()
super(SVD, self).__init__(h5_main)
self.process_name = 'SVD'
'''
Calculate the size of the main data in memory and compare to max_mem
We use the minimum of the actual dtype's itemsize and float32 since we
don't want to read it in yet and do the proper type conversions.
'''
func, is_complex, is_compound, n_features, n_samples, type_mult = check_dtype(h5_main)
self.data_transform_func, is_complex, is_compound, n_features, n_samples, type_mult = check_dtype(h5_main)
if num_comps is None:
num_comps = min(n_samples, n_features)
if num_components is None:
num_components = min(n_samples, n_features)
else:
num_comps = min(n_samples, n_features, num_comps)
num_components = min(n_samples, n_features, num_components)
self.num_components = num_components
self.parms_dict = {'num_components': num_components}
self.duplicate_h5_groups = self._check_for_duplicates()
def compute(self):
"""
Computes SVD and writes results to file
Returns
-------
h5_results_grp : h5py.Datagroup object
Datagroup containing all the results
"""
'''
Check if a number of compnents has been set and ensure that the number is less than
......@@ -67,16 +64,31 @@ def doSVD(h5_main, num_comps=None):
'''
print('Performing SVD decomposition')
U, S, V = randomized_svd(func(h5_main), num_comps, n_iter=3)
t1 = time.time()
svd_type = 'sklearn-randomized'
U, S, V = randomized_svd(self.data_transform_func(self.h5_main), self.num_components, n_iter=3)
print('SVD took {} seconds. Writing results to file.'.format(round(time.time() - t1, 2)))
'''
Create datasets for V and S, deleting original arrays afterward to save
memory.
'''
self._write_results_chunk(U, S, V)
del U, S, V
return self.h5_results_grp
def _write_results_chunk(self, U, S, V):
"""
Writes the provided SVD results to file
Parameters
----------
U : array-like
Abundance matrix
S : array-like
variance vector
V : array-like
eigenvector matrix
"""
ds_S = MicroDataset('S', data=np.float32(S))
ds_S.attrs['labels'] = {'Principal Component': [slice(0, None)]}
ds_S.attrs['units'] = ''
......@@ -89,8 +101,8 @@ def doSVD(h5_main, num_comps=None):
ds_U = MicroDataset('U', data=np.float32(U), chunking=u_chunks)
del U
V = transformToTargetType(V, h5_main.dtype)
v_chunks = calc_chunks(V.shape, h5_main.dtype.itemsize)
V = transformToTargetType(V, self.h5_main.dtype)
v_chunks = calc_chunks(V.shape, self.h5_main.dtype.itemsize)
ds_V = MicroDataset('V', data=V, chunking=v_chunks)
del V
......@@ -98,30 +110,30 @@ def doSVD(h5_main, num_comps=None):
Create the Group to hold the results and add the existing datasets as
children
'''
grp_name = dset_name + '-SVD_'
svd_grp = MicroDataGroup(grp_name, h5_main.parent.name[1:])
grp_name = self.h5_main.name.split('/')[-1] + '-' + self.process_name + '_'
svd_grp = MicroDataGroup(grp_name, self.h5_main.parent.name[1:])
svd_grp.addChildren([ds_V, ds_S, ds_U, ds_inds])
'''
Write the attributes to the group
'''
svd_grp.attrs['num_components'] = num_comps
svd_grp.attrs['svd_method'] = svd_type
svd_grp.attrs = self.parms_dict
svd_grp.attrs.update({'svd_method': 'sklearn-randomized', 'last_pixel': self.h5_main.shape[0] - 1})
'''
Write the data and retrieve the HDF5 objects then delete the Microdatasets
'''
hdf = ioHDF5(h5_main.file)
hdf = ioHDF5(self.h5_main.file)
h5_svd_refs = hdf.writeData(svd_grp)
h5_U = getH5DsetRefs(['U'], h5_svd_refs)[0]
h5_S = getH5DsetRefs(['S'], h5_svd_refs)[0]
h5_V = getH5DsetRefs(['V'], h5_svd_refs)[0]
h5_svd_inds = getH5DsetRefs(['Component_Indices'], h5_svd_refs)[0]
h5_svd_grp = h5_S.parent
self.h5_results_grp = h5_S.parent
# copy attributes
copy_main_attributes(h5_main, h5_V)
copy_main_attributes(self.h5_main, h5_V)
h5_V.attrs['units'] = np.array(['a. u.'], dtype='S')
del ds_S, ds_V, ds_U, svd_grp
......@@ -130,7 +142,7 @@ def doSVD(h5_main, num_comps=None):
# Since I was meticulous about the translators that I wrote, I know I will find something here
checkAndLinkAncillary(h5_U,
['Position_Indices', 'Position_Values'],
h5_main=h5_main)
h5_main=self.h5_main)
checkAndLinkAncillary(h5_V,
['Position_Indices', 'Position_Values'],
......@@ -142,17 +154,17 @@ def doSVD(h5_main, num_comps=None):
checkAndLinkAncillary(h5_V,
['Spectroscopic_Indices', 'Spectroscopic_Values'],
h5_main=h5_main)
h5_main=self.h5_main)
'''
Check h5_main for plot group references.
Copy them into V if they exist
'''
for key in h5_main.attrs.keys():
for key in self.h5_main.attrs.keys():
if '_Plot_Group' not in key:
continue
ref_inds = getH5RegRefIndices(h5_main.attrs[key], h5_main, return_method='corners')
ref_inds = getH5RegRefIndices(self.h5_main.attrs[key], self.h5_main, return_method='corners')
ref_inds = ref_inds.reshape([-1, 2, 2])
ref_inds[:, 1, 0] = h5_V.shape[0] - 1
......@@ -160,12 +172,10 @@ def doSVD(h5_main, num_comps=None):
h5_V.attrs[key] = svd_ref
return h5_svd_grp
###############################################################################
def simplifiedKPCA(kpca, source_data):
def simplified_kpca(kpca, source_data):
"""
Performs kernel PCA on the provided dataset and returns the familiar
eigenvector, eigenvalue, and scree matrices.
......
......@@ -9,10 +9,11 @@ Submodules
be_viz_utils
plot_utils
jupyter_utils
"""
from . import tests
from . import plot_utils
from . import be_viz_utils
__all__ = ['plot_utils', 'be_viz_utils']
__all__ = ['plot_utils', 'be_viz_utils', 'jupyter_utils']
"""
Created on 11/11/16 10:08 AM
@author: Suhas Somnath, Chris Smith
"""
import matplotlib.pyplot as plt
from IPython.display import display
import ipywidgets as widgets
import numpy as np
from .plot_utils import single_img_cbar_plot
def simple_ndim_visualizer(data_mat, pos_dim_names, pos_dim_units_old, spec_dim_names, spec_dim_units_old,
pos_ref_vals=None, spec_ref_vals=None, pos_plot_2d=True, spec_plot_2d=True, spec_xdim=None,
pos_xdim=None):
"""
Generates a simple visualizer for visualizing simple datasets (up to 4 dimensions). The visualizer will ONLY work
within the context of a jupyter notebook!
The visualizer consists of two panels - spatial map and spectrograms. slider widgets will be generated to slice
dimensions. The data matrix can be real, complex or compound valued
Parameters
----------
data_mat : numpy.array object
Data to be visualized
pos_dim_names : list of strings
Names of the position dimensions
pos_dim_units_old : list of strings
Units for the position dimension
spec_dim_names : list of strings
Names of the spectroscopic dimensions
spec_dim_units_old : list of strings
Units for the spectroscopic dimensions
pos_ref_vals : dictionary, optional
Dictionary of names and reference values for each of the position dimensions.
Default - linear distribution for each dimension
spec_ref_vals : dictionary, optional
Dictionary of names and reference values for each of the spectroscopic dimensions.
Default - linear distribution for each dimension
pos_plot_2d : bool, optional
Whether or not to plot spatial data as 2D images. Default = True
spec_plot_2d : bool, optional
Whether or not to plot spectral data as 2D images. Default = True
spec_xdim : str, optional
Name of dimension with respect to which the spectral data will be plotted for 1D plots
pos_xdim : str, optional
Name of dimension with respect to which the position data will be plotted for 1D plots
"""
def check_data_type(data_mat):
if data_mat.dtype.names is not None:
return 2, list(data_mat.dtype.names), None
if data_mat.dtype in [np.complex64, np.complex128, np.complex]:
return 1, ['Real','Imaginary', 'Amplitude','Phase'], [np.real, np.imag, np.abs, np.angle]
else:
return 0, None, None
def get_clims(data, stdev=2):
avg = np.mean(data)
std = np.std(data)
return (avg -stdev*std, avg + stdev*std)
def get_slice_string(slice_dict, dim_names, values_dict, units_dict):
slice_str = ''
for cur_name in dim_names:
if cur_name in dim_names:
slice_str += '{} = {} {}\n'.format(cur_name,
values_dict[cur_name][slice_dict[cur_name]],
units_dict[cur_name])
slice_str = slice_str[:-1]
return slice_str
def get_slicing_tuple(slice_dict):
slice_list = []
for dim_name in pos_dim_names + spec_dim_names:
cur_slice = slice(None)
if slice_dict[dim_name] is not None:
cur_slice = slice(slice_dict[dim_name], slice_dict[dim_name]+1)
slice_list.append(cur_slice)
return tuple(slice_list)
def naive_slice(data_mat, slice_dict):
return np.squeeze(data_mat[get_slicing_tuple(slice_dict)])
def get_spatmap_slice_dict(slice_dict={}):
spatmap_slicing = {}
for name in pos_dim_names:
spatmap_slicing[name] = None
for ind, name in enumerate(spec_dim_names):
spatmap_slicing[name] = slice_dict.get(name, data_mat.shape[ind + len(pos_dim_names)] // 2)
return spatmap_slicing
def get_spgram_slice_dict(slice_dict={}):
spgram_slicing = {}
for ind, name in enumerate(pos_dim_names):
spgram_slicing[name] = slice_dict.get(name, data_mat.shape[ind] // 2)
for name in spec_dim_names:
spgram_slicing[name] = None
return spgram_slicing
def update_image(img_handle, data_mat, slice_dict, twoD=True):
if twoD:
img_handle.set_data(naive_slice(data_mat, slice_dict))
else:
y_mat = naive_slice(data_mat, slice_dict)
if y_mat.ndim > 1:
if y_mat.shape[0] != len(img_handle):
y_mat = y_mat.T
for line_handle, y_vec in zip(img_handle, y_mat):
line_handle.set_ydata(y_vec)
img_handle[0].get_axes().set_ylim([np.min(y_mat), np.max(y_mat)])
# ###########################################################################
pos_plot_2d = pos_plot_2d and len(pos_dim_names) > 1
spec_plot_2d = spec_plot_2d and len(spec_dim_names) > 1
if not spec_plot_2d and spec_xdim is None:
# Take the largest dimension you can find:
spec_xdim = spec_dim_names[np.argmax(data_mat.shape[len(pos_dim_names):])]
if not pos_plot_2d and pos_xdim is None:
# Take the largest dimension you can find:
pos_xdim = pos_dim_names[np.argmax(data_mat.shape[:len(pos_dim_names)])]
if pos_ref_vals is None:
spec_ref_vals = {}
for ind, name in enumerate(pos_dim_names):
spec_ref_vals[name] = np.arange(data_mat.shape[ind + len(pos_dim_names)])
if spec_ref_vals is None:
pos_ref_vals = {}
for ind, name in enumerate(pos_dim_names):
pos_ref_vals[name] = np.arange(data_mat.shape[ind])
pos_dim_units = {}
spec_dim_units = {}
for name, unit in zip(pos_dim_names, pos_dim_units_old):
pos_dim_units[name] = unit
for name, unit in zip(spec_dim_names, spec_dim_units_old):
spec_dim_units[name] = unit
data_type, data_names, data_funcs = check_data_type(data_mat)
sub_data = data_mat
component_name = 'Real'
if data_type == 1:
sub_data = data_funcs[0](data_mat)
component_name = data_names[0]
elif data_type == 2:
component_name = data_names[0]
sub_data = data_mat[component_name]
component_title = 'Component: ' + component_name
clims = get_clims(sub_data)
spatmap_slicing = get_spatmap_slice_dict()
current_spatmap = naive_slice(sub_data, spatmap_slicing)
spgram_slicing = get_spgram_slice_dict()
current_spgram = naive_slice(sub_data, spgram_slicing)
# print(current_spatmap.shape, current_spgram.shape)
fig, axes = plt.subplots(ncols=2, figsize=(14,7))
# axes[0].hold(True)
spec_titles = get_slice_string(spatmap_slicing, spec_dim_names, spec_ref_vals, spec_dim_units)
axes[0].set_title('Spatial Map for\n' + component_title + '\n' + spec_titles)
if pos_plot_2d:
img_spat, cbar_spat = single_img_cbar_plot(axes[0], current_spatmap,
x_size=data_mat.shape[1], y_size=data_mat.shape[0],
clim=clims)
axes[0].set_xlabel(pos_dim_names[1] + ' (' + pos_dim_units_old[1] + ')')
axes[0].set_ylabel(pos_dim_names[0] + ' (' + pos_dim_units_old[0] + ')')
main_vert_line = axes[0].axvline(x=spgram_slicing[pos_dim_names[1]], color='k')
main_hor_line = axes[0].axhline(y=spgram_slicing[pos_dim_names[0]], color='k')
else:
axes[0].set_xlabel(pos_xdim + ' (' + pos_dim_units[pos_xdim] + ')')
if current_spatmap.shape[0] != pos_ref_vals[pos_xdim].size:
current_spatmap = current_spatmap.T
img_spat = axes[0].plot(pos_ref_vals[pos_xdim], current_spatmap)
if current_spatmap.ndim > 1:
other_pos_dim = pos_dim_names.copy()
other_pos_dim.remove(pos_xdim)
other_pos_dim = other_pos_dim[0]
axes[0].legend(pos_ref_vals[other_pos_dim])
pos_titles = get_slice_string(spgram_slicing, pos_dim_names, pos_ref_vals, pos_dim_units)
axes[1].set_title('Spectrogram for\n' + component_title + '\n' + pos_titles)
if spec_plot_2d:
axes[1].set_xlabel(spec_dim_names[1] + ' (' + spec_dim_units_old[1] + ')')
axes[1].set_ylabel(spec_dim_names[0] + ' (' + spec_dim_units_old[0] + ')')
img_spec, cbar_spec = single_img_cbar_plot(axes[1], current_spgram,
x_size=data_mat.shape[len(pos_dim_names) + 1],
y_size=data_mat.shape[len(pos_dim_names)],
cbar_label=component_name, clim=clims)
else:
axes[1].set_xlabel(spec_xdim + ' (' + spec_dim_units[spec_xdim] + ')')
if current_spgram.shape[0] != spec_ref_vals[spec_xdim].size:
current_spgram = current_spgram.T
img_spec = axes[1].plot(spec_ref_vals[spec_xdim], current_spgram)
if current_spgram.ndim > 1:
other_spec_dim = spec_dim_names.copy()
other_spec_dim.remove(spec_xdim)
other_spec_dim = other_spec_dim[0]
axes[1].legend(spec_ref_vals[other_spec_dim])
fig.tight_layout()
slice_dict = {}
for dim_ind, dim_name in enumerate(pos_dim_names):
slice_dict[dim_name] = (0, sub_data.shape[dim_ind] -1, 1)
for dim_ind, dim_name in enumerate(spec_dim_names):
slice_dict[dim_name] = (0, sub_data.shape[dim_ind + len(pos_dim_names)] - 1, 1)
if data_type > 0:
slice_dict['component'] = data_names
# stupid and hacky way of doing this:
global_vars = {'sub_data': sub_data, 'component_title': component_title}
def update_plots(**kwargs):
component_name = kwargs.get('component', None)
if component_name is not None:
if component_name != slice_dict['component']:
# update the data and title:
if data_type == 1:
func_ind = data_names.index(component_name)
sub_data = data_funcs[func_ind](data_mat)
elif data_type == 2:
sub_data = data_mat[component_name]
component_title = 'Component: ' + component_name
# sub data and component_title here are now local, update gobal vars!
global_vars.update({'sub_data': sub_data, 'component_title': component_title})
clims = get_clims(sub_data)
update_image(img_spat, sub_data, spatmap_slicing, twoD=pos_plot_2d)
if pos_plot_2d:
img_spat.set_clim(clims)
update_image(img_spec, sub_data, spgram_slicing, twoD=spec_plot_2d)
if spec_plot_2d:
img_spec.set_clim(clims)
spec_titles = get_slice_string(spatmap_slicing, spec_dim_names, spec_ref_vals, spec_dim_units)
axes[0].set_title('Spatial Map for\n' + component_title + '\n' + spec_titles)
pos_titles = get_slice_string(spgram_slicing, pos_dim_names, pos_ref_vals, pos_dim_units)
axes[1].set_title('Spectrogram for\n' + component_title + '\n' + pos_titles)
# print('Updated component!')
# Check to see if spectrogram needs to be updated: