Commit ab06ad52 authored by CompPhysChris's avatar CompPhysChris Committed by GitHub
Browse files

Merge pull request #116 from pycroscopy/cades_dev

Cades dev
parents 4ba8f9fb fffc90fb
"""
========================================================================================================
Tutorials for Developing Scientific Workflows in Pycroscopy - Part 3: Handling Multidimensional datasets
Tutorial 3: Handling Multidimensional datasets
========================================================================================================
**Suhas Somnath**
8/8/2017
This set of notebooks will serve as examples for developing end-to-end workflows for and using pycroscopy.
This set of tutorials will serve as examples for developing end-to-end workflows for and using pycroscopy.
**In this example, we will learn how to slice multidimensional datasets.**
......@@ -19,6 +19,8 @@ matrices, namely the spectroscopic indices and values matrix as well as the posi
will be essential for reshaping the data back to its original N dimensional form and for slicing multidimensional
datasets
We highly recommend reading
"""
# Ensure python 3 compatibility:
......@@ -57,7 +59,7 @@ import pycroscopy as px
# download the raw data file from Github:
h5_path = 'temp_3.h5'
url = 'https://raw.githubusercontent.com/pycroscopy/pycroscopy/master/data/FORC_BEPS.h5'
url = 'https://raw.githubusercontent.com/pycroscopy/pycroscopy/cades_dev/data/BEPS_small.h5'
if os.path.exists(h5_path):
os.remove(h5_path)
_ = wget.download(url, h5_path, bar=None)
......@@ -202,3 +204,146 @@ pos_dim_sizes = get_dim_sizes(h5_pos_ind, is_position=True)
spec_dim_sizes = get_dim_sizes(h5_spec_ind)
print('Positions:', pos_dim_sizes, '\nSpectroscopic:', spec_dim_sizes)
#########################################################################
# Slicing the Main dataset
# ========================
#
# Let's assume that we are interested in visualizing the spectrograms at the first field of the second cycle at
# position - row:3 and column 2. There are two ways of accessing the data:
# 1. The easier method - reshape the data to N dimensions and slice the dataset
# * This approach, while trivial, may not be suitable for large datasets which may or may not fit in memory
# 2. The harder method - find the spectroscopic and position indices of interest and slice the 2D dataset
#
# Approach 1 - N-dimensional form
# -------------------------------
# We will use convenient pycroscopy function that safely reshapes the data to its N dimensional form with a single
# line. Note that while this approach appears simple on the surface, there are a fair number of lines of code that
# make up this function.
ds_nd, success, labels = px.hdf_utils.reshape_to_Ndims(h5_main, get_labels=True)
print('Shape of the N-dimensional dataset:', ds_nd.shape)
print(labels)
#########################################################################
# Now that we have the data in its original N dimensional form, we can easily slice the dataset:
spectrogram = ds_nd[2,3, :, 1, :, 0]
# Now the spectrogram is of order (frequency x DC_Offset).
spectrogram = spectrogram.T
# Now the spectrogram is of order (DC_Offset x frequency)
fig, axis = plt. subplots()
axis.imshow(np.abs(spectrogram), origin='lower')
axis.set_xlabel('Frequency Index')
axis.set_ylabel('DC Offset Index')
axis.set_title('Spectrogram Amplitude');
#########################################################################
# Approach 2 - slicing the 2D matrix
# ----------------------------------
#
# This approach is more hands-on and requires that we be very careful with the indexing and slicing. Nonetheless,
# the process is actually fairly intuitive. We rely entirely upon the spectroscopic and position ancillary datasets
# to find the indices for slicing the dataset. Unlike the main dataset, the ancillary datasets are very small and
# can be stored easily in memory. Once the slicing indices are calculated, we __only read the desired portion of
# `main` data to memory__. Thus the amount of data loaded into memory is only the amount that we absolutely need.
# __This is the only approach that can be applied to slice very large datasets without ovwhelming memory overheads__.
# The comments for each line explain the entire process comprehensively
# Get only the spectroscopic dimension names:
spec_dim_names = px.hdf_utils.get_attr(h5_spec_ind, 'labels')
# Find the row in the spectroscopic indices that corresponds to the dimensions we want to slice:
cycle_row_ind = np.where(spec_dim_names == 'Cycle')[0][0]
# Find the row correspoding to field in the same way:
field_row_ind = np.where(spec_dim_names == 'Field')[0][0]
# Find all the spectral indices corresponding to the second cycle:
desired_cycle = h5_spec_ind[cycle_row_ind] == 1
# Do the same to find the spectral indicies for the first field:
desired_field = h5_spec_ind[field_row_ind] == 0
# Now find the indices where the cycle = 1 and the field = 0 using a logical AND statement:
spec_slice = np.logical_and(desired_cycle, desired_field)
# We will use the same approach to find the position indices
# corresponding to the row index of 3 and colum index of 2:
pos_dim_names = px.hdf_utils.get_attr(h5_pos_ind,'labels')
x_col_ind = np.where(pos_dim_names == 'X')[0][0]
y_col_ind = np.where(pos_dim_names == 'Y')[0][0]
desired_x = h5_pos_ind[:, x_col_ind] == 2
desired_y = h5_pos_ind[:, y_col_ind] == 3
pos_slice = np.logical_and(desired_x, desired_y)
# Now use the spectroscopic and position slice arrays to slice the 2D dataset:
data_vec = h5_main[pos_slice, :][:, spec_slice]
print('Sliced data is of shape:', data_vec.shape)
#########################################################################
# Note that the sliced data is effectively one dimensional since the spectroscopic dimensions were flattened to a
# single dimension.
#
# Now that we have the data we are interested in, all we need to do is reshape the vector to the expected 2D
# spectrogram shape. We still have to be careful about the order of the indices for reshaping the vector to the
# 2D matrix. Note that in python, we specify the slower axis before the faster axis in the reshape command.
# Reshape this dataset to the 2D spectrogram that we desire:
# For this we need to find the size of the data in the DC_offset and Frequency dimensions:
dc_dim_ind = np.where(spec_dim_names == 'DC_Offset')[0][0]
# Find the row correspoding to field in the same way:
freq_dim_ind = np.where(spec_dim_names == 'Frequency')[0][0]
dc_dim_size = spec_dim_sizes[dc_dim_ind]
freq_dim_size = spec_dim_sizes[freq_dim_ind]
# Since we know that the DC offset varies slower than the frequency, we reshape the
# the data vector by (dc_dim_size, freq_dim_size)
print('We need to reshape the vector by the tuple:', (dc_dim_size, freq_dim_size))
#########################################################################
# The dimensions in the ancillary datasets may or may not be arranged from fastest to slowest even though that is
# part of the requirements. We can still account for this. In the event that we don't know the order in which to
# reshape the data vector because we don't know which dimension varies faster than the other(s), we would need to
# sort the dimensions by how fast their indices change. Fortuantely, pycroscopy has a function called `px.hdf_utils.
# get_sort_order` that does just this. Knowing the sort order, we can easily reshape correctly in an automated manner.
# We will do this below
# Sort the spectroscopic dimensions by how fast their indices changes (fastest --> slowest)
spec_sort_order = px.hdf_utils.get_sort_order(h5_spec_ind)
print('Spectroscopic dimensions arranged as is:\n',
spec_dim_names)
print('Dimension indices arranged from fastest to slowest:',
spec_sort_order)
print('Dimension namess now arranged from fastest to slowest:\n',
spec_dim_names[spec_sort_order])
if spec_sort_order[dc_dim_ind] > spec_sort_order[freq_dim_ind]:
spectrogram_shape = (dc_dim_size, freq_dim_size)
else:
spectrogram_shape = (freq_dim_size, dc_dim_size)
print('We need to reshape the vector by the tuple:', spectrogram_shape)
# Reshaping from 1D to 2D:
spectrogram2 = np.reshape(np.squeeze(data_vec), spectrogram_shape)
#########################################################################
# Now that the spectrogram is indeed two dimensional, we can visualize it
# Now the spectrogram is of order (DC_Offset x frequency)
fig, axis = plt. subplots()
axis.imshow(np.abs(spectrogram2), origin='lower')
axis.set_xlabel('Frequency Index')
axis.set_ylabel('DC Offset Index')
axis.set_title('Spectrogram Amplitude');
#########################################################################
# Close and delete the h5_file
h5_file.close()
os.remove(h5_path)
"""
=================================================================
Basic data analysis - Spectral Unmixing, KMeans, PCA, NFINDR etc.
Spectral Unmixing
=================================================================
R. K. Vasudevan\ :sup:`1,2`\ , S. Somnath\ :sup:`3`
......
"""
======================================================================================
Tutorials for Developing Scientific Workflows in Pycroscopy - Part 1: Data Translation
Tutorial 1: Data Translation
======================================================================================
**Suhas Somnath**
8/8/2017
This set of notebooks will serve as examples for developing and end-to-end workflows for and using pycroscopy.
This set of tutorials will serve as examples for developing end-to-end workflows for and using pycroscopy.
**In this example, we extract data and parameters from a Scanning Tunnelling Spectroscopy (STS) raw data file, as
obtained from an Omicron STM, and write these to a pycroscopy compatible data file.**
......
"""
====================================================================================================
Tutorials for Developing Scientific Workflows in Pycroscopy - Part 2: Writing to pycroscopy H5 files
Tutorial 2: Writing to pycroscopy H5 files
====================================================================================================
**Suhas Somnath**
8/8/2017
This set of notebooks will serve as examples for developing end-to-end workflows for and using pycroscopy.
This set of tutorials will serve as examples for developing end-to-end workflows for and using pycroscopy.
While pycroscopy contains many popular data processing function, it may not have a function you need. Since pycroscopy
is data-centric, it is preferable to write processing results back to the same file as well.
......@@ -15,7 +15,7 @@ is data-centric, it is preferable to write processing results back to the same f
back to the file.**
K-Means clustering is a quick and simple method to determine the types of spectral responses present in the data and
their spatial occurrance.
their spatial occurrence.
Introduction:
=============
......@@ -23,40 +23,8 @@ Introduction:
Data structuring and file format:
=================================
**Before proceeding with this example, we highly recommend you read about the data formatting in pycroscopy as well as
reading and writing to HDF5 files.** We will summarize some key points below:
* pycroscopy uses the **heirarchical data format (HDF5)** files to store data
* These HDF5 or H5 files contain datasets and datagroups
* pycroscopy data files have two kinds of datasets:
* **`main`** datasets: These must be of the form: `[instance, features]`.
* All imaging or measurement data satisfy this category, where positions form the instances and the spectral
points form the features. Thus, even standard 2D images or a single spectra also satisfy this condition.
* A collection of `k` chosen spectra would still satisfy this condition. Some examples include:
* the cluster centers obtained from a clustering algorithm like `k-Means clustering`.
* The abundance maps obtained from decomposition algorithms like `Singular Value Decomposition (SVD)` or
`Non-negetive matrix factorization (NMF)`
* **`ancillary`** datasets: All other datasets fall into this category. These include the frequency vector or bias
vector as a function of which the main dataset was collected.
* pycroscopy stores all data in two dimensional matrices with all position dimensions collapsed to the first dimension
and all other (spectroscopic) dimensions collapsed to the second dimension.
* All these **`main`** datasets are always accompanied by four ancillary datasets:
* Position Indices
* Position Values
* Spectroscopic Indices
* Spectroscopic Values
* These ancillary datasets are always two dimensional.
* The Position datasets are NxM where N is the total number of positions and M is the number of position dimensions.
* The Spectroscopic datasets are MxN where M is the number of spectroscopic dimensions and N is the total number os specstroscopic steps.
* All **`main`** datasets always have two attributes that describe the measurement itself:
* `quantity`: The physical quantity contained in each cell of the dataset - such as voltage, current, force etc.
* `units`: The units for the physical quantity such as `V` for volts, `nA` for nano amperes, `pN` for pico newtons
etc.
* All **`main`** datasets additionally have 4 attributes that provide the references or links to the 4 aforementions
ancillary datasets
* Storing just the references allows us to re-use the same position / spectroscopic datasets without having to
remake them
* For more information see the data format documentation
**Before proceeding with this example, we highly recommend you read about data formatting in pycroscopy as well as
reading and writing to HDF5 files.**
This bookkeeping is necessary for helping the code to understand the dimensionality and structure of the data. While
these rules may seem tedious, there are several functions and a few classes that make these tasks much easier
......
This diff is collapsed.
......@@ -585,7 +585,7 @@ def get_formatted_labels(h5_dset):
return None
def reshape_to_Ndims(h5_main, h5_pos=None, h5_spec=None, get_labels=False):
def reshape_to_Ndims(h5_main, h5_pos=None, h5_spec=None, get_labels=False, verbose=False, sort_dims=False):
"""
Reshape the input 2D matrix to be N-dimensions based on the
position and spectroscopic datasets.
......@@ -598,8 +598,13 @@ def reshape_to_Ndims(h5_main, h5_pos=None, h5_spec=None, get_labels=False):
Position indices corresponding to rows in `h5_main`
h5_spec : HDF5 Dataset, optional
Spectroscopic indices corresponding to columns in `h5_main`
get_labels : bool
Should the labels be returned. Default False
get_labels : bool, optional
Whether or not to return the dimension labels. Default False
verbose : bool, optional
Whether or not to print debugging statements
sort_dims : bool
If True, the data is sorted so that the dimensions are in order from fastest to slowest
If `get_labels` is also True, the labels are sorted as well.
Returns
-------
......@@ -689,12 +694,27 @@ def reshape_to_Ndims(h5_main, h5_pos=None, h5_spec=None, get_labels=False):
pos_sort = get_sort_order(np.transpose(ds_pos))
spec_sort = get_sort_order(ds_spec)
if verbose:
print('Position dimensions:', get_attr(h5_pos, 'labels'))
print('Position sort order:', pos_sort)
print('Spectroscopic Dimensions:', get_attr(h5_spec, 'labels'))
print('Spectroscopic sort order:', spec_sort)
'''
Get the size of each dimension in the sorted order
'''
pos_dims = get_dimensionality(np.transpose(ds_pos), pos_sort)
spec_dims = get_dimensionality(ds_spec, spec_sort)
if verbose:
print('\nPosition dimensions (sort applied):', get_attr(h5_pos, 'labels')[pos_sort])
print('Position dimensionality (sort applied):', pos_dims)
print('Spectroscopic dimensions (sort applied):', get_attr(h5_spec, 'labels')[spec_sort])
print('Spectroscopic dimensionality (sort applied):', spec_dims)
all_labels = np.hstack((get_attr(h5_pos, 'labels')[pos_sort][::-1],
get_attr(h5_spec, 'labels')[spec_sort][::-1]))
ds_main = h5_main[()]
"""
......@@ -718,14 +738,30 @@ def reshape_to_Ndims(h5_main, h5_pos=None, h5_spec=None, get_labels=False):
except:
raise
if verbose:
print('\nAfter first reshape, labels are', all_labels)
print('Data shape is', ds_Nd.shape)
"""
Now we transpose the axes for both the position and spectroscopic dimensions
so that they are in the same order as in the index array
"""
swap_axes = np.append(pos_sort.size - 1 - np.argsort(pos_sort),
spec_sort.size - spec_sort - 1 + len(pos_dims))
if not sort_dims:
swap_axes = np.append(pos_sort.size - 1 - pos_sort,
spec_sort.size - spec_sort - 1 + len(pos_dims))
else:
swap_axes = np.append(pos_sort[::-1], spec_sort[::-1] + len(pos_dims))
ds_Nd2 = np.transpose(ds_Nd, swap_axes)
if verbose:
print('\nAxes will permuted in this order:', swap_axes)
print('New labels ordering:', all_labels[swap_axes])
ds_Nd = np.transpose(ds_Nd, swap_axes)
results = [ds_Nd, True]
if verbose:
print('Dataset now of shape:', ds_Nd.shape)
if get_labels:
'''
......@@ -736,15 +772,15 @@ def reshape_to_Ndims(h5_main, h5_pos=None, h5_spec=None, get_labels=False):
else:
pos_labs = np.array(['' for _ in pos_dims])
if isinstance(h5_spec, h5py.Dataset):
spec_labs = get_attr(h5_spec, 'labels')[spec_sort]
spec_labs = get_attr(h5_spec, 'labels')
else:
spec_labs = np.array(['' for _ in spec_dims])
ds_labels = np.hstack([pos_labs, spec_labs])
ds_labels = np.hstack([pos_labs[pos_sort[::-1]], spec_labs[spec_sort[::-1]]])
return ds_Nd2, True, ds_labels
else:
return ds_Nd2, True
results.append(ds_labels[swap_axes])
return results
def reshape_from_Ndims(ds_Nd, h5_pos=None, h5_spec=None):
......@@ -1513,3 +1549,55 @@ def create_spec_inds_from_vals(ds_spec_val_mat):
ds_spec_inds_mat[change_sort, jcol] = indices
return ds_spec_inds_mat
def get_unit_values(h5_spec_ind, h5_spec_val, dim_names=None):
"""
Gets the unit arrays of values that describe the spectroscopic dimensions
Parameters
----------
h5_spec_ind : h5py.Dataset
Spectroscopic Indices dataset
h5_spec_val : h5py.Dataset
Spectroscopic Values dataset
dim_names : str, or list of str, Optional
Names of the dimensions of interest
Note - this function can be extended / modified for ancillary position dimensions as well
Returns
-------
unit_values : dict
Dictionary containing the unit array for each dimension. The name of the dimensions are the keys.
"""
# For all dimensions, find where the index = 0
# basically, we are indexing all dimensions to 0
first_indices = []
for dim_ind in range(h5_spec_ind.shape[0]):
first_indices.append(h5_spec_ind[dim_ind] == 0)
first_indices = np.vstack(first_indices)
spec_dim_names = get_attr(h5_spec_ind, 'labels')
if dim_names is None:
dim_names = spec_dim_names
elif not isinstance(dim_names, list):
dim_names = [dim_names]
unit_values = dict()
for dim_name in dim_names:
# Find the row in the spectroscopic indices that corresponds to the dimensions we want to slice:
desired_row_ind = np.where(spec_dim_names == dim_name)[0][0]
# Find indices of all other dimensions
remaining_dims = list(range(h5_spec_ind.shape[0]))
remaining_dims.remove(desired_row_ind)
# The intersection of all these indices should give the desired index for the desired row
intersections = np.all(first_indices[remaining_dims, :], axis=0)
# apply this slicing to the values dataset:
unit_values[dim_name] = h5_spec_val[desired_row_ind, intersections]
return unit_values
......@@ -36,11 +36,11 @@ from .ptychography import PtychographyTranslator
from .sporc import SporcTranslator
from .time_series import MovieTranslator
from .translator import Translator
from .beps_data_generator import FakeDataGenerator
from .beps_data_generator import FakeBEPSGenerator
from .labview_h5_patcher import LabViewH5Patcher
__all__ = ['Translator', 'BEodfTranslator', 'BEPSndfTranslator', 'BEodfRelaxationTranslator',
'GIVTranslator', 'GLineTranslator', 'GTuneTranslator', 'GDMTranslator', 'PtychographyTranslator',
'SporcTranslator', 'MovieTranslator', 'IgorIBWTranslator', 'NumpyTranslator',
'OneViewTranslator', 'ImageTranslator', 'NDataTranslator', 'FakeDataGenerator',
'OneViewTranslator', 'ImageTranslator', 'NDataTranslator', 'FakeBEPSGenerator',
'LabViewH5Patcher']
......@@ -22,7 +22,7 @@ from .df_utils.beps_gen_utils import get_noise_vec, beps_image_folder
from .df_utils.io_image import read_image, no_bin
class FakeDataGenerator(Translator):
class FakeBEPSGenerator(Translator):
"""
"""
......@@ -34,7 +34,7 @@ class FakeDataGenerator(Translator):
"""
"""
super(FakeDataGenerator, self).__init__(*args, **kwargs)
super(FakeBEPSGenerator, self).__init__(*args, **kwargs)
self.N_x = None
self.N_y = None
self.n_steps = None
......@@ -125,7 +125,7 @@ class FakeDataGenerator(Translator):
return new_file_list
def translate(self, h5_path, n_steps, n_bins, start_freq, end_freq,
def translate(self, h5_path, n_steps=32, n_bins=37, start_freq=300E+3, end_freq=350E+3,
data_type='BEPSData', mode='DC modulation mode', field_mode='in and out-of-field',
n_cycles=1, FORC_cycles=1, FORC_repeats=1, loop_a=1, loop_b=4,
cycle_frac='full', image_folder=beps_image_folder, bin_factor=None,
......@@ -136,14 +136,18 @@ class FakeDataGenerator(Translator):
----------
h5_path : str
Desired path to write the new HDF5 file
n_steps : uint
n_steps : uint, optional
Number of voltage steps
n_bins : n_bins
Default - 32
n_bins : uint, optional
Number of frequency bins
start_freq : float
Default - 37
start_freq : float, optional
Starting frequency in Hz
end_freq : float
Default - 300E+3
end_freq : float, optional
Final freqency in Hz
Default - 350E+3
data_type : str, optional
Type of data to generate
Options - 'BEPSData', 'BELineData'
......
......@@ -163,7 +163,7 @@ class Decomposition(object):
h5_decomp_refs = hdf.writeData(decomp_grp)
h5_components = getH5DsetRefs(['Components'], h5_decomp_refs)[0]
h5_projections = getH5DsetRefs(['Mean_Response'], h5_decomp_refs)[0]
h5_projections = getH5DsetRefs(['Projection'], h5_decomp_refs)[0]
h5_decomp_inds = getH5DsetRefs(['Decomposition_Indices'], h5_decomp_refs)[0]
h5_decomp_vals = getH5DsetRefs(['Decomposition_Values'], h5_decomp_refs)[0]
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment