Commit 929f9cdd authored by Unknown's avatar Unknown
Browse files

Update the multidimensional data tutorial

parent fa42748e
......@@ -58,8 +58,8 @@ import pycroscopy as px
# dimensions in addition to the spectra itself. Hence, this dataet becomes a 2+4 = 6 dimensional dataset
# download the raw data file from Github:
h5_path = 'temp.h5'
url = 'https://raw.githubusercontent.com/pycroscopy/pycroscopy/master/data/FORC_BEPS.h5'
h5_path = 'temp_3.h5'
url = 'https://raw.githubusercontent.com/pycroscopy/pycroscopy/cades_dev/data/BEPS_small.h5'
if os.path.exists(h5_path):
os.remove(h5_path)
_ = wget.download(url, h5_path, bar=None)
......@@ -201,3 +201,146 @@ pos_dim_sizes = get_dim_sizes(h5_pos_ind, is_position=True)
spec_dim_sizes = get_dim_sizes(h5_spec_ind)
print('Positions:', pos_dim_sizes, '\nSpectroscopic:', spec_dim_sizes)
#########################################################################
# Slicing the Main dataset
# ========================
#
# Let's assume that we are interested in visualizing the spectrograms at the first field of the second cycle at
# position - row:3 and column 2. There are two ways of accessing the data:
# 1. The easier method - reshape the data to N dimensions and slice the dataset
# * This approach, while trivial, may not be suitable for large datasets which may or may not fit in memory
# 2. The harder method - find the spectroscopic and position indices of interest and slice the 2D dataset
#
# Approach 1 - N-dimensional form
# -------------------------------
# We will use convenient pycroscopy function that safely reshapes the data to its N dimensional form with a single
# line. Note that while this approach appears simple on the surface, there are a fair number of lines of code that
# make up this function.
ds_nd, success, labels = px.hdf_utils.reshape_to_Ndims(h5_main, get_labels=True)
print('Shape of the N-dimensional dataset:', ds_nd.shape)
print(labels)
#########################################################################
# Now that we have the data in its original N dimensional form, we can easily slice the dataset:
spectrogram = ds_nd[2,3, :, 1, :, 0]
# Now the spectrogram is of order (frequency x DC_Offset).
spectrogram = spectrogram.T
# Now the spectrogram is of order (DC_Offset x frequency)
fig, axis = plt. subplots()
axis.imshow(np.abs(spectrogram), origin='lower')
axis.set_xlabel('Frequency Index')
axis.set_ylabel('DC Offset Index')
axis.set_title('Spectrogram Amplitude');
#########################################################################
# Approach 2 - slicing the 2D matrix
# ----------------------------------
#
# This approach is more hands-on and requires that we be very careful with the indexing and slicing. Nonetheless,
# the process is actually fairly intuitive. We rely entirely upon the spectroscopic and position ancillary datasets
# to find the indices for slicing the dataset. Unlike the main dataset, the ancillary datasets are very small and
# can be stored easily in memory. Once the slicing indices are calculated, we __only read the desired portion of
# `main` data to memory__. Thus the amount of data loaded into memory is only the amount that we absolutely need.
# __This is the only approach that can be applied to slice very large datasets without ovwhelming memory overheads__.
# The comments for each line explain the entire process comprehensively
# Get only the spectroscopic dimension names:
spec_dim_names = px.hdf_utils.get_attr(h5_spec_ind, 'labels')
# Find the row in the spectroscopic indices that corresponds to the dimensions we want to slice:
cycle_row_ind = np.where(spec_dim_names == 'Cycle')[0][0]
# Find the row correspoding to field in the same way:
field_row_ind = np.where(spec_dim_names == 'Field')[0][0]
# Find all the spectral indices corresponding to the second cycle:
desired_cycle = h5_spec_ind[cycle_row_ind] == 1
# Do the same to find the spectral indicies for the first field:
desired_field = h5_spec_ind[field_row_ind] == 0
# Now find the indices where the cycle = 1 and the field = 0 using a logical AND statement:
spec_slice = np.logical_and(desired_cycle, desired_field)
# We will use the same approach to find the position indices
# corresponding to the row index of 3 and colum index of 2:
pos_dim_names = px.hdf_utils.get_attr(h5_pos_ind,'labels')
x_col_ind = np.where(pos_dim_names == 'X')[0][0]
y_col_ind = np.where(pos_dim_names == 'Y')[0][0]
desired_x = h5_pos_ind[:, x_col_ind] == 2
desired_y = h5_pos_ind[:, y_col_ind] == 3
pos_slice = np.logical_and(desired_x, desired_y)
# Now use the spectroscopic and position slice arrays to slice the 2D dataset:
data_vec = h5_main[pos_slice, :][:, spec_slice]
print('Sliced data is of shape:', data_vec.shape)
#########################################################################
# Note that the sliced data is effectively one dimensional since the spectroscopic dimensions were flattened to a
# single dimension.
#
# Now that we have the data we are interested in, all we need to do is reshape the vector to the expected 2D
# spectrogram shape. We still have to be careful about the order of the indices for reshaping the vector to the
# 2D matrix. Note that in python, we specify the slower axis before the faster axis in the reshape command.
# Reshape this dataset to the 2D spectrogram that we desire:
# For this we need to find the size of the data in the DC_offset and Frequency dimensions:
dc_dim_ind = np.where(spec_dim_names == 'DC_Offset')[0][0]
# Find the row correspoding to field in the same way:
freq_dim_ind = np.where(spec_dim_names == 'Frequency')[0][0]
dc_dim_size = spec_dim_sizes[dc_dim_ind]
freq_dim_size = spec_dim_sizes[freq_dim_ind]
# Since we know that the DC offset varies slower than the frequency, we reshape the
# the data vector by (dc_dim_size, freq_dim_size)
print('We need to reshape the vector by the tuple:', (dc_dim_size, freq_dim_size))
#########################################################################
# The dimensions in the ancillary datasets may or may not be arranged from fastest to slowest even though that is
# part of the requirements. We can still account for this. In the event that we don't know the order in which to
# reshape the data vector because we don't know which dimension varies faster than the other(s), we would need to
# sort the dimensions by how fast their indices change. Fortuantely, pycroscopy has a function called `px.hdf_utils.
# get_sort_order` that does just this. Knowing the sort order, we can easily reshape correctly in an automated manner.
# We will do this below
# Sort the spectroscopic dimensions by how fast their indices changes (fastest --> slowest)
spec_sort_order = px.hdf_utils.get_sort_order(h5_spec_ind)
print('Spectroscopic dimensions arranged as is:\n',
spec_dim_names)
print('Dimension indices arranged from fastest to slowest:',
spec_sort_order)
print('Dimension namess now arranged from fastest to slowest:\n',
spec_dim_names[spec_sort_order])
if spec_sort_order[dc_dim_ind] > spec_sort_order[freq_dim_ind]:
spectrogram_shape = (dc_dim_size, freq_dim_size)
else:
spectrogram_shape = (freq_dim_size, dc_dim_size)
print('We need to reshape the vector by the tuple:', spectrogram_shape)
# Reshaping from 1D to 2D:
spectrogram2 = np.reshape(np.squeeze(data_vec), spectrogram_shape)
#########################################################################
# Now that the spectrogram is indeed two dimensional, we can visualize it
# Now the spectrogram is of order (DC_Offset x frequency)
fig, axis = plt. subplots()
axis.imshow(np.abs(spectrogram2), origin='lower')
axis.set_xlabel('Frequency Index')
axis.set_ylabel('DC Offset Index')
axis.set_title('Spectrogram Amplitude');
#########################################################################
# Close and delete the h5_file
h5_file.close()
os.remove(h5_path)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment