plot_spectral_unmixing.py 12.6 KB
Newer Older
Unknown's avatar
Unknown committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
=================================================================
Basic data analysis - Spectral Unmixing, KMeans, PCA, NFINDR etc.
=================================================================

R. K. Vasudevan\ :sup:`1,2`\ , S. Somnath\ :sup:`3`

* :sup:`1` Center for Nanophase Materials Sciences
* :sup:`2` Institute for Functional Imaging of Materials
* :sup:`3` Advanced Data and Workflows Group

Oak Ridge National Laboratory, Oak Ridge TN 37831, USA

In this notebook we load some spectral data, and perform basic data analysis, including:
========================================================================================
* KMeans Clustering
* Non-negative Matrix Factorization
* Principal Component Analysis
* NFINDR

Software Prerequisites:
=======================
* Standard distribution of **Anaconda** (includes numpy, scipy, matplotlib and sci-kit learn)
* **pysptools** (will automatically be installed in the next step)
* **cvxopt** for fully constrained least squares fitting
    * install in a terminal via **`conda install -c https://conda.anaconda.org/omnia cvxopt`**
* **pycroscopy** : Though pycroscopy is mainly used here for plotting purposes only, it's true capabilities
are realized through the ability to seamlessly perform these analyses on any imaging dataset (regardless
of origin, size, complexity) and storing the results back into the same dataset among other things

"""

#Import packages

# Ensure that this code works on both python 2 and python 3
from __future__ import division, print_function, absolute_import, unicode_literals

# basic numeric computation:
import numpy as np

# The package used for creating and manipulating HDF5 files:
import h5py

# Plotting and visualization:
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

# for downloading files:
import wget
import os

# multivariate analysis:
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF
from pysptools import eea
import pysptools.abundance_maps as amp
from pysptools.eea import nfindr

# finally import pycroscopy:
import pycroscopy as px

#####################################################################################
# The Data
# ========
#
# In this example, we will work on **Infrared (IR) Spectra** data obtained from an Anasys Instruments Nano IR
# as one of the simplest examples of data. This dataset contains a single IR spectra collected at each
# position on a single line of spatial points. Thus, this two dimensional dataset has one position dimension
# (lets say X) and one spectral dimension (wavelength).
#
# In the event that the spectra were collected on a 2D grid of spatial locations (two spatial dimensions -
# X, Y), the resultant three dimensional dataset (X, Y, wavelength) would need to be reshaped to a two
# dimensional dataset of (position, wavelength) since this is the standard format that is accepted by
# all statistical analysis, machine learning, spectral unmixing algorithms. The same reshaping of data would
# need to be performed if there are more than two spectroscopic dimensions.
#
# Working with the specific Nano IR dataset:
# ==========================================
# We will begin by downloading the data file from Github, followed by reshaping and decimation of the dataset

81
data_file_path = 'temp.h5'
Unknown's avatar
Unknown committed
82
# download the data file from Github:
83
url = 'https://raw.githubusercontent.com/pycroscopy/pycroscopy/master/data/BELine_0004.h5'
Unknown's avatar
Unknown committed
84
85
_ = wget.download(url, data_file_path, bar=None)

86
87
hdf = px.ioHDF5(data_file_path)
h5_file = hdf.file
Unknown's avatar
Unknown committed
88

89
90
91
92
print('Contents of data file:')
print('----------------------')
px.hdf_utils.print_tree(h5_file)
print('----------------------')
Unknown's avatar
Unknown committed
93

94
h5_meas_grp = h5_file['Measurement_000']
Unknown's avatar
Unknown committed
95

96
97
98
# Extracting some basic parameters:
num_rows = px.hdf_utils.get_attr(h5_meas_grp,'grid_num_rows')
num_cols = px.hdf_utils.get_attr(h5_meas_grp,'grid_num_cols')
Unknown's avatar
Unknown committed
99

100
101
# Getting a reference to the main dataset:
h5_main = h5_meas_grp['Channel_000/Raw_Data']
Unknown's avatar
Unknown committed
102

103
104
105
# Extracting the X axis - vector of frequencies
h5_spec_vals = px.hdf_utils.getAuxData(h5_main,'Spectroscopic_Values')[-1]
freq_vec = np.squeeze(h5_spec_vals.value) * 1E-3
Unknown's avatar
Unknown committed
106

107
print('Data currently of shape:', h5_main.shape)
Unknown's avatar
Unknown committed
108

109
110
x_label = 'Frequency (kHz)'
y_label = 'Amplitude (a.u.)'
Unknown's avatar
Unknown committed
111
112

#####################################################################################
Unknown's avatar
Unknown committed
113
114
# Visualize the Amplitude Data
# ============================
115
116
117
118
# Note that we are not hard-coding / writing any tick labels / axis labels by hand.
# All the necessary information was present in the H5 file

px.viz.be_viz_utils.jupyter_visualize_be_spectrograms(h5_main)
Unknown's avatar
Unknown committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

#####################################################################################
# 1. Singular Value Decomposition (SVD)
# =====================================
#
# SVD is an eigenvector decomposition that is defined statistically, and therefore typically produces
# non-physical eigenvectors. Consequently, the interpretation of eigenvectors and abundance maps from
# SVD requires care and caution in interpretation. Nontheless, it is a good method for quickly
# visualizing the major trends in the dataset since the resultant eigenvectors are sorted in descending
# order of variance or importance. Furthermore, SVD is also very well suited for data cleaning through
# the reconstruction of the dataset using only the first N (most significant) components.
#
# SVD results in three matrices:
# * V - Eigenvectors sorted by variance in descending order
# * U - corresponding bundance maps
# * S - Variance or importance of each of these components
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#
# Advantage of pycroscopy:
# ------------------------
# Notice that we are working with a complex valued dataset. Passing the complex values as is to SVD would result in
# complex valued eigenvectors / endmembers as well as abundance maps. Complex valued abundance maps are not physical.
# Thus, one would need to restructure the data such that it is real-valued only.
#
# One solution is to stack the real value followed by the magnitude of the imaginary component before passing to SVD.
# After SVD, the real-valued eigenvectors would need to be treated as the concatenation of the real and imaginary
# components. So, the eigenvectors would need to be restructured to get back the complex valued eigenvectors.
#
# **Pycroscopy handles all these data transformations (both for the source dataset and the eigenvectors)
# automatically.**  In general, pycroscopy handles compund / complex valued datasets everywhere possible
#
# Furthermore, while it is not discussed in this example, pycroscopy also writes back the results from SVD back to
# the same source h5 file including all relevant links to the source dataset and other ancillary datasets

h5_svd_group = px.doSVD(h5_main, num_comps=256)
Unknown's avatar
Unknown committed
153

154
155
156
h5_u = h5_svd_group['U']
h5_v = h5_svd_group['V']
h5_s = h5_svd_group['S']
Unknown's avatar
Unknown committed
157

158
159
# Since the two spatial dimensions (x, y) have been collapsed to one, we need to reshape the abundance maps:
abun_maps = np.reshape(h5_u[:,:25], (num_rows, num_cols, -1))
Unknown's avatar
Unknown committed
160
161

# Visualize the variance / statistical importance of each component:
162
px.plot_utils.plotScree(h5_s, title='Note the exponential drop of variance with number of components')
Unknown's avatar
Unknown committed
163
164

# Visualize the eigenvectors:
165
166
167
168
169
170
first_evecs = h5_v[:9, :]

px.plot_utils.plot_loops(freq_vec, np.abs(first_evecs), x_label=x_label, y_label=y_label, plots_on_side=3,
                         subtitles='Component', title='SVD Eigenvectors (Amplitude)', evenly_spaced=False)
px.plot_utils.plot_loops(freq_vec, np.angle(first_evecs), x_label=x_label, y_label='Phase (rad)', plots_on_side=3,
                         subtitles='Component', title='SVD Eigenvectors (Phase)', evenly_spaced=False)
Unknown's avatar
Unknown committed
171
172

# Visualize the abundance maps:
173
174
px.plot_utils.plot_map_stack(abun_maps, num_comps=9, heading='SVD Abundance Maps',
                             color_bar_mode='single', cmap='inferno')
Unknown's avatar
Unknown committed
175
176
177
178
179
180
181
182
183
184
185
186
187

#####################################################################################
# 2. KMeans Clustering
# ====================
#
# KMeans clustering is a quick and easy method to determine the types of spectral responses present in the
# data. It is not a decomposition method, but a basic clustering method. The user inputs the number of
# clusters (sets) to partition the data into. The algorithm proceeds to find the optimal labeling
# (ie., assignment of each spectra as belonging to the k<sup>th</sup> set) such that the within-cluster
# sum of squares is minimized.
#
# Set the number of clusters below

188
num_clusters = 4
Unknown's avatar
Unknown committed
189

190
estimators = px.Cluster(h5_main, 'KMeans', n_clusters=num_clusters)
Unknown's avatar
Unknown committed
191
192
193
h5_kmeans_grp = estimators.do_cluster(h5_main)
h5_kmeans_labels = h5_kmeans_grp['Labels']
h5_kmeans_mean_resp = h5_kmeans_grp['Mean_Response']
Unknown's avatar
Unknown committed
194

195
px.plot_utils.plot_cluster_h5_group(h5_kmeans_grp)
Unknown's avatar
Unknown committed
196
197
198
199
200
201
202
203
204
205
206
207
208

#####################################################################################
# 3. Non-negative Matrix Factorization (NMF)
# ===========================================
#
# NMF, or non-negative matrix factorization, is a method that is useful towards unmixing of spectral
# data. It only works on data with positive real values. It operates by approximate determination of
# factors (matrices) W and H, given a matrix V, as shown below
#
# .. image:: https://upload.wikimedia.org/wikipedia/commons/f/f9/NMF.png

num_comps = 4

209
210
# get the non-negative portion of the dataset
data_mat = np.abs(h5_main)
Unknown's avatar
Unknown committed
211
212
213
214

model = NMF(n_components=num_comps, init='random', random_state=0)
model.fit(data_mat)

215
216
fig, axis = plt.subplots(figsize=(5.5, 5))
px.plot_utils.plot_line_family(axis, freq_vec, model.components_, label_prefix='NMF Component #')
Unknown's avatar
Unknown committed
217
218
219
axis.set_xlabel(x_label, fontsize=12)
axis.set_ylabel(y_label, fontsize=12)
axis.set_title('NMF Components', fontsize=14)
220
axis.legend(bbox_to_anchor=[1.0, 1.0], fontsize=12)
Unknown's avatar
Unknown committed
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252

#####################################################################################
# 4. NFINDR
# =========
#
# NFINDR is a geometric decomposition technique that can aid in determination of constitent spectra in data.
# The basic idea is as follows. Assume that at any point *x*, the spectra measured *A(w,x)* is a
# linear superposition of *k* 'pure' spectra, i.e.
#
# *A(w,x)* = c\ :sub:`0`\ (x)a\ :sub:`0` + c\ :sub:`1`\ (x)a\ :sub:`1` + ... + c\ :sub:`k`\ (x)a\ :sub:`k`
#
# In this case, our task consists of first determining the pure spectra {a\ :sub:`0`\ ,...,a\ :sub:`k`\ },
# and then determining the coefficients {c\ :sub:`0`\ ,...,c\ :sub:`k`\ }. NFINDR determines the 'pure'
# spectra by first projecting the data into a low-dimensional sub-space (typically using PCA), and then
# taking the convex hull of the points in this space. Then, points are picked at random along the convex
# hull and the volume of the simplex that the points form is determined. If (k+1) pure spectra are needed,
# the data is reduced to (k) dimensions for this purpose. The points that maximize the volume of the
# simples are taken as the most representative pure spectra available in the dataset. One way to think of
# this is that any spectra that lie within the given volume can be represented as a superposition of these
# constituent spectra; thus maximizing this volume allows the purest spectra to be determined.
#
# The second task is to determine the coefficients. This is done usign the fully constrained least squares
# optimization, and involves the sum-to-one constraint, to allow quantitative comparisons to be made.
# More information can be found in the paper below:
#
# `Winter, Michael E. "N-FINDR: An algorithm for fast autonomous spectral end-member determination in
# hyperspectral data." SPIE's International Symposium on Optical Science, Engineering, and Instrumentation.
# International Society for Optics and Photonics, 1999.
# <http://proceedings.spiedigitallibrary.org/proceeding.aspx?articleid=994814>`_)

num_comps = 4

253
254
255
# get the amplitude component of the dataset
data_mat = np.abs(h5_main)

Unknown's avatar
Unknown committed
256
257
258
nfindr_results = eea.nfindr.NFINDR(data_mat, num_comps) #Find endmembers
end_members = nfindr_results[0]

259
260
fig, axis = plt.subplots(figsize=(5.5, 5))
px.plot_utils.plot_line_family(axis, freq_vec, end_members, label_prefix='NFINDR endmember #')
Unknown's avatar
Unknown committed
261
262
263
264
265
266
267
268
269
270
axis.set_title('NFINDR Endmembers', fontsize=14)
axis.set_xlabel(x_label, fontsize=12)
axis.set_ylabel(y_label, fontsize=12)
axis.legend(bbox_to_anchor=[1.0,1.0], fontsize=12)

# fully constrained least squares model:
fcls = amp.FCLS()
# Find abundances:
amap = fcls.map(data_mat[np.newaxis, :, :], end_members)

271
272
# Reshaping amap
amap = np.reshape(np.squeeze(amap), (num_rows, num_cols, -1))
Unknown's avatar
Unknown committed
273

274
275
px.plot_utils.plot_map_stack(amap, heading='NFINDR Abundance maps', cmap=plt.cm.inferno,
                             color_bar_mode='single');
Unknown's avatar
Unknown committed
276
277
278
279
280

#####################################################################################

# Close and delete the h5_file
h5_file.close()
281
os.remove(data_file_path)