Newer
Older
// Includes
#include "MantidKernel/Statistics.h"
Peterson, Peter
committed
#include <algorithm>
#include <functional>
#include <limits>
Peterson, Peter
committed
#include <numeric>
#include <string>
Peterson, Peter
committed
namespace Mantid
{
Peterson, Peter
committed
{
using std::string;
using std::vector;
/**
* Generate a Statistics object where all of the values are NaN. This is a good initial default.
*/
Statistics getNanStatistics()
{
double nan = std::numeric_limits<double>::quiet_NaN();
Statistics stats;
stats.minimum = nan;
stats.maximum = nan;
stats.mean = nan;
stats.median = nan;
stats.standard_deviation = nan;
return stats;
}
/**
* There are enough special cases in determining the median where it useful to
* put it in a single function.
*/
template<typename TYPE>
double getMedian(const vector<TYPE>& data, const size_t num_data, const bool sorted)
{
if (num_data == 1)
return static_cast<double> (*(data.begin()));
bool is_even = ((num_data % 2) == 0);
if (is_even)
{
double left = 0.0;
double right = 0.0;
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
if (sorted)
{
// Just get the centre two elements.
left = static_cast<double> (*(data.begin() + num_data / 2 - 1));
right = static_cast<double> (*(data.begin() + num_data / 2));
}
else
{
// If the data is not sorted, make a copy we can mess with
vector<TYPE> temp(data.begin(), data.end());
// Get what the centre two elements should be...
std::nth_element(temp.begin(), temp.begin() + num_data / 2 - 1, temp.end());
left = static_cast<double> (*(temp.begin() + num_data / 2 - 1));
std::nth_element(temp.begin(), temp.begin() + num_data / 2, temp.end());
right = static_cast<double> (*(temp.begin() + num_data / 2));
}
// return the average
return (left + right) / 2.;
}
else
// Odd number
{
if (sorted)
{
// If sorted and odd, just return the centre value
return static_cast<double> (*(data.begin() + num_data / 2));
}
else
{
// If the data is not sorted, make a copy we can mess with
vector<TYPE> temp(data.begin(), data.end());
// Make sure the centre value is in the correct position
std::nth_element(temp.begin(), temp.begin() + num_data / 2, temp.end());
// Now return the centre value
return static_cast<double> (*(temp.begin() + num_data / 2));
}
}
}
/**
* There are enough special cases in determining the Z score where it useful to
* put it in a single function.
*/
template<typename TYPE>
std::vector<double> getZscore(const vector<TYPE>& data, const bool sorted)
{
if (data.size() < 3)
{
std::vector<double>Zscore(data.size(),0.);
return Zscore;
}
std::vector<double> Zscore;
Statistics stats = getStatistics(data, sorted);
if(stats.standard_deviation == 0.)
{
std::vector<double>Zscore(data.size(),0.);
return Zscore;
}
typename vector<TYPE>::const_iterator it = data.begin();
for (; it != data.end(); ++it)
{
double tmp = static_cast<double> (*it);
Zscore.push_back(fabs((tmp - stats.mean) / stats.standard_deviation));
}
return Zscore;
}
/**
* There are enough special cases in determining the modified Z score where it useful to
* put it in a single function.
*/
template<typename TYPE>
std::vector<double> getModifiedZscore(const vector<TYPE>& data, const bool sorted)
{
if (data.size() < 3)
{
std::vector<double>Zscore(data.size(),0.);
return Zscore;
}
std::vector<double>MADvec;
double tmp;
size_t num_data = data.size(); // cache since it is frequently used
double median = getMedian(data, num_data, sorted);
typename vector<TYPE>::const_iterator it = data.begin();
for (; it != data.end(); ++it)
{
tmp = static_cast<double> (*it);
MADvec.push_back(fabs(tmp - median));
}
double MAD = getMedian(MADvec, num_data, sorted);
std::vector<double>Zscore(data.size(),0.);
return Zscore;
std::vector<double> Zscore;
it = data.begin();
for (; it != data.end(); ++it)
{
tmp = static_cast<double> (*it);
Zscore.push_back(0.6745*fabs((tmp - median) / MAD));
}
return Zscore;
}
/**
* Determine the statistics for a vector of data. If it is sorted then let the
* function know so it won't make a copy of the data for determining the median.
*/
template<typename TYPE>
Statistics getStatistics(const vector<TYPE>& data, const bool sorted)
{
Statistics stats = getNanStatistics();
size_t num_data = data.size(); // cache since it is frequently used
if (num_data == 0)
{ // don't do anything
return stats;
}
// calculate the mean
const TYPE sum = std::accumulate(data.begin(), data.end(), static_cast<TYPE>(0), std::plus<TYPE>());
stats.mean = static_cast<double>(sum)/(static_cast<double>(num_data));
// calculate the standard deviation, min, max
stats.minimum = stats.mean;
stats.maximum = stats.mean;
double stddev = 0.;
typename vector<TYPE>::const_iterator it = data.begin();
for (; it != data.end(); ++it)
{
double temp = static_cast<double> (*it);
stddev += ((temp - stats.mean) * (temp - stats.mean));
if (temp > stats.maximum)
stats.maximum = temp;
if (temp < stats.minimum)
stats.minimum = temp;
}
stats.standard_deviation = sqrt(stddev / (static_cast<double> (num_data)));
// calculate the median
stats.median = getMedian(data, num_data, sorted);
return stats;
}
/// Getting statistics of a string array should just give a bunch of NaNs
template<>
DLLExport Statistics getStatistics<string> (const vector<string>& data, const bool sorted)
{
Gigg, Martyn Anthony
committed
UNUSED_ARG(sorted);
UNUSED_ARG(data);
/// Getting statistics of a boolean array should just give a bunch of NaNs
template<>
DLLExport Statistics getStatistics<bool> (const vector<bool>& data, const bool sorted)
{
UNUSED_ARG(sorted);
UNUSED_ARG(data);
return getNanStatistics();
}
/** Return the Rwp of a diffraction pattern data
* @param obsI :: array of observed intensity values
* @param calI :: array of calculated intensity values;
* @param obsE :: array of error of the observed data;
* @return :: RFactor including Rp and Rwp
Rfactor getRFactor(const std::vector<double>& obsI, const std::vector<double>& calI, const std::vector<double>& obsE)
{
// 1. Check
if (obsI.size() != calI.size() || obsI.size() != obsE.size())
{
std::stringstream errss;
errss << "GetRFactor() Input Error! Observed Intensity (" << obsI.size()
<< "), Calculated Intensity (" << calI.size() << ") and Observed Error ("
<< obsE.size() << ") have different number of elements.";
throw std::runtime_error(errss.str());
}
if (obsI.size() == 0)
{
throw std::runtime_error("getRFactor(): the input arrays are empty.");
}
double sumnom = 0;
double sumdenom = 0;
double sumrpnom = 0;
double sumrpdenom = 0;
size_t numpts = obsI.size();
for (size_t i = 0; i < numpts; ++i)
{
double cal_i = calI[i];
double obs_i = obsI[i];
double sigma = obsE[i];
double weight = 1.0/(sigma*sigma);
double diff = obs_i - cal_i;
if (weight == weight && weight <= DBL_MAX)
{
// If weight is not NaN.
sumrpnom += fabs(diff);
sumrpdenom += fabs(obs_i);
double tempnom = weight*diff*diff;
double tempden = weight*obs_i*obs_i;
sumnom += tempnom;
sumdenom += tempden;
if (tempnom != tempnom || tempden != tempden)
{
std::cout << "***** Error! ****** Data indexed " << i << " is NaN. "
<< "i = " << i << ": cal = " << calI[i] << ", obs = " << obs_i
<< ", weight = " << weight << ". \n";
}
}
rfactor.Rwp = std::sqrt(sumnom/sumdenom);
if (rfactor.Rwp != rfactor.Rwp)
std::cout << "Rwp is NaN. Denominator = " << sumnom << "; Nominator = " << sumdenom << ". \n";
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
/**
* This will calculate the first n-moments (inclusive) about the origin. For example
* if maxMoment=2 then this will return 3 values: 0th (total weight), 1st (mean), 2nd (deviation).
*
* @param x The independent values
* @param y The dependent values
* @param maxMoment The number of moments to calculate
* @returns The first n-moments.
*/
template<typename TYPE>
std::vector<double> getMomentsAboutOrigin(const std::vector<TYPE>& x, const std::vector<TYPE>& y, const int maxMoment)
{
// densities have the same number of x and y
bool isDensity(x.size() == y.size());
// if it isn't a density then check for histogram
if ((!isDensity) && (x.size() != y.size()+1))
{
std::stringstream msg;
msg << "length of x (" << x.size() << ") and y (" << y.size() << ")do not match";
throw std::out_of_range(msg.str());
}
// initialize a result vector with all zeros
std::vector<double> result(maxMoment+1, 0.);
// cache the maximum index
size_t numPoints = y.size();
if (isDensity)
numPoints = x.size()-1;
// densities are calculated using Newton's method for numerical integration
// as backwards as it sounds, the outer loop should be the points rather than the moments
for (size_t j = 0; j < numPoints; ++j)
// reduce item lookup - and central x for histogram
const double xVal = .5*static_cast<double>(x[j]+x[j+1]);
// this variable will be (x^n)*y
double temp = static_cast<double>(y[j]); // correct for histogram
if (isDensity)
{
const double xDelta = static_cast<double>(x[j+1]-x[j]);
temp = .5*(temp + static_cast<double>(y[j+1]))*xDelta;
// accumulate the moments
result[0] += temp;
for (size_t i = 1; i < result.size(); ++i)
temp *= xVal;
result[i] += temp;
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
}
}
return result;
}
/**
* This will calculate the first n-moments (inclusive) about the mean (1st moment). For example
* if maxMoment=2 then this will return 3 values: 0th (total weight), 1st (mean), 2nd (deviation).
*
* @param x The independent values
* @param y The dependent values
* @param maxMoment The number of moments to calculate
* @returns The first n-moments.
*/
template<typename TYPE>
std::vector<double> getMomentsAboutMean(const std::vector<TYPE>& x, const std::vector<TYPE>& y, const int maxMoment)
{
// get the zeroth (integrated value) and first moment (mean)
std::vector<double> momentsAboutOrigin = getMomentsAboutOrigin(x, y, 1);
const double mean = momentsAboutOrigin[1];
// initialize a result vector with all zeros
std::vector<double> result(maxMoment+1, 0.);
result[0] = momentsAboutOrigin[0];
// escape early if we need to
if (maxMoment == 0)
return result;
// densities have the same number of x and y
bool isDensity(x.size() == y.size());
// cache the maximum index
size_t numPoints = y.size();
if (isDensity)
numPoints = x.size()-1;
// densities are calculated using Newton's method for numerical integration
// as backwards as it sounds, the outer loop should be the points rather than the moments
for (size_t j = 0; j < numPoints; ++j)
// central x in histogram with a change of variables - and just change for density
const double xVal = .5*static_cast<double>(x[j]+x[j+1]) - mean; // change of variables
// this variable will be (x^n)*y
double temp;
if (isDensity)
{
const double xDelta = static_cast<double>(x[j+1]-x[j]);
temp = xVal * .5*static_cast<double>(y[j]+y[j+1])*xDelta;
else
{
temp = xVal * static_cast<double>(y[j]);
}
// accumulate the moment
result[1] += temp;
for (size_t i = 2; i < result.size(); ++i)
temp *= xVal;
result[i] += temp;
}
}
return result;
}
// -------------------------- Macro to instantiation concrete types --------------------------------
#define INSTANTIATE(TYPE) \
template MANTID_KERNEL_DLL Statistics getStatistics<TYPE> (const vector<TYPE> &, const bool); \
template MANTID_KERNEL_DLL std::vector<double> getZscore<TYPE> (const vector<TYPE> &, const bool); \
template MANTID_KERNEL_DLL std::vector<double> getModifiedZscore<TYPE> (const vector<TYPE> &, const bool); \
template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutOrigin<TYPE> (const std::vector<TYPE>& x, const std::vector<TYPE>& y, const int maxMoment); \
template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutMean<TYPE> (const std::vector<TYPE>& x, const std::vector<TYPE>& y, const int maxMoment);
// --------------------------- Concrete instantiations ---------------------------------------------
INSTANTIATE(float);
INSTANTIATE(double);
INSTANTIATE(int);
INSTANTIATE(long);
INSTANTIATE(long long);
INSTANTIATE(unsigned int);
INSTANTIATE(unsigned long);
INSTANTIATE(unsigned long long);
Peterson, Peter
committed
} // namespace Mantid