Commit c52ab49c authored by Martyn Gigg's avatar Martyn Gigg
Browse files

Create a more flexible interface for getStatistics.

It now defines allows a user to select the statistic type and only
calculate it if reqested.
Refs #13366
parent a561826c
......@@ -103,8 +103,8 @@ void CalculateZscore::exec() {
MantidVec &vecY = outWS->dataY(i);
MantidVec &vecE = outWS->dataE(i);
vector<double> yzscores = getZscore(inpY, false);
vector<double> ezscores = getZscore(inpE, false);
vector<double> yzscores = getZscore(inpY);
vector<double> ezscores = getZscore(inpE);
vecX = inpX;
vecY = yzscores;
......
#ifndef MANTID_KERNEL_STATISTICS_H_
#define MANTID_KERNEL_STATISTICS_H_
/**
Copyright &copy; 2010-2012 ISIS Rutherford Appleton Laboratory, NScD Oak
Ridge National Laboratory & European Spallation Source
This file is part of Mantid.
Mantid is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
Mantid is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
File change history is stored at: <https://github.com/mantidproject/mantid>.
Code Documentation is available at: <http://doxygen.mantidproject.org>
*/
#include "MantidKernel/DllConfig.h"
#include <vector>
......@@ -24,26 +45,6 @@ enum StatisticType {
/**
Simple struct to store statistics.
Copyright &copy; 2010-2012 ISIS Rutherford Appleton Laboratory, NScD Oak
Ridge National Laboratory & European Spallation Source
This file is part of Mantid.
Mantid is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
Mantid is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
File change history is stored at: <https://github.com/mantidproject/mantid>.
Code Documentation is available at: <http://doxygen.mantidproject.org>
*/
struct Statistics {
/// Minimum value
......@@ -58,6 +59,18 @@ struct Statistics {
double standard_deviation;
};
/// Controls the computation of statisical data
struct StatOptions {
enum Flag {
SortedData = 1, // is the data sorted?
Mean = 2, // calculate the mean
UncorrectedStdDev = 4, // calculate the s.d. using N dofs
CorrectedStdDev = 8, // calculate the s.d. using N-1 dofs
Median = 16, // calculate the median
AllStats = (Mean | UncorrectedStdDev | Median)
};
};
/** R factor for powder data analysis
*/
struct Rfactor {
......@@ -76,11 +89,10 @@ struct Rfactor {
/// Return a statistics object for the given data set
template <typename TYPE>
Statistics getStatistics(const std::vector<TYPE> &data,
const bool sorted = false);
const unsigned int flags = StatOptions::AllStats);
/// Return the Z score values for a dataset
template <typename TYPE>
std::vector<double> getZscore(const std::vector<TYPE> &data,
const bool sorted = false);
std::vector<double> getZscore(const std::vector<TYPE> &data);
/// Return the modified Z score values for a dataset
template <typename TYPE>
std::vector<double> getModifiedZscore(const std::vector<TYPE> &data,
......
......@@ -11,7 +11,6 @@
#include <stdexcept>
#include <functional>
namespace Mantid {
namespace Kernel {
......@@ -87,14 +86,13 @@ double getMedian(const vector<TYPE> &data, const size_t num_data,
* put it in a single function.
*/
template <typename TYPE>
std::vector<double> getZscore(const vector<TYPE> &data, const bool sorted) {
std::vector<double> getZscore(const vector<TYPE> &data) {
if (data.size() < 3) {
std::vector<double> Zscore(data.size(), 0.);
return Zscore;
}
std::vector<double> Zscore;
Statistics stats = getStatistics(data, sorted);
Statistics stats = getStatistics(data);
if (stats.standard_deviation == 0.) {
std::vector<double> Zscore(data.size(), 0.);
return Zscore;
......@@ -145,47 +143,55 @@ std::vector<double> getModifiedZscore(const vector<TYPE> &data,
/**
* Determine the statistics for a vector of data. If it is sorted then let the
* function know so it won't make a copy of the data for determining the median.
* @param data Data points whose statistics are to be evaluated
* @param flags A set of flags to control the computation of the stats
*/
template <typename TYPE>
Statistics getStatistics(const vector<TYPE> &data, const bool sorted) {
Statistics getStatistics(const vector<TYPE> &data, const unsigned int flags) {
Statistics stats = getNanStatistics();
size_t num_data = data.size(); // cache since it is frequently used
if (num_data == 0) { // don't do anything
if (num_data == 0) { // don't do anything
return stats;
}
// calculate the mean
const TYPE sum = std::accumulate(data.begin(), data.end(),
static_cast<TYPE>(0), std::plus<TYPE>());
stats.mean = static_cast<double>(sum) / (static_cast<double>(num_data));
// calculate the standard deviation, min, max
stats.minimum = stats.mean;
stats.maximum = stats.mean;
double stddev = 0.;
typename vector<TYPE>::const_iterator it = data.begin();
for (; it != data.end(); ++it) {
double temp = static_cast<double>(*it);
stddev += ((temp - stats.mean) * (temp - stats.mean));
if (temp > stats.maximum)
stats.maximum = temp;
if (temp < stats.minimum)
stats.minimum = temp;
// calculate the mean if this or the stddev is requested
const bool stddev = ((flags & StatOptions::UncorrectedStdDev) ||
(flags & StatOptions::CorrectedStdDev));
if ((flags & StatOptions::Mean) || stddev) {
const TYPE sum = std::accumulate(data.begin(), data.end(),
static_cast<TYPE>(0), std::plus<TYPE>());
stats.mean = static_cast<double>(sum) / (static_cast<double>(num_data));
if (stddev) {
// calculate the standard deviation, min, max
stats.minimum = stats.mean;
stats.maximum = stats.mean;
double stddev = 0.;
typename vector<TYPE>::const_iterator it = data.begin();
for (; it != data.end(); ++it) {
double temp = static_cast<double>(*it);
stddev += ((temp - stats.mean) * (temp - stats.mean));
if (temp > stats.maximum)
stats.maximum = temp;
if (temp < stats.minimum)
stats.minimum = temp;
}
size_t ndofs =
(flags & StatOptions::CorrectedStdDev) ? num_data - 1 : num_data;
stats.standard_deviation = sqrt(stddev / (static_cast<double>(ndofs)));
}
}
// calculate the median if requested
if (flags & StatOptions::Median) {
stats.median = getMedian(data, num_data, flags & StatOptions::SortedData);
}
stats.standard_deviation = sqrt(stddev / (static_cast<double>(num_data)));
// calculate the median
stats.median = getMedian(data, num_data, sorted);
return stats;
}
/// Getting statistics of a string array should just give a bunch of NaNs
template <>
DLLExport Statistics
getStatistics<string>(const vector<string> &data, const bool sorted) {
UNUSED_ARG(sorted);
getStatistics<string>(const vector<string> &data, const unsigned int flags) {
UNUSED_ARG(flags);
UNUSED_ARG(data);
return getNanStatistics();
}
......@@ -193,8 +199,8 @@ DLLExport Statistics
/// Getting statistics of a boolean array should just give a bunch of NaNs
template <>
DLLExport Statistics
getStatistics<bool>(const vector<bool> &data, const bool sorted) {
UNUSED_ARG(sorted);
getStatistics<bool>(const vector<bool> &data, const unsigned int flags) {
UNUSED_ARG(flags);
UNUSED_ARG(data);
return getNanStatistics();
}
......@@ -300,7 +306,8 @@ std::vector<double> getMomentsAboutOrigin(const std::vector<TYPE> &x,
numPoints = x.size() - 1;
// densities are calculated using Newton's method for numerical integration
// as backwards as it sounds, the outer loop should be the points rather than
// as backwards as it sounds, the outer loop should be the points rather
// than
// the moments
for (size_t j = 0; j < numPoints; ++j) {
// reduce item lookup - and central x for histogram
......@@ -359,7 +366,8 @@ std::vector<double> getMomentsAboutMean(const std::vector<TYPE> &x,
numPoints = x.size() - 1;
// densities are calculated using Newton's method for numerical integration
// as backwards as it sounds, the outer loop should be the points rather than
// as backwards as it sounds, the outer loop should be the points rather
// than
// the moments
for (size_t j = 0; j < numPoints; ++j) {
// central x in histogram with a change of variables - and just change for
......@@ -391,9 +399,9 @@ std::vector<double> getMomentsAboutMean(const std::vector<TYPE> &x,
// --------------------------------
#define INSTANTIATE(TYPE) \
template MANTID_KERNEL_DLL Statistics \
getStatistics<TYPE>(const vector<TYPE> &, const bool); \
getStatistics<TYPE>(const vector<TYPE> &, const unsigned int); \
template MANTID_KERNEL_DLL std::vector<double> getZscore<TYPE>( \
const vector<TYPE> &, const bool); \
const vector<TYPE> &); \
template MANTID_KERNEL_DLL std::vector<double> getModifiedZscore<TYPE>( \
const vector<TYPE> &, const bool); \
template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutOrigin<TYPE>( \
......
#ifndef STATISTICSTEST_H_
#define STATISTICSTEST_H_
#include "MantidKernel/Statistics.h"
#include <boost/math/special_functions/fpclassify.hpp>
#include <cxxtest/TestSuite.h>
#include <algorithm>
#include <cmath>
#include <vector>
#include <string>
#include "MantidKernel/Statistics.h"
using namespace Mantid::Kernel;
using std::string;
......@@ -14,7 +16,7 @@ using std::vector;
class StatisticsTest : public CxxTest::TestSuite
{
public:
void testDoubleOdd()
void test_Doubles_And_Default_Flags_Calculates_All_Stats()
{
vector<double> data;
data.push_back(17.2);
......@@ -30,8 +32,80 @@ public:
TS_ASSERT_EQUALS(stats.minimum, 12.6);
TS_ASSERT_EQUALS(stats.maximum, 18.3);
TS_ASSERT_EQUALS(stats.median, 17.2);
}
void test_Doubles_With_Sorted_Data()
{
vector<double> data;
data.push_back(17.2);
data.push_back(18.1);
data.push_back(16.5);
data.push_back(18.3);
data.push_back(12.6);
sort(data.begin(), data.end());
Statistics stats = getStatistics(data, (StatOptions::Median|StatOptions::SortedData));
TS_ASSERT(boost::math::isnan(stats.mean));
TS_ASSERT(boost::math::isnan(stats.standard_deviation));
TS_ASSERT(boost::math::isnan(stats.minimum));
TS_ASSERT(boost::math::isnan(stats.maximum));
TS_ASSERT_EQUALS(stats.median, 17.2);
}
void test_Unsorted_Data_With_Sorted_Flag_Gives_Expected_Incorrect_Result_For_Median()
{
vector<double> data;
data.push_back(17.2);
data.push_back(18.1);
data.push_back(16.5);
data.push_back(18.3);
data.push_back(12.6);
Statistics stats = getStatistics(data, (StatOptions::Median|StatOptions::SortedData));
TS_ASSERT(boost::math::isnan(stats.mean));
TS_ASSERT(boost::math::isnan(stats.standard_deviation));
TS_ASSERT(boost::math::isnan(stats.minimum));
TS_ASSERT(boost::math::isnan(stats.maximum));
TS_ASSERT_EQUALS(stats.median, 16.5);
}
void test_Doubles_With_Corrected_StdDev_Calculates_Mean()
{
vector<double> data;
data.push_back(17.2);
data.push_back(18.1);
data.push_back(16.5);
data.push_back(18.3);
data.push_back(12.6);
sort(data.begin(), data.end());
Statistics stats = getStatistics(data, StatOptions::CorrectedStdDev);
TS_ASSERT_EQUALS(stats.mean, 16.54);
TS_ASSERT_DELTA(stats.standard_deviation, 2.3179, 0.0001);
TS_ASSERT_EQUALS(stats.minimum, 12.6);
TS_ASSERT_EQUALS(stats.maximum, 18.3);
TS_ASSERT(boost::math::isnan(stats.median));
}
void test_Types_Can_Be_Disabled_With_Flags() {
vector<double> data;
data.push_back(17.2);
data.push_back(18.1);
data.push_back(16.5);
data.push_back(18.3);
data.push_back(12.6);
Statistics justMean = getStatistics(data, StatOptions::Mean);
TS_ASSERT_EQUALS(justMean.mean, 16.54);
TS_ASSERT(boost::math::isnan(justMean.standard_deviation));
TS_ASSERT(boost::math::isnan(justMean.minimum));
TS_ASSERT(boost::math::isnan(justMean.maximum));
TS_ASSERT(boost::math::isnan(justMean.median));
}
void testZscores()
{
vector<double> data;
......@@ -62,7 +136,6 @@ public:
std::vector<double> ZModscore = getModifiedZscore(data);
TS_ASSERT_DELTA(ZModscore[4], 1.2365, 0.0001);
TS_ASSERT_DELTA(ZModscore[6], 0.3372, 0.0001);
}
void testDoubleSingle()
......
......@@ -74,15 +74,21 @@ public:
/**
* Proxy for @see Mantid::Kernel::getStatistics so that it can accept numpy
* arrays,
* arrays
* @param data Input data
* @param sorted A boolean indicating whether the data is sorted
*/
Statistics getStatisticsNumpy(const numeric::array &data,
const bool sorted = false) {
using Mantid::Kernel::getStatistics;
using Mantid::Kernel::StatOptions;
using Converters::NDArrayToVector;
if (isFloatArray(data.ptr())) {
return getStatistics(NDArrayToVector<double>(data)(), sorted);
unsigned int flags = StatOptions::AllStats;
if (sorted)
flags |= StatOptions::SortedData;
return getStatistics(NDArrayToVector<double>(data)(), flags);
} else {
throw UnknownDataType();
}
......@@ -93,27 +99,18 @@ BOOST_PYTHON_FUNCTION_OVERLOADS(getStatisticsOverloads, getStatisticsNumpy, 1,
//============================ Z score
//============================================
// Function pointer to real implementation of Zscore functions
typedef std::vector<double>(*ZScoreFunction)(const std::vector<double> &data,
const bool sorted);
/**
* The implementation for getMomentsAboutOrigin & getMomentsAboutOriginMean for
* using
* numpy arrays are identical. This encapsulates that behaviour an additional
* parameter for
* specifying the actual function called along.
* @param momentsFunc A function pointer to the required moments function
* The implementation for getZscoreNumpy for using numpy arrays
* @param zscoreFunc A function pointer to the required moments function
* @param data Numpy array of data
* @param sorted True if the input data is already sorted
*/
std::vector<double> getZScoreNumpyImpl(ZScoreFunction zscoreFunc,
const numeric::array &data,
const bool sorted) {
std::vector<double> getZscoreNumpy(const numeric::array &data) {
using Converters::NDArrayToVector;
using Mantid::Kernel::getZscore;
if (isFloatArray(data.ptr())) {
return zscoreFunc(NDArrayToVector<double>(data)(), sorted);
return getZscore(NDArrayToVector<double>(data)());
} else {
throw UnknownDataType();
}
......@@ -121,14 +118,16 @@ std::vector<double> getZScoreNumpyImpl(ZScoreFunction zscoreFunc,
/**
* Proxy for @see Mantid::Kernel::getZscore so that it can accept numpy arrays,
* @param data The input array
* @param sorted True if the data is sorted (deprecated)
*/
std::vector<double> getZscoreNumpy(const numeric::array &data,
const bool sorted = false) {
using Mantid::Kernel::getZscore;
return getZScoreNumpyImpl(&getZscore, data, sorted);
std::vector<double> getZscoreNumpyDeprecated(const numeric::array &data,
const bool sorted) {
UNUSED_ARG(sorted);
PyErr_Warn(PyExc_DeprecationWarning,
"getZScore no longer requires the second sorted argument.");
return getZscoreNumpy(data);
}
// Define an overload to handle the default argument
BOOST_PYTHON_FUNCTION_OVERLOADS(getZscoreOverloads, getZscoreNumpy, 1, 2)
/**
* Proxy for @see Mantid::Kernel::getModifiedZscore so that it can accept numpy
......@@ -137,8 +136,15 @@ BOOST_PYTHON_FUNCTION_OVERLOADS(getZscoreOverloads, getZscoreNumpy, 1, 2)
std::vector<double> getModifiedZscoreNumpy(const numeric::array &data,
const bool sorted = false) {
using Mantid::Kernel::getModifiedZscore;
return getZScoreNumpyImpl(&getModifiedZscore, data, sorted);
using Converters::NDArrayToVector;
if (isFloatArray(data.ptr())) {
return getModifiedZscore(NDArrayToVector<double>(data)(), sorted);
} else {
throw UnknownDataType();
}
}
// Define an overload to handle the default argument
BOOST_PYTHON_FUNCTION_OVERLOADS(getModifiedZscoreOverloads,
getModifiedZscoreNumpy, 1, 2)
......@@ -231,9 +237,11 @@ void export_Statistics() {
"Determine the statistics for an array of data"))
.staticmethod("getStatistics")
.def("getZscore", &getZscoreNumpy,
getZscoreOverloads(args("data", "sorted"),
"Determine the Z score for an array of data"))
.def("getZscore", &getZscoreNumpy, args("data"),
"Determine the Z score for an array of data")
.def("getZscore", &getZscoreNumpyDeprecated, args("data", "sorted"),
"Determine the Z score for an array of "
"data (deprecated sorted argument)")
.staticmethod("getZscore")
.def("getModifiedZscore", &getModifiedZscoreNumpy,
......@@ -256,8 +264,8 @@ void export_Statistics() {
[ReturnNumpyArray()])
.staticmethod("getMomentsAboutMean");
//------------------------------ Statistics values
//-----------------------------------------------------
//------------------------------ Statistics
//values--------------------------------
// Want this in the same scope as above so must be here
class_<Statistics>("Statistics")
.add_property("minimum", &Statistics::minimum,
......
......@@ -48,6 +48,12 @@ class StatisticsTest(unittest.TestCase):
self.assertAlmostEqual(1.23658, modZ[4], places = 4)
self.assertAlmostEqual(0.33725, modZ[6], places = 4)
# Test the sorted argument still works. Remove this when the function is removed
# sorted=True only ever affected the order
zscore = Stats.getZscore(arr, sorted=True)
self.assertAlmostEqual(1.63977, zscore[4], places = 4)
self.assertAlmostEqual(0.32235, zscore[6], places = 4)
def test_getMoments(self):
mean = 5.
sigma = 4.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment