Statistics.cpp 14.1 KB
Newer Older
1
2
3
// Mantid Repository : https://github.com/mantidproject/mantid
//
// Copyright © 2018 ISIS Rutherford Appleton Laboratory UKRI,
4
5
//   NScD Oak Ridge National Laboratory, European Spallation Source,
//   Institut Laue - Langevin & CSNS, Institute of High Energy Physics, CAS
6
// SPDX - License - Identifier: GPL - 3.0 +
7
8
// Includes
#include "MantidKernel/Statistics.h"
9
#include "MantidKernel/Logger.h"
10

Hahn, Steven's avatar
Hahn, Steven committed
11
12
#include <boost/accumulators/accumulators.hpp>
#include <boost/accumulators/statistics/max.hpp>
LamarMoore's avatar
LamarMoore committed
13
14
#include <boost/accumulators/statistics/min.hpp>
#include <boost/accumulators/statistics/stats.hpp>
Hahn, Steven's avatar
Hahn, Steven committed
15
16
#include <boost/accumulators/statistics/variance.hpp>

17
18
19
20
21
#include <algorithm>
#include <cfloat>
#include <cmath>
#include <sstream>

22
namespace Mantid::Kernel {
23
namespace {
Sam Jenkins's avatar
Sam Jenkins committed
24
Logger logger("Statistics");
25
}
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

using std::string;
using std::vector;

/**
 * Generate a Statistics object where all of the values are NaN. This is a good
 * initial default.
 */
Statistics getNanStatistics() {
  double nan = std::numeric_limits<double>::quiet_NaN();

  Statistics stats;
  stats.minimum = nan;
  stats.maximum = nan;
  stats.mean = nan;
  stats.median = nan;
  stats.standard_deviation = nan;

  return stats;
}

/**
 * There are enough special cases in determining the median where it useful to
 * put it in a single function.
 */
51
52
53
54
55
56
57
58
59
60
61
62
63
template <typename TYPE> double getMedian(const vector<TYPE> &data) {
  const size_t size = data.size();
  if (size == 1)
    return static_cast<double>(data[0]);

  const bool isSorted = std::is_sorted(data.cbegin(), data.cend());
  std::vector<TYPE> tmpSortedData;
  auto sortedDataRef = std::ref(data);
  if (!isSorted) {
    tmpSortedData = data;
    std::sort(tmpSortedData.begin(), tmpSortedData.end());
    sortedDataRef = std::ref(std::as_const(tmpSortedData));
  }
64

65
66
  const bool is_even = (size % 2 == 0);
  double retVal = 0;
67
  if (is_even) {
68
69
70
71
72
    const auto left = static_cast<double>(sortedDataRef.get()[size / 2 - 1]);
    const auto right = static_cast<double>(sortedDataRef.get()[size / 2]);
    retVal = (left + right) / 2;
  } else {
    retVal = static_cast<double>(sortedDataRef.get()[size / 2]);
73
  }
74
  return retVal;
Hahn, Steven's avatar
Hahn, Steven committed
75
}
Hahn, Steven's avatar
Hahn, Steven committed
76

77
78
79
80
/**
 * There are enough special cases in determining the Z score where it useful to
 * put it in a single function.
 */
Samuel Jones's avatar
Samuel Jones committed
81
template <typename TYPE> std::vector<double> getZscore(const vector<TYPE> &data) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
82
  std::vector<double> Zscore;
83
  if (data.size() < 3) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
84
    Zscore.resize(data.size(), 0.);
85
86
    return Zscore;
  }
87
  Statistics stats = getStatistics(data);
88
  if (stats.standard_deviation == 0.) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
89
    Zscore.resize(data.size(), 0.);
90
91
    return Zscore;
  }
Hahn, Steven's avatar
Hahn, Steven committed
92
  for (auto it = data.cbegin(); it != data.cend(); ++it) {
93
    auto tmp = static_cast<double>(*it);
94
    Zscore.emplace_back(fabs((stats.mean - tmp) / stats.standard_deviation));
95
96
97
  }
  return Zscore;
}
Lynch, Vickie's avatar
Lynch, Vickie committed
98
99
100
101
/**
 * There are enough special cases in determining the Z score where it useful to
 * put it in a single function.
 */
Samuel Jones's avatar
Samuel Jones committed
102
template <typename TYPE> std::vector<double> getWeightedZscore(const vector<TYPE> &data, const vector<TYPE> &weights) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
103
  std::vector<double> Zscore;
Lynch, Vickie's avatar
Lynch, Vickie committed
104
  if (data.size() < 3) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
105
    Zscore.resize(data.size(), 0.);
Lynch, Vickie's avatar
Lynch, Vickie committed
106
107
108
109
    return Zscore;
  }
  Statistics stats = getStatistics(data);
  if (stats.standard_deviation == 0.) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
110
    Zscore.resize(data.size(), 0.);
Lynch, Vickie's avatar
Lynch, Vickie committed
111
112
113
114
115
116
117
118
119
    return Zscore;
  }
  double sumWeights = 0.0;
  double sumWeightedData = 0.0;
  double weightedVariance = 0.0;
  for (size_t it = 0; it != data.size(); ++it) {
    sumWeights += static_cast<double>(weights[it]);
    sumWeightedData += static_cast<double>(weights[it] * data[it]);
  }
Lynch, Vickie's avatar
Lynch, Vickie committed
120
  double weightedMean = sumWeightedData / sumWeights;
Lynch, Vickie's avatar
Lynch, Vickie committed
121
  for (size_t it = 0; it != data.size(); ++it) {
Samuel Jones's avatar
Samuel Jones committed
122
123
    weightedVariance += std::pow(static_cast<double>(data[it]) - weightedMean, 2) *
                        std::pow(static_cast<double>(weights[it]) / sumWeights, 2);
Lynch, Vickie's avatar
Lynch, Vickie committed
124
125
  }
  for (auto it = data.cbegin(); it != data.cend(); ++it) {
Samuel Jones's avatar
Samuel Jones committed
126
    Zscore.emplace_back(fabs((static_cast<double>(*it) - weightedMean) / std::sqrt(weightedVariance)));
Lynch, Vickie's avatar
Lynch, Vickie committed
127
128
129
  }
  return Zscore;
}
130
131
132
133
134
/**
 * There are enough special cases in determining the modified Z score where it
 * useful to
 * put it in a single function.
 */
135
template <typename TYPE> std::vector<double> getModifiedZscore(const vector<TYPE> &data) {
136
137
138
139
140
141
  if (data.size() < 3) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  std::vector<double> MADvec;
  double tmp;
142
  double median = getMedian(data);
Hahn, Steven's avatar
Hahn, Steven committed
143
  for (auto it = data.cbegin(); it != data.cend(); ++it) {
144
    tmp = static_cast<double>(*it);
145
    MADvec.emplace_back(fabs(tmp - median));
146
  }
147
  double MAD = getMedian(MADvec);
148
149
150
151
152
153
  if (MAD == 0.) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  MADvec.clear();
  std::vector<double> Zscore;
Hahn, Steven's avatar
Hahn, Steven committed
154
  for (auto it = data.begin(); it != data.end(); ++it) {
155
    tmp = static_cast<double>(*it);
156
    Zscore.emplace_back(0.6745 * fabs((tmp - median) / MAD));
157
158
159
160
161
162
163
  }
  return Zscore;
}

/**
 * Determine the statistics for a vector of data. If it is sorted then let the
 * function know so it won't make a copy of the data for determining the median.
164
165
 * @param data Data points whose statistics are to be evaluated
 * @param flags A set of flags to control the computation of the stats
166
 */
Samuel Jones's avatar
Samuel Jones committed
167
template <typename TYPE> Statistics getStatistics(const vector<TYPE> &data, const unsigned int flags) {
Hahn, Steven's avatar
Hahn, Steven committed
168
  Statistics statistics = getNanStatistics();
169
  if (data.empty()) { // don't do anything
Hahn, Steven's avatar
Hahn, Steven committed
170
    return statistics;
171
  }
172
  // calculate the mean if this or the stddev is requested
Samuel Jones's avatar
Samuel Jones committed
173
  const bool stddev = ((flags & StatOptions::UncorrectedStdDev) || (flags & StatOptions::CorrectedStdDev));
Hahn, Steven's avatar
Hahn, Steven committed
174
175
  if (stddev) {
    using namespace boost::accumulators;
Lynch, Vickie's avatar
Lynch, Vickie committed
176
    accumulator_set<double, stats<tag::min, tag::max, tag::variance>> acc;
Hahn, Steven's avatar
Hahn, Steven committed
177
    for (auto &value : data) {
Hahn, Steven's avatar
Hahn, Steven committed
178
      acc(static_cast<double>(value));
179
    }
Hahn, Steven's avatar
Hahn, Steven committed
180
181
182
    statistics.minimum = min(acc);
    statistics.maximum = max(acc);
    statistics.mean = mean(acc);
Hahn, Steven's avatar
Hahn, Steven committed
183
184
185
    double var = variance(acc);

    if (flags & StatOptions::CorrectedStdDev) {
186
      auto ndofs = static_cast<double>(data.size());
Hahn, Steven's avatar
Hahn, Steven committed
187
188
      var *= ndofs / (ndofs - 1.0);
    }
Hahn, Steven's avatar
Hahn, Steven committed
189
    statistics.standard_deviation = std::sqrt(var);
Hahn, Steven's avatar
Hahn, Steven committed
190
191
192

  } else if (flags & StatOptions::Mean) {
    using namespace boost::accumulators;
Lynch, Vickie's avatar
Lynch, Vickie committed
193
    accumulator_set<double, stats<tag::mean>> acc;
Hahn, Steven's avatar
Hahn, Steven committed
194
    for (auto &value : data) {
Hahn, Steven's avatar
Hahn, Steven committed
195
      acc(static_cast<double>(value));
Hahn, Steven's avatar
Hahn, Steven committed
196
    }
Hahn, Steven's avatar
Hahn, Steven committed
197
    statistics.mean = mean(acc);
198
  }
Hahn, Steven's avatar
Hahn, Steven committed
199

200
201
  // calculate the median if requested
  if (flags & StatOptions::Median) {
202
    statistics.median = getMedian(data);
203
  }
Hahn, Steven's avatar
Hahn, Steven committed
204

Hahn, Steven's avatar
Hahn, Steven committed
205
  return statistics;
206
207
208
}

/// Getting statistics of a string array should just give a bunch of NaNs
Samuel Jones's avatar
Samuel Jones committed
209
template <> DLLExport Statistics getStatistics<string>(const vector<string> &data, const unsigned int flags) {
210
  UNUSED_ARG(flags);
211
212
213
214
215
  UNUSED_ARG(data);
  return getNanStatistics();
}

/// Getting statistics of a boolean array should just give a bunch of NaNs
Samuel Jones's avatar
Samuel Jones committed
216
template <> DLLExport Statistics getStatistics<bool>(const vector<bool> &data, const unsigned int flags) {
217
  UNUSED_ARG(flags);
218
219
220
221
222
  UNUSED_ARG(data);
  return getNanStatistics();
}

/** Return the Rwp of a diffraction pattern data
LamarMoore's avatar
LamarMoore committed
223
224
225
226
227
228
 * @param obsI :: array of observed intensity values
 * @param calI :: array of calculated intensity values;
 * @param obsE :: array of error of the observed data;
 * @return :: RFactor including Rp and Rwp
 *
 */
Samuel Jones's avatar
Samuel Jones committed
229
Rfactor getRFactor(const std::vector<double> &obsI, const std::vector<double> &calI, const std::vector<double> &obsE) {
230
231
232
  // 1. Check
  if (obsI.size() != calI.size() || obsI.size() != obsE.size()) {
    std::stringstream errss;
Samuel Jones's avatar
Samuel Jones committed
233
234
    errss << "GetRFactor() Input Error!  Observed Intensity (" << obsI.size() << "), Calculated Intensity ("
          << calI.size() << ") and Observed Error (" << obsE.size() << ") have different number of elements.";
235
236
    throw std::runtime_error(errss.str());
  }
237
  if (obsI.empty()) {
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
    throw std::runtime_error("getRFactor(): the input arrays are empty.");
  }

  double sumnom = 0;
  double sumdenom = 0;
  double sumrpnom = 0;
  double sumrpdenom = 0;

  size_t numpts = obsI.size();
  for (size_t i = 0; i < numpts; ++i) {
    double cal_i = calI[i];
    double obs_i = obsI[i];
    double sigma = obsE[i];
    double weight = 1.0 / (sigma * sigma);
    double diff = obs_i - cal_i;

    if (weight == weight && weight <= DBL_MAX) {
      // If weight is not NaN.
      sumrpnom += fabs(diff);
      sumrpdenom += fabs(obs_i);

      double tempnom = weight * diff * diff;
      double tempden = weight * obs_i * obs_i;

      sumnom += tempnom;
      sumdenom += tempden;

      if (tempnom != tempnom || tempden != tempden) {
Samuel Jones's avatar
Samuel Jones committed
266
267
        logger.error() << "***** Error! ****** Data indexed " << i << " is NaN. "
                       << "i = " << i << ": cal = " << calI[i] << ", obs = " << obs_i << ", weight = " << weight
Sam Jenkins's avatar
Sam Jenkins committed
268
                       << ". \n";
Campbell, Stuart's avatar
Campbell, Stuart committed
269
270
      }
    }
271
272
273
274
275
276
277
  }

  Rfactor rfactor(0., 0.);
  rfactor.Rp = (sumrpnom / sumrpdenom);
  rfactor.Rwp = std::sqrt(sumnom / sumdenom);

  if (rfactor.Rwp != rfactor.Rwp)
Samuel Jones's avatar
Samuel Jones committed
278
    logger.debug() << "Rwp is NaN.  Denominator = " << sumnom << "; Nominator = " << sumdenom << ". \n";
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294

  return rfactor;
}

/**
 * This will calculate the first n-moments (inclusive) about the origin. For
 *example
 * if maxMoment=2 then this will return 3 values: 0th (total weight), 1st
 *(mean), 2nd (deviation).
 *
 * @param x The independent values
 * @param y The dependent values
 * @param maxMoment The number of moments to calculate
 * @returns The first n-moments.
 */
template <typename TYPE>
Samuel Jones's avatar
Samuel Jones committed
295
std::vector<double> getMomentsAboutOrigin(const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment) {
296
297
298
299
300
301
  // densities have the same number of x and y
  bool isDensity(x.size() == y.size());

  // if it isn't a density then check for histogram
  if ((!isDensity) && (x.size() != y.size() + 1)) {
    std::stringstream msg;
Samuel Jones's avatar
Samuel Jones committed
302
    msg << "length of x (" << x.size() << ") and y (" << y.size() << ")do not match";
303
304
305
306
307
308
309
310
311
312
313
314
    throw std::out_of_range(msg.str());
  }

  // initialize a result vector with all zeros
  std::vector<double> result(maxMoment + 1, 0.);

  // cache the maximum index
  size_t numPoints = y.size();
  if (isDensity)
    numPoints = x.size() - 1;

  // densities are calculated using Newton's method for numerical integration
315
316
  // as backwards as it sounds, the outer loop should be the points rather
  // than
317
318
319
320
321
  // the moments
  for (size_t j = 0; j < numPoints; ++j) {
    // reduce item lookup - and central x for histogram
    const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]);
    // this variable will be (x^n)*y
322
    auto temp = static_cast<double>(y[j]); // correct for histogram
323
    if (isDensity) {
324
      const auto xDelta = static_cast<double>(x[j + 1] - x[j]);
325
      temp = .5 * (temp + static_cast<double>(y[j + 1])) * xDelta;
Campbell, Stuart's avatar
Campbell, Stuart committed
326
327
    }

328
329
330
331
332
    // accumulate the moments
    result[0] += temp;
    for (size_t i = 1; i < result.size(); ++i) {
      temp *= xVal;
      result[i] += temp;
333
    }
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
  }

  return result;
}

/**
 * This will calculate the first n-moments (inclusive) about the mean (1st
 *moment). For example
 * if maxMoment=2 then this will return 3 values: 0th (total weight), 1st
 *(mean), 2nd (deviation).
 *
 * @param x The independent values
 * @param y The dependent values
 * @param maxMoment The number of moments to calculate
 * @returns The first n-moments.
 */
template <typename TYPE>
Samuel Jones's avatar
Samuel Jones committed
351
std::vector<double> getMomentsAboutMean(const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment) {
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
  // get the zeroth (integrated value) and first moment (mean)
  std::vector<double> momentsAboutOrigin = getMomentsAboutOrigin(x, y, 1);
  const double mean = momentsAboutOrigin[1];

  // initialize a result vector with all zeros
  std::vector<double> result(maxMoment + 1, 0.);
  result[0] = momentsAboutOrigin[0];

  // escape early if we need to
  if (maxMoment == 0)
    return result;

  // densities have the same number of x and y
  bool isDensity(x.size() == y.size());

  // cache the maximum index
  size_t numPoints = y.size();
  if (isDensity)
    numPoints = x.size() - 1;

  // densities are calculated using Newton's method for numerical integration
373
374
  // as backwards as it sounds, the outer loop should be the points rather
  // than
375
376
377
378
  // the moments
  for (size_t j = 0; j < numPoints; ++j) {
    // central x in histogram with a change of variables - and just change for
    // density
Samuel Jones's avatar
Samuel Jones committed
379
    const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]) - mean; // change of variables
380
381
382
383

    // this variable will be (x^n)*y
    double temp;
    if (isDensity) {
384
      const auto xDelta = static_cast<double>(x[j + 1] - x[j]);
385
386
387
      temp = xVal * .5 * static_cast<double>(y[j] + y[j + 1]) * xDelta;
    } else {
      temp = xVal * static_cast<double>(y[j]);
388
389
    }

390
391
392
393
394
    // accumulate the moment
    result[1] += temp;
    for (size_t i = 2; i < result.size(); ++i) {
      temp *= xVal;
      result[i] += temp;
395
    }
396
397
398
399
400
401
402
  }

  return result;
}

// -------------------------- Macro to instantiation concrete types
// --------------------------------
Samuel Jones's avatar
Samuel Jones committed
403
404
405
406
#define INSTANTIATE(TYPE)                                                                                              \
  template MANTID_KERNEL_DLL Statistics getStatistics<TYPE>(const vector<TYPE> &, const unsigned int);                 \
  template MANTID_KERNEL_DLL std::vector<double> getZscore<TYPE>(const vector<TYPE> &);                                \
  template MANTID_KERNEL_DLL std::vector<double> getWeightedZscore<TYPE>(const vector<TYPE> &, const vector<TYPE> &);  \
407
  template MANTID_KERNEL_DLL std::vector<double> getModifiedZscore<TYPE>(const vector<TYPE> &);                        \
Samuel Jones's avatar
Samuel Jones committed
408
409
410
411
  template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutOrigin<TYPE>(                                          \
      const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment);                                    \
  template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutMean<TYPE>(                                            \
      const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment);
412
413
414

// --------------------------- Concrete instantiations
// ---------------------------------------------
415
416
417
418
419
420
421
422
INSTANTIATE(float)
INSTANTIATE(double)
INSTANTIATE(int)
INSTANTIATE(long)
INSTANTIATE(long long)
INSTANTIATE(unsigned int)
INSTANTIATE(unsigned long)
INSTANTIATE(unsigned long long)
423

424
} // namespace Mantid::Kernel