Statistics.cpp 15 KB
Newer Older
1
2
3
// Mantid Repository : https://github.com/mantidproject/mantid
//
// Copyright © 2018 ISIS Rutherford Appleton Laboratory UKRI,
4
5
//   NScD Oak Ridge National Laboratory, European Spallation Source,
//   Institut Laue - Langevin & CSNS, Institute of High Energy Physics, CAS
6
// SPDX - License - Identifier: GPL - 3.0 +
7
8
// Includes
#include "MantidKernel/Statistics.h"
9
#include "MantidKernel/Logger.h"
10

Hahn, Steven's avatar
Hahn, Steven committed
11
12
#include <boost/accumulators/accumulators.hpp>
#include <boost/accumulators/statistics/max.hpp>
LamarMoore's avatar
LamarMoore committed
13
14
#include <boost/accumulators/statistics/min.hpp>
#include <boost/accumulators/statistics/stats.hpp>
Hahn, Steven's avatar
Hahn, Steven committed
15
16
#include <boost/accumulators/statistics/variance.hpp>

17
18
19
20
21
#include <algorithm>
#include <cfloat>
#include <cmath>
#include <sstream>

22
namespace Mantid::Kernel {
23
namespace {
Sam Jenkins's avatar
Sam Jenkins committed
24
Logger logger("Statistics");
25
}
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

using std::string;
using std::vector;

/**
 * Generate a Statistics object where all of the values are NaN. This is a good
 * initial default.
 */
Statistics getNanStatistics() {
  double nan = std::numeric_limits<double>::quiet_NaN();

  Statistics stats;
  stats.minimum = nan;
  stats.maximum = nan;
  stats.mean = nan;
  stats.median = nan;
  stats.standard_deviation = nan;

  return stats;
}

/**
 * There are enough special cases in determining the median where it useful to
 * put it in a single function.
 */
Samuel Jones's avatar
Samuel Jones committed
51
template <typename TYPE> double getMedian(const vector<TYPE> &data, const size_t num_data, const bool sorted) {
52
53
54
  if (num_data == 1)
    return static_cast<double>(*(data.begin()));

Hahn, Steven's avatar
Hahn, Steven committed
55
  bool is_even = ((num_data & 1) == 0);
56
57
58
59
60
61
62
63
64
65
66
67
  if (is_even) {
    double left = 0.0;
    double right = 0.0;

    if (sorted) {
      // Just get the centre two elements.
      left = static_cast<double>(*(data.begin() + num_data / 2 - 1));
      right = static_cast<double>(*(data.begin() + num_data / 2));
    } else {
      // If the data is not sorted, make a copy we can mess with
      vector<TYPE> temp(data.begin(), data.end());
      // Get what the centre two elements should be...
Samuel Jones's avatar
Samuel Jones committed
68
      std::nth_element(temp.begin(), temp.begin() + num_data / 2 - 1, temp.end());
69
70
71
      left = static_cast<double>(*(temp.begin() + num_data / 2 - 1));
      std::nth_element(temp.begin(), temp.begin() + num_data / 2, temp.end());
      right = static_cast<double>(*(temp.begin() + num_data / 2));
Campbell, Stuart's avatar
Campbell, Stuart committed
72
    }
73
74
75
    // return the average
    return (left + right) / 2.;
  } else
Lynch, Vickie's avatar
Lynch, Vickie committed
76
  // Odd number
77
78
79
80
81
82
83
84
85
86
87
  {
    if (sorted) {
      // If sorted and odd, just return the centre value
      return static_cast<double>(*(data.begin() + num_data / 2));
    } else {
      // If the data is not sorted, make a copy we can mess with
      vector<TYPE> temp(data.begin(), data.end());
      // Make sure the centre value is in the correct position
      std::nth_element(temp.begin(), temp.begin() + num_data / 2, temp.end());
      // Now return the centre value
      return static_cast<double>(*(temp.begin() + num_data / 2));
88
    }
89
  }
Hahn, Steven's avatar
Hahn, Steven committed
90
}
Hahn, Steven's avatar
Hahn, Steven committed
91

92
93
94
95
/**
 * There are enough special cases in determining the Z score where it useful to
 * put it in a single function.
 */
Samuel Jones's avatar
Samuel Jones committed
96
template <typename TYPE> std::vector<double> getZscore(const vector<TYPE> &data) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
97
  std::vector<double> Zscore;
98
  if (data.size() < 3) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
99
    Zscore.resize(data.size(), 0.);
100
101
    return Zscore;
  }
102
  Statistics stats = getStatistics(data);
103
  if (stats.standard_deviation == 0.) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
104
    Zscore.resize(data.size(), 0.);
105
106
    return Zscore;
  }
Hahn, Steven's avatar
Hahn, Steven committed
107
  for (auto it = data.cbegin(); it != data.cend(); ++it) {
108
    auto tmp = static_cast<double>(*it);
109
    Zscore.emplace_back(fabs((stats.mean - tmp) / stats.standard_deviation));
110
111
112
  }
  return Zscore;
}
Lynch, Vickie's avatar
Lynch, Vickie committed
113
114
115
116
/**
 * There are enough special cases in determining the Z score where it useful to
 * put it in a single function.
 */
Samuel Jones's avatar
Samuel Jones committed
117
template <typename TYPE> std::vector<double> getWeightedZscore(const vector<TYPE> &data, const vector<TYPE> &weights) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
118
  std::vector<double> Zscore;
Lynch, Vickie's avatar
Lynch, Vickie committed
119
  if (data.size() < 3) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
120
    Zscore.resize(data.size(), 0.);
Lynch, Vickie's avatar
Lynch, Vickie committed
121
122
123
124
    return Zscore;
  }
  Statistics stats = getStatistics(data);
  if (stats.standard_deviation == 0.) {
Gagik Vardanyan's avatar
Gagik Vardanyan committed
125
    Zscore.resize(data.size(), 0.);
Lynch, Vickie's avatar
Lynch, Vickie committed
126
127
128
129
130
131
132
133
134
    return Zscore;
  }
  double sumWeights = 0.0;
  double sumWeightedData = 0.0;
  double weightedVariance = 0.0;
  for (size_t it = 0; it != data.size(); ++it) {
    sumWeights += static_cast<double>(weights[it]);
    sumWeightedData += static_cast<double>(weights[it] * data[it]);
  }
Lynch, Vickie's avatar
Lynch, Vickie committed
135
  double weightedMean = sumWeightedData / sumWeights;
Lynch, Vickie's avatar
Lynch, Vickie committed
136
  for (size_t it = 0; it != data.size(); ++it) {
Samuel Jones's avatar
Samuel Jones committed
137
138
    weightedVariance += std::pow(static_cast<double>(data[it]) - weightedMean, 2) *
                        std::pow(static_cast<double>(weights[it]) / sumWeights, 2);
Lynch, Vickie's avatar
Lynch, Vickie committed
139
140
  }
  for (auto it = data.cbegin(); it != data.cend(); ++it) {
Samuel Jones's avatar
Samuel Jones committed
141
    Zscore.emplace_back(fabs((static_cast<double>(*it) - weightedMean) / std::sqrt(weightedVariance)));
Lynch, Vickie's avatar
Lynch, Vickie committed
142
143
144
  }
  return Zscore;
}
145
146
147
148
149
/**
 * There are enough special cases in determining the modified Z score where it
 * useful to
 * put it in a single function.
 */
Samuel Jones's avatar
Samuel Jones committed
150
template <typename TYPE> std::vector<double> getModifiedZscore(const vector<TYPE> &data, const bool sorted) {
151
152
153
154
155
156
157
158
  if (data.size() < 3) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  std::vector<double> MADvec;
  double tmp;
  size_t num_data = data.size(); // cache since it is frequently used
  double median = getMedian(data, num_data, sorted);
Hahn, Steven's avatar
Hahn, Steven committed
159
  for (auto it = data.cbegin(); it != data.cend(); ++it) {
160
    tmp = static_cast<double>(*it);
161
    MADvec.emplace_back(fabs(tmp - median));
162
163
164
165
166
167
168
169
  }
  double MAD = getMedian(MADvec, num_data, sorted);
  if (MAD == 0.) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  MADvec.clear();
  std::vector<double> Zscore;
Hahn, Steven's avatar
Hahn, Steven committed
170
  for (auto it = data.begin(); it != data.end(); ++it) {
171
    tmp = static_cast<double>(*it);
172
    Zscore.emplace_back(0.6745 * fabs((tmp - median) / MAD));
173
174
175
176
177
178
179
  }
  return Zscore;
}

/**
 * Determine the statistics for a vector of data. If it is sorted then let the
 * function know so it won't make a copy of the data for determining the median.
180
181
 * @param data Data points whose statistics are to be evaluated
 * @param flags A set of flags to control the computation of the stats
182
 */
Samuel Jones's avatar
Samuel Jones committed
183
template <typename TYPE> Statistics getStatistics(const vector<TYPE> &data, const unsigned int flags) {
Hahn, Steven's avatar
Hahn, Steven committed
184
  Statistics statistics = getNanStatistics();
185
  size_t num_data = data.size(); // cache since it is frequently used
186
  if (num_data == 0) {           // don't do anything
Hahn, Steven's avatar
Hahn, Steven committed
187
    return statistics;
188
  }
189
  // calculate the mean if this or the stddev is requested
Samuel Jones's avatar
Samuel Jones committed
190
  const bool stddev = ((flags & StatOptions::UncorrectedStdDev) || (flags & StatOptions::CorrectedStdDev));
Hahn, Steven's avatar
Hahn, Steven committed
191
192
  if (stddev) {
    using namespace boost::accumulators;
Lynch, Vickie's avatar
Lynch, Vickie committed
193
    accumulator_set<double, stats<tag::min, tag::max, tag::variance>> acc;
Hahn, Steven's avatar
Hahn, Steven committed
194
    for (auto &value : data) {
Hahn, Steven's avatar
Hahn, Steven committed
195
      acc(static_cast<double>(value));
196
    }
Hahn, Steven's avatar
Hahn, Steven committed
197
198
199
    statistics.minimum = min(acc);
    statistics.maximum = max(acc);
    statistics.mean = mean(acc);
Hahn, Steven's avatar
Hahn, Steven committed
200
201
202
    double var = variance(acc);

    if (flags & StatOptions::CorrectedStdDev) {
203
      auto ndofs = static_cast<double>(data.size());
Hahn, Steven's avatar
Hahn, Steven committed
204
205
      var *= ndofs / (ndofs - 1.0);
    }
Hahn, Steven's avatar
Hahn, Steven committed
206
    statistics.standard_deviation = std::sqrt(var);
Hahn, Steven's avatar
Hahn, Steven committed
207
208
209

  } else if (flags & StatOptions::Mean) {
    using namespace boost::accumulators;
Lynch, Vickie's avatar
Lynch, Vickie committed
210
    accumulator_set<double, stats<tag::mean>> acc;
Hahn, Steven's avatar
Hahn, Steven committed
211
    for (auto &value : data) {
Hahn, Steven's avatar
Hahn, Steven committed
212
      acc(static_cast<double>(value));
Hahn, Steven's avatar
Hahn, Steven committed
213
    }
Hahn, Steven's avatar
Hahn, Steven committed
214
    statistics.mean = mean(acc);
215
  }
Hahn, Steven's avatar
Hahn, Steven committed
216

217
218
  // calculate the median if requested
  if (flags & StatOptions::Median) {
Samuel Jones's avatar
Samuel Jones committed
219
    statistics.median = getMedian(data, num_data, flags & StatOptions::SortedData);
220
  }
Hahn, Steven's avatar
Hahn, Steven committed
221

Hahn, Steven's avatar
Hahn, Steven committed
222
  return statistics;
223
224
225
}

/// Getting statistics of a string array should just give a bunch of NaNs
Samuel Jones's avatar
Samuel Jones committed
226
template <> DLLExport Statistics getStatistics<string>(const vector<string> &data, const unsigned int flags) {
227
  UNUSED_ARG(flags);
228
229
230
231
232
  UNUSED_ARG(data);
  return getNanStatistics();
}

/// Getting statistics of a boolean array should just give a bunch of NaNs
Samuel Jones's avatar
Samuel Jones committed
233
template <> DLLExport Statistics getStatistics<bool>(const vector<bool> &data, const unsigned int flags) {
234
  UNUSED_ARG(flags);
235
236
237
238
239
  UNUSED_ARG(data);
  return getNanStatistics();
}

/** Return the Rwp of a diffraction pattern data
LamarMoore's avatar
LamarMoore committed
240
241
242
243
244
245
 * @param obsI :: array of observed intensity values
 * @param calI :: array of calculated intensity values;
 * @param obsE :: array of error of the observed data;
 * @return :: RFactor including Rp and Rwp
 *
 */
Samuel Jones's avatar
Samuel Jones committed
246
Rfactor getRFactor(const std::vector<double> &obsI, const std::vector<double> &calI, const std::vector<double> &obsE) {
247
248
249
  // 1. Check
  if (obsI.size() != calI.size() || obsI.size() != obsE.size()) {
    std::stringstream errss;
Samuel Jones's avatar
Samuel Jones committed
250
251
    errss << "GetRFactor() Input Error!  Observed Intensity (" << obsI.size() << "), Calculated Intensity ("
          << calI.size() << ") and Observed Error (" << obsE.size() << ") have different number of elements.";
252
253
    throw std::runtime_error(errss.str());
  }
254
  if (obsI.empty()) {
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
    throw std::runtime_error("getRFactor(): the input arrays are empty.");
  }

  double sumnom = 0;
  double sumdenom = 0;
  double sumrpnom = 0;
  double sumrpdenom = 0;

  size_t numpts = obsI.size();
  for (size_t i = 0; i < numpts; ++i) {
    double cal_i = calI[i];
    double obs_i = obsI[i];
    double sigma = obsE[i];
    double weight = 1.0 / (sigma * sigma);
    double diff = obs_i - cal_i;

    if (weight == weight && weight <= DBL_MAX) {
      // If weight is not NaN.
      sumrpnom += fabs(diff);
      sumrpdenom += fabs(obs_i);

      double tempnom = weight * diff * diff;
      double tempden = weight * obs_i * obs_i;

      sumnom += tempnom;
      sumdenom += tempden;

      if (tempnom != tempnom || tempden != tempden) {
Samuel Jones's avatar
Samuel Jones committed
283
284
        logger.error() << "***** Error! ****** Data indexed " << i << " is NaN. "
                       << "i = " << i << ": cal = " << calI[i] << ", obs = " << obs_i << ", weight = " << weight
Sam Jenkins's avatar
Sam Jenkins committed
285
                       << ". \n";
Campbell, Stuart's avatar
Campbell, Stuart committed
286
287
      }
    }
288
289
290
291
292
293
294
  }

  Rfactor rfactor(0., 0.);
  rfactor.Rp = (sumrpnom / sumrpdenom);
  rfactor.Rwp = std::sqrt(sumnom / sumdenom);

  if (rfactor.Rwp != rfactor.Rwp)
Samuel Jones's avatar
Samuel Jones committed
295
    logger.debug() << "Rwp is NaN.  Denominator = " << sumnom << "; Nominator = " << sumdenom << ". \n";
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311

  return rfactor;
}

/**
 * This will calculate the first n-moments (inclusive) about the origin. For
 *example
 * if maxMoment=2 then this will return 3 values: 0th (total weight), 1st
 *(mean), 2nd (deviation).
 *
 * @param x The independent values
 * @param y The dependent values
 * @param maxMoment The number of moments to calculate
 * @returns The first n-moments.
 */
template <typename TYPE>
Samuel Jones's avatar
Samuel Jones committed
312
std::vector<double> getMomentsAboutOrigin(const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment) {
313
314
315
316
317
318
  // densities have the same number of x and y
  bool isDensity(x.size() == y.size());

  // if it isn't a density then check for histogram
  if ((!isDensity) && (x.size() != y.size() + 1)) {
    std::stringstream msg;
Samuel Jones's avatar
Samuel Jones committed
319
    msg << "length of x (" << x.size() << ") and y (" << y.size() << ")do not match";
320
321
322
323
324
325
326
327
328
329
330
331
    throw std::out_of_range(msg.str());
  }

  // initialize a result vector with all zeros
  std::vector<double> result(maxMoment + 1, 0.);

  // cache the maximum index
  size_t numPoints = y.size();
  if (isDensity)
    numPoints = x.size() - 1;

  // densities are calculated using Newton's method for numerical integration
332
333
  // as backwards as it sounds, the outer loop should be the points rather
  // than
334
335
336
337
338
  // the moments
  for (size_t j = 0; j < numPoints; ++j) {
    // reduce item lookup - and central x for histogram
    const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]);
    // this variable will be (x^n)*y
339
    auto temp = static_cast<double>(y[j]); // correct for histogram
340
    if (isDensity) {
341
      const auto xDelta = static_cast<double>(x[j + 1] - x[j]);
342
      temp = .5 * (temp + static_cast<double>(y[j + 1])) * xDelta;
Campbell, Stuart's avatar
Campbell, Stuart committed
343
344
    }

345
346
347
348
349
    // accumulate the moments
    result[0] += temp;
    for (size_t i = 1; i < result.size(); ++i) {
      temp *= xVal;
      result[i] += temp;
350
    }
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
  }

  return result;
}

/**
 * This will calculate the first n-moments (inclusive) about the mean (1st
 *moment). For example
 * if maxMoment=2 then this will return 3 values: 0th (total weight), 1st
 *(mean), 2nd (deviation).
 *
 * @param x The independent values
 * @param y The dependent values
 * @param maxMoment The number of moments to calculate
 * @returns The first n-moments.
 */
template <typename TYPE>
Samuel Jones's avatar
Samuel Jones committed
368
std::vector<double> getMomentsAboutMean(const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment) {
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
  // get the zeroth (integrated value) and first moment (mean)
  std::vector<double> momentsAboutOrigin = getMomentsAboutOrigin(x, y, 1);
  const double mean = momentsAboutOrigin[1];

  // initialize a result vector with all zeros
  std::vector<double> result(maxMoment + 1, 0.);
  result[0] = momentsAboutOrigin[0];

  // escape early if we need to
  if (maxMoment == 0)
    return result;

  // densities have the same number of x and y
  bool isDensity(x.size() == y.size());

  // cache the maximum index
  size_t numPoints = y.size();
  if (isDensity)
    numPoints = x.size() - 1;

  // densities are calculated using Newton's method for numerical integration
390
391
  // as backwards as it sounds, the outer loop should be the points rather
  // than
392
393
394
395
  // the moments
  for (size_t j = 0; j < numPoints; ++j) {
    // central x in histogram with a change of variables - and just change for
    // density
Samuel Jones's avatar
Samuel Jones committed
396
    const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]) - mean; // change of variables
397
398
399
400

    // this variable will be (x^n)*y
    double temp;
    if (isDensity) {
401
      const auto xDelta = static_cast<double>(x[j + 1] - x[j]);
402
403
404
      temp = xVal * .5 * static_cast<double>(y[j] + y[j + 1]) * xDelta;
    } else {
      temp = xVal * static_cast<double>(y[j]);
405
406
    }

407
408
409
410
411
    // accumulate the moment
    result[1] += temp;
    for (size_t i = 2; i < result.size(); ++i) {
      temp *= xVal;
      result[i] += temp;
412
    }
413
414
415
416
417
418
419
  }

  return result;
}

// -------------------------- Macro to instantiation concrete types
// --------------------------------
Samuel Jones's avatar
Samuel Jones committed
420
421
422
423
424
425
426
427
428
#define INSTANTIATE(TYPE)                                                                                              \
  template MANTID_KERNEL_DLL Statistics getStatistics<TYPE>(const vector<TYPE> &, const unsigned int);                 \
  template MANTID_KERNEL_DLL std::vector<double> getZscore<TYPE>(const vector<TYPE> &);                                \
  template MANTID_KERNEL_DLL std::vector<double> getWeightedZscore<TYPE>(const vector<TYPE> &, const vector<TYPE> &);  \
  template MANTID_KERNEL_DLL std::vector<double> getModifiedZscore<TYPE>(const vector<TYPE> &, const bool);            \
  template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutOrigin<TYPE>(                                          \
      const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment);                                    \
  template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutMean<TYPE>(                                            \
      const std::vector<TYPE> &x, const std::vector<TYPE> &y, const int maxMoment);
429
430
431

// --------------------------- Concrete instantiations
// ---------------------------------------------
432
433
434
435
436
437
438
439
INSTANTIATE(float)
INSTANTIATE(double)
INSTANTIATE(int)
INSTANTIATE(long)
INSTANTIATE(long long)
INSTANTIATE(unsigned int)
INSTANTIATE(unsigned long)
INSTANTIATE(unsigned long long)
440

441
} // namespace Mantid::Kernel