Statistics.cpp 15.7 KB
Newer Older
1
2
3
4
5
6
// Mantid Repository : https://github.com/mantidproject/mantid
//
// Copyright © 2018 ISIS Rutherford Appleton Laboratory UKRI,
//     NScD Oak Ridge National Laboratory, European Spallation Source
//     & Institut Laue - Langevin
// SPDX - License - Identifier: GPL - 3.0 +
7
8
// Includes
#include "MantidKernel/Statistics.h"
9
#include "MantidKernel/Logger.h"
10

Hahn, Steven's avatar
Hahn, Steven committed
11
12
#include <boost/accumulators/accumulators.hpp>
#include <boost/accumulators/statistics/max.hpp>
LamarMoore's avatar
LamarMoore committed
13
14
#include <boost/accumulators/statistics/min.hpp>
#include <boost/accumulators/statistics/stats.hpp>
Hahn, Steven's avatar
Hahn, Steven committed
15
16
#include <boost/accumulators/statistics/variance.hpp>

17
18
19
20
21
#include <algorithm>
#include <cfloat>
#include <cmath>
#include <sstream>

22
23
namespace Mantid {
namespace Kernel {
24
25
26
namespace {
  Logger logger("Statistics");
}
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

using std::string;
using std::vector;

/**
 * Generate a Statistics object where all of the values are NaN. This is a good
 * initial default.
 */
Statistics getNanStatistics() {
  double nan = std::numeric_limits<double>::quiet_NaN();

  Statistics stats;
  stats.minimum = nan;
  stats.maximum = nan;
  stats.mean = nan;
  stats.median = nan;
  stats.standard_deviation = nan;

  return stats;
}

/**
 * There are enough special cases in determining the median where it useful to
 * put it in a single function.
 */
template <typename TYPE>
double getMedian(const vector<TYPE> &data, const size_t num_data,
                 const bool sorted) {
  if (num_data == 1)
    return static_cast<double>(*(data.begin()));

Hahn, Steven's avatar
Hahn, Steven committed
58
  bool is_even = ((num_data & 1) == 0);
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
  if (is_even) {
    double left = 0.0;
    double right = 0.0;

    if (sorted) {
      // Just get the centre two elements.
      left = static_cast<double>(*(data.begin() + num_data / 2 - 1));
      right = static_cast<double>(*(data.begin() + num_data / 2));
    } else {
      // If the data is not sorted, make a copy we can mess with
      vector<TYPE> temp(data.begin(), data.end());
      // Get what the centre two elements should be...
      std::nth_element(temp.begin(), temp.begin() + num_data / 2 - 1,
                       temp.end());
      left = static_cast<double>(*(temp.begin() + num_data / 2 - 1));
      std::nth_element(temp.begin(), temp.begin() + num_data / 2, temp.end());
      right = static_cast<double>(*(temp.begin() + num_data / 2));
Campbell, Stuart's avatar
Campbell, Stuart committed
76
    }
77
78
79
    // return the average
    return (left + right) / 2.;
  } else
Lynch, Vickie's avatar
Lynch, Vickie committed
80
  // Odd number
81
82
83
84
85
86
87
88
89
90
91
  {
    if (sorted) {
      // If sorted and odd, just return the centre value
      return static_cast<double>(*(data.begin() + num_data / 2));
    } else {
      // If the data is not sorted, make a copy we can mess with
      vector<TYPE> temp(data.begin(), data.end());
      // Make sure the centre value is in the correct position
      std::nth_element(temp.begin(), temp.begin() + num_data / 2, temp.end());
      // Now return the centre value
      return static_cast<double>(*(temp.begin() + num_data / 2));
92
    }
93
  }
Hahn, Steven's avatar
Hahn, Steven committed
94
}
Hahn, Steven's avatar
Hahn, Steven committed
95

96
97
98
99
100
/**
 * There are enough special cases in determining the Z score where it useful to
 * put it in a single function.
 */
template <typename TYPE>
101
std::vector<double> getZscore(const vector<TYPE> &data) {
102
103
104
105
106
  if (data.size() < 3) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  std::vector<double> Zscore;
107
  Statistics stats = getStatistics(data);
108
109
110
111
  if (stats.standard_deviation == 0.) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
Hahn, Steven's avatar
Hahn, Steven committed
112
  for (auto it = data.cbegin(); it != data.cend(); ++it) {
113
    double tmp = static_cast<double>(*it);
Lynch, Vickie's avatar
Lynch, Vickie committed
114
    Zscore.push_back(fabs((stats.mean - tmp) / stats.standard_deviation));
115
116
117
  }
  return Zscore;
}
Lynch, Vickie's avatar
Lynch, Vickie committed
118
119
120
121
122
/**
 * There are enough special cases in determining the Z score where it useful to
 * put it in a single function.
 */
template <typename TYPE>
Lynch, Vickie's avatar
Lynch, Vickie committed
123
124
std::vector<double> getWeightedZscore(const vector<TYPE> &data,
                                      const vector<TYPE> &weights) {
Lynch, Vickie's avatar
Lynch, Vickie committed
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
  if (data.size() < 3) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  std::vector<double> Zscore;
  Statistics stats = getStatistics(data);
  if (stats.standard_deviation == 0.) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  double sumWeights = 0.0;
  double sumWeightedData = 0.0;
  double weightedVariance = 0.0;
  for (size_t it = 0; it != data.size(); ++it) {
    sumWeights += static_cast<double>(weights[it]);
    sumWeightedData += static_cast<double>(weights[it] * data[it]);
  }
Lynch, Vickie's avatar
Lynch, Vickie committed
142
  double weightedMean = sumWeightedData / sumWeights;
Lynch, Vickie's avatar
Lynch, Vickie committed
143
  for (size_t it = 0; it != data.size(); ++it) {
Lynch, Vickie's avatar
Lynch, Vickie committed
144
145
146
    weightedVariance +=
        std::pow(static_cast<double>(data[it]) - weightedMean, 2) *
        std::pow(static_cast<double>(weights[it]) / sumWeights, 2);
Lynch, Vickie's avatar
Lynch, Vickie committed
147
148
  }
  for (auto it = data.cbegin(); it != data.cend(); ++it) {
Lynch, Vickie's avatar
Lynch, Vickie committed
149
150
    Zscore.push_back(fabs((static_cast<double>(*it) - weightedMean) /
                          std::sqrt(weightedVariance)));
Lynch, Vickie's avatar
Lynch, Vickie committed
151
152
153
  }
  return Zscore;
}
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
/**
 * There are enough special cases in determining the modified Z score where it
 * useful to
 * put it in a single function.
 */
template <typename TYPE>
std::vector<double> getModifiedZscore(const vector<TYPE> &data,
                                      const bool sorted) {
  if (data.size() < 3) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  std::vector<double> MADvec;
  double tmp;
  size_t num_data = data.size(); // cache since it is frequently used
  double median = getMedian(data, num_data, sorted);
Hahn, Steven's avatar
Hahn, Steven committed
170
  for (auto it = data.cbegin(); it != data.cend(); ++it) {
171
172
173
174
175
176
177
178
179
180
    tmp = static_cast<double>(*it);
    MADvec.push_back(fabs(tmp - median));
  }
  double MAD = getMedian(MADvec, num_data, sorted);
  if (MAD == 0.) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  MADvec.clear();
  std::vector<double> Zscore;
Hahn, Steven's avatar
Hahn, Steven committed
181
  for (auto it = data.begin(); it != data.end(); ++it) {
182
183
184
185
186
187
188
189
190
    tmp = static_cast<double>(*it);
    Zscore.push_back(0.6745 * fabs((tmp - median) / MAD));
  }
  return Zscore;
}

/**
 * Determine the statistics for a vector of data. If it is sorted then let the
 * function know so it won't make a copy of the data for determining the median.
191
192
 * @param data Data points whose statistics are to be evaluated
 * @param flags A set of flags to control the computation of the stats
193
194
 */
template <typename TYPE>
195
Statistics getStatistics(const vector<TYPE> &data, const unsigned int flags) {
Hahn, Steven's avatar
Hahn, Steven committed
196
  Statistics statistics = getNanStatistics();
197
  size_t num_data = data.size(); // cache since it is frequently used
198
  if (num_data == 0) {           // don't do anything
Hahn, Steven's avatar
Hahn, Steven committed
199
    return statistics;
200
  }
201
202
203
  // calculate the mean if this or the stddev is requested
  const bool stddev = ((flags & StatOptions::UncorrectedStdDev) ||
                       (flags & StatOptions::CorrectedStdDev));
Hahn, Steven's avatar
Hahn, Steven committed
204
205
  if (stddev) {
    using namespace boost::accumulators;
Lynch, Vickie's avatar
Lynch, Vickie committed
206
    accumulator_set<double, stats<tag::min, tag::max, tag::variance>> acc;
Hahn, Steven's avatar
Hahn, Steven committed
207
    for (auto &value : data) {
Hahn, Steven's avatar
Hahn, Steven committed
208
      acc(static_cast<double>(value));
209
    }
Hahn, Steven's avatar
Hahn, Steven committed
210
211
212
    statistics.minimum = min(acc);
    statistics.maximum = max(acc);
    statistics.mean = mean(acc);
Hahn, Steven's avatar
Hahn, Steven committed
213
214
215
216
217
218
    double var = variance(acc);

    if (flags & StatOptions::CorrectedStdDev) {
      double ndofs = static_cast<double>(data.size());
      var *= ndofs / (ndofs - 1.0);
    }
Hahn, Steven's avatar
Hahn, Steven committed
219
    statistics.standard_deviation = std::sqrt(var);
Hahn, Steven's avatar
Hahn, Steven committed
220
221
222

  } else if (flags & StatOptions::Mean) {
    using namespace boost::accumulators;
Lynch, Vickie's avatar
Lynch, Vickie committed
223
    accumulator_set<double, stats<tag::mean>> acc;
Hahn, Steven's avatar
Hahn, Steven committed
224
    for (auto &value : data) {
Hahn, Steven's avatar
Hahn, Steven committed
225
      acc(static_cast<double>(value));
Hahn, Steven's avatar
Hahn, Steven committed
226
    }
Hahn, Steven's avatar
Hahn, Steven committed
227
    statistics.mean = mean(acc);
228
  }
Hahn, Steven's avatar
Hahn, Steven committed
229

230
231
  // calculate the median if requested
  if (flags & StatOptions::Median) {
Hahn, Steven's avatar
Hahn, Steven committed
232
233
    statistics.median =
        getMedian(data, num_data, flags & StatOptions::SortedData);
234
  }
Hahn, Steven's avatar
Hahn, Steven committed
235

Hahn, Steven's avatar
Hahn, Steven committed
236
  return statistics;
237
238
239
240
}

/// Getting statistics of a string array should just give a bunch of NaNs
template <>
LamarMoore's avatar
LamarMoore committed
241
242
DLLExport Statistics getStatistics<string>(const vector<string> &data,
                                           const unsigned int flags) {
243
  UNUSED_ARG(flags);
244
245
246
247
248
249
  UNUSED_ARG(data);
  return getNanStatistics();
}

/// Getting statistics of a boolean array should just give a bunch of NaNs
template <>
LamarMoore's avatar
LamarMoore committed
250
251
DLLExport Statistics getStatistics<bool>(const vector<bool> &data,
                                         const unsigned int flags) {
252
  UNUSED_ARG(flags);
253
254
255
256
257
  UNUSED_ARG(data);
  return getNanStatistics();
}

/** Return the Rwp of a diffraction pattern data
LamarMoore's avatar
LamarMoore committed
258
259
260
261
262
263
 * @param obsI :: array of observed intensity values
 * @param calI :: array of calculated intensity values;
 * @param obsE :: array of error of the observed data;
 * @return :: RFactor including Rp and Rwp
 *
 */
264
265
266
267
268
269
270
271
272
273
274
275
Rfactor getRFactor(const std::vector<double> &obsI,
                   const std::vector<double> &calI,
                   const std::vector<double> &obsE) {
  // 1. Check
  if (obsI.size() != calI.size() || obsI.size() != obsE.size()) {
    std::stringstream errss;
    errss << "GetRFactor() Input Error!  Observed Intensity (" << obsI.size()
          << "), Calculated Intensity (" << calI.size()
          << ") and Observed Error (" << obsE.size()
          << ") have different number of elements.";
    throw std::runtime_error(errss.str());
  }
276
  if (obsI.empty()) {
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
    throw std::runtime_error("getRFactor(): the input arrays are empty.");
  }

  double sumnom = 0;
  double sumdenom = 0;
  double sumrpnom = 0;
  double sumrpdenom = 0;

  size_t numpts = obsI.size();
  for (size_t i = 0; i < numpts; ++i) {
    double cal_i = calI[i];
    double obs_i = obsI[i];
    double sigma = obsE[i];
    double weight = 1.0 / (sigma * sigma);
    double diff = obs_i - cal_i;

    if (weight == weight && weight <= DBL_MAX) {
      // If weight is not NaN.
      sumrpnom += fabs(diff);
      sumrpdenom += fabs(obs_i);

      double tempnom = weight * diff * diff;
      double tempden = weight * obs_i * obs_i;

      sumnom += tempnom;
      sumdenom += tempden;

      if (tempnom != tempnom || tempden != tempden) {
305
        logger.error() << "***** Error! ****** Data indexed " << i << " is NaN. "
306
307
                  << "i = " << i << ": cal = " << calI[i] << ", obs = " << obs_i
                  << ", weight = " << weight << ". \n";
Campbell, Stuart's avatar
Campbell, Stuart committed
308
309
      }
    }
310
311
312
313
314
315
316
  }

  Rfactor rfactor(0., 0.);
  rfactor.Rp = (sumrpnom / sumrpdenom);
  rfactor.Rwp = std::sqrt(sumnom / sumdenom);

  if (rfactor.Rwp != rfactor.Rwp)
317
    logger.debug() << "Rwp is NaN.  Denominator = " << sumnom
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
              << "; Nominator = " << sumdenom << ". \n";

  return rfactor;
}

/**
 * This will calculate the first n-moments (inclusive) about the origin. For
 *example
 * if maxMoment=2 then this will return 3 values: 0th (total weight), 1st
 *(mean), 2nd (deviation).
 *
 * @param x The independent values
 * @param y The dependent values
 * @param maxMoment The number of moments to calculate
 * @returns The first n-moments.
 */
template <typename TYPE>
std::vector<double> getMomentsAboutOrigin(const std::vector<TYPE> &x,
                                          const std::vector<TYPE> &y,
                                          const int maxMoment) {
  // densities have the same number of x and y
  bool isDensity(x.size() == y.size());

  // if it isn't a density then check for histogram
  if ((!isDensity) && (x.size() != y.size() + 1)) {
    std::stringstream msg;
    msg << "length of x (" << x.size() << ") and y (" << y.size()
        << ")do not match";
    throw std::out_of_range(msg.str());
  }

  // initialize a result vector with all zeros
  std::vector<double> result(maxMoment + 1, 0.);

  // cache the maximum index
  size_t numPoints = y.size();
  if (isDensity)
    numPoints = x.size() - 1;

  // densities are calculated using Newton's method for numerical integration
358
359
  // as backwards as it sounds, the outer loop should be the points rather
  // than
360
361
362
363
364
365
366
367
368
  // the moments
  for (size_t j = 0; j < numPoints; ++j) {
    // reduce item lookup - and central x for histogram
    const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]);
    // this variable will be (x^n)*y
    double temp = static_cast<double>(y[j]); // correct for histogram
    if (isDensity) {
      const double xDelta = static_cast<double>(x[j + 1] - x[j]);
      temp = .5 * (temp + static_cast<double>(y[j + 1])) * xDelta;
Campbell, Stuart's avatar
Campbell, Stuart committed
369
370
    }

371
372
373
374
375
    // accumulate the moments
    result[0] += temp;
    for (size_t i = 1; i < result.size(); ++i) {
      temp *= xVal;
      result[i] += temp;
376
    }
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
  }

  return result;
}

/**
 * This will calculate the first n-moments (inclusive) about the mean (1st
 *moment). For example
 * if maxMoment=2 then this will return 3 values: 0th (total weight), 1st
 *(mean), 2nd (deviation).
 *
 * @param x The independent values
 * @param y The dependent values
 * @param maxMoment The number of moments to calculate
 * @returns The first n-moments.
 */
template <typename TYPE>
std::vector<double> getMomentsAboutMean(const std::vector<TYPE> &x,
                                        const std::vector<TYPE> &y,
                                        const int maxMoment) {
  // get the zeroth (integrated value) and first moment (mean)
  std::vector<double> momentsAboutOrigin = getMomentsAboutOrigin(x, y, 1);
  const double mean = momentsAboutOrigin[1];

  // initialize a result vector with all zeros
  std::vector<double> result(maxMoment + 1, 0.);
  result[0] = momentsAboutOrigin[0];

  // escape early if we need to
  if (maxMoment == 0)
    return result;

  // densities have the same number of x and y
  bool isDensity(x.size() == y.size());

  // cache the maximum index
  size_t numPoints = y.size();
  if (isDensity)
    numPoints = x.size() - 1;

  // densities are calculated using Newton's method for numerical integration
418
419
  // as backwards as it sounds, the outer loop should be the points rather
  // than
420
421
422
423
424
425
426
427
428
429
430
431
432
433
  // the moments
  for (size_t j = 0; j < numPoints; ++j) {
    // central x in histogram with a change of variables - and just change for
    // density
    const double xVal =
        .5 * static_cast<double>(x[j] + x[j + 1]) - mean; // change of variables

    // this variable will be (x^n)*y
    double temp;
    if (isDensity) {
      const double xDelta = static_cast<double>(x[j + 1] - x[j]);
      temp = xVal * .5 * static_cast<double>(y[j] + y[j + 1]) * xDelta;
    } else {
      temp = xVal * static_cast<double>(y[j]);
434
435
    }

436
437
438
439
440
    // accumulate the moment
    result[1] += temp;
    for (size_t i = 2; i < result.size(); ++i) {
      temp *= xVal;
      result[i] += temp;
441
    }
442
443
444
445
446
447
448
449
  }

  return result;
}

// -------------------------- Macro to instantiation concrete types
// --------------------------------
#define INSTANTIATE(TYPE)                                                      \
LamarMoore's avatar
LamarMoore committed
450
451
  template MANTID_KERNEL_DLL Statistics getStatistics<TYPE>(                   \
      const vector<TYPE> &, const unsigned int);                               \
452
  template MANTID_KERNEL_DLL std::vector<double> getZscore<TYPE>(              \
453
      const vector<TYPE> &);                                                   \
Lynch, Vickie's avatar
Lynch, Vickie committed
454
455
  template MANTID_KERNEL_DLL std::vector<double> getWeightedZscore<TYPE>(      \
      const vector<TYPE> &, const vector<TYPE> &);                             \
456
457
458
459
460
461
462
463
464
465
466
  template MANTID_KERNEL_DLL std::vector<double> getModifiedZscore<TYPE>(      \
      const vector<TYPE> &, const bool);                                       \
  template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutOrigin<TYPE>(  \
      const std::vector<TYPE> &x, const std::vector<TYPE> &y,                  \
      const int maxMoment);                                                    \
  template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutMean<TYPE>(    \
      const std::vector<TYPE> &x, const std::vector<TYPE> &y,                  \
      const int maxMoment);

// --------------------------- Concrete instantiations
// ---------------------------------------------
467
468
469
470
471
472
473
474
INSTANTIATE(float)
INSTANTIATE(double)
INSTANTIATE(int)
INSTANTIATE(long)
INSTANTIATE(long long)
INSTANTIATE(unsigned int)
INSTANTIATE(unsigned long)
INSTANTIATE(unsigned long long)
475
476

} // namespace Kernel
477
} // namespace Mantid