Statistics.cpp 13.8 KB
Newer Older
1
2
3
// Includes
#include "MantidKernel/Statistics.h"

4
#include <algorithm>
Campbell, Stuart's avatar
Campbell, Stuart committed
5
6
#include <cfloat>
#include <cmath>
7
#include <iostream>
Campbell, Stuart's avatar
Campbell, Stuart committed
8
9
#include <limits>
#include <numeric>
10
#include <sstream>
Campbell, Stuart's avatar
Campbell, Stuart committed
11
#include <stdexcept>
12
#include <functional>
Campbell, Stuart's avatar
Campbell, Stuart committed
13

14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
namespace Mantid {
namespace Kernel {

using std::string;
using std::vector;

/**
 * Generate a Statistics object where all of the values are NaN. This is a good
 * initial default.
 */
Statistics getNanStatistics() {
  double nan = std::numeric_limits<double>::quiet_NaN();

  Statistics stats;
  stats.minimum = nan;
  stats.maximum = nan;
  stats.mean = nan;
  stats.median = nan;
  stats.standard_deviation = nan;

  return stats;
}

/**
 * There are enough special cases in determining the median where it useful to
 * put it in a single function.
 */
template <typename TYPE>
double getMedian(const vector<TYPE> &data, const size_t num_data,
                 const bool sorted) {
  if (num_data == 1)
    return static_cast<double>(*(data.begin()));

  bool is_even = ((num_data % 2) == 0);
  if (is_even) {
    double left = 0.0;
    double right = 0.0;

    if (sorted) {
      // Just get the centre two elements.
      left = static_cast<double>(*(data.begin() + num_data / 2 - 1));
      right = static_cast<double>(*(data.begin() + num_data / 2));
    } else {
      // If the data is not sorted, make a copy we can mess with
      vector<TYPE> temp(data.begin(), data.end());
      // Get what the centre two elements should be...
      std::nth_element(temp.begin(), temp.begin() + num_data / 2 - 1,
                       temp.end());
      left = static_cast<double>(*(temp.begin() + num_data / 2 - 1));
      std::nth_element(temp.begin(), temp.begin() + num_data / 2, temp.end());
      right = static_cast<double>(*(temp.begin() + num_data / 2));
Campbell, Stuart's avatar
Campbell, Stuart committed
65
    }
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
    // return the average
    return (left + right) / 2.;
  } else
  // Odd number
  {
    if (sorted) {
      // If sorted and odd, just return the centre value
      return static_cast<double>(*(data.begin() + num_data / 2));
    } else {
      // If the data is not sorted, make a copy we can mess with
      vector<TYPE> temp(data.begin(), data.end());
      // Make sure the centre value is in the correct position
      std::nth_element(temp.begin(), temp.begin() + num_data / 2, temp.end());
      // Now return the centre value
      return static_cast<double>(*(temp.begin() + num_data / 2));
81
    }
82
83
84
85
86
87
88
  }
}
/**
 * There are enough special cases in determining the Z score where it useful to
 * put it in a single function.
 */
template <typename TYPE>
89
std::vector<double> getZscore(const vector<TYPE> &data) {
90
91
92
93
94
  if (data.size() < 3) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  std::vector<double> Zscore;
95
  Statistics stats = getStatistics(data);
96
97
98
99
  if (stats.standard_deviation == 0.) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
Hahn, Steven's avatar
Hahn, Steven committed
100
  for (auto it = data.cbegin(); it != data.cend(); ++it) {
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
    double tmp = static_cast<double>(*it);
    Zscore.push_back(fabs((tmp - stats.mean) / stats.standard_deviation));
  }
  return Zscore;
}
/**
 * There are enough special cases in determining the modified Z score where it
 * useful to
 * put it in a single function.
 */
template <typename TYPE>
std::vector<double> getModifiedZscore(const vector<TYPE> &data,
                                      const bool sorted) {
  if (data.size() < 3) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  std::vector<double> MADvec;
  double tmp;
  size_t num_data = data.size(); // cache since it is frequently used
  double median = getMedian(data, num_data, sorted);
Hahn, Steven's avatar
Hahn, Steven committed
122
  for (auto it = data.cbegin(); it != data.cend(); ++it) {
123
124
125
126
127
128
129
130
131
132
    tmp = static_cast<double>(*it);
    MADvec.push_back(fabs(tmp - median));
  }
  double MAD = getMedian(MADvec, num_data, sorted);
  if (MAD == 0.) {
    std::vector<double> Zscore(data.size(), 0.);
    return Zscore;
  }
  MADvec.clear();
  std::vector<double> Zscore;
Hahn, Steven's avatar
Hahn, Steven committed
133
  for (auto it = data.begin(); it != data.end(); ++it) {
134
135
136
137
138
139
140
141
142
    tmp = static_cast<double>(*it);
    Zscore.push_back(0.6745 * fabs((tmp - median) / MAD));
  }
  return Zscore;
}

/**
 * Determine the statistics for a vector of data. If it is sorted then let the
 * function know so it won't make a copy of the data for determining the median.
143
144
 * @param data Data points whose statistics are to be evaluated
 * @param flags A set of flags to control the computation of the stats
145
146
 */
template <typename TYPE>
147
Statistics getStatistics(const vector<TYPE> &data, const unsigned int flags) {
148
149
  Statistics stats = getNanStatistics();
  size_t num_data = data.size(); // cache since it is frequently used
150
  if (num_data == 0) {           // don't do anything
151
152
153
    return stats;
  }

154
155
156
157
158
159
160
161
162
163
164
165
  // calculate the mean if this or the stddev is requested
  const bool stddev = ((flags & StatOptions::UncorrectedStdDev) ||
                       (flags & StatOptions::CorrectedStdDev));
  if ((flags & StatOptions::Mean) || stddev) {
    const TYPE sum = std::accumulate(data.begin(), data.end(),
                                     static_cast<TYPE>(0), std::plus<TYPE>());
    stats.mean = static_cast<double>(sum) / (static_cast<double>(num_data));
    if (stddev) {
      // calculate the standard deviation, min, max
      stats.minimum = stats.mean;
      stats.maximum = stats.mean;
      double stddev = 0.;
Hahn, Steven's avatar
Hahn, Steven committed
166
      for (auto it = data.cbegin(); it != data.cend(); ++it) {
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
        double temp = static_cast<double>(*it);
        stddev += ((temp - stats.mean) * (temp - stats.mean));
        if (temp > stats.maximum)
          stats.maximum = temp;
        if (temp < stats.minimum)
          stats.minimum = temp;
      }
      size_t ndofs =
          (flags & StatOptions::CorrectedStdDev) ? num_data - 1 : num_data;
      stats.standard_deviation = sqrt(stddev / (static_cast<double>(ndofs)));
    }
  }
  // calculate the median if requested
  if (flags & StatOptions::Median) {
    stats.median = getMedian(data, num_data, flags & StatOptions::SortedData);
182
183
184
185
186
187
188
  }
  return stats;
}

/// Getting statistics of a string array should just give a bunch of NaNs
template <>
DLLExport Statistics
189
190
getStatistics<string>(const vector<string> &data, const unsigned int flags) {
  UNUSED_ARG(flags);
191
192
193
194
195
196
197
  UNUSED_ARG(data);
  return getNanStatistics();
}

/// Getting statistics of a boolean array should just give a bunch of NaNs
template <>
DLLExport Statistics
198
199
getStatistics<bool>(const vector<bool> &data, const unsigned int flags) {
  UNUSED_ARG(flags);
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
  UNUSED_ARG(data);
  return getNanStatistics();
}

/** Return the Rwp of a diffraction pattern data
  * @param obsI :: array of observed intensity values
  * @param calI :: array of calculated intensity values;
  * @param obsE :: array of error of the observed data;
  * @return :: RFactor including Rp and Rwp
  *
  */
Rfactor getRFactor(const std::vector<double> &obsI,
                   const std::vector<double> &calI,
                   const std::vector<double> &obsE) {
  // 1. Check
  if (obsI.size() != calI.size() || obsI.size() != obsE.size()) {
    std::stringstream errss;
    errss << "GetRFactor() Input Error!  Observed Intensity (" << obsI.size()
          << "), Calculated Intensity (" << calI.size()
          << ") and Observed Error (" << obsE.size()
          << ") have different number of elements.";
    throw std::runtime_error(errss.str());
  }
  if (obsI.size() == 0) {
    throw std::runtime_error("getRFactor(): the input arrays are empty.");
  }

  double sumnom = 0;
  double sumdenom = 0;
  double sumrpnom = 0;
  double sumrpdenom = 0;

  size_t numpts = obsI.size();
  for (size_t i = 0; i < numpts; ++i) {
    double cal_i = calI[i];
    double obs_i = obsI[i];
    double sigma = obsE[i];
    double weight = 1.0 / (sigma * sigma);
    double diff = obs_i - cal_i;

    if (weight == weight && weight <= DBL_MAX) {
      // If weight is not NaN.
      sumrpnom += fabs(diff);
      sumrpdenom += fabs(obs_i);

      double tempnom = weight * diff * diff;
      double tempden = weight * obs_i * obs_i;

      sumnom += tempnom;
      sumdenom += tempden;

      if (tempnom != tempnom || tempden != tempden) {
        std::cout << "***** Error! ****** Data indexed " << i << " is NaN. "
                  << "i = " << i << ": cal = " << calI[i] << ", obs = " << obs_i
                  << ", weight = " << weight << ". \n";
Campbell, Stuart's avatar
Campbell, Stuart committed
255
256
      }
    }
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
  }

  Rfactor rfactor(0., 0.);
  rfactor.Rp = (sumrpnom / sumrpdenom);
  rfactor.Rwp = std::sqrt(sumnom / sumdenom);

  if (rfactor.Rwp != rfactor.Rwp)
    std::cout << "Rwp is NaN.  Denominator = " << sumnom
              << "; Nominator = " << sumdenom << ". \n";

  return rfactor;
}

/**
 * This will calculate the first n-moments (inclusive) about the origin. For
 *example
 * if maxMoment=2 then this will return 3 values: 0th (total weight), 1st
 *(mean), 2nd (deviation).
 *
 * @param x The independent values
 * @param y The dependent values
 * @param maxMoment The number of moments to calculate
 * @returns The first n-moments.
 */
template <typename TYPE>
std::vector<double> getMomentsAboutOrigin(const std::vector<TYPE> &x,
                                          const std::vector<TYPE> &y,
                                          const int maxMoment) {
  // densities have the same number of x and y
  bool isDensity(x.size() == y.size());

  // if it isn't a density then check for histogram
  if ((!isDensity) && (x.size() != y.size() + 1)) {
    std::stringstream msg;
    msg << "length of x (" << x.size() << ") and y (" << y.size()
        << ")do not match";
    throw std::out_of_range(msg.str());
  }

  // initialize a result vector with all zeros
  std::vector<double> result(maxMoment + 1, 0.);

  // cache the maximum index
  size_t numPoints = y.size();
  if (isDensity)
    numPoints = x.size() - 1;

  // densities are calculated using Newton's method for numerical integration
305
306
  // as backwards as it sounds, the outer loop should be the points rather
  // than
307
308
309
310
311
312
313
314
315
  // the moments
  for (size_t j = 0; j < numPoints; ++j) {
    // reduce item lookup - and central x for histogram
    const double xVal = .5 * static_cast<double>(x[j] + x[j + 1]);
    // this variable will be (x^n)*y
    double temp = static_cast<double>(y[j]); // correct for histogram
    if (isDensity) {
      const double xDelta = static_cast<double>(x[j + 1] - x[j]);
      temp = .5 * (temp + static_cast<double>(y[j + 1])) * xDelta;
Campbell, Stuart's avatar
Campbell, Stuart committed
316
317
    }

318
319
320
321
322
    // accumulate the moments
    result[0] += temp;
    for (size_t i = 1; i < result.size(); ++i) {
      temp *= xVal;
      result[i] += temp;
323
    }
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
  }

  return result;
}

/**
 * This will calculate the first n-moments (inclusive) about the mean (1st
 *moment). For example
 * if maxMoment=2 then this will return 3 values: 0th (total weight), 1st
 *(mean), 2nd (deviation).
 *
 * @param x The independent values
 * @param y The dependent values
 * @param maxMoment The number of moments to calculate
 * @returns The first n-moments.
 */
template <typename TYPE>
std::vector<double> getMomentsAboutMean(const std::vector<TYPE> &x,
                                        const std::vector<TYPE> &y,
                                        const int maxMoment) {
  // get the zeroth (integrated value) and first moment (mean)
  std::vector<double> momentsAboutOrigin = getMomentsAboutOrigin(x, y, 1);
  const double mean = momentsAboutOrigin[1];

  // initialize a result vector with all zeros
  std::vector<double> result(maxMoment + 1, 0.);
  result[0] = momentsAboutOrigin[0];

  // escape early if we need to
  if (maxMoment == 0)
    return result;

  // densities have the same number of x and y
  bool isDensity(x.size() == y.size());

  // cache the maximum index
  size_t numPoints = y.size();
  if (isDensity)
    numPoints = x.size() - 1;

  // densities are calculated using Newton's method for numerical integration
365
366
  // as backwards as it sounds, the outer loop should be the points rather
  // than
367
368
369
370
371
372
373
374
375
376
377
378
379
380
  // the moments
  for (size_t j = 0; j < numPoints; ++j) {
    // central x in histogram with a change of variables - and just change for
    // density
    const double xVal =
        .5 * static_cast<double>(x[j] + x[j + 1]) - mean; // change of variables

    // this variable will be (x^n)*y
    double temp;
    if (isDensity) {
      const double xDelta = static_cast<double>(x[j + 1] - x[j]);
      temp = xVal * .5 * static_cast<double>(y[j] + y[j + 1]) * xDelta;
    } else {
      temp = xVal * static_cast<double>(y[j]);
381
382
    }

383
384
385
386
387
    // accumulate the moment
    result[1] += temp;
    for (size_t i = 2; i < result.size(); ++i) {
      temp *= xVal;
      result[i] += temp;
388
    }
389
390
391
392
393
394
395
396
397
  }

  return result;
}

// -------------------------- Macro to instantiation concrete types
// --------------------------------
#define INSTANTIATE(TYPE)                                                      \
  template MANTID_KERNEL_DLL Statistics                                        \
398
  getStatistics<TYPE>(const vector<TYPE> &, const unsigned int);               \
399
  template MANTID_KERNEL_DLL std::vector<double> getZscore<TYPE>(              \
400
      const vector<TYPE> &);                                                   \
401
402
403
404
405
406
407
408
409
410
411
  template MANTID_KERNEL_DLL std::vector<double> getModifiedZscore<TYPE>(      \
      const vector<TYPE> &, const bool);                                       \
  template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutOrigin<TYPE>(  \
      const std::vector<TYPE> &x, const std::vector<TYPE> &y,                  \
      const int maxMoment);                                                    \
  template MANTID_KERNEL_DLL std::vector<double> getMomentsAboutMean<TYPE>(    \
      const std::vector<TYPE> &x, const std::vector<TYPE> &y,                  \
      const int maxMoment);

// --------------------------- Concrete instantiations
// ---------------------------------------------
412
413
414
415
416
417
418
419
INSTANTIATE(float)
INSTANTIATE(double)
INSTANTIATE(int)
INSTANTIATE(long)
INSTANTIATE(long long)
INSTANTIATE(unsigned int)
INSTANTIATE(unsigned long)
INSTANTIATE(unsigned long long)
420
421

} // namespace Kernel
422
} // namespace Mantid