Loading source/adios2/helper/adiosMath.inl +316 −64 Original line number Diff line number Diff line Loading @@ -4,7 +4,6 @@ * SPDX-License-Identifier: Apache-2.0 */ #ifndef ADIOS2_HELPER_ADIOSMATH_INL_ #define ADIOS2_HELPER_ADIOSMATH_INL_ #ifndef ADIOS2_HELPER_ADIOSMATH_H_ Loading @@ -16,6 +15,12 @@ #include <limits> //std::numeri_limits #include <thread> #ifdef __aarch64__ #include <arm_neon.h> #elif defined(__x86_64__) || defined(_M_X64) #include <immintrin.h> #endif #include "adios2/common/ADIOSMacros.h" #include "adiosLog.h" Loading @@ -25,13 +30,11 @@ namespace helper { template <class T> void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, const Dims &count, const bool isRowMajor, T &min, T &max, const MemorySpace memSpace) noexcept void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, const Dims &count, const bool isRowMajor, T &min, T &max, const MemorySpace memSpace) noexcept { auto lf_MinMaxRowMajor = [](const T *values, const Dims &shape, const Dims &start, const Dims &count, T &min, T &max, const MemorySpace memSpace) { auto lf_MinMaxRowMajor = [](const T *values, const Dims &shape, const Dims &start, const Dims &count, T &min, T &max, const MemorySpace memSpace) { // loop through selection box contiguous part const size_t dimensions = shape.size(); const size_t stride = count.back(); Loading @@ -44,12 +47,11 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, while (run) { // here copy current linear memory between currentPoint and end const size_t startOffset = helper::LinearIndex( Dims(shape.size(), 0), shape, currentPoint, true); const size_t startOffset = helper::LinearIndex(Dims(shape.size(), 0), shape, currentPoint, true); T minStride, maxStride; GetMinMax(values + startOffset, stride, minStride, maxStride, memSpace); GetMinMax(values + startOffset, stride, minStride, maxStride, memSpace); if (firstStep) { Loading Loading @@ -95,9 +97,8 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, } // end while stride loop }; auto lf_MinMaxColumnMajor = [](const T *values, const Dims &shape, const Dims &start, const Dims &count, T &min, T &max, const MemorySpace memSpace) { auto lf_MinMaxColumnMajor = [](const T *values, const Dims &shape, const Dims &start, const Dims &count, T &min, T &max, const MemorySpace memSpace) { // loop through selection box contiguous part const size_t dimensions = shape.size(); const size_t stride = count.front(); Loading @@ -110,12 +111,11 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, while (run) { // here copy current linear memory between currentPoint and end const size_t startOffset = helper::LinearIndex( Dims(shape.size(), 0), shape, currentPoint, false); const size_t startOffset = helper::LinearIndex(Dims(shape.size(), 0), shape, currentPoint, false); T minStride, maxStride; GetMinMax(values + startOffset, stride, minStride, maxStride, memSpace); GetMinMax(values + startOffset, stride, minStride, maxStride, memSpace); if (firstStep) { Loading Loading @@ -165,8 +165,7 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, if (shape.size() == 1) { const size_t startOffset = helper::LinearIndex(Dims(1, 0), shape, start, isRowMajor); const size_t startOffset = helper::LinearIndex(Dims(1, 0), shape, start, isRowMajor); const size_t totalSize = helper::GetTotalSize(count); GetMinMax(values + startOffset, totalSize, min, max, memSpace); return; Loading Loading @@ -198,6 +197,266 @@ inline void GetMinMax(const T *values, const size_t size, T &min, T &max, max = *bounds.second; } // SIMD specializations for min/max. std::minmax_element cannot be vectorized because it // tracks element positions. These value-only reductions use platform SIMD intrinsics and // run at memory bandwidth. When porting to a new architecture (e.g. Power/VSX, RISC-V V), // add a specialization here with the appropriate intrinsics; the #else scalar fallback // will be used otherwise. // // Covered types: float, double, int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t. // Not covered (use scalar fallback): int64_t, uint64_t (no SIMD min/max before AVX-512), // long double (80-bit, no SIMD), complex (ordering by std::norm). // Helper: scalar min/max loop used by SIMD tail and scalar-only fallback. template <class T> inline void GetMinMaxScalar(const T *values, size_t start, const size_t size, T &min, T &max) noexcept { for (size_t i = start; i < size; ++i) { if (values[i] < min) min = values[i]; if (values[i] > max) max = values[i]; } } #ifdef __aarch64__ // clang-format off // NEON macro: generates a GetMinMax specialization for an integer type. // VTYPE: NEON vector type, LOAD: load intrinsic, MIN/MAX: min/max intrinsics, // DUP: broadcast intrinsic, REDUCE_MIN/MAX: horizontal reduce intrinsics, // COUNT: elements per vector pair iteration. #define ADIOS2_NEON_MINMAX_INT(CTYPE, VTYPE, LOAD, MIN, MAX, DUP, REDUCE_MIN, REDUCE_MAX, COUNT) \ template <> \ inline void GetMinMax(const CTYPE *values, const size_t size, CTYPE &min, CTYPE &max, \ const MemorySpace) noexcept \ { \ if (size == 0) return; \ size_t i = 0; \ VTYPE vmin = DUP(values[0]); \ VTYPE vmax = DUP(values[0]); \ VTYPE vmin2 = vmin, vmax2 = vmax; \ const size_t endN = size - (size % (COUNT)); \ for (; i < endN; i += (COUNT)) \ { \ VTYPE a = LOAD(values + i); \ VTYPE b = LOAD(values + i + (COUNT) / 2); \ vmin = MIN(vmin, a); vmax = MAX(vmax, a); \ vmin2 = MIN(vmin2, b); vmax2 = MAX(vmax2, b); \ } \ vmin = MIN(vmin, vmin2); vmax = MAX(vmax, vmax2); \ min = REDUCE_MIN(vmin); max = REDUCE_MAX(vmax); \ GetMinMaxScalar(values, i, size, min, max); \ } ADIOS2_NEON_MINMAX_INT(int8_t, int8x16_t, vld1q_s8, vminq_s8, vmaxq_s8, vdupq_n_s8, vminvq_s8, vmaxvq_s8, 32) ADIOS2_NEON_MINMAX_INT(uint8_t, uint8x16_t, vld1q_u8, vminq_u8, vmaxq_u8, vdupq_n_u8, vminvq_u8, vmaxvq_u8, 32) ADIOS2_NEON_MINMAX_INT(int16_t, int16x8_t, vld1q_s16, vminq_s16, vmaxq_s16, vdupq_n_s16, vminvq_s16, vmaxvq_s16, 16) ADIOS2_NEON_MINMAX_INT(uint16_t, uint16x8_t, vld1q_u16, vminq_u16, vmaxq_u16, vdupq_n_u16, vminvq_u16, vmaxvq_u16, 16) ADIOS2_NEON_MINMAX_INT(int32_t, int32x4_t, vld1q_s32, vminq_s32, vmaxq_s32, vdupq_n_s32, vminvq_s32, vmaxvq_s32, 8) ADIOS2_NEON_MINMAX_INT(uint32_t, uint32x4_t, vld1q_u32, vminq_u32, vmaxq_u32, vdupq_n_u32, vminvq_u32, vmaxvq_u32, 8) #undef ADIOS2_NEON_MINMAX_INT // clang-format on #elif defined(__x86_64__) || defined(_M_X64) // SSE2 integer min/max via compare+select (blendv requires SSE4.1, so we use and/andnot/or). // mask = cmpgt(a,b) selects: min = (b & mask) | (a & ~mask), max = (a & mask) | (b & ~mask) #define ADIOS2_SSE2_MINMAX_INT(CTYPE, VTYPE, LOAD, STORE, SET1, CMPGT, AND, ANDNOT, OR, COUNT) \ template <> \ inline void GetMinMax(const CTYPE *values, const size_t size, CTYPE &min, CTYPE &max, \ const MemorySpace) noexcept \ { \ if (size == 0) \ return; \ size_t i = 0; \ VTYPE vmin = SET1(values[0]); \ VTYPE vmax = SET1(values[0]); \ const size_t endN = size - (size % (COUNT)); \ for (; i < endN; i += (COUNT)) \ { \ VTYPE v = LOAD(reinterpret_cast<const __m128i *>(values + i)); \ VTYPE gt = CMPGT(v, vmax); \ vmax = OR(AND(v, gt), ANDNOT(gt, vmax)); \ VTYPE lt = CMPGT(vmin, v); \ vmin = OR(AND(v, lt), ANDNOT(lt, vmin)); \ } \ CTYPE tmin[16 / sizeof(CTYPE)], tmax[16 / sizeof(CTYPE)]; \ STORE(reinterpret_cast<__m128i *>(tmin), vmin); \ STORE(reinterpret_cast<__m128i *>(tmax), vmax); \ min = tmin[0]; \ max = tmax[0]; \ for (int j = 1; j < (int)(16 / sizeof(CTYPE)); ++j) \ { \ if (tmin[j] < min) \ min = tmin[j]; \ if (tmax[j] > max) \ max = tmax[j]; \ } \ GetMinMaxScalar(values, i, size, min, max); \ } // Signed types use _mm_cmpgt_epiN directly. ADIOS2_SSE2_MINMAX_INT(int8_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi8, _mm_cmpgt_epi8, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 16) ADIOS2_SSE2_MINMAX_INT(int16_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi16, _mm_cmpgt_epi16, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 8) ADIOS2_SSE2_MINMAX_INT(int32_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi32, _mm_cmpgt_epi32, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 4) // Unsigned types: SSE2 only has signed cmpgt. XOR with sign bit to convert unsigned // comparison to signed, then use signed cmpgt. namespace detail { inline __m128i unsigned_cmpgt_epi8(__m128i a, __m128i b) { const __m128i bias = _mm_set1_epi8(static_cast<char>(0x80)); return _mm_cmpgt_epi8(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias)); } inline __m128i unsigned_cmpgt_epi16(__m128i a, __m128i b) { const __m128i bias = _mm_set1_epi16(static_cast<short>(0x8000)); return _mm_cmpgt_epi16(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias)); } inline __m128i unsigned_cmpgt_epi32(__m128i a, __m128i b) { const __m128i bias = _mm_set1_epi32(static_cast<int>(0x80000000)); return _mm_cmpgt_epi32(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias)); } } // namespace detail ADIOS2_SSE2_MINMAX_INT(uint8_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi8, detail::unsigned_cmpgt_epi8, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 16) ADIOS2_SSE2_MINMAX_INT(uint16_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi16, detail::unsigned_cmpgt_epi16, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 8) ADIOS2_SSE2_MINMAX_INT(uint32_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi32, detail::unsigned_cmpgt_epi32, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 4) #undef ADIOS2_SSE2_MINMAX_INT #endif template <> inline void GetMinMax(const double *values, const size_t size, double &min, double &max, const MemorySpace memSpace) noexcept { if (size == 0) return; #ifdef ADIOS2_HAVE_GPU_SUPPORT if (memSpace == MemorySpace::GPU) { GetGPUMinMax(values, size, min, max); return; } #endif size_t i = 0; #ifdef __aarch64__ float64x2_t vmin = vdupq_n_f64(values[0]); float64x2_t vmax = vdupq_n_f64(values[0]); float64x2_t vmin2 = vmin, vmax2 = vmax; const size_t end4 = size - (size % 4); for (; i < end4; i += 4) { float64x2_t a = vld1q_f64(values + i); float64x2_t b = vld1q_f64(values + i + 2); vmin = vminq_f64(vmin, a); vmax = vmaxq_f64(vmax, a); vmin2 = vminq_f64(vmin2, b); vmax2 = vmaxq_f64(vmax2, b); } vmin = vminq_f64(vmin, vmin2); vmax = vmaxq_f64(vmax, vmax2); min = vminvq_f64(vmin); max = vmaxvq_f64(vmax); #elif defined(__x86_64__) || defined(_M_X64) __m128d vmin = _mm_set1_pd(values[0]); __m128d vmax = _mm_set1_pd(values[0]); __m128d vmin2 = vmin, vmax2 = vmax; const size_t end4 = size - (size % 4); for (; i < end4; i += 4) { __m128d a = _mm_loadu_pd(values + i); __m128d b = _mm_loadu_pd(values + i + 2); vmin = _mm_min_pd(vmin, a); vmax = _mm_max_pd(vmax, a); vmin2 = _mm_min_pd(vmin2, b); vmax2 = _mm_max_pd(vmax2, b); } vmin = _mm_min_pd(vmin, vmin2); vmax = _mm_max_pd(vmax, vmax2); double tmin[2], tmax[2]; _mm_storeu_pd(tmin, vmin); _mm_storeu_pd(tmax, vmax); min = (std::min)(tmin[0], tmin[1]); max = (std::max)(tmax[0], tmax[1]); #else min = values[0]; max = values[0]; #endif GetMinMaxScalar(values, i, size, min, max); } template <> inline void GetMinMax(const float *values, const size_t size, float &min, float &max, const MemorySpace memSpace) noexcept { if (size == 0) return; #ifdef ADIOS2_HAVE_GPU_SUPPORT if (memSpace == MemorySpace::GPU) { GetGPUMinMax(values, size, min, max); return; } #endif size_t i = 0; #ifdef __aarch64__ float32x4_t vmin = vdupq_n_f32(values[0]); float32x4_t vmax = vdupq_n_f32(values[0]); float32x4_t vmin2 = vmin, vmax2 = vmax; const size_t end8 = size - (size % 8); for (; i < end8; i += 8) { float32x4_t a = vld1q_f32(values + i); float32x4_t b = vld1q_f32(values + i + 4); vmin = vminq_f32(vmin, a); vmax = vmaxq_f32(vmax, a); vmin2 = vminq_f32(vmin2, b); vmax2 = vmaxq_f32(vmax2, b); } vmin = vminq_f32(vmin, vmin2); vmax = vmaxq_f32(vmax, vmax2); min = vminvq_f32(vmin); max = vmaxvq_f32(vmax); #elif defined(__x86_64__) || defined(_M_X64) __m128 vmin = _mm_set1_ps(values[0]); __m128 vmax = _mm_set1_ps(values[0]); __m128 vmin2 = vmin, vmax2 = vmax; const size_t end8 = size - (size % 8); for (; i < end8; i += 8) { __m128 a = _mm_loadu_ps(values + i); __m128 b = _mm_loadu_ps(values + i + 4); vmin = _mm_min_ps(vmin, a); vmax = _mm_max_ps(vmax, a); vmin2 = _mm_min_ps(vmin2, b); vmax2 = _mm_max_ps(vmax2, b); } vmin = _mm_min_ps(vmin, vmin2); vmax = _mm_max_ps(vmax, vmax2); float tmin[4], tmax[4]; _mm_storeu_ps(tmin, vmin); _mm_storeu_ps(tmax, vmax); min = (std::min)({tmin[0], tmin[1], tmin[2], tmin[3]}); max = (std::max)({tmax[0], tmax[1], tmax[2], tmax[3]}); #else min = values[0]; max = values[0]; #endif GetMinMaxScalar(values, i, size, min, max); } template <> inline void GetMinMax(const std::complex<float> *values, const size_t size, std::complex<float> &min, std::complex<float> &max, Loading Loading @@ -233,8 +492,8 @@ inline void GetMinMax(const std::complex<double> *values, const size_t size, } template <class T> void GetMinMaxComplex(const std::complex<T> *values, const size_t size, std::complex<T> &min, std::complex<T> &max) noexcept void GetMinMaxComplex(const std::complex<T> *values, const size_t size, std::complex<T> &min, std::complex<T> &max) noexcept { min = values[0]; max = values[0]; Loading Loading @@ -263,8 +522,7 @@ void GetMinMaxComplex(const std::complex<T> *values, const size_t size, template <class T> void GetMinMaxThreads(const T *values, const size_t size, T &min, T &max, const unsigned int threads, const MemorySpace memSpace) noexcept const unsigned int threads, const MemorySpace memSpace) noexcept { if (size == 0) { Loading Loading @@ -293,14 +551,12 @@ void GetMinMaxThreads(const T *values, const size_t size, T &min, T &max, if (t == threads - 1) { getMinMaxThreads.push_back( std::thread(GetMinMax<T>, &values[position], last, getMinMaxThreads.push_back(std::thread(GetMinMax<T>, &values[position], last, std::ref(mins[t]), std::ref(maxs[t]), memSpace)); } else { getMinMaxThreads.push_back( std::thread(GetMinMax<T>, &values[position], stride, getMinMaxThreads.push_back(std::thread(GetMinMax<T>, &values[position], stride, std::ref(mins[t]), std::ref(maxs[t]), memSpace)); } } Loading @@ -318,9 +574,9 @@ void GetMinMaxThreads(const T *values, const size_t size, T &min, T &max, } template <class T> void GetMinMaxThreads(const std::complex<T> *values, const size_t size, std::complex<T> &min, std::complex<T> &max, const unsigned int threads, MemorySpace memSpace) noexcept void GetMinMaxThreads(const std::complex<T> *values, const size_t size, std::complex<T> &min, std::complex<T> &max, const unsigned int threads, MemorySpace memSpace) noexcept { #ifdef ADIOS2_HAVE_GPU_SUPPORT if (memSpace == MemorySpace::GPU) Loading Loading @@ -359,14 +615,12 @@ void GetMinMaxThreads(const std::complex<T> *values, const size_t size, if (t == threads - 1) { getMinMaxThreads.push_back( std::thread(GetMinMaxComplex<T>, &values[position], last, getMinMaxThreads.push_back(std::thread(GetMinMaxComplex<T>, &values[position], last, std::ref(mins[t]), std::ref(maxs[t]))); } else { getMinMaxThreads.push_back( std::thread(GetMinMaxComplex<T>, &values[position], stride, getMinMaxThreads.push_back(std::thread(GetMinMaxComplex<T>, &values[position], stride, std::ref(mins[t]), std::ref(maxs[t]))); } } Loading @@ -384,9 +638,8 @@ void GetMinMaxThreads(const std::complex<T> *values, const size_t size, } template <class T> void GetMinMaxSubblocks(const T *values, const Dims &count, const BlockDivisionInfo &info, std::vector<T> &MinMaxs, T &bmin, T &bmax, const unsigned int threads, void GetMinMaxSubblocks(const T *values, const Dims &count, const BlockDivisionInfo &info, std::vector<T> &MinMaxs, T &bmin, T &bmax, const unsigned int threads, const MemorySpace memSpace) noexcept { const int ndim = static_cast<int>(count.size()); Loading Loading @@ -560,8 +813,8 @@ void GetMinMaxSubblocks(const T *values, const Dims &count, #define declare_template_instantiation(T) \ template <> \ inline bool LessThan<std::complex<T>>( \ const std::complex<T> input1, const std::complex<T> input2) noexcept \ inline bool LessThan<std::complex<T>>(const std::complex<T> input1, \ const std::complex<T> input2) noexcept \ { \ if (std::norm(input1) < std::norm(input2)) \ { \ Loading @@ -585,8 +838,8 @@ constexpr bool LessThan(const T input1, const T input2) noexcept #define declare_template_instantiation(T) \ template <> \ inline bool GreaterThan<std::complex<T>>( \ const std::complex<T> input1, const std::complex<T> input2) noexcept \ inline bool GreaterThan<std::complex<T>>(const std::complex<T> input1, \ const std::complex<T> input2) noexcept \ { \ if (std::norm(input1) > std::norm(input2)) \ { \ Loading Loading @@ -633,8 +886,7 @@ std::vector<T> VectorsOp(BinaryOperation op, const std::vector<T> &vector1, const std::vector<T> &vector2) noexcept { std::vector<T> result(vector1.size()); std::transform(vector1.begin(), vector1.end(), vector2.begin(), result.begin(), op); std::transform(vector1.begin(), vector1.end(), vector2.begin(), result.begin(), op); return result; } Loading Loading
source/adios2/helper/adiosMath.inl +316 −64 Original line number Diff line number Diff line Loading @@ -4,7 +4,6 @@ * SPDX-License-Identifier: Apache-2.0 */ #ifndef ADIOS2_HELPER_ADIOSMATH_INL_ #define ADIOS2_HELPER_ADIOSMATH_INL_ #ifndef ADIOS2_HELPER_ADIOSMATH_H_ Loading @@ -16,6 +15,12 @@ #include <limits> //std::numeri_limits #include <thread> #ifdef __aarch64__ #include <arm_neon.h> #elif defined(__x86_64__) || defined(_M_X64) #include <immintrin.h> #endif #include "adios2/common/ADIOSMacros.h" #include "adiosLog.h" Loading @@ -25,13 +30,11 @@ namespace helper { template <class T> void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, const Dims &count, const bool isRowMajor, T &min, T &max, const MemorySpace memSpace) noexcept void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, const Dims &count, const bool isRowMajor, T &min, T &max, const MemorySpace memSpace) noexcept { auto lf_MinMaxRowMajor = [](const T *values, const Dims &shape, const Dims &start, const Dims &count, T &min, T &max, const MemorySpace memSpace) { auto lf_MinMaxRowMajor = [](const T *values, const Dims &shape, const Dims &start, const Dims &count, T &min, T &max, const MemorySpace memSpace) { // loop through selection box contiguous part const size_t dimensions = shape.size(); const size_t stride = count.back(); Loading @@ -44,12 +47,11 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, while (run) { // here copy current linear memory between currentPoint and end const size_t startOffset = helper::LinearIndex( Dims(shape.size(), 0), shape, currentPoint, true); const size_t startOffset = helper::LinearIndex(Dims(shape.size(), 0), shape, currentPoint, true); T minStride, maxStride; GetMinMax(values + startOffset, stride, minStride, maxStride, memSpace); GetMinMax(values + startOffset, stride, minStride, maxStride, memSpace); if (firstStep) { Loading Loading @@ -95,9 +97,8 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, } // end while stride loop }; auto lf_MinMaxColumnMajor = [](const T *values, const Dims &shape, const Dims &start, const Dims &count, T &min, T &max, const MemorySpace memSpace) { auto lf_MinMaxColumnMajor = [](const T *values, const Dims &shape, const Dims &start, const Dims &count, T &min, T &max, const MemorySpace memSpace) { // loop through selection box contiguous part const size_t dimensions = shape.size(); const size_t stride = count.front(); Loading @@ -110,12 +111,11 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, while (run) { // here copy current linear memory between currentPoint and end const size_t startOffset = helper::LinearIndex( Dims(shape.size(), 0), shape, currentPoint, false); const size_t startOffset = helper::LinearIndex(Dims(shape.size(), 0), shape, currentPoint, false); T minStride, maxStride; GetMinMax(values + startOffset, stride, minStride, maxStride, memSpace); GetMinMax(values + startOffset, stride, minStride, maxStride, memSpace); if (firstStep) { Loading Loading @@ -165,8 +165,7 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, if (shape.size() == 1) { const size_t startOffset = helper::LinearIndex(Dims(1, 0), shape, start, isRowMajor); const size_t startOffset = helper::LinearIndex(Dims(1, 0), shape, start, isRowMajor); const size_t totalSize = helper::GetTotalSize(count); GetMinMax(values + startOffset, totalSize, min, max, memSpace); return; Loading Loading @@ -198,6 +197,266 @@ inline void GetMinMax(const T *values, const size_t size, T &min, T &max, max = *bounds.second; } // SIMD specializations for min/max. std::minmax_element cannot be vectorized because it // tracks element positions. These value-only reductions use platform SIMD intrinsics and // run at memory bandwidth. When porting to a new architecture (e.g. Power/VSX, RISC-V V), // add a specialization here with the appropriate intrinsics; the #else scalar fallback // will be used otherwise. // // Covered types: float, double, int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t. // Not covered (use scalar fallback): int64_t, uint64_t (no SIMD min/max before AVX-512), // long double (80-bit, no SIMD), complex (ordering by std::norm). // Helper: scalar min/max loop used by SIMD tail and scalar-only fallback. template <class T> inline void GetMinMaxScalar(const T *values, size_t start, const size_t size, T &min, T &max) noexcept { for (size_t i = start; i < size; ++i) { if (values[i] < min) min = values[i]; if (values[i] > max) max = values[i]; } } #ifdef __aarch64__ // clang-format off // NEON macro: generates a GetMinMax specialization for an integer type. // VTYPE: NEON vector type, LOAD: load intrinsic, MIN/MAX: min/max intrinsics, // DUP: broadcast intrinsic, REDUCE_MIN/MAX: horizontal reduce intrinsics, // COUNT: elements per vector pair iteration. #define ADIOS2_NEON_MINMAX_INT(CTYPE, VTYPE, LOAD, MIN, MAX, DUP, REDUCE_MIN, REDUCE_MAX, COUNT) \ template <> \ inline void GetMinMax(const CTYPE *values, const size_t size, CTYPE &min, CTYPE &max, \ const MemorySpace) noexcept \ { \ if (size == 0) return; \ size_t i = 0; \ VTYPE vmin = DUP(values[0]); \ VTYPE vmax = DUP(values[0]); \ VTYPE vmin2 = vmin, vmax2 = vmax; \ const size_t endN = size - (size % (COUNT)); \ for (; i < endN; i += (COUNT)) \ { \ VTYPE a = LOAD(values + i); \ VTYPE b = LOAD(values + i + (COUNT) / 2); \ vmin = MIN(vmin, a); vmax = MAX(vmax, a); \ vmin2 = MIN(vmin2, b); vmax2 = MAX(vmax2, b); \ } \ vmin = MIN(vmin, vmin2); vmax = MAX(vmax, vmax2); \ min = REDUCE_MIN(vmin); max = REDUCE_MAX(vmax); \ GetMinMaxScalar(values, i, size, min, max); \ } ADIOS2_NEON_MINMAX_INT(int8_t, int8x16_t, vld1q_s8, vminq_s8, vmaxq_s8, vdupq_n_s8, vminvq_s8, vmaxvq_s8, 32) ADIOS2_NEON_MINMAX_INT(uint8_t, uint8x16_t, vld1q_u8, vminq_u8, vmaxq_u8, vdupq_n_u8, vminvq_u8, vmaxvq_u8, 32) ADIOS2_NEON_MINMAX_INT(int16_t, int16x8_t, vld1q_s16, vminq_s16, vmaxq_s16, vdupq_n_s16, vminvq_s16, vmaxvq_s16, 16) ADIOS2_NEON_MINMAX_INT(uint16_t, uint16x8_t, vld1q_u16, vminq_u16, vmaxq_u16, vdupq_n_u16, vminvq_u16, vmaxvq_u16, 16) ADIOS2_NEON_MINMAX_INT(int32_t, int32x4_t, vld1q_s32, vminq_s32, vmaxq_s32, vdupq_n_s32, vminvq_s32, vmaxvq_s32, 8) ADIOS2_NEON_MINMAX_INT(uint32_t, uint32x4_t, vld1q_u32, vminq_u32, vmaxq_u32, vdupq_n_u32, vminvq_u32, vmaxvq_u32, 8) #undef ADIOS2_NEON_MINMAX_INT // clang-format on #elif defined(__x86_64__) || defined(_M_X64) // SSE2 integer min/max via compare+select (blendv requires SSE4.1, so we use and/andnot/or). // mask = cmpgt(a,b) selects: min = (b & mask) | (a & ~mask), max = (a & mask) | (b & ~mask) #define ADIOS2_SSE2_MINMAX_INT(CTYPE, VTYPE, LOAD, STORE, SET1, CMPGT, AND, ANDNOT, OR, COUNT) \ template <> \ inline void GetMinMax(const CTYPE *values, const size_t size, CTYPE &min, CTYPE &max, \ const MemorySpace) noexcept \ { \ if (size == 0) \ return; \ size_t i = 0; \ VTYPE vmin = SET1(values[0]); \ VTYPE vmax = SET1(values[0]); \ const size_t endN = size - (size % (COUNT)); \ for (; i < endN; i += (COUNT)) \ { \ VTYPE v = LOAD(reinterpret_cast<const __m128i *>(values + i)); \ VTYPE gt = CMPGT(v, vmax); \ vmax = OR(AND(v, gt), ANDNOT(gt, vmax)); \ VTYPE lt = CMPGT(vmin, v); \ vmin = OR(AND(v, lt), ANDNOT(lt, vmin)); \ } \ CTYPE tmin[16 / sizeof(CTYPE)], tmax[16 / sizeof(CTYPE)]; \ STORE(reinterpret_cast<__m128i *>(tmin), vmin); \ STORE(reinterpret_cast<__m128i *>(tmax), vmax); \ min = tmin[0]; \ max = tmax[0]; \ for (int j = 1; j < (int)(16 / sizeof(CTYPE)); ++j) \ { \ if (tmin[j] < min) \ min = tmin[j]; \ if (tmax[j] > max) \ max = tmax[j]; \ } \ GetMinMaxScalar(values, i, size, min, max); \ } // Signed types use _mm_cmpgt_epiN directly. ADIOS2_SSE2_MINMAX_INT(int8_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi8, _mm_cmpgt_epi8, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 16) ADIOS2_SSE2_MINMAX_INT(int16_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi16, _mm_cmpgt_epi16, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 8) ADIOS2_SSE2_MINMAX_INT(int32_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi32, _mm_cmpgt_epi32, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 4) // Unsigned types: SSE2 only has signed cmpgt. XOR with sign bit to convert unsigned // comparison to signed, then use signed cmpgt. namespace detail { inline __m128i unsigned_cmpgt_epi8(__m128i a, __m128i b) { const __m128i bias = _mm_set1_epi8(static_cast<char>(0x80)); return _mm_cmpgt_epi8(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias)); } inline __m128i unsigned_cmpgt_epi16(__m128i a, __m128i b) { const __m128i bias = _mm_set1_epi16(static_cast<short>(0x8000)); return _mm_cmpgt_epi16(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias)); } inline __m128i unsigned_cmpgt_epi32(__m128i a, __m128i b) { const __m128i bias = _mm_set1_epi32(static_cast<int>(0x80000000)); return _mm_cmpgt_epi32(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias)); } } // namespace detail ADIOS2_SSE2_MINMAX_INT(uint8_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi8, detail::unsigned_cmpgt_epi8, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 16) ADIOS2_SSE2_MINMAX_INT(uint16_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi16, detail::unsigned_cmpgt_epi16, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 8) ADIOS2_SSE2_MINMAX_INT(uint32_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi32, detail::unsigned_cmpgt_epi32, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 4) #undef ADIOS2_SSE2_MINMAX_INT #endif template <> inline void GetMinMax(const double *values, const size_t size, double &min, double &max, const MemorySpace memSpace) noexcept { if (size == 0) return; #ifdef ADIOS2_HAVE_GPU_SUPPORT if (memSpace == MemorySpace::GPU) { GetGPUMinMax(values, size, min, max); return; } #endif size_t i = 0; #ifdef __aarch64__ float64x2_t vmin = vdupq_n_f64(values[0]); float64x2_t vmax = vdupq_n_f64(values[0]); float64x2_t vmin2 = vmin, vmax2 = vmax; const size_t end4 = size - (size % 4); for (; i < end4; i += 4) { float64x2_t a = vld1q_f64(values + i); float64x2_t b = vld1q_f64(values + i + 2); vmin = vminq_f64(vmin, a); vmax = vmaxq_f64(vmax, a); vmin2 = vminq_f64(vmin2, b); vmax2 = vmaxq_f64(vmax2, b); } vmin = vminq_f64(vmin, vmin2); vmax = vmaxq_f64(vmax, vmax2); min = vminvq_f64(vmin); max = vmaxvq_f64(vmax); #elif defined(__x86_64__) || defined(_M_X64) __m128d vmin = _mm_set1_pd(values[0]); __m128d vmax = _mm_set1_pd(values[0]); __m128d vmin2 = vmin, vmax2 = vmax; const size_t end4 = size - (size % 4); for (; i < end4; i += 4) { __m128d a = _mm_loadu_pd(values + i); __m128d b = _mm_loadu_pd(values + i + 2); vmin = _mm_min_pd(vmin, a); vmax = _mm_max_pd(vmax, a); vmin2 = _mm_min_pd(vmin2, b); vmax2 = _mm_max_pd(vmax2, b); } vmin = _mm_min_pd(vmin, vmin2); vmax = _mm_max_pd(vmax, vmax2); double tmin[2], tmax[2]; _mm_storeu_pd(tmin, vmin); _mm_storeu_pd(tmax, vmax); min = (std::min)(tmin[0], tmin[1]); max = (std::max)(tmax[0], tmax[1]); #else min = values[0]; max = values[0]; #endif GetMinMaxScalar(values, i, size, min, max); } template <> inline void GetMinMax(const float *values, const size_t size, float &min, float &max, const MemorySpace memSpace) noexcept { if (size == 0) return; #ifdef ADIOS2_HAVE_GPU_SUPPORT if (memSpace == MemorySpace::GPU) { GetGPUMinMax(values, size, min, max); return; } #endif size_t i = 0; #ifdef __aarch64__ float32x4_t vmin = vdupq_n_f32(values[0]); float32x4_t vmax = vdupq_n_f32(values[0]); float32x4_t vmin2 = vmin, vmax2 = vmax; const size_t end8 = size - (size % 8); for (; i < end8; i += 8) { float32x4_t a = vld1q_f32(values + i); float32x4_t b = vld1q_f32(values + i + 4); vmin = vminq_f32(vmin, a); vmax = vmaxq_f32(vmax, a); vmin2 = vminq_f32(vmin2, b); vmax2 = vmaxq_f32(vmax2, b); } vmin = vminq_f32(vmin, vmin2); vmax = vmaxq_f32(vmax, vmax2); min = vminvq_f32(vmin); max = vmaxvq_f32(vmax); #elif defined(__x86_64__) || defined(_M_X64) __m128 vmin = _mm_set1_ps(values[0]); __m128 vmax = _mm_set1_ps(values[0]); __m128 vmin2 = vmin, vmax2 = vmax; const size_t end8 = size - (size % 8); for (; i < end8; i += 8) { __m128 a = _mm_loadu_ps(values + i); __m128 b = _mm_loadu_ps(values + i + 4); vmin = _mm_min_ps(vmin, a); vmax = _mm_max_ps(vmax, a); vmin2 = _mm_min_ps(vmin2, b); vmax2 = _mm_max_ps(vmax2, b); } vmin = _mm_min_ps(vmin, vmin2); vmax = _mm_max_ps(vmax, vmax2); float tmin[4], tmax[4]; _mm_storeu_ps(tmin, vmin); _mm_storeu_ps(tmax, vmax); min = (std::min)({tmin[0], tmin[1], tmin[2], tmin[3]}); max = (std::max)({tmax[0], tmax[1], tmax[2], tmax[3]}); #else min = values[0]; max = values[0]; #endif GetMinMaxScalar(values, i, size, min, max); } template <> inline void GetMinMax(const std::complex<float> *values, const size_t size, std::complex<float> &min, std::complex<float> &max, Loading Loading @@ -233,8 +492,8 @@ inline void GetMinMax(const std::complex<double> *values, const size_t size, } template <class T> void GetMinMaxComplex(const std::complex<T> *values, const size_t size, std::complex<T> &min, std::complex<T> &max) noexcept void GetMinMaxComplex(const std::complex<T> *values, const size_t size, std::complex<T> &min, std::complex<T> &max) noexcept { min = values[0]; max = values[0]; Loading Loading @@ -263,8 +522,7 @@ void GetMinMaxComplex(const std::complex<T> *values, const size_t size, template <class T> void GetMinMaxThreads(const T *values, const size_t size, T &min, T &max, const unsigned int threads, const MemorySpace memSpace) noexcept const unsigned int threads, const MemorySpace memSpace) noexcept { if (size == 0) { Loading Loading @@ -293,14 +551,12 @@ void GetMinMaxThreads(const T *values, const size_t size, T &min, T &max, if (t == threads - 1) { getMinMaxThreads.push_back( std::thread(GetMinMax<T>, &values[position], last, getMinMaxThreads.push_back(std::thread(GetMinMax<T>, &values[position], last, std::ref(mins[t]), std::ref(maxs[t]), memSpace)); } else { getMinMaxThreads.push_back( std::thread(GetMinMax<T>, &values[position], stride, getMinMaxThreads.push_back(std::thread(GetMinMax<T>, &values[position], stride, std::ref(mins[t]), std::ref(maxs[t]), memSpace)); } } Loading @@ -318,9 +574,9 @@ void GetMinMaxThreads(const T *values, const size_t size, T &min, T &max, } template <class T> void GetMinMaxThreads(const std::complex<T> *values, const size_t size, std::complex<T> &min, std::complex<T> &max, const unsigned int threads, MemorySpace memSpace) noexcept void GetMinMaxThreads(const std::complex<T> *values, const size_t size, std::complex<T> &min, std::complex<T> &max, const unsigned int threads, MemorySpace memSpace) noexcept { #ifdef ADIOS2_HAVE_GPU_SUPPORT if (memSpace == MemorySpace::GPU) Loading Loading @@ -359,14 +615,12 @@ void GetMinMaxThreads(const std::complex<T> *values, const size_t size, if (t == threads - 1) { getMinMaxThreads.push_back( std::thread(GetMinMaxComplex<T>, &values[position], last, getMinMaxThreads.push_back(std::thread(GetMinMaxComplex<T>, &values[position], last, std::ref(mins[t]), std::ref(maxs[t]))); } else { getMinMaxThreads.push_back( std::thread(GetMinMaxComplex<T>, &values[position], stride, getMinMaxThreads.push_back(std::thread(GetMinMaxComplex<T>, &values[position], stride, std::ref(mins[t]), std::ref(maxs[t]))); } } Loading @@ -384,9 +638,8 @@ void GetMinMaxThreads(const std::complex<T> *values, const size_t size, } template <class T> void GetMinMaxSubblocks(const T *values, const Dims &count, const BlockDivisionInfo &info, std::vector<T> &MinMaxs, T &bmin, T &bmax, const unsigned int threads, void GetMinMaxSubblocks(const T *values, const Dims &count, const BlockDivisionInfo &info, std::vector<T> &MinMaxs, T &bmin, T &bmax, const unsigned int threads, const MemorySpace memSpace) noexcept { const int ndim = static_cast<int>(count.size()); Loading Loading @@ -560,8 +813,8 @@ void GetMinMaxSubblocks(const T *values, const Dims &count, #define declare_template_instantiation(T) \ template <> \ inline bool LessThan<std::complex<T>>( \ const std::complex<T> input1, const std::complex<T> input2) noexcept \ inline bool LessThan<std::complex<T>>(const std::complex<T> input1, \ const std::complex<T> input2) noexcept \ { \ if (std::norm(input1) < std::norm(input2)) \ { \ Loading @@ -585,8 +838,8 @@ constexpr bool LessThan(const T input1, const T input2) noexcept #define declare_template_instantiation(T) \ template <> \ inline bool GreaterThan<std::complex<T>>( \ const std::complex<T> input1, const std::complex<T> input2) noexcept \ inline bool GreaterThan<std::complex<T>>(const std::complex<T> input1, \ const std::complex<T> input2) noexcept \ { \ if (std::norm(input1) > std::norm(input2)) \ { \ Loading Loading @@ -633,8 +886,7 @@ std::vector<T> VectorsOp(BinaryOperation op, const std::vector<T> &vector1, const std::vector<T> &vector2) noexcept { std::vector<T> result(vector1.size()); std::transform(vector1.begin(), vector1.end(), vector2.begin(), result.begin(), op); std::transform(vector1.begin(), vector1.end(), vector2.begin(), result.begin(), op); return result; } Loading