Merge pull request #5016 from ornladios-robot/backport-to-release_212-4985 (fef7a3c2) · Commits · ecpcitest / ADIOS2

source/adios2/helper/adiosMath.inl

+316 −64

Original line number	Diff line number	Diff line
		@@ -4,7 +4,6 @@
		* SPDX-License-Identifier: Apache-2.0
		*/


		#ifndef ADIOS2_HELPER_ADIOSMATH_INL_
		#define ADIOS2_HELPER_ADIOSMATH_INL_
		#ifndef ADIOS2_HELPER_ADIOSMATH_H_
		@@ -16,6 +15,12 @@
		#include <limits> //std::numeri_limits
		#include <thread>

		#ifdef __aarch64__
		#include <arm_neon.h>
		#elif defined(__x86_64__) \|\| defined(_M_X64)
		#include <immintrin.h>
		#endif

		#include "adios2/common/ADIOSMacros.h"
		#include "adiosLog.h"

		@@ -25,13 +30,11 @@ namespace helper
		{

		template <class T>
		void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start,
		const Dims &count, const bool isRowMajor, T &min,
		T &max, const MemorySpace memSpace) noexcept
		void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start, const Dims &count,
		const bool isRowMajor, T &min, T &max, const MemorySpace memSpace) noexcept
		{
		auto lf_MinMaxRowMajor = [](const T *values, const Dims &shape,
		const Dims &start, const Dims &count, T &min,
		T &max, const MemorySpace memSpace) {
		auto lf_MinMaxRowMajor = [](const T *values, const Dims &shape, const Dims &start,
		const Dims &count, T &min, T &max, const MemorySpace memSpace) {
		// loop through selection box contiguous part
		const size_t dimensions = shape.size();
		const size_t stride = count.back();
		@@ -44,12 +47,11 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start,
		while (run)
		{
		// here copy current linear memory between currentPoint and end
		const size_t startOffset = helper::LinearIndex(
		Dims(shape.size(), 0), shape, currentPoint, true);
		const size_t startOffset =
		helper::LinearIndex(Dims(shape.size(), 0), shape, currentPoint, true);

		T minStride, maxStride;
		GetMinMax(values + startOffset, stride, minStride, maxStride,
		memSpace);
		GetMinMax(values + startOffset, stride, minStride, maxStride, memSpace);

		if (firstStep)
		{
		@@ -95,9 +97,8 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start,
		} // end while stride loop
		};

		auto lf_MinMaxColumnMajor = [](const T *values, const Dims &shape,
		const Dims &start, const Dims &count, T &min,
		T &max, const MemorySpace memSpace) {
		auto lf_MinMaxColumnMajor = [](const T *values, const Dims &shape, const Dims &start,
		const Dims &count, T &min, T &max, const MemorySpace memSpace) {
		// loop through selection box contiguous part
		const size_t dimensions = shape.size();
		const size_t stride = count.front();
		@@ -110,12 +111,11 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start,
		while (run)
		{
		// here copy current linear memory between currentPoint and end
		const size_t startOffset = helper::LinearIndex(
		Dims(shape.size(), 0), shape, currentPoint, false);
		const size_t startOffset =
		helper::LinearIndex(Dims(shape.size(), 0), shape, currentPoint, false);

		T minStride, maxStride;
		GetMinMax(values + startOffset, stride, minStride, maxStride,
		memSpace);
		GetMinMax(values + startOffset, stride, minStride, maxStride, memSpace);

		if (firstStep)
		{
		@@ -165,8 +165,7 @@ void GetMinMaxSelection(const T *values, const Dims &shape, const Dims &start,

		if (shape.size() == 1)
		{
		const size_t startOffset =
		helper::LinearIndex(Dims(1, 0), shape, start, isRowMajor);
		const size_t startOffset = helper::LinearIndex(Dims(1, 0), shape, start, isRowMajor);
		const size_t totalSize = helper::GetTotalSize(count);
		GetMinMax(values + startOffset, totalSize, min, max, memSpace);
		return;
		@@ -198,6 +197,266 @@ inline void GetMinMax(const T *values, const size_t size, T &min, T &max,
		max = *bounds.second;
		}

		// SIMD specializations for min/max. std::minmax_element cannot be vectorized because it
		// tracks element positions. These value-only reductions use platform SIMD intrinsics and
		// run at memory bandwidth. When porting to a new architecture (e.g. Power/VSX, RISC-V V),
		// add a specialization here with the appropriate intrinsics; the #else scalar fallback
		// will be used otherwise.
		//
		// Covered types: float, double, int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t.
		// Not covered (use scalar fallback): int64_t, uint64_t (no SIMD min/max before AVX-512),
		// long double (80-bit, no SIMD), complex (ordering by std::norm).

		// Helper: scalar min/max loop used by SIMD tail and scalar-only fallback.
		template <class T>
		inline void GetMinMaxScalar(const T *values, size_t start, const size_t size, T &min,
		T &max) noexcept
		{
		for (size_t i = start; i < size; ++i)
		{
		if (values[i] < min)
		min = values[i];
		if (values[i] > max)
		max = values[i];
		}
		}

		#ifdef __aarch64__
		// clang-format off
		// NEON macro: generates a GetMinMax specialization for an integer type.
		// VTYPE: NEON vector type, LOAD: load intrinsic, MIN/MAX: min/max intrinsics,
		// DUP: broadcast intrinsic, REDUCE_MIN/MAX: horizontal reduce intrinsics,
		// COUNT: elements per vector pair iteration.
		#define ADIOS2_NEON_MINMAX_INT(CTYPE, VTYPE, LOAD, MIN, MAX, DUP, REDUCE_MIN, REDUCE_MAX, COUNT) \
		template <> \
		inline void GetMinMax(const CTYPE *values, const size_t size, CTYPE &min, CTYPE &max, \
		const MemorySpace) noexcept \
		{ \
		if (size == 0) return; \
		size_t i = 0; \
		VTYPE vmin = DUP(values[0]); \
		VTYPE vmax = DUP(values[0]); \
		VTYPE vmin2 = vmin, vmax2 = vmax; \
		const size_t endN = size - (size % (COUNT)); \
		for (; i < endN; i += (COUNT)) \
		{ \
		VTYPE a = LOAD(values + i); \
		VTYPE b = LOAD(values + i + (COUNT) / 2); \
		vmin = MIN(vmin, a); vmax = MAX(vmax, a); \
		vmin2 = MIN(vmin2, b); vmax2 = MAX(vmax2, b); \
		} \
		vmin = MIN(vmin, vmin2); vmax = MAX(vmax, vmax2); \
		min = REDUCE_MIN(vmin); max = REDUCE_MAX(vmax); \
		GetMinMaxScalar(values, i, size, min, max); \
		}

		ADIOS2_NEON_MINMAX_INT(int8_t, int8x16_t, vld1q_s8, vminq_s8, vmaxq_s8, vdupq_n_s8, vminvq_s8, vmaxvq_s8, 32)
		ADIOS2_NEON_MINMAX_INT(uint8_t, uint8x16_t, vld1q_u8, vminq_u8, vmaxq_u8, vdupq_n_u8, vminvq_u8, vmaxvq_u8, 32)
		ADIOS2_NEON_MINMAX_INT(int16_t, int16x8_t, vld1q_s16, vminq_s16, vmaxq_s16, vdupq_n_s16, vminvq_s16, vmaxvq_s16, 16)
		ADIOS2_NEON_MINMAX_INT(uint16_t, uint16x8_t, vld1q_u16, vminq_u16, vmaxq_u16, vdupq_n_u16, vminvq_u16, vmaxvq_u16, 16)
		ADIOS2_NEON_MINMAX_INT(int32_t, int32x4_t, vld1q_s32, vminq_s32, vmaxq_s32, vdupq_n_s32, vminvq_s32, vmaxvq_s32, 8)
		ADIOS2_NEON_MINMAX_INT(uint32_t, uint32x4_t, vld1q_u32, vminq_u32, vmaxq_u32, vdupq_n_u32, vminvq_u32, vmaxvq_u32, 8)
		#undef ADIOS2_NEON_MINMAX_INT
		// clang-format on

		#elif defined(__x86_64__) \|\| defined(_M_X64)
		// SSE2 integer min/max via compare+select (blendv requires SSE4.1, so we use and/andnot/or).
		// mask = cmpgt(a,b) selects: min = (b & mask) \| (a & ~mask), max = (a & mask) \| (b & ~mask)
		#define ADIOS2_SSE2_MINMAX_INT(CTYPE, VTYPE, LOAD, STORE, SET1, CMPGT, AND, ANDNOT, OR, COUNT) \
		template <> \
		inline void GetMinMax(const CTYPE *values, const size_t size, CTYPE &min, CTYPE &max, \
		const MemorySpace) noexcept \
		{ \
		if (size == 0) \
		return; \
		size_t i = 0; \
		VTYPE vmin = SET1(values[0]); \
		VTYPE vmax = SET1(values[0]); \
		const size_t endN = size - (size % (COUNT)); \
		for (; i < endN; i += (COUNT)) \
		{ \
		VTYPE v = LOAD(reinterpret_cast<const __m128i *>(values + i)); \
		VTYPE gt = CMPGT(v, vmax); \
		vmax = OR(AND(v, gt), ANDNOT(gt, vmax)); \
		VTYPE lt = CMPGT(vmin, v); \
		vmin = OR(AND(v, lt), ANDNOT(lt, vmin)); \
		} \
		CTYPE tmin[16 / sizeof(CTYPE)], tmax[16 / sizeof(CTYPE)]; \
		STORE(reinterpret_cast<__m128i *>(tmin), vmin); \
		STORE(reinterpret_cast<__m128i *>(tmax), vmax); \
		min = tmin[0]; \
		max = tmax[0]; \
		for (int j = 1; j < (int)(16 / sizeof(CTYPE)); ++j) \
		{ \
		if (tmin[j] < min) \
		min = tmin[j]; \
		if (tmax[j] > max) \
		max = tmax[j]; \
		} \
		GetMinMaxScalar(values, i, size, min, max); \
		}

		// Signed types use _mm_cmpgt_epiN directly.
		ADIOS2_SSE2_MINMAX_INT(int8_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi8,
		_mm_cmpgt_epi8, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 16)
		ADIOS2_SSE2_MINMAX_INT(int16_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi16,
		_mm_cmpgt_epi16, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 8)
		ADIOS2_SSE2_MINMAX_INT(int32_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi32,
		_mm_cmpgt_epi32, _mm_and_si128, _mm_andnot_si128, _mm_or_si128, 4)

		// Unsigned types: SSE2 only has signed cmpgt. XOR with sign bit to convert unsigned
		// comparison to signed, then use signed cmpgt.
		namespace detail
		{
		inline __m128i unsigned_cmpgt_epi8(__m128i a, __m128i b)
		{
		const __m128i bias = _mm_set1_epi8(static_cast<char>(0x80));
		return _mm_cmpgt_epi8(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias));
		}
		inline __m128i unsigned_cmpgt_epi16(__m128i a, __m128i b)
		{
		const __m128i bias = _mm_set1_epi16(static_cast<short>(0x8000));
		return _mm_cmpgt_epi16(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias));
		}
		inline __m128i unsigned_cmpgt_epi32(__m128i a, __m128i b)
		{
		const __m128i bias = _mm_set1_epi32(static_cast<int>(0x80000000));
		return _mm_cmpgt_epi32(_mm_xor_si128(a, bias), _mm_xor_si128(b, bias));
		}
		} // namespace detail

		ADIOS2_SSE2_MINMAX_INT(uint8_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi8,
		detail::unsigned_cmpgt_epi8, _mm_and_si128, _mm_andnot_si128, _mm_or_si128,
		16)
		ADIOS2_SSE2_MINMAX_INT(uint16_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi16,
		detail::unsigned_cmpgt_epi16, _mm_and_si128, _mm_andnot_si128, _mm_or_si128,
		8)
		ADIOS2_SSE2_MINMAX_INT(uint32_t, __m128i, _mm_loadu_si128, _mm_storeu_si128, _mm_set1_epi32,
		detail::unsigned_cmpgt_epi32, _mm_and_si128, _mm_andnot_si128, _mm_or_si128,
		4)

		#undef ADIOS2_SSE2_MINMAX_INT
		#endif
		template <>
		inline void GetMinMax(const double *values, const size_t size, double &min, double &max,
		const MemorySpace memSpace) noexcept
		{
		if (size == 0)
		return;
		#ifdef ADIOS2_HAVE_GPU_SUPPORT
		if (memSpace == MemorySpace::GPU)
		{
		GetGPUMinMax(values, size, min, max);
		return;
		}
		#endif
		size_t i = 0;
		#ifdef __aarch64__
		float64x2_t vmin = vdupq_n_f64(values[0]);
		float64x2_t vmax = vdupq_n_f64(values[0]);
		float64x2_t vmin2 = vmin, vmax2 = vmax;
		const size_t end4 = size - (size % 4);
		for (; i < end4; i += 4)
		{
		float64x2_t a = vld1q_f64(values + i);
		float64x2_t b = vld1q_f64(values + i + 2);
		vmin = vminq_f64(vmin, a);
		vmax = vmaxq_f64(vmax, a);
		vmin2 = vminq_f64(vmin2, b);
		vmax2 = vmaxq_f64(vmax2, b);
		}
		vmin = vminq_f64(vmin, vmin2);
		vmax = vmaxq_f64(vmax, vmax2);
		min = vminvq_f64(vmin);
		max = vmaxvq_f64(vmax);
		#elif defined(__x86_64__) \|\| defined(_M_X64)
		__m128d vmin = _mm_set1_pd(values[0]);
		__m128d vmax = _mm_set1_pd(values[0]);
		__m128d vmin2 = vmin, vmax2 = vmax;
		const size_t end4 = size - (size % 4);
		for (; i < end4; i += 4)
		{
		__m128d a = _mm_loadu_pd(values + i);
		__m128d b = _mm_loadu_pd(values + i + 2);
		vmin = _mm_min_pd(vmin, a);
		vmax = _mm_max_pd(vmax, a);
		vmin2 = _mm_min_pd(vmin2, b);
		vmax2 = _mm_max_pd(vmax2, b);
		}
		vmin = _mm_min_pd(vmin, vmin2);
		vmax = _mm_max_pd(vmax, vmax2);
		double tmin[2], tmax[2];
		_mm_storeu_pd(tmin, vmin);
		_mm_storeu_pd(tmax, vmax);
		min = (std::min)(tmin[0], tmin[1]);
		max = (std::max)(tmax[0], tmax[1]);
		#else
		min = values[0];
		max = values[0];
		#endif
		GetMinMaxScalar(values, i, size, min, max);
		}

		template <>
		inline void GetMinMax(const float *values, const size_t size, float &min, float &max,
		const MemorySpace memSpace) noexcept
		{
		if (size == 0)
		return;
		#ifdef ADIOS2_HAVE_GPU_SUPPORT
		if (memSpace == MemorySpace::GPU)
		{
		GetGPUMinMax(values, size, min, max);
		return;
		}
		#endif
		size_t i = 0;
		#ifdef __aarch64__
		float32x4_t vmin = vdupq_n_f32(values[0]);
		float32x4_t vmax = vdupq_n_f32(values[0]);
		float32x4_t vmin2 = vmin, vmax2 = vmax;
		const size_t end8 = size - (size % 8);
		for (; i < end8; i += 8)
		{
		float32x4_t a = vld1q_f32(values + i);
		float32x4_t b = vld1q_f32(values + i + 4);
		vmin = vminq_f32(vmin, a);
		vmax = vmaxq_f32(vmax, a);
		vmin2 = vminq_f32(vmin2, b);
		vmax2 = vmaxq_f32(vmax2, b);
		}
		vmin = vminq_f32(vmin, vmin2);
		vmax = vmaxq_f32(vmax, vmax2);
		min = vminvq_f32(vmin);
		max = vmaxvq_f32(vmax);
		#elif defined(__x86_64__) \|\| defined(_M_X64)
		__m128 vmin = _mm_set1_ps(values[0]);
		__m128 vmax = _mm_set1_ps(values[0]);
		__m128 vmin2 = vmin, vmax2 = vmax;
		const size_t end8 = size - (size % 8);
		for (; i < end8; i += 8)
		{
		__m128 a = _mm_loadu_ps(values + i);
		__m128 b = _mm_loadu_ps(values + i + 4);
		vmin = _mm_min_ps(vmin, a);
		vmax = _mm_max_ps(vmax, a);
		vmin2 = _mm_min_ps(vmin2, b);
		vmax2 = _mm_max_ps(vmax2, b);
		}
		vmin = _mm_min_ps(vmin, vmin2);
		vmax = _mm_max_ps(vmax, vmax2);
		float tmin[4], tmax[4];
		_mm_storeu_ps(tmin, vmin);
		_mm_storeu_ps(tmax, vmax);
		min = (std::min)({tmin[0], tmin[1], tmin[2], tmin[3]});
		max = (std::max)({tmax[0], tmax[1], tmax[2], tmax[3]});
		#else
		min = values[0];
		max = values[0];
		#endif
		GetMinMaxScalar(values, i, size, min, max);
		}

		template <>
		inline void GetMinMax(const std::complex<float> *values, const size_t size,
		std::complex<float> &min, std::complex<float> &max,
		@@ -233,8 +492,8 @@ inline void GetMinMax(const std::complex<double> *values, const size_t size,
		}

		template <class T>
		void GetMinMaxComplex(const std::complex<T> *values, const size_t size,
		std::complex<T> &min, std::complex<T> &max) noexcept
		void GetMinMaxComplex(const std::complex<T> *values, const size_t size, std::complex<T> &min,
		std::complex<T> &max) noexcept
		{
		min = values[0];
		max = values[0];
		@@ -263,8 +522,7 @@ void GetMinMaxComplex(const std::complex<T> *values, const size_t size,

		template <class T>
		void GetMinMaxThreads(const T *values, const size_t size, T &min, T &max,
		const unsigned int threads,
		const MemorySpace memSpace) noexcept
		const unsigned int threads, const MemorySpace memSpace) noexcept
		{
		if (size == 0)
		{
		@@ -293,14 +551,12 @@ void GetMinMaxThreads(const T *values, const size_t size, T &min, T &max,

		if (t == threads - 1)
		{
		getMinMaxThreads.push_back(
		std::thread(GetMinMax<T>, &values[position], last,
		getMinMaxThreads.push_back(std::thread(GetMinMax<T>, &values[position], last,
		std::ref(mins[t]), std::ref(maxs[t]), memSpace));
		}
		else
		{
		getMinMaxThreads.push_back(
		std::thread(GetMinMax<T>, &values[position], stride,
		getMinMaxThreads.push_back(std::thread(GetMinMax<T>, &values[position], stride,
		std::ref(mins[t]), std::ref(maxs[t]), memSpace));
		}
		}
		@@ -318,9 +574,9 @@ void GetMinMaxThreads(const T *values, const size_t size, T &min, T &max,
		}

		template <class T>
		void GetMinMaxThreads(const std::complex<T> *values, const size_t size,
		std::complex<T> &min, std::complex<T> &max,
		const unsigned int threads, MemorySpace memSpace) noexcept
		void GetMinMaxThreads(const std::complex<T> *values, const size_t size, std::complex<T> &min,
		std::complex<T> &max, const unsigned int threads,
		MemorySpace memSpace) noexcept
		{
		#ifdef ADIOS2_HAVE_GPU_SUPPORT
		if (memSpace == MemorySpace::GPU)
		@@ -359,14 +615,12 @@ void GetMinMaxThreads(const std::complex<T> *values, const size_t size,

		if (t == threads - 1)
		{
		getMinMaxThreads.push_back(
		std::thread(GetMinMaxComplex<T>, &values[position], last,
		getMinMaxThreads.push_back(std::thread(GetMinMaxComplex<T>, &values[position], last,
		std::ref(mins[t]), std::ref(maxs[t])));
		}
		else
		{
		getMinMaxThreads.push_back(
		std::thread(GetMinMaxComplex<T>, &values[position], stride,
		getMinMaxThreads.push_back(std::thread(GetMinMaxComplex<T>, &values[position], stride,
		std::ref(mins[t]), std::ref(maxs[t])));
		}
		}
		@@ -384,9 +638,8 @@ void GetMinMaxThreads(const std::complex<T> *values, const size_t size,
		}

		template <class T>
		void GetMinMaxSubblocks(const T *values, const Dims &count,
		const BlockDivisionInfo &info, std::vector<T> &MinMaxs,
		T &bmin, T &bmax, const unsigned int threads,
		void GetMinMaxSubblocks(const T *values, const Dims &count, const BlockDivisionInfo &info,
		std::vector<T> &MinMaxs, T &bmin, T &bmax, const unsigned int threads,
		const MemorySpace memSpace) noexcept
		{
		const int ndim = static_cast<int>(count.size());
		@@ -560,8 +813,8 @@ void GetMinMaxSubblocks(const T *values, const Dims &count,

		#define declare_template_instantiation(T) \
		template <> \
		inline bool LessThan<std::complex<T>>( \
		const std::complex<T> input1, const std::complex<T> input2) noexcept \
		inline bool LessThan<std::complex<T>>(const std::complex<T> input1, \
		const std::complex<T> input2) noexcept \
		{ \
		if (std::norm(input1) < std::norm(input2)) \
		{ \
		@@ -585,8 +838,8 @@ constexpr bool LessThan(const T input1, const T input2) noexcept

		#define declare_template_instantiation(T) \
		template <> \
		inline bool GreaterThan<std::complex<T>>( \
		const std::complex<T> input1, const std::complex<T> input2) noexcept \
		inline bool GreaterThan<std::complex<T>>(const std::complex<T> input1, \
		const std::complex<T> input2) noexcept \
		{ \
		if (std::norm(input1) > std::norm(input2)) \
		{ \
		@@ -633,8 +886,7 @@ std::vector<T> VectorsOp(BinaryOperation op, const std::vector<T> &vector1,
		const std::vector<T> &vector2) noexcept
		{
		std::vector<T> result(vector1.size());
		std::transform(vector1.begin(), vector1.end(), vector2.begin(),
		result.begin(), op);
		std::transform(vector1.begin(), vector1.end(), vector2.begin(), result.begin(), op);
		return result;
		}

Admin message