Unverified Commit d2752775 authored by Dmitry Vyukov's avatar Dmitry Vyukov Committed by GitHub
Browse files

[libc] Optimize mempcy size thresholds (#70049)

Adjust boundary conditions for sizes = 16/32/64.
See the added comment for explanations.

Results on a machine with AVX2, so sizes 64/128 affected:
```
                │   baseline   │               adjusted               │
                │    sec/op    │   sec/op     vs base                 │
memcpy/Google_A   5.701n ±  0%   5.551n ± 1%   -2.63% (n=100)
memcpy/Google_B   3.817n ±  0%   3.776n ± 0%   -1.07% (p=0.000 n=100)
memcpy/Google_D   11.35n ±  1%   11.32n ± 0%        ~ (p=0.066 n=100)
memcpy/Google_U   3.874n ± 1%    3.821n ± 1%   -1.37% (p=0.001 n=100)
memcpy/64         3.843n ±  0%   3.105n ± 3%  -19.22% (n=50)
memcpy/128        4.842n ±  0%   3.818n ± 0%  -21.15% (p=0.000 n=50)
```
parent a0cd6265
Loading
Loading
Loading
Loading
+25 −7
Original line number Diff line number Diff line
@@ -55,7 +55,7 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
                            size_t count) {
  if (count < 128)
  if (count <= 128)
    return builtin::Memcpy<64>::head_tail(dst, src, count);
  builtin::Memcpy<32>::block(dst, src);
  align_to_next_boundary<32, Arg::Dst>(dst, src, count);
@@ -65,7 +65,7 @@ inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
                           size_t count) {
  if (count < 128)
  if (count <= 128)
    return builtin::Memcpy<64>::head_tail(dst, src, count);
  if (count < 256)
    return builtin::Memcpy<128>::head_tail(dst, src, count);
@@ -79,7 +79,7 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
                                           CPtr __restrict src, size_t count) {
  using namespace LIBC_NAMESPACE::x86;
  prefetch_to_local_cache(src + kOneCacheline);
  if (count < 128)
  if (count <= 128)
    return builtin::Memcpy<64>::head_tail(dst, src, count);
  prefetch_to_local_cache(src + kTwoCachelines);
  // Aligning 'dst' on a 32B boundary.
@@ -120,7 +120,7 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
                                          CPtr __restrict src, size_t count) {
  using namespace LIBC_NAMESPACE::x86;
  prefetch_to_local_cache(src + kOneCacheline);
  if (count < 128)
  if (count <= 128)
    return builtin::Memcpy<64>::head_tail(dst, src, count);
  prefetch_to_local_cache(src + kTwoCachelines);
  prefetch_to_local_cache(src + kThreeCachelines);
@@ -149,6 +149,15 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,

[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
#if defined(__AVX512F__)
  constexpr size_t vector_size = 64;
#elif defined(__AVX__)
  constexpr size_t vector_size = 32;
#elif defined(__SSE2__)
  constexpr size_t vector_size = 16;
#else
  constexpr size_t vector_size = 8;
#endif
  if (count == 0)
    return;
  if (count == 1)
@@ -161,11 +170,20 @@ inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
    return builtin::Memcpy<4>::block(dst, src);
  if (count < 8)
    return builtin::Memcpy<4>::head_tail(dst, src, count);
  if (count < 16)
  // If count is equal to a power of 2, we can handle it as head-tail
  // of both smaller size and larger size (head-tail are either
  // non-overlapping for smaller size, or completely collapsed
  // for larger size). It seems to be more profitable to do the copy
  // with the larger size, if it's natively supported (e.g. doing
  // 2 collapsed 32-byte moves for count=64 if AVX2 is supported).
  // But it's not profitable to use larger size if it's not natively
  // supported: we will both use more instructions and handle fewer
  // sizes in earlier branches.
  if (vector_size >= 16 ? count < 16 : count <= 16)
    return builtin::Memcpy<8>::head_tail(dst, src, count);
  if (count < 32)
  if (vector_size >= 32 ? count < 32 : count <= 32)
    return builtin::Memcpy<16>::head_tail(dst, src, count);
  if (count < 64)
  if (vector_size >= 64 ? count < 64 : count <= 64)
    return builtin::Memcpy<32>::head_tail(dst, src, count);
  if constexpr (x86::kAvx) {
    if constexpr (x86::kUseSoftwarePrefetching) {