From dcdad0e5a229ab73cc5d310a67971604c12d2031 Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Wed, 23 Apr 2025 13:27:03 -0700 Subject: [PATCH] Add writeVarintSve for aarch64 - retry (#9603) Summary: Implemented an explicit SVE version of writeVarint. Throughput for 64-bit types shows a ~15% improvement. 16-bit and 32-bit cases seem to show a small improvement as well. All three functions are branch-free, their disassembly can be seen here: https://godbolt.org/z/jG5d8Wfe8 before: bench_write(u16_any_branch_free) 110.66% 2.00us 500.10K bench_write(u32_any_branch_free) 126.90% 2.00us 499.37K bench_write(u64_any_branch_free) 193.56% 2.33us 429.37K bench_write(u16_1b_branch_free) 99.562% 1.91us 522.97K bench_write(u16_2b_branch_free) 114.92% 2.00us 500.59K bench_write(u16_3b_branch_free) 111.66% 2.00us 500.99K bench_write(u32_1b_branch_free) 97.918% 1.93us 518.38K bench_write(u32_2b_branch_free) 113.76% 1.99us 502.29K bench_write(u32_3b_branch_free) 111.14% 1.99us 503.03K bench_write(u32_4b_branch_free) 115.72% 1.97us 507.52K bench_write(u32_5b_branch_free) 122.05% 2.00us 498.82K bench_write(u64_1b_branch_free) 99.089% 1.95us 511.71K bench_write(u64_2b_branch_free) 90.484% 2.53us 396.00K bench_write(u64_3b_branch_free) 93.335% 2.38us 419.63K bench_write(u64_4b_branch_free) 100.61% 2.24us 446.86K bench_write(u64_5b_branch_free) 123.18% 2.37us 421.24K bench_write(u64_6b_branch_free) 120.10% 2.33us 429.84K bench_write(u64_7b_branch_free) 144.69% 2.36us 423.79K bench_write(u64_8b_branch_free) 149.44% 2.25us 443.92K bench_write(u64_9b_branch_free) 174.37% 2.31us 433.60K bench_write(u64_10b_branch_free) 176.81% 2.28us 438.61K bench_write(exponential_1b_branch_free) 108.05% 1.91us 522.52K bench_write(exponential_2b_branch_free) 118.34% 1.98us 504.37K bench_write(exponential_3b_branch_free) 114.22% 1.99us 501.87K after: bench_write(u16_any_branch_free) 115.30% 1.97us 507.43K bench_write(u32_any_branch_free) 130.06% 1.97us 508.40K bench_write(u64_any_branch_free) 226.45% 1.96us 509.18K bench_write(u16_1b_branch_free) 101.37% 1.84us 543.01K bench_write(u16_2b_branch_free) 116.65% 1.97us 508.51K bench_write(u16_3b_branch_free) 111.17% 1.96us 510.12K bench_write(u32_1b_branch_free) 99.679% 1.93us 519.42K bench_write(u32_2b_branch_free) 115.98% 1.98us 506.04K bench_write(u32_3b_branch_free) 111.45% 1.98us 503.85K bench_write(u32_4b_branch_free) 116.04% 1.95us 513.18K bench_write(u32_5b_branch_free) 124.59% 1.97us 508.35K bench_write(u64_1b_branch_free) 99.669% 1.91us 522.26K bench_write(u64_2b_branch_free) 117.53% 1.93us 518.86K bench_write(u64_3b_branch_free) 111.95% 1.95us 511.77K bench_write(u64_4b_branch_free) 111.29% 1.98us 504.98K bench_write(u64_5b_branch_free) 124.53% 1.96us 510.52K bench_write(u64_6b_branch_free) 145.48% 1.90us 526.18K bench_write(u64_7b_branch_free) 172.51% 1.97us 506.83K bench_write(u64_8b_branch_free) 174.92% 1.95us 514.13K bench_write(u64_9b_branch_free) 202.27% 1.97us 508.08K bench_write(u64_10b_branch_free) 205.43% 1.96us 510.44K bench_write(exponential_1b_branch_free) 105.67% 1.91us 523.63K bench_write(exponential_2b_branch_free) 116.10% 1.95us 512.64K bench_write(exponential_3b_branch_free) 119.08% 1.95us 513.34K Reviewed By: embg Differential Revision: D73513003 --- .../src/thrift/lib/cpp/util/VarintUtils-inl.h | 114 ++++++++++++++++-- .../lib/cpp/util/test/VarintUtilsBench.cpp | 4 +- 2 files changed, 106 insertions(+), 12 deletions(-) diff --git a/third-party/thrift/src/thrift/lib/cpp/util/VarintUtils-inl.h b/third-party/thrift/src/thrift/lib/cpp/util/VarintUtils-inl.h index 7072c39b414b6..ce593067af33c 100644 --- a/third-party/thrift/src/thrift/lib/cpp/util/VarintUtils-inl.h +++ b/third-party/thrift/src/thrift/lib/cpp/util/VarintUtils-inl.h @@ -50,13 +50,14 @@ // apple silicon can run most x86-64 instructions, but not necessarily all #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 1 #elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE2_BITPERM) && \ - __has_include() + __has_include() && !FOLLY_MOBILE #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 1 #else #define THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER 0 #endif #if THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER && FOLLY_AARCH64 +#include #include // @manual #include #endif @@ -430,20 +431,102 @@ uint8_t writeVarintUnrolled(Cursor& c, T value) { #if THRIFT_UTIL_VARINTUTILS_BRANCH_FREE_ENCODER +#if FOLLY_AARCH64 + +template +uint8_t writeVarintSve(Cursor& c, T valueS) { + auto value = folly::to_unsigned(valueS); + if (FOLLY_LIKELY((value & ~0x7f) == 0)) { + c.template write(static_cast(value)); + return 1; + } + + if constexpr (sizeof(T) == 1) { + c.template write(static_cast(value | 0x100)); + return 2; + } + + enum { maxSize = (8 * sizeof(T) + 6) / 7 }; + c.ensure(maxSize); + + svuint8_t bdepMask = svset_neonq_u8(svundef_u8(), vdupq_n_u8(0x7f)); + uint64x2_t clzMask = vreinterpretq_u64_u8(vdupq_n_u8(0xff)); + uint64x2_t vec; + vec[0] = value; + + vec = svget_neonq_u64(svbdep_u64( + svset_neonq_u64(svundef_u64(), vec), svreinterpret_u64_u8(bdepMask))); + + svuint64_t clzV; + uint64x2_t clzMaskV; + if constexpr (sizeof(T) == 2) { + clzV = svset_neonq_u64( + svundef_u64(), + vreinterpretq_u64_u32(vclzq_u32(vreinterpretq_u32_u64(vec)))); + clzMaskV = vreinterpretq_u64_u32(svget_neonq_u32(svlsr_u32_x( + svptrue_b32(), + svset_neonq_u32(svundef_u32(), vreinterpretq_u32_u64(clzMask)), + svreinterpret_u32_u64(clzV)))); + } else { + clzV = svclz_u64_x(svptrue_b64(), svset_neonq_u64(svundef_u64(), vec)); + clzMaskV = svget_neonq_u64(svlsr_u64_x( + svptrue_b64(), svset_neonq_u64(svundef_u64(), clzMask), clzV)); + } + + svuint64_t sizeSV = svlsr_n_u64_x(svptrue_b64(), clzV, 3); + + if constexpr (sizeof(T) == 2) { + sizeSV = svsubr_n_u64_x(svptrue_b64(), sizeSV, 4); + } else { + sizeSV = svsubr_n_u64_x(svptrue_b64(), sizeSV, 8); + } + + vec = vreinterpretq_u64_u8(svget_neonq_u8(svorr_n_u8_x( + svptrue_b8(), + svset_neonq_u8(svundef_u8(), vreinterpretq_u8_u64(vec)), + 0x80))); + + vec = vandq_u64(vec, clzMaskV); + + if constexpr (sizeof(T) == 8) { + uint8_t orMask = value < (1ull << 56) ? 0 : 0x80; + uint64x2_t orMaskV = vreinterpretq_u64_u8(vdupq_n_u8(orMask)); + vec = vorrq_u64(vec, orMaskV); + } + + uint8_t* p = c.writableData(); + + if constexpr (sizeof(T) == sizeof(uint16_t)) { + vst1q_lane_u16( + reinterpret_cast(p), vreinterpretq_u16_u64(vec), 0); + vst1q_lane_u8(p + 2, vreinterpretq_u8_u64(vec), 2); + } else if constexpr (sizeof(T) == sizeof(uint32_t)) { + vst1q_lane_u32( + reinterpret_cast(p), vreinterpretq_u32_u64(vec), 0); + vst1q_lane_u8(p + 4, vreinterpretq_u8_u64(vec), 4); + } else { + vst1q_lane_u64(reinterpret_cast(p), vec, 0); + p[8] = value >> 56; + p[9] = value >> 63; + } + + uint8_t size = vreinterpretq_u8_u64(svget_neonq_u64(sizeSV))[0]; + if constexpr (sizeof(T) == 8) { + size = value < (1ull << 56) ? size : (value >> 63) + 9; + } + + c.append(size); + return size; +} + +#else + inline uint64_t compressBits(uint64_t value, uint64_t mask) { -#if FOLLY_X64 return _pdep_u64(value, mask); -#elif FOLLY_AARCH64 - // See https://godbolt.org/z/nhc443acd - const auto vec = svbdep_u64(svdup_n_u64(value), svdup_n_u64(mask)); - return vgetq_lane_u64(svget_neonq_u64(vec), 0); -#else - static_assert(0, "no pdep-equivalent instruction is available"); -#endif // __BMI2__, __ARM_FEATURE_SVE2_BITPERM } template -uint8_t writeVarintBranchFree(Cursor& c, T valueS) { +uint8_t writeVarintBranchFreeX86(Cursor& c, T valueS) { auto value = folly::to_unsigned(valueS); if (FOLLY_LIKELY((value & ~0x7f) == 0)) { c.template write(static_cast(value)); @@ -494,6 +577,17 @@ uint8_t writeVarintBranchFree(Cursor& c, T valueS) { return size; } +#endif + +template +uint8_t writeVarintBranchFree(Cursor& c, T valueS) { +#if FOLLY_AARCH64 + return writeVarintSve(c, valueS); +#else + return writeVarintBranchFreeX86(c, valueS); +#endif +} + template uint8_t writeVarint(Cursor& c, T value) { return writeVarintBranchFree(c, value); diff --git a/third-party/thrift/src/thrift/lib/cpp/util/test/VarintUtilsBench.cpp b/third-party/thrift/src/thrift/lib/cpp/util/test/VarintUtilsBench.cpp index a9085c43ec7e4..5eafaaf3e9058 100644 --- a/third-party/thrift/src/thrift/lib/cpp/util/test/VarintUtilsBench.cpp +++ b/third-party/thrift/src/thrift/lib/cpp/util/test/VarintUtilsBench.cpp @@ -235,8 +235,8 @@ BENCHMARK_NAMED_PARAM(bench_read, u64_9b, u64_9b()) BENCHMARK_NAMED_PARAM(bench_read, u64_10b, u64_10b()) BENCHMARK_NAMED_PARAM(bench_read, exponential_1b, exponential_1b()) -BENCHMARK_NAMED_PARAM(bench_read, exponential_2b, exponential_1b()) -BENCHMARK_NAMED_PARAM(bench_read, exponential_3b, exponential_1b()) +BENCHMARK_NAMED_PARAM(bench_read, exponential_2b, exponential_2b()) +BENCHMARK_NAMED_PARAM(bench_read, exponential_3b, exponential_3b()) int main(int argc, char** argv) { folly::Init init(&argc, &argv, true);