diff options
Diffstat (limited to 'src/google/protobuf/wire_format_lite.cc')
-rw-r--r-- | src/google/protobuf/wire_format_lite.cc | 189 |
1 files changed, 109 insertions, 80 deletions
diff --git a/src/google/protobuf/wire_format_lite.cc b/src/google/protobuf/wire_format_lite.cc index cd343b74..1d8cda5a 100644 --- a/src/google/protobuf/wire_format_lite.cc +++ b/src/google/protobuf/wire_format_lite.cc @@ -34,9 +34,6 @@ #include <google/protobuf/wire_format_lite_inl.h> -#ifdef __SSE_4_1__ -#include <immintrin.h> -#endif #include <stack> #include <string> #include <vector> @@ -635,12 +632,12 @@ bool WireFormatLite::VerifyUtf8String(const char* data, return true; } -#ifdef __SSE_4_1__ -template<typename T, bool ZigZag, bool SignExtended> -static size_t VarintSize( - const T* data, const int n, - const internal::enable_if<sizeof(T) == 4>::type* = NULL) { +// this code is deliberately written such that clang makes it into really +// efficient SSE code. +template<bool ZigZag, bool SignExtended, typename T> +static size_t VarintSize(const T* data, const int n) { #if __cplusplus >= 201103L + static_assert(sizeof(T) == 4, "This routine only works for 32 bit integers"); // is_unsigned<T> => !ZigZag static_assert((std::is_unsigned<T>::value ^ ZigZag) || std::is_signed<T>::value, @@ -649,101 +646,83 @@ static size_t VarintSize( static_assert((std::is_unsigned<T>::value ^ SignExtended) || std::is_signed<T>::value, "Cannot SignExtended unsigned types"); + static_assert(!(SignExtended && ZigZag), + "Cannot SignExtended and ZigZag on the same type"); #endif - - union vus32 { - uint32 u[4]; - int32 s[4]; - __m128i v; - }; - - static const vus32 ones = {{1, 1, 1, 1}}; - - // CodedOutputStream::VarintSize32SignExtended returns 10 for negative - // numbers. We can apply the UInt32Size algorithm, and simultaneously logical - // shift the MSB into the LSB to determine if it is negative. - static const vus32 fives = {{5, 5, 5, 5}}; - - // sum is the vectorized-output of calling CodedOutputStream::VarintSize32 on - // the processed elements. - // - // msb_sum is the count of set most-significant bits. When computing the - // vectorized CodedOutputStream::VarintSize32SignExtended, negative values - // have the most significant bit set. VarintSize32SignExtended returns 10 and - // VarintSize32 returns 5. msb_sum allows us to compute: - // VarintSize32SignExtended = msb_sum * 5 + VarintSize32 - vus32 sum, v, msb_sum; - sum.v = _mm_setzero_si128(); - msb_sum.v = _mm_setzero_si128(); - - int rounded = n & ~(3); - int i; - for (i = 0; i < rounded; i += 4) { - v.v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&data[i])); - + uint32 sum = n; + uint32 msb_sum = 0; + for (int i = 0; i < n; i++) { + uint32 x = data[i]; if (ZigZag) { - // Note: the right-shift must be arithmetic - v.v = _mm_xor_si128(_mm_slli_epi32(v.v, 1), _mm_srai_epi32(v.v, 31)); - } - - sum.v = _mm_add_epi32(sum.v, ones.v); - if (SignExtended) { - msb_sum.v = _mm_add_epi32(msb_sum.v, _mm_srli_epi32(v.v, 31)); - } - - v.v = _mm_srli_epi32(v.v, 7); - - for (int j = 0; j < 4; j++) { - __m128i min = _mm_min_epi32(v.v, ones.v); - - sum.v = _mm_add_epi32(sum.v, min); - v.v = _mm_srli_epi32(v.v, 7); + x = WireFormatLite::ZigZagEncode32(x); + } else if (SignExtended) { + msb_sum += x >> 31; } + // clang is so smart that it produces optimal SSE sequence unrolling + // the loop 8 ints at a time. With a sequence of 4 + // cmpres = cmpgt x, sizeclass ( -1 or 0) + // sum = sum - cmpres + if (x > 0x7F) sum++; + if (x > 0x3FFF) sum++; + if (x > 0x1FFFFF) sum++; + if (x > 0xFFFFFFF) sum++; } + if (SignExtended) sum += msb_sum * 5; + return sum; +} - if (SignExtended) { - vus32 extensions; - extensions.v = _mm_mullo_epi32(msb_sum.v, fives.v); - - sum.v = _mm_add_epi32(sum.v, extensions.v); - } - - // TODO(ckennelly): Can we avoid the sign conversion? - size_t out = _mm_cvtsi128_si32( - _mm_hadd_epi32(_mm_hadd_epi32(sum.v, ones.v), ones.v)); - - // Finish tail. - for (; i < n; i++) { +template<bool ZigZag, typename T> +static size_t VarintSize64(const T* data, const int n) { +#if __cplusplus >= 201103L + static_assert(sizeof(T) == 8, "This routine only works for 64 bit integers"); + // is_unsigned<T> => !ZigZag + static_assert(!ZigZag || !std::is_unsigned<T>::value, + "Cannot ZigZag encode unsigned types"); +#endif + uint64 sum = n; + for (int i = 0; i < n; i++) { + uint64 x = data[i]; if (ZigZag) { - out += WireFormatLite::SInt32Size(data[i]); - } else if (SignExtended) { - out += WireFormatLite::Int32Size(data[i]); - } else { - out += WireFormatLite::UInt32Size(data[i]); + x = WireFormatLite::ZigZagEncode64(x); } + // First step is a binary search, we can't branch in sse so we use the + // result of the compare to adjust sum and appropriately. This code is + // written to make clang recognize the vectorization. + uint64 tmp = x >= (static_cast<uint64>(1) << 35) ? -1 : 0; + sum += 5 & tmp; + x >>= 35 & tmp; + if (x > 0x7F) sum++; + if (x > 0x3FFF) sum++; + if (x > 0x1FFFFF) sum++; + if (x > 0xFFFFFFF) sum++; } - - return out; + return sum; } +// GCC does not recognize the vectorization opportunity +// and other platforms are untested, in those cases using the optimized +// varint size routine for each element is faster. +// Hence we enable it only for clang +#if defined(__SSE__) && defined(__clang__) size_t WireFormatLite::Int32Size(const RepeatedField<int32>& value) { - return VarintSize<int32, false, true>(value.data(), value.size()); + return VarintSize<false, true>(value.data(), value.size()); } size_t WireFormatLite::UInt32Size(const RepeatedField<uint32>& value) { - return VarintSize<uint32, false, false>(value.data(), value.size()); + return VarintSize<false, false>(value.data(), value.size()); } size_t WireFormatLite::SInt32Size(const RepeatedField<int32>& value) { - return VarintSize<int32, true, true>(value.data(), value.size()); + return VarintSize<true, false>(value.data(), value.size()); } size_t WireFormatLite::EnumSize(const RepeatedField<int>& value) { // On ILP64, sizeof(int) == 8, which would require a different template. - return VarintSize<int, false, true>(value.data(), value.size()); + return VarintSize<false, true>(value.data(), value.size()); } -#else // !__SSE_4_1__ +#else // !(defined(__SSE4_1__) && defined(__clang__)) + size_t WireFormatLite::Int32Size(const RepeatedField<int32>& value) { size_t out = 0; const int n = value.size(); @@ -779,6 +758,56 @@ size_t WireFormatLite::EnumSize(const RepeatedField<int>& value) { } return out; } + +#endif + +// Micro benchmarks show that the SSE improved loop only starts beating +// the normal loop on Haswell platforms and then only for >32 ints. We +// disable this for now. Some specialized users might find it worthwhile to +// enable this. +#define USE_SSE_FOR_64_BIT_INTEGER_ARRAYS 0 +#if USE_SSE_FOR_64_BIT_INTEGER_ARRAYS +size_t WireFormatLite::Int64Size (const RepeatedField< int64>& value) { + return VarintSize64<false>(value.data(), value.size()); +} + +size_t WireFormatLite::UInt64Size(const RepeatedField<uint64>& value) { + return VarintSize64<false>(value.data(), value.size()); +} + +size_t WireFormatLite::SInt64Size(const RepeatedField< int64>& value) { + return VarintSize64<true>(value.data(), value.size()); +} + +#else + +size_t WireFormatLite::Int64Size (const RepeatedField< int64>& value) { + size_t out = 0; + const int n = value.size(); + for (int i = 0; i < n; i++) { + out += Int64Size(value.Get(i)); + } + return out; +} + +size_t WireFormatLite::UInt64Size(const RepeatedField<uint64>& value) { + size_t out = 0; + const int n = value.size(); + for (int i = 0; i < n; i++) { + out += UInt64Size(value.Get(i)); + } + return out; +} + +size_t WireFormatLite::SInt64Size(const RepeatedField< int64>& value) { + size_t out = 0; + const int n = value.size(); + for (int i = 0; i < n; i++) { + out += SInt64Size(value.Get(i)); + } + return out; +} + #endif } // namespace internal |