aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/google/protobuf/wire_format_lite.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/google/protobuf/wire_format_lite.cc')
-rw-r--r--src/google/protobuf/wire_format_lite.cc189
1 files changed, 109 insertions, 80 deletions
diff --git a/src/google/protobuf/wire_format_lite.cc b/src/google/protobuf/wire_format_lite.cc
index cd343b74..1d8cda5a 100644
--- a/src/google/protobuf/wire_format_lite.cc
+++ b/src/google/protobuf/wire_format_lite.cc
@@ -34,9 +34,6 @@
#include <google/protobuf/wire_format_lite_inl.h>
-#ifdef __SSE_4_1__
-#include <immintrin.h>
-#endif
#include <stack>
#include <string>
#include <vector>
@@ -635,12 +632,12 @@ bool WireFormatLite::VerifyUtf8String(const char* data,
return true;
}
-#ifdef __SSE_4_1__
-template<typename T, bool ZigZag, bool SignExtended>
-static size_t VarintSize(
- const T* data, const int n,
- const internal::enable_if<sizeof(T) == 4>::type* = NULL) {
+// this code is deliberately written such that clang makes it into really
+// efficient SSE code.
+template<bool ZigZag, bool SignExtended, typename T>
+static size_t VarintSize(const T* data, const int n) {
#if __cplusplus >= 201103L
+ static_assert(sizeof(T) == 4, "This routine only works for 32 bit integers");
// is_unsigned<T> => !ZigZag
static_assert((std::is_unsigned<T>::value ^ ZigZag) ||
std::is_signed<T>::value,
@@ -649,101 +646,83 @@ static size_t VarintSize(
static_assert((std::is_unsigned<T>::value ^ SignExtended) ||
std::is_signed<T>::value,
"Cannot SignExtended unsigned types");
+ static_assert(!(SignExtended && ZigZag),
+ "Cannot SignExtended and ZigZag on the same type");
#endif
-
- union vus32 {
- uint32 u[4];
- int32 s[4];
- __m128i v;
- };
-
- static const vus32 ones = {{1, 1, 1, 1}};
-
- // CodedOutputStream::VarintSize32SignExtended returns 10 for negative
- // numbers. We can apply the UInt32Size algorithm, and simultaneously logical
- // shift the MSB into the LSB to determine if it is negative.
- static const vus32 fives = {{5, 5, 5, 5}};
-
- // sum is the vectorized-output of calling CodedOutputStream::VarintSize32 on
- // the processed elements.
- //
- // msb_sum is the count of set most-significant bits. When computing the
- // vectorized CodedOutputStream::VarintSize32SignExtended, negative values
- // have the most significant bit set. VarintSize32SignExtended returns 10 and
- // VarintSize32 returns 5. msb_sum allows us to compute:
- // VarintSize32SignExtended = msb_sum * 5 + VarintSize32
- vus32 sum, v, msb_sum;
- sum.v = _mm_setzero_si128();
- msb_sum.v = _mm_setzero_si128();
-
- int rounded = n & ~(3);
- int i;
- for (i = 0; i < rounded; i += 4) {
- v.v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&data[i]));
-
+ uint32 sum = n;
+ uint32 msb_sum = 0;
+ for (int i = 0; i < n; i++) {
+ uint32 x = data[i];
if (ZigZag) {
- // Note: the right-shift must be arithmetic
- v.v = _mm_xor_si128(_mm_slli_epi32(v.v, 1), _mm_srai_epi32(v.v, 31));
- }
-
- sum.v = _mm_add_epi32(sum.v, ones.v);
- if (SignExtended) {
- msb_sum.v = _mm_add_epi32(msb_sum.v, _mm_srli_epi32(v.v, 31));
- }
-
- v.v = _mm_srli_epi32(v.v, 7);
-
- for (int j = 0; j < 4; j++) {
- __m128i min = _mm_min_epi32(v.v, ones.v);
-
- sum.v = _mm_add_epi32(sum.v, min);
- v.v = _mm_srli_epi32(v.v, 7);
+ x = WireFormatLite::ZigZagEncode32(x);
+ } else if (SignExtended) {
+ msb_sum += x >> 31;
}
+ // clang is so smart that it produces optimal SSE sequence unrolling
+ // the loop 8 ints at a time. With a sequence of 4
+ // cmpres = cmpgt x, sizeclass ( -1 or 0)
+ // sum = sum - cmpres
+ if (x > 0x7F) sum++;
+ if (x > 0x3FFF) sum++;
+ if (x > 0x1FFFFF) sum++;
+ if (x > 0xFFFFFFF) sum++;
}
+ if (SignExtended) sum += msb_sum * 5;
+ return sum;
+}
- if (SignExtended) {
- vus32 extensions;
- extensions.v = _mm_mullo_epi32(msb_sum.v, fives.v);
-
- sum.v = _mm_add_epi32(sum.v, extensions.v);
- }
-
- // TODO(ckennelly): Can we avoid the sign conversion?
- size_t out = _mm_cvtsi128_si32(
- _mm_hadd_epi32(_mm_hadd_epi32(sum.v, ones.v), ones.v));
-
- // Finish tail.
- for (; i < n; i++) {
+template<bool ZigZag, typename T>
+static size_t VarintSize64(const T* data, const int n) {
+#if __cplusplus >= 201103L
+ static_assert(sizeof(T) == 8, "This routine only works for 64 bit integers");
+ // is_unsigned<T> => !ZigZag
+ static_assert(!ZigZag || !std::is_unsigned<T>::value,
+ "Cannot ZigZag encode unsigned types");
+#endif
+ uint64 sum = n;
+ for (int i = 0; i < n; i++) {
+ uint64 x = data[i];
if (ZigZag) {
- out += WireFormatLite::SInt32Size(data[i]);
- } else if (SignExtended) {
- out += WireFormatLite::Int32Size(data[i]);
- } else {
- out += WireFormatLite::UInt32Size(data[i]);
+ x = WireFormatLite::ZigZagEncode64(x);
}
+ // First step is a binary search, we can't branch in sse so we use the
+ // result of the compare to adjust sum and appropriately. This code is
+ // written to make clang recognize the vectorization.
+ uint64 tmp = x >= (static_cast<uint64>(1) << 35) ? -1 : 0;
+ sum += 5 & tmp;
+ x >>= 35 & tmp;
+ if (x > 0x7F) sum++;
+ if (x > 0x3FFF) sum++;
+ if (x > 0x1FFFFF) sum++;
+ if (x > 0xFFFFFFF) sum++;
}
-
- return out;
+ return sum;
}
+// GCC does not recognize the vectorization opportunity
+// and other platforms are untested, in those cases using the optimized
+// varint size routine for each element is faster.
+// Hence we enable it only for clang
+#if defined(__SSE__) && defined(__clang__)
size_t WireFormatLite::Int32Size(const RepeatedField<int32>& value) {
- return VarintSize<int32, false, true>(value.data(), value.size());
+ return VarintSize<false, true>(value.data(), value.size());
}
size_t WireFormatLite::UInt32Size(const RepeatedField<uint32>& value) {
- return VarintSize<uint32, false, false>(value.data(), value.size());
+ return VarintSize<false, false>(value.data(), value.size());
}
size_t WireFormatLite::SInt32Size(const RepeatedField<int32>& value) {
- return VarintSize<int32, true, true>(value.data(), value.size());
+ return VarintSize<true, false>(value.data(), value.size());
}
size_t WireFormatLite::EnumSize(const RepeatedField<int>& value) {
// On ILP64, sizeof(int) == 8, which would require a different template.
- return VarintSize<int, false, true>(value.data(), value.size());
+ return VarintSize<false, true>(value.data(), value.size());
}
-#else // !__SSE_4_1__
+#else // !(defined(__SSE4_1__) && defined(__clang__))
+
size_t WireFormatLite::Int32Size(const RepeatedField<int32>& value) {
size_t out = 0;
const int n = value.size();
@@ -779,6 +758,56 @@ size_t WireFormatLite::EnumSize(const RepeatedField<int>& value) {
}
return out;
}
+
+#endif
+
+// Micro benchmarks show that the SSE improved loop only starts beating
+// the normal loop on Haswell platforms and then only for >32 ints. We
+// disable this for now. Some specialized users might find it worthwhile to
+// enable this.
+#define USE_SSE_FOR_64_BIT_INTEGER_ARRAYS 0
+#if USE_SSE_FOR_64_BIT_INTEGER_ARRAYS
+size_t WireFormatLite::Int64Size (const RepeatedField< int64>& value) {
+ return VarintSize64<false>(value.data(), value.size());
+}
+
+size_t WireFormatLite::UInt64Size(const RepeatedField<uint64>& value) {
+ return VarintSize64<false>(value.data(), value.size());
+}
+
+size_t WireFormatLite::SInt64Size(const RepeatedField< int64>& value) {
+ return VarintSize64<true>(value.data(), value.size());
+}
+
+#else
+
+size_t WireFormatLite::Int64Size (const RepeatedField< int64>& value) {
+ size_t out = 0;
+ const int n = value.size();
+ for (int i = 0; i < n; i++) {
+ out += Int64Size(value.Get(i));
+ }
+ return out;
+}
+
+size_t WireFormatLite::UInt64Size(const RepeatedField<uint64>& value) {
+ size_t out = 0;
+ const int n = value.size();
+ for (int i = 0; i < n; i++) {
+ out += UInt64Size(value.Get(i));
+ }
+ return out;
+}
+
+size_t WireFormatLite::SInt64Size(const RepeatedField< int64>& value) {
+ size_t out = 0;
+ const int n = value.size();
+ for (int i = 0; i < n; i++) {
+ out += SInt64Size(value.Get(i));
+ }
+ return out;
+}
+
#endif
} // namespace internal