diff options
Diffstat (limited to 'tensorflow/core/lib/strings')
-rw-r--r-- | tensorflow/core/lib/strings/numbers.cc | 260 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/numbers.h | 92 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/numbers_test.cc | 113 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/ordered_code.cc | 515 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/ordered_code.h | 77 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/ordered_code_test.cc | 1183 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/str_util.cc | 312 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/str_util.h | 149 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/str_util_test.cc | 258 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/strcat.cc | 194 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/strcat.h | 229 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/strcat_test.cc | 324 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/stringprintf.cc | 85 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/stringprintf.h | 37 | ||||
-rw-r--r-- | tensorflow/core/lib/strings/stringprintf_test.cc | 113 |
15 files changed, 3941 insertions, 0 deletions
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc new file mode 100644 index 0000000000..d61129fb3f --- /dev/null +++ b/tensorflow/core/lib/strings/numbers.cc @@ -0,0 +1,260 @@ +#include "tensorflow/core/lib/strings/numbers.h" + +#include <float.h> +#include <stdio.h> +#include <stdlib.h> +#include <algorithm> +#include <cmath> + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { +namespace strings { + +char* FastInt32ToBufferLeft(int32 i, char* buffer) { + uint32 u = i; + if (i < 0) { + *buffer++ = '-'; + // We need to do the negation in modular (i.e., "unsigned") + // arithmetic; MSVC++ apprently warns for plain "-u", so + // we write the equivalent expression "0 - u" instead. + u = 0 - u; + } + return FastUInt32ToBufferLeft(u, buffer); +} + +char* FastUInt32ToBufferLeft(uint32 i, char* buffer) { + char* start = buffer; + do { + *buffer++ = ((i % 10) + '0'); + i /= 10; + } while (i > 0); + *buffer = 0; + std::reverse(start, buffer); + return buffer; +} + +char* FastInt64ToBufferLeft(int64 i, char* buffer) { + uint64 u = i; + if (i < 0) { + *buffer++ = '-'; + u = 0 - u; + } + return FastUInt64ToBufferLeft(u, buffer); +} + +char* FastUInt64ToBufferLeft(uint64 i, char* buffer) { + char* start = buffer; + do { + *buffer++ = ((i % 10) + '0'); + i /= 10; + } while (i > 0); + *buffer = 0; + std::reverse(start, buffer); + return buffer; +} + +static const double kDoublePrecisionCheckMax = DBL_MAX / 1.000000000000001; + +char* DoubleToBuffer(double value, char* buffer) { + // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all + // platforms these days. Just in case some system exists where DBL_DIG + // is significantly larger -- and risks overflowing our buffer -- we have + // this assert. + static_assert(DBL_DIG < 20, "DBL_DIG is too big"); + + bool full_precision_needed = true; + if (std::abs(value) <= kDoublePrecisionCheckMax) { + int snprintf_result = + snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG, value); + + // The snprintf should never overflow because the buffer is significantly + // larger than the precision we asked for. + DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize); + + full_precision_needed = strtod(buffer, NULL) != value; + } + + if (full_precision_needed) { + int snprintf_result = + snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG + 2, value); + + // Should never overflow; see above. + DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize); + } + return buffer; +} + +bool safe_strto64(const char* str, int64* value) { + if (!str) return false; + + // Skip leading space. + while (isspace(*str)) ++str; + + int64 vlimit = kint64max; + int sign = 1; + if (*str == '-') { + sign = -1; + ++str; + // Different limit for positive and negative integers. + vlimit = kint64min; + } + + if (!isdigit(*str)) return false; + + int64 result = 0; + if (sign == 1) { + do { + int digit = *str - '0'; + if ((vlimit - digit) / 10 < result) { + return false; + } + result = result * 10 + digit; + ++str; + } while (isdigit(*str)); + } else { + do { + int digit = *str - '0'; + if ((vlimit + digit) / 10 > result) { + return false; + } + result = result * 10 - digit; + ++str; + } while (isdigit(*str)); + } + + // Skip trailing space. + while (isspace(*str)) ++str; + + if (*str) return false; + + *value = result; + return true; +} + +bool safe_strto32(const char* str, int32* value) { + if (!str) return false; + + // Skip leading space. + while (isspace(*str)) ++str; + + int64 vmax = kint32max; + int sign = 1; + if (*str == '-') { + sign = -1; + ++str; + // Different max for positive and negative integers. + ++vmax; + } + + if (!isdigit(*str)) return false; + + int64 result = 0; + do { + result = result * 10 + *str - '0'; + if (result > vmax) { + return false; + } + ++str; + } while (isdigit(*str)); + + // Skip trailing space. + while (isspace(*str)) ++str; + + if (*str) return false; + + *value = result * sign; + return true; +} + +bool safe_strtof(const char* str, float* value) { + char* endptr; + *value = strtof(str, &endptr); + while (isspace(*endptr)) ++endptr; + // Ignore range errors from strtod/strtof. + // The values it returns on underflow and + // overflow are the right fallback in a + // robust setting. + return *str != '\0' && *endptr == '\0'; +} + +char* FloatToBuffer(float value, char* buffer) { + // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all + // platforms these days. Just in case some system exists where FLT_DIG + // is significantly larger -- and risks overflowing our buffer -- we have + // this assert. + static_assert(FLT_DIG < 10, "FLT_DIG is too big"); + + int snprintf_result = + snprintf(buffer, kFastToBufferSize, "%.*g", FLT_DIG, value); + + // The snprintf should never overflow because the buffer is significantly + // larger than the precision we asked for. + DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize); + + float parsed_value; + if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) { + snprintf_result = + snprintf(buffer, kFastToBufferSize, "%.*g", FLT_DIG + 2, value); + + // Should never overflow; see above. + DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize); + } + return buffer; +} + +string FpToString(Fprint fp) { + char buf[17]; + snprintf(buf, sizeof(buf), "%016llx", static_cast<uint64>(fp)); + return string(buf); +} + +bool StringToFp(const string& s, Fprint* fp) { + char junk; + uint64 result; + if (sscanf(s.c_str(), "%llx%c", &result, &junk) == 1) { + *fp = result; + return true; + } else { + return false; + } +} + +string HumanReadableNumBytes(int64 num_bytes) { + if (num_bytes == kint64min) { + // Special case for number with not representable negation. + return "-8E"; + } + + const char* neg_str = (num_bytes < 0) ? "-" : ""; + if (num_bytes < 0) { + num_bytes = -num_bytes; + } + + // Special case for bytes. + if (num_bytes < 1024) { + // No fractions for bytes. + char buf[8]; // Longest possible string is '-XXXXB' + snprintf(buf, sizeof(buf), "%s%lldB", neg_str, + static_cast<int64>(num_bytes)); + return string(buf); + } + + static const char units[] = "KMGTPE"; // int64 only goes up to E. + const char* unit = units; + while (num_bytes >= static_cast<int64>(1024) * 1024) { + num_bytes /= 1024; + ++unit; + CHECK(unit < units + TF_ARRAYSIZE(units)); + } + + // We use SI prefixes. + char buf[16]; + snprintf(buf, sizeof(buf), ((*unit == 'K') ? "%s%.1f%ciB" : "%s%.2f%ciB"), + neg_str, num_bytes / 1024.0, *unit); + return string(buf); +} + +} // namespace strings +} // namespace tensorflow diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h new file mode 100644 index 0000000000..a30a862279 --- /dev/null +++ b/tensorflow/core/lib/strings/numbers.h @@ -0,0 +1,92 @@ +#ifndef TENSORFLOW_LIB_STRINGS_NUMBERS_H_ +#define TENSORFLOW_LIB_STRINGS_NUMBERS_H_ + +#include <string> + +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { +namespace strings { + +// ---------------------------------------------------------------------- +// FastIntToBufferLeft() +// These are intended for speed. +// +// All functions take the output buffer as an arg. FastInt() uses +// at most 22 bytes, FastTime() uses exactly 30 bytes. They all +// return a pointer to the beginning of the output, which is the same as +// the beginning of the input buffer. +// +// NOTE: In 64-bit land, sizeof(time_t) is 8, so it is possible +// to pass to FastTimeToBuffer() a time whose year cannot be +// represented in 4 digits. In this case, the output buffer +// will contain the string "Invalid:<value>" +// ---------------------------------------------------------------------- + +// Previously documented minimums -- the buffers provided must be at least this +// long, though these numbers are subject to change: +// Int32, UInt32: 12 bytes +// Int64, UInt64, Int, Uint: 22 bytes +// Time: 30 bytes +// Use kFastToBufferSize rather than hardcoding constants. +static const int kFastToBufferSize = 32; + +// ---------------------------------------------------------------------- +// FastInt32ToBufferLeft() +// FastUInt32ToBufferLeft() +// FastInt64ToBufferLeft() +// FastUInt64ToBufferLeft() +// +// These functions convert their numeric argument to an ASCII +// representation of the numeric value in base 10, with the +// representation being left-aligned in the buffer. The caller is +// responsible for ensuring that the buffer has enough space to hold +// the output. The buffer should typically be at least kFastToBufferSize +// bytes. +// +// Returns a pointer to the end of the string (i.e. the null character +// terminating the string). +// ---------------------------------------------------------------------- + +char* FastInt32ToBufferLeft(int32 i, char* buffer); // at least 12 bytes +char* FastUInt32ToBufferLeft(uint32 i, char* buffer); // at least 12 bytes +char* FastInt64ToBufferLeft(int64 i, char* buffer); // at least 22 bytes +char* FastUInt64ToBufferLeft(uint64 i, char* buffer); // at least 22 bytes + +// Required buffer size for DoubleToBuffer is kFastToBufferSize. +// Required buffer size for FloatToBuffer is kFastToBufferSize. +char* DoubleToBuffer(double i, char* buffer); +char* FloatToBuffer(float i, char* buffer); + +// Convert a 64-bit fingerprint value to an ASCII representation. +string FpToString(Fprint fp); + +// Attempt to parse a fingerprint in the form encoded by FpToString. If +// successsful, stores the fingerprint in *fp and returns true. Otherwise, +// returns false. +bool StringToFp(const string& s, Fprint* fp); + +// Convert strings to 32bit integer values. +// Leading and trailing spaces are allowed. +// Return false with overflow or invalid input. +bool safe_strto32(const char* str, int32* value); + +// Convert strings to 64bit integer values. +// Leading and trailing spaces are allowed. +// Return false with overflow or invalid input. +bool safe_strto64(const char* str, int64* value); + +// Convert strings to floating point values. +// Leading and trailing spaces are allowed. +// Values may be rounded on over- and underflow. +bool safe_strtof(const char* str, float* value); + +// Converts from an int64 representing a number of bytes to a +// human readable string representing the same number. +// e.g. 12345678 -> "11.77MiB". +string HumanReadableNumBytes(int64 num_bytes); + +} // namespace strings +} // namespace tensorflow + +#endif // TENSORFLOW_LIB_STRINGS_NUMBERS_H_ diff --git a/tensorflow/core/lib/strings/numbers_test.cc b/tensorflow/core/lib/strings/numbers_test.cc new file mode 100644 index 0000000000..b178e6af53 --- /dev/null +++ b/tensorflow/core/lib/strings/numbers_test.cc @@ -0,0 +1,113 @@ +#include "tensorflow/core/lib/strings/numbers.h" + +#include <string> +#include <gtest/gtest.h> + +namespace tensorflow { +namespace strings { + +// NOTE: most of the routines in numbers.h are tested indirectly through +// strcat_test.cc in this directory. + +// Test StrCat of ints and longs of various sizes and signdedness. +TEST(FpToString, Ints) { + for (int s = 0; s < 64; s++) { + for (int delta = -1; delta <= 1; delta++) { + uint64 fp = (1ull << s) + delta; + string s = FpToString(fp); + uint64 fp2; + EXPECT_TRUE(StringToFp(s, &fp2)); + EXPECT_EQ(fp, fp2); + } + } + Fprint dummy; + EXPECT_FALSE(StringToFp("", &dummy)); + EXPECT_FALSE(StringToFp("xyz", &dummy)); + EXPECT_FALSE(StringToFp("0000000000000000xyz", &dummy)); +} + +TEST(HumanReadableNumBytes, Bytes) { + EXPECT_EQ("0B", HumanReadableNumBytes(0)); + EXPECT_EQ("4B", HumanReadableNumBytes(4)); + EXPECT_EQ("1023B", HumanReadableNumBytes(1023)); + + EXPECT_EQ("1.0KiB", HumanReadableNumBytes(1024)); + EXPECT_EQ("1.0KiB", HumanReadableNumBytes(1025)); + EXPECT_EQ("1.5KiB", HumanReadableNumBytes(1500)); + EXPECT_EQ("1.9KiB", HumanReadableNumBytes(1927)); + + EXPECT_EQ("2.0KiB", HumanReadableNumBytes(2048)); + EXPECT_EQ("1.00MiB", HumanReadableNumBytes(1 << 20)); + EXPECT_EQ("11.77MiB", HumanReadableNumBytes(12345678)); + EXPECT_EQ("1.00GiB", HumanReadableNumBytes(1 << 30)); + + EXPECT_EQ("1.00TiB", HumanReadableNumBytes(1LL << 40)); + EXPECT_EQ("1.00PiB", HumanReadableNumBytes(1LL << 50)); + EXPECT_EQ("1.00EiB", HumanReadableNumBytes(1LL << 60)); + + // Try a few negative numbers + EXPECT_EQ("-1B", HumanReadableNumBytes(-1)); + EXPECT_EQ("-4B", HumanReadableNumBytes(-4)); + EXPECT_EQ("-1000B", HumanReadableNumBytes(-1000)); + EXPECT_EQ("-11.77MiB", HumanReadableNumBytes(-12345678)); + EXPECT_EQ("-8E", HumanReadableNumBytes(kint64min)); +} + +TEST(safe_strto32, Int32s) { + int32 result; + + EXPECT_EQ(true, safe_strto32("1", &result)); + EXPECT_EQ(1, result); + EXPECT_EQ(true, safe_strto32("123", &result)); + EXPECT_EQ(123, result); + EXPECT_EQ(true, safe_strto32(" -123 ", &result)); + EXPECT_EQ(-123, result); + EXPECT_EQ(true, safe_strto32("2147483647", &result)); + EXPECT_EQ(2147483647, result); + EXPECT_EQ(true, safe_strto32("-2147483648", &result)); + EXPECT_EQ(-2147483648, result); + + // Invalid argument + EXPECT_EQ(false, safe_strto32(" 132as ", &result)); + EXPECT_EQ(false, safe_strto32(" 132.2 ", &result)); + EXPECT_EQ(false, safe_strto32(" -", &result)); + EXPECT_EQ(false, safe_strto32("", &result)); + EXPECT_EQ(false, safe_strto32(" ", &result)); + EXPECT_EQ(false, safe_strto32("123 a", &result)); + + // Overflow + EXPECT_EQ(false, safe_strto32("2147483648", &result)); + EXPECT_EQ(false, safe_strto32("-2147483649", &result)); +} + +TEST(safe_strto64, Int64s) { + int64 result; + + EXPECT_EQ(true, safe_strto64("1", &result)); + EXPECT_EQ(1, result); + EXPECT_EQ(true, safe_strto64("123", &result)); + EXPECT_EQ(123, result); + EXPECT_EQ(true, safe_strto64(" -123 ", &result)); + EXPECT_EQ(-123, result); + EXPECT_EQ(true, safe_strto64("9223372036854775807", &result)); + EXPECT_EQ(9223372036854775807, result); + EXPECT_EQ(true, safe_strto64("-9223372036854775808", &result)); + // kint64min == -9223372036854775808 + // Use -9223372036854775808 directly results in out of range error + EXPECT_EQ(kint64min, result); + + // Invalid argument + EXPECT_EQ(false, safe_strto64(" 132as ", &result)); + EXPECT_EQ(false, safe_strto64(" 132.2 ", &result)); + EXPECT_EQ(false, safe_strto64(" -", &result)); + EXPECT_EQ(false, safe_strto64("", &result)); + EXPECT_EQ(false, safe_strto64(" ", &result)); + EXPECT_EQ(false, safe_strto64("123 a", &result)); + + // Overflow + EXPECT_EQ(false, safe_strto64("9223372036854775808", &result)); + EXPECT_EQ(false, safe_strto64("-9223372036854775809", &result)); +} + +} // namespace strings +} // namespace tensorflow diff --git a/tensorflow/core/lib/strings/ordered_code.cc b/tensorflow/core/lib/strings/ordered_code.cc new file mode 100644 index 0000000000..ec67595ebb --- /dev/null +++ b/tensorflow/core/lib/strings/ordered_code.cc @@ -0,0 +1,515 @@ +#include "tensorflow/core/lib/strings/ordered_code.h" + +#include <assert.h> +#include <stddef.h> + +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { +namespace strings { + +// We encode a string in different ways depending on whether the item +// should be in lexicographically increasing or decreasing order. +// +// +// Lexicographically increasing order +// +// We want a string-to-string mapping F(x) such that for any two strings +// +// x < y => F(x) < F(y) +// +// In addition to the normal characters '\x00' through '\xff', we want to +// encode a few extra symbols in strings: +// +// <sep> Separator between items +// <infinity> Infinite string +// +// Therefore we need an alphabet with at least 258 symbols. Each +// character '\1' through '\xfe' is mapped to itself. The other four are +// encoded into two-letter sequences starting with '\0' and '\xff': +// +// <sep> encoded as => \0\1 +// \0 encoded as => \0\xff +// \xff encoded as => \xff\x00 +// <infinity> encoded as => \xff\xff +// +// The remaining two-letter sequences starting with '\0' and '\xff' are +// currently unused. +// +// F(<infinity>) is defined above. For any finite string x, F(x) is the +// the encodings of x's characters followed by the encoding for <sep>. The +// ordering of two finite strings is the same as the ordering of the +// respective characters at the first position where they differ, which in +// turn is the same as the ordering of the encodings of those two +// characters. Moreover, for every finite string x, F(x) < F(<infinity>). +// +// +// Lexicographically decreasing order +// +// We want a string-to-string mapping G(x) such that for any two strings, +// whether finite or not, +// +// x < y => G(x) > G(y) +// +// To achieve this, define G(x) to be the inversion of F(x): I(F(x)). In +// other words, invert every bit in F(x) to get G(x). For example, +// +// x = \x00\x13\xff +// F(x) = \x00\xff\x13\xff\x00\x00\x01 escape \0, \xff, append F(<sep>) +// G(x) = \xff\x00\xec\x00\xff\xff\xfe invert every bit in F(x) +// +// x = <infinity> +// F(x) = \xff\xff +// G(x) = \x00\x00 +// +// Another example is +// +// x F(x) G(x) = I(F(x)) +// - ---- -------------- +// <infinity> \xff\xff \x00\x00 +// "foo" foo\0\1 \x99\x90\x90\xff\xfe +// "aaa" aaa\0\1 \x9e\x9e\x9e\xff\xfe +// "aa" aa\0\1 \x9e\x9e\xff\xfe +// "" \0\1 \xff\xfe +// +// More generally and rigorously, if for any two strings x and y +// +// F(x) < F(y) => I(F(x)) > I(F(y)) (1) +// +// it would follow that x < y => G(x) > G(y) because +// +// x < y => F(x) < F(y) => G(x) = I(F(x)) > I(F(y)) = G(y) +// +// We now show why (1) is true, in two parts. Notice that for any two +// strings x < y, F(x) is *not* a proper prefix of F(y). Suppose x is a +// proper prefix of y (say, x="abc" < y="abcd"). F(x) and F(y) diverge at +// the F(<sep>) in F(x) (v. F('d') in the example). Suppose x is not a +// proper prefix of y (say, x="abce" < y="abd"), F(x) and F(y) diverge at +// their respective encodings of the characters where x and y diverge +// (F('c') v. F('d')). Finally, if y=<infinity>, we can see that +// F(y)=\xff\xff is not the prefix of F(x) for any finite string x, simply +// by considering all the possible first characters of F(x). +// +// Given that F(x) is not a proper prefix F(y), the order of F(x) and F(y) +// is determined by the byte where F(x) and F(y) diverge. For example, the +// order of F(x)="eefh" and F(y)="eeg" is determined by their third +// characters. I(p) inverts each byte in p, which effectively subtracts +// each byte from 0xff. So, in this example, I('f') > I('g'), and thus +// I(F(x)) > I(F(y)). +// +// +// Implementation +// +// To implement G(x) efficiently, we use C++ template to instantiate two +// versions of the code to produce F(x), one for normal encoding (giving us +// F(x)) and one for inverted encoding (giving us G(x) = I(F(x))). + +static const char kEscape1 = '\000'; +static const char kNullCharacter = '\xff'; // Combined with kEscape1 +static const char kSeparator = '\001'; // Combined with kEscape1 + +static const char kEscape2 = '\xff'; +static const char kInfinity = '\xff'; // Combined with kEscape2 +static const char kFFCharacter = '\000'; // Combined with kEscape2 + +static const char kEscape1_Separator[2] = {kEscape1, kSeparator}; + +// Append to "*dest" the "len" bytes starting from "*src". +inline static void AppendBytes(string* dest, const char* src, int len) { + dest->append(src, len); +} + +inline bool IsSpecialByte(char c) { return ((unsigned char)(c + 1)) < 2; } + +// Return a pointer to the first byte in the range "[start..limit)" +// whose value is 0 or 255 (kEscape1 or kEscape2). If no such byte +// exists in the range, returns "limit". +inline const char* SkipToNextSpecialByte(const char* start, const char* limit) { + // If these constants were ever changed, this routine needs to change + DCHECK_EQ(kEscape1, 0); + DCHECK_EQ(kEscape2 & 0xffu, 255u); + const char* p = start; + while (p < limit && !IsSpecialByte(*p)) { + p++; + } + return p; +} + +// Expose SkipToNextSpecialByte for testing purposes +const char* OrderedCode::TEST_SkipToNextSpecialByte(const char* start, + const char* limit) { + return SkipToNextSpecialByte(start, limit); +} + +// Helper routine to encode "s" and append to "*dest", escaping special +// characters. +inline static void EncodeStringFragment(string* dest, StringPiece s) { + const char* p = s.data(); + const char* limit = p + s.size(); + const char* copy_start = p; + while (true) { + p = SkipToNextSpecialByte(p, limit); + if (p >= limit) break; // No more special characters that need escaping + char c = *(p++); + DCHECK(IsSpecialByte(c)); + if (c == kEscape1) { + AppendBytes(dest, copy_start, p - copy_start - 1); + dest->push_back(kEscape1); + dest->push_back(kNullCharacter); + copy_start = p; + } else { + assert(c == kEscape2); + AppendBytes(dest, copy_start, p - copy_start - 1); + dest->push_back(kEscape2); + dest->push_back(kFFCharacter); + copy_start = p; + } + } + if (p > copy_start) { + AppendBytes(dest, copy_start, p - copy_start); + } +} + +void OrderedCode::WriteString(string* dest, StringPiece s) { + EncodeStringFragment(dest, s); + AppendBytes(dest, kEscape1_Separator, 2); +} + +void OrderedCode::WriteNumIncreasing(string* dest, uint64 val) { + // Values are encoded with a single byte length prefix, followed + // by the actual value in big-endian format with leading 0 bytes + // dropped. + unsigned char buf[9]; // 8 bytes for value plus one byte for length + int len = 0; + while (val > 0) { + len++; + buf[9 - len] = (val & 0xff); + val >>= 8; + } + buf[9 - len - 1] = (unsigned char)len; + len++; + AppendBytes(dest, reinterpret_cast<const char*>(buf + 9 - len), len); +} + +// Parse the encoding of a previously encoded string. +// If parse succeeds, return true, consume encoding from +// "*src", and if result != NULL append the decoded string to "*result". +// Otherwise, return false and leave both undefined. +inline static bool ReadStringInternal(StringPiece* src, string* result) { + const char* start = src->data(); + const char* string_limit = src->data() + src->size(); + + // We only scan up to "limit-2" since a valid string must end with + // a two character terminator: 'kEscape1 kSeparator' + const char* limit = string_limit - 1; + const char* copy_start = start; + while (true) { + start = SkipToNextSpecialByte(start, limit); + if (start >= limit) break; // No terminator sequence found + const char c = *(start++); + // If inversion is required, instead of inverting 'c', we invert the + // character constants to which 'c' is compared. We get the same + // behavior but save the runtime cost of inverting 'c'. + DCHECK(IsSpecialByte(c)); + if (c == kEscape1) { + if (result) { + AppendBytes(result, copy_start, start - copy_start - 1); + } + // kEscape1 kSeparator ends component + // kEscape1 kNullCharacter represents '\0' + const char next = *(start++); + if (next == kSeparator) { + src->remove_prefix(start - src->data()); + return true; + } else if (next == kNullCharacter) { + if (result) { + *result += '\0'; + } + } else { + return false; + } + copy_start = start; + } else { + assert(c == kEscape2); + if (result) { + AppendBytes(result, copy_start, start - copy_start - 1); + } + // kEscape2 kFFCharacter represents '\xff' + // kEscape2 kInfinity is an error + const char next = *(start++); + if (next == kFFCharacter) { + if (result) { + *result += '\xff'; + } + } else { + return false; + } + copy_start = start; + } + } + return false; +} + +bool OrderedCode::ReadString(StringPiece* src, string* result) { + return ReadStringInternal(src, result); +} + +bool OrderedCode::ReadNumIncreasing(StringPiece* src, uint64* result) { + if (src->empty()) { + return false; // Not enough bytes + } + + // Decode length byte + const size_t len = static_cast<unsigned char>((*src)[0]); + + // If len > 0 and src is longer than 1, the first byte of "payload" + // must be non-zero (otherwise the encoding is not minimal). + // In opt mode, we don't enforce that encodings must be minimal. + DCHECK(0 == len || src->size() == 1 || (*src)[1] != '\0') + << "invalid encoding"; + + if (len + 1 > src->size() || len > 8) { + return false; // Not enough bytes or too many bytes + } + + if (result) { + uint64 tmp = 0; + for (size_t i = 0; i < len; i++) { + tmp <<= 8; + tmp |= static_cast<unsigned char>((*src)[1 + i]); + } + *result = tmp; + } + src->remove_prefix(len + 1); + return true; +} + +void OrderedCode::TEST_Corrupt(string* str, int k) { + int seen_seps = 0; + for (size_t i = 0; i + 1 < str->size(); i++) { + if ((*str)[i] == kEscape1 && (*str)[i + 1] == kSeparator) { + seen_seps++; + if (seen_seps == k) { + (*str)[i + 1] = kSeparator + 1; + return; + } + } + } +} + +// Signed number encoding/decoding ///////////////////////////////////// +// +// The format is as follows: +// +// The first bit (the most significant bit of the first byte) +// represents the sign, 0 if the number is negative and +// 1 if the number is >= 0. +// +// Any unbroken sequence of successive bits with the same value as the sign +// bit, up to 9 (the 8th and 9th are the most significant bits of the next +// byte), are size bits that count the number of bytes after the first byte. +// That is, the total length is between 1 and 10 bytes. +// +// The value occupies the bits after the sign bit and the "size bits" +// till the end of the string, in network byte order. If the number +// is negative, the bits are in 2-complement. +// +// +// Example 1: number 0x424242 -> 4 byte big-endian hex string 0xf0424242: +// +// +---------------+---------------+---------------+---------------+ +// 1 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 +// +---------------+---------------+---------------+---------------+ +// ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ +// | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | +// | | | | payload: the remaining bits after the sign and size bits +// | | | | and the delimiter bit, the value is 0x424242 +// | | | | +// | size bits: 3 successive bits with the same value as the sign bit +// | (followed by a delimiter bit with the opposite value) +// | mean that there are 3 bytes after the first byte, 4 total +// | +// sign bit: 1 means that the number is non-negative +// +// Example 2: negative number -0x800 -> 2 byte big-endian hex string 0x3800: +// +// +---------------+---------------+ +// 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 +// +---------------+---------------+ +// ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ +// | | | | | | | | | | | | | | | | | | | | | | | | | | | | | +// | | payload: the remaining bits after the sign and size bits and the +// | | delimiter bit, 2-complement because of the negative sign, +// | | value is ~0x7ff, represents the value -0x800 +// | | +// | size bits: 1 bit with the same value as the sign bit +// | (followed by a delimiter bit with the opposite value) +// | means that there is 1 byte after the first byte, 2 total +// | +// sign bit: 0 means that the number is negative +// +// +// Compared with the simpler unsigned format used for uint64 numbers, +// this format is more compact for small numbers, namely one byte encodes +// numbers in the range [-64,64), two bytes cover the range [-2^13,2^13), etc. +// In general, n bytes encode numbers in the range [-2^(n*7-1),2^(n*7-1)). +// (The cross-over point for compactness of representation is 8 bytes, +// where this format only covers the range [-2^55,2^55), +// whereas an encoding with sign bit and length in the first byte and +// payload in all following bytes would cover [-2^56,2^56).) + +static const int kMaxSigned64Length = 10; + +// This array maps encoding length to header bits in the first two bytes. +static const char kLengthToHeaderBits[1 + kMaxSigned64Length][2] = { + {0, 0}, {'\x80', 0}, {'\xc0', 0}, {'\xe0', 0}, + {'\xf0', 0}, {'\xf8', 0}, {'\xfc', 0}, {'\xfe', 0}, + {'\xff', 0}, {'\xff', '\x80'}, {'\xff', '\xc0'}}; + +// This array maps encoding lengths to the header bits that overlap with +// the payload and need fixing when reading. +static const uint64 kLengthToMask[1 + kMaxSigned64Length] = { + 0ULL, + 0x80ULL, + 0xc000ULL, + 0xe00000ULL, + 0xf0000000ULL, + 0xf800000000ULL, + 0xfc0000000000ULL, + 0xfe000000000000ULL, + 0xff00000000000000ULL, + 0x8000000000000000ULL, + 0ULL}; + +// This array maps the number of bits in a number to the encoding +// length produced by WriteSignedNumIncreasing. +// For positive numbers, the number of bits is 1 plus the most significant +// bit position (the highest bit position in a positive int64 is 63). +// For a negative number n, we count the bits in ~n. +// That is, length = kBitsToLength[Bits::Log2Floor64(n < 0 ? ~n : n) + 1]. +static const int8 kBitsToLength[1 + 63] = { + 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, + 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, + 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10}; + +#if defined(__GNUC__) +// Returns floor(lg(n)). Returns -1 if n == 0. +static int Log2Floor64(uint64 n) { + return n == 0 ? -1 : 63 ^ __builtin_clzll(n); +} +#else +// Portable slow version +static int Log2Floor32_Portable(uint32 n) { + if (n == 0) return -1; + int log = 0; + uint32 value = n; + for (int i = 4; i >= 0; --i) { + int shift = (1 << i); + uint32 x = value >> shift; + if (x != 0) { + value = x; + log += shift; + } + } + assert(value == 1); + return log; +} +// Returns floor(lg(n)). Returns -1 if n == 0. +static int Log2Floor64(uint64 n) { + const uint32 topbits = static_cast<uint32>(n >> 32); + if (topbits == 0) { + // Top bits are zero, so scan in bottom bits + return Log2Floor32_Portable(static_cast<uint32>(n)); + } else { + return 32 + Log2Floor32_Portable(topbits); + } +} +#endif + +// Calculates the encoding length in bytes of the signed number n. +static inline int SignedEncodingLength(int64 n) { + return kBitsToLength[Log2Floor64(n < 0 ? ~n : n) + 1]; +} + +static void StoreBigEndian64(char* dst, uint64 v) { + for (int i = 0; i < 8; i++) { + dst[i] = (v >> (56 - 8 * i)) & 0xff; + } +} + +static uint64 LoadBigEndian64(const char* src) { + uint64 result = 0; + for (int i = 0; i < 8; i++) { + unsigned char c = static_cast<unsigned char>(src[i]); + result |= static_cast<uint64>(c) << (56 - 8 * i); + } + return result; +} + +void OrderedCode::WriteSignedNumIncreasing(string* dest, int64 val) { + const uint64 x = val < 0 ? ~val : val; + if (x < 64) { // fast path for encoding length == 1 + *dest += kLengthToHeaderBits[1][0] ^ val; + return; + } + // buf = val in network byte order, sign extended to 10 bytes + const char sign_byte = val < 0 ? '\xff' : '\0'; + char buf[10] = { + sign_byte, sign_byte, + }; + StoreBigEndian64(buf + 2, val); + static_assert(sizeof(buf) == kMaxSigned64Length, "max length size mismatch"); + const int len = SignedEncodingLength(x); + DCHECK_GE(len, 2); + char* const begin = buf + sizeof(buf) - len; + begin[0] ^= kLengthToHeaderBits[len][0]; + begin[1] ^= kLengthToHeaderBits[len][1]; // ok because len >= 2 + dest->append(begin, len); +} + +bool OrderedCode::ReadSignedNumIncreasing(StringPiece* src, int64* result) { + if (src->empty()) return false; + const uint64 xor_mask = (!((*src)[0] & 0x80)) ? ~0ULL : 0ULL; + const unsigned char first_byte = (*src)[0] ^ (xor_mask & 0xff); + + // now calculate and test length, and set x to raw (unmasked) result + int len; + uint64 x; + if (first_byte != 0xff) { + len = 7 - Log2Floor64(first_byte ^ 0xff); + if (src->size() < static_cast<size_t>(len)) return false; + x = xor_mask; // sign extend using xor_mask + for (int i = 0; i < len; ++i) + x = (x << 8) | static_cast<unsigned char>((*src)[i]); + } else { + len = 8; + if (src->size() < static_cast<size_t>(len)) return false; + const unsigned char second_byte = (*src)[1] ^ (xor_mask & 0xff); + if (second_byte >= 0x80) { + if (second_byte < 0xc0) { + len = 9; + } else { + const unsigned char third_byte = (*src)[2] ^ (xor_mask & 0xff); + if (second_byte == 0xc0 && third_byte < 0x80) { + len = 10; + } else { + return false; // either len > 10 or len == 10 and #bits > 63 + } + } + if (src->size() < static_cast<size_t>(len)) return false; + } + x = LoadBigEndian64(src->data() + len - 8); + } + + x ^= kLengthToMask[len]; // remove spurious header bits + + DCHECK_EQ(len, SignedEncodingLength(x)) << "invalid encoding"; + + if (result) *result = x; + src->remove_prefix(len); + return true; +} + +} // namespace strings +} // namespace tensorflow diff --git a/tensorflow/core/lib/strings/ordered_code.h b/tensorflow/core/lib/strings/ordered_code.h new file mode 100644 index 0000000000..39f1df9a94 --- /dev/null +++ b/tensorflow/core/lib/strings/ordered_code.h @@ -0,0 +1,77 @@ +// This module provides routines for encoding a sequence of typed +// entities into a string. The resulting strings can be +// lexicographically compared to yield the same comparison value that +// would have been generated if the encoded items had been compared +// one by one according to their type. +// +// More precisely, suppose: +// 1. string A is generated by encoding the sequence of items [A_1..A_n] +// 2. string B is generated by encoding the sequence of items [B_1..B_n] +// 3. The types match; i.e., for all i: A_i was encoded using +// the same routine as B_i +// Then: +// Comparing A vs. B lexicographically is the same as comparing +// the vectors [A_1..A_n] and [B_1..B_n] lexicographically. +// +// Furthermore, if n < m, the encoding of [A_1..A_n] is a strict prefix of +// [A_1..A_m] (unless m = n+1 and A_m is the empty string encoded with +// WriteTrailingString, in which case the encodings are equal). +// +// This module is often useful when generating multi-part sstable +// keys that have to be ordered in a particular fashion. + +#ifndef TENSORFLOW_LIB_STRINGS_ORDERED_CODE_H__ +#define TENSORFLOW_LIB_STRINGS_ORDERED_CODE_H__ + +#include <string> +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { +class StringPiece; + +namespace strings { + +class OrderedCode { + public: + // ------------------------------------------------------------------- + // Encoding routines: each one of the following routines append + // one item to "*dest" in an encoding where larger values are + // ordered lexicographically after smaller values. + static void WriteString(string* dest, StringPiece str); + static void WriteNumIncreasing(string* dest, uint64 num); + static void WriteSignedNumIncreasing(string* dest, int64 num); + + // ------------------------------------------------------------------- + // Decoding routines: these extract an item earlier encoded using + // the corresponding WriteXXX() routines above. The item is read + // from "*src"; "*src" is modified to point past the decoded item; + // and if "result" is non-NULL, "*result" is modified to contain the + // result. In case of string result, the decoded string is appended to + // "*result". Returns true if the next item was read successfully, false + // otherwise. + static bool ReadString(StringPiece* src, string* result); + static bool ReadNumIncreasing(StringPiece* src, uint64* result); + static bool ReadSignedNumIncreasing(StringPiece* src, int64* result); + + // Helper for testing: corrupt "*str" by changing the kth item separator + // in the string. + static void TEST_Corrupt(string* str, int k); + + // Helper for testing. + // SkipToNextSpecialByte is an internal routine defined in the .cc file + // with the following semantics. Return a pointer to the first byte + // in the range "[start..limit)" whose value is 0 or 255. If no such + // byte exists in the range, returns "limit". + static const char* TEST_SkipToNextSpecialByte(const char* start, + const char* limit); + + private: + // This has only static methods, so disallow construction entirely + OrderedCode(); + TF_DISALLOW_COPY_AND_ASSIGN(OrderedCode); +}; + +} // namespace strings +} // namespace tensorflow + +#endif // TENSORFLOW_LIB_STRINGS_ORDERED_CODE_H__ diff --git a/tensorflow/core/lib/strings/ordered_code_test.cc b/tensorflow/core/lib/strings/ordered_code_test.cc new file mode 100644 index 0000000000..d517d14f4a --- /dev/null +++ b/tensorflow/core/lib/strings/ordered_code_test.cc @@ -0,0 +1,1183 @@ +#include "tensorflow/core/lib/strings/ordered_code.h" + +#include <float.h> +#include <stddef.h> +#include <limits> +#include <vector> + +#include <gtest/gtest.h> +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { +namespace strings { + +static string RandomString(random::SimplePhilox* rnd, int len) { + string x; + for (int i = 0; i < len; i++) { + x += rnd->Uniform(256); + } + return x; +} + +// --------------------------------------------------------------------- +// Utility template functions (they help templatize the tests below) + +// Read/WriteIncreasing are defined for string, uint64, int64 below. +template <typename T> +static void OCWriteIncreasing(string* dest, const T& val); +template <typename T> +static bool OCReadIncreasing(StringPiece* src, T* result); + +// Read/WriteIncreasing<string> +template <> +void OCWriteIncreasing<string>(string* dest, const string& val) { + OrderedCode::WriteString(dest, val); +} +template <> +bool OCReadIncreasing<string>(StringPiece* src, string* result) { + return OrderedCode::ReadString(src, result); +} + +// Read/WriteIncreasing<uint64> +template <> +void OCWriteIncreasing<uint64>(string* dest, const uint64& val) { + OrderedCode::WriteNumIncreasing(dest, val); +} +template <> +bool OCReadIncreasing<uint64>(StringPiece* src, uint64* result) { + return OrderedCode::ReadNumIncreasing(src, result); +} + +// Read/WriteIncreasing<int64> +template <> +void OCWriteIncreasing<int64>(string* dest, const int64& val) { + OrderedCode::WriteSignedNumIncreasing(dest, val); +} +template <> +bool OCReadIncreasing<int64>(StringPiece* src, int64* result) { + return OrderedCode::ReadSignedNumIncreasing(src, result); +} + +template <typename T> +string OCWrite(T val) { + string result; + OCWriteIncreasing<T>(&result, val); + return result; +} + +template <typename T> +void OCWriteToString(string* result, T val) { + OCWriteIncreasing<T>(result, val); +} + +template <typename T> +bool OCRead(StringPiece* s, T* val) { + return OCReadIncreasing<T>(s, val); +} + +// --------------------------------------------------------------------- +// Numbers + +template <typename T> +static T TestRead(const string& a) { + // gracefully reject any proper prefix of an encoding + for (int i = 0; i < a.size() - 1; ++i) { + StringPiece s(a.data(), i); + CHECK(!OCRead<T>(&s, NULL)); + CHECK_EQ(s, a.substr(0, i)); + } + + StringPiece s(a); + T v; + CHECK(OCRead<T>(&s, &v)); + CHECK(s.empty()); + return v; +} + +template <typename T> +static void TestWriteRead(T expected) { + EXPECT_EQ(expected, TestRead<T>(OCWrite<T>(expected))); +} + +// Verifies that the second Write* call appends a non-empty string to its +// output. +template <typename T, typename U> +static void TestWriteAppends(T first, U second) { + string encoded; + OCWriteToString<T>(&encoded, first); + string encoded_first_only = encoded; + OCWriteToString<U>(&encoded, second); + EXPECT_NE(encoded, encoded_first_only); + EXPECT_TRUE(StringPiece(encoded).starts_with(encoded_first_only)); +} + +template <typename T> +static void TestNumbers(T multiplier) { + // first test powers of 2 (and nearby numbers) + for (T x = std::numeric_limits<T>().max(); x != 0; x /= 2) { + TestWriteRead(multiplier * (x - 1)); + TestWriteRead(multiplier * x); + if (x != std::numeric_limits<T>::max()) { + TestWriteRead(multiplier * (x + 1)); + } else if (multiplier < 0 && multiplier == -1) { + TestWriteRead(-x - 1); + } + } + + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + for (int bits = 1; bits <= std::numeric_limits<T>().digits; ++bits) { + // test random non-negative numbers with given number of significant bits + const uint64 mask = (~0ULL) >> (64 - bits); + for (int i = 0; i < 1000; i++) { + T x = rnd.Rand64() & mask; + TestWriteRead(multiplier * x); + T y = rnd.Rand64() & mask; + TestWriteAppends(multiplier * x, multiplier * y); + } + } +} + +// Return true iff 'a' is "before" 'b' +static bool CompareStrings(const string& a, const string& b) { return (a < b); } + +template <typename T> +static void TestNumberOrdering() { + // first the negative numbers (if T is signed, otherwise no-op) + string laststr = OCWrite<T>(std::numeric_limits<T>().min()); + for (T num = std::numeric_limits<T>().min() / 2; num != 0; num /= 2) { + string strminus1 = OCWrite<T>(num - 1); + string str = OCWrite<T>(num); + string strplus1 = OCWrite<T>(num + 1); + + CHECK(CompareStrings(strminus1, str)); + CHECK(CompareStrings(str, strplus1)); + + // Compare 'str' with 'laststr'. When we approach 0, 'laststr' is + // not necessarily before 'strminus1'. + CHECK(CompareStrings(laststr, str)); + laststr = str; + } + + // then the positive numbers + laststr = OCWrite<T>(0); + T num = 1; + while (num < std::numeric_limits<T>().max() / 2) { + num *= 2; + string strminus1 = OCWrite<T>(num - 1); + string str = OCWrite<T>(num); + string strplus1 = OCWrite<T>(num + 1); + + CHECK(CompareStrings(strminus1, str)); + CHECK(CompareStrings(str, strplus1)); + + // Compare 'str' with 'laststr'. + CHECK(CompareStrings(laststr, str)); + laststr = str; + } +} + +// Helper routine for testing TEST_SkipToNextSpecialByte +static int FindSpecial(const string& x) { + const char* p = x.data(); + const char* limit = p + x.size(); + const char* result = OrderedCode::TEST_SkipToNextSpecialByte(p, limit); + return result - p; +} + +TEST(OrderedCode, SkipToNextSpecialByte) { + for (size_t len = 0; len < 256; len++) { + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + string x; + while (x.size() < len) { + char c = 1 + rnd.Uniform(254); + ASSERT_NE(c, 0); + ASSERT_NE(c, 255); + x += c; // No 0 bytes, no 255 bytes + } + EXPECT_EQ(FindSpecial(x), x.size()); + for (size_t special_pos = 0; special_pos < len; special_pos++) { + for (size_t special_test = 0; special_test < 2; special_test++) { + const char special_byte = (special_test == 0) ? 0 : 255; + string y = x; + y[special_pos] = special_byte; + EXPECT_EQ(FindSpecial(y), special_pos); + if (special_pos < 16) { + // Add some special bytes after the one at special_pos to make sure + // we still return the earliest special byte in the string + for (size_t rest = special_pos + 1; rest < len; rest++) { + if (rnd.OneIn(3)) { + y[rest] = rnd.OneIn(2) ? 0 : 255; + EXPECT_EQ(FindSpecial(y), special_pos); + } + } + } + } + } + } +} + +TEST(OrderedCode, ExhaustiveFindSpecial) { + char buf[16]; + char* limit = buf + sizeof(buf); + int count = 0; + for (int start_offset = 0; start_offset <= 5; start_offset += 5) { + // We test exhaustively with all combinations of 3 bytes starting + // at offset 0 and offset 5 (so as to test with the bytes at both + // ends of a 64-bit word). + for (size_t i = 0; i < sizeof(buf); i++) { + buf[i] = 'a'; // Not a special byte + } + for (int b0 = 0; b0 < 256; b0++) { + for (int b1 = 0; b1 < 256; b1++) { + for (int b2 = 0; b2 < 256; b2++) { + buf[start_offset + 0] = b0; + buf[start_offset + 1] = b1; + buf[start_offset + 2] = b2; + char* expected; + if (b0 == 0 || b0 == 255) { + expected = &buf[start_offset]; + } else if (b1 == 0 || b1 == 255) { + expected = &buf[start_offset + 1]; + } else if (b2 == 0 || b2 == 255) { + expected = &buf[start_offset + 2]; + } else { + expected = limit; + } + count++; + EXPECT_EQ(expected, + OrderedCode::TEST_SkipToNextSpecialByte(buf, limit)); + } + } + } + } + EXPECT_EQ(count, 256 * 256 * 256 * 2); +} + +TEST(Uint64, EncodeDecode) { TestNumbers<uint64>(1); } + +TEST(Uint64, Ordering) { TestNumberOrdering<uint64>(); } + +TEST(Int64, EncodeDecode) { + TestNumbers<int64>(1); + TestNumbers<int64>(-1); +} + +TEST(Int64, Ordering) { TestNumberOrdering<int64>(); } + +// Returns the bitwise complement of s. +static inline string StrNot(const string& s) { + string result; + for (string::const_iterator it = s.begin(); it != s.end(); ++it) + result.push_back(~*it); + return result; +} + +template <typename T> +static void TestInvalidEncoding(const string& s) { + StringPiece p(s); + EXPECT_FALSE(OCRead<T>(&p, static_cast<T*>(NULL))); + EXPECT_EQ(s, p); +} + +TEST(OrderedCodeInvalidEncodingsTest, Overflow) { + // 1U << 64, increasing and decreasing + const string k2xx64U = "\x09\x01" + string(8, 0); + TestInvalidEncoding<uint64>(k2xx64U); + + // 1 << 63 and ~(1 << 63), increasing and decreasing + const string k2xx63 = "\xff\xc0\x80" + string(7, 0); + TestInvalidEncoding<int64>(k2xx63); + TestInvalidEncoding<int64>(StrNot(k2xx63)); +} + +TEST(OrderedCodeInvalidEncodingsDeathTest, NonCanonical) { + // Test "ambiguous"/"non-canonical" encodings. + // These are non-minimal (but otherwise "valid") encodings that + // differ from the minimal encoding chosen by OrderedCode::WriteXXX + // and thus should be avoided to not mess up the string ordering of + // encodings. + + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + + for (int n = 2; n <= 9; ++n) { + // The zero in non_minimal[1] is "redundant". + string non_minimal = + string(1, n - 1) + string(1, 0) + RandomString(&rnd, n - 2); + EXPECT_EQ(n, non_minimal.length()); + + EXPECT_NE(OCWrite<uint64>(0), non_minimal); +#ifndef NDEBUG + StringPiece s(non_minimal); + EXPECT_DEATH(OrderedCode::ReadNumIncreasing(&s, NULL), "invalid encoding"); +#else + TestRead<uint64>(non_minimal); +#endif + } + + for (int n = 2; n <= 10; ++n) { + // Header with 1 sign bit and n-1 size bits. + string header = string(n / 8, 0xff) + string(1, 0xff << (8 - (n % 8))); + // There are more than 7 zero bits between header bits and "payload". + string non_minimal = header + + string(1, rnd.Uniform(256) & ~*header.rbegin()) + + RandomString(&rnd, n - header.length() - 1); + EXPECT_EQ(n, non_minimal.length()); + + EXPECT_NE(OCWrite<int64>(0), non_minimal); +#ifndef NDEBUG + StringPiece s(non_minimal); + EXPECT_DEATH(OrderedCode::ReadSignedNumIncreasing(&s, NULL), + "invalid encoding") + << n; +#else + TestRead<int64>(non_minimal); +#endif + } +} + +// Returns random number with specified number of bits, +// i.e., in the range [2^(bits-1),2^bits). +static uint64 NextBits(random::SimplePhilox* rnd, int bits) { + return (bits != 0) + ? (rnd->Rand64() % (1LL << (bits - 1))) + (1LL << (bits - 1)) + : 0; +} + +template <typename T> +static void BM_WriteNum(int n, T multiplier) { + static const int kValues = 64; + T values[kValues]; + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + // Use enough distinct values to confuse the branch predictor + for (int i = 0; i < kValues; i++) { + values[i] = NextBits(&rnd, n % 64) * multiplier; + } + string result; + int index = 0; + while (n-- > 0) { + result.clear(); + OCWriteToString<T>(&result, values[index % kValues]); + index++; + } +} + +template <typename T> +static void BM_ReadNum(int n, T multiplier) { + string x; + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + // Use enough distinct values to confuse the branch predictor + static const int kValues = 64; + string values[kValues]; + for (int i = 0; i < kValues; i++) { + T val = NextBits(&rnd, i % 64) * multiplier; + values[i] = OCWrite<T>(val); + } + uint32 index = 0; + while (n-- > 0) { + T val; + StringPiece s = values[index++ % kValues]; + OCRead<T>(&s, &val); + } +} + +#define BENCHMARK_NUM(name, T, multiplier) \ + static void BM_Write##name(int n) { BM_WriteNum<T>(n, multiplier); } \ + BENCHMARK(BM_Write##name); \ + static void BM_Read##name(int n) { BM_ReadNum<T>(n, multiplier); } \ + BENCHMARK(BM_Read##name) + +BENCHMARK_NUM(NumIncreasing, uint64, 1); +BENCHMARK_NUM(SignedNum, int64, 1); +BENCHMARK_NUM(SignedNumNegative, int64, -1); + +#undef BENCHMARK_NUM + +// --------------------------------------------------------------------- +// Strings + +TEST(String, EncodeDecode) { + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + + for (int len = 0; len < 256; len++) { + const string a = RandomString(&rnd, len); + TestWriteRead(a); + for (int len2 = 0; len2 < 64; len2++) { + const string b = RandomString(&rnd, len2); + + TestWriteAppends(a, b); + + string out; + OCWriteToString<string>(&out, a); + OCWriteToString<string>(&out, b); + + string a2, b2, dummy; + StringPiece s = out; + StringPiece s2 = out; + CHECK(OCRead<string>(&s, &a2)); + CHECK(OCRead<string>(&s2, NULL)); + CHECK_EQ(s, s2); + + CHECK(OCRead<string>(&s, &b2)); + CHECK(OCRead<string>(&s2, NULL)); + CHECK_EQ(s, s2); + + CHECK(!OCRead<string>(&s, &dummy)); + CHECK(!OCRead<string>(&s2, NULL)); + CHECK_EQ(a, a2); + CHECK_EQ(b, b2); + CHECK(s.empty()); + CHECK(s2.empty()); + } + } +} + +// 'str' is a static C-style string that may contain '\0' +#define STATIC_STR(str) StringPiece((str), sizeof(str) - 1) + +static string EncodeStringIncreasing(StringPiece value) { + string encoded; + OrderedCode::WriteString(&encoded, value); + return encoded; +} + +TEST(String, Increasing) { + // Here are a series of strings in non-decreasing order, including + // consecutive strings such that the second one is equal to, a proper + // prefix of, or has the same length as the first one. Most also contain + // the special escaping characters '\x00' and '\xff'. + ASSERT_EQ(EncodeStringIncreasing(STATIC_STR("")), + EncodeStringIncreasing(STATIC_STR(""))); + + ASSERT_LT(EncodeStringIncreasing(STATIC_STR("")), + EncodeStringIncreasing(STATIC_STR("\x00"))); + + ASSERT_EQ(EncodeStringIncreasing(STATIC_STR("\x00")), + EncodeStringIncreasing(STATIC_STR("\x00"))); + + ASSERT_LT(EncodeStringIncreasing(STATIC_STR("\x00")), + EncodeStringIncreasing(STATIC_STR("\x01"))); + + ASSERT_LT(EncodeStringIncreasing(STATIC_STR("\x01")), + EncodeStringIncreasing(STATIC_STR("a"))); + + ASSERT_EQ(EncodeStringIncreasing(STATIC_STR("a")), + EncodeStringIncreasing(STATIC_STR("a"))); + + ASSERT_LT(EncodeStringIncreasing(STATIC_STR("a")), + EncodeStringIncreasing(STATIC_STR("aa"))); + + ASSERT_LT(EncodeStringIncreasing(STATIC_STR("aa")), + EncodeStringIncreasing(STATIC_STR("\xff"))); + + ASSERT_LT(EncodeStringIncreasing(STATIC_STR("\xff")), + EncodeStringIncreasing(STATIC_STR("\xff\x00"))); + + ASSERT_LT(EncodeStringIncreasing(STATIC_STR("\xff\x00")), + EncodeStringIncreasing(STATIC_STR("\xff\x01"))); +} + +TEST(EncodingIsExpected, String) { + std::vector<std::pair<string, string>> data = { + {"", string("\x00\x01", 2)}, + {"foo", string("foo\x00\x01", 5)}, + {"hello", string("hello\x00\x01", 7)}, + {string("\x00\x01\xff", 3), string("\x00\xff\x01\xff\x00\x00\x01", 7)}, + }; + for (const auto& t : data) { + string result; + OrderedCode::WriteString(&result, t.first); + EXPECT_EQ(t.second, result); + + StringPiece in = result; + string decoded; + EXPECT_TRUE(OrderedCode::ReadString(&in, &decoded)); + EXPECT_EQ(t.first, decoded); + EXPECT_EQ("", in); + } +} + +TEST(EncodingIsExpected, Unsigned) { + std::vector<std::pair<uint64, string>> data = { + {0x0ull, string("\000", 1)}, + {0x1ull, string("\001\001", 2)}, + {0x2ull, string("\001\002", 2)}, + {0x1ull, string("\001\001", 2)}, + {0x2ull, string("\001\002", 2)}, + {0x3ull, string("\001\003", 2)}, + {0x3ull, string("\001\003", 2)}, + {0x4ull, string("\001\004", 2)}, + {0x5ull, string("\001\005", 2)}, + {0x7ull, string("\001\007", 2)}, + {0x8ull, string("\001\010", 2)}, + {0x9ull, string("\001\t", 2)}, + {0xfull, string("\001\017", 2)}, + {0x10ull, string("\001\020", 2)}, + {0x11ull, string("\001\021", 2)}, + {0x1full, string("\001\037", 2)}, + {0x20ull, string("\001 ", 2)}, + {0x21ull, string("\001!", 2)}, + {0x3full, string("\001?", 2)}, + {0x40ull, string("\001@", 2)}, + {0x41ull, string("\001A", 2)}, + {0x7full, string("\001\177", 2)}, + {0x80ull, string("\001\200", 2)}, + {0x81ull, string("\001\201", 2)}, + {0xffull, string("\001\377", 2)}, + {0x100ull, string("\002\001\000", 3)}, + {0x101ull, string("\002\001\001", 3)}, + {0x1ffull, string("\002\001\377", 3)}, + {0x200ull, string("\002\002\000", 3)}, + {0x201ull, string("\002\002\001", 3)}, + {0x3ffull, string("\002\003\377", 3)}, + {0x400ull, string("\002\004\000", 3)}, + {0x401ull, string("\002\004\001", 3)}, + {0x7ffull, string("\002\007\377", 3)}, + {0x800ull, string("\002\010\000", 3)}, + {0x801ull, string("\002\010\001", 3)}, + {0xfffull, string("\002\017\377", 3)}, + {0x1000ull, string("\002\020\000", 3)}, + {0x1001ull, string("\002\020\001", 3)}, + {0x1fffull, string("\002\037\377", 3)}, + {0x2000ull, string("\002 \000", 3)}, + {0x2001ull, string("\002 \001", 3)}, + {0x3fffull, string("\002?\377", 3)}, + {0x4000ull, string("\002@\000", 3)}, + {0x4001ull, string("\002@\001", 3)}, + {0x7fffull, string("\002\177\377", 3)}, + {0x8000ull, string("\002\200\000", 3)}, + {0x8001ull, string("\002\200\001", 3)}, + {0xffffull, string("\002\377\377", 3)}, + {0x10000ull, string("\003\001\000\000", 4)}, + {0x10001ull, string("\003\001\000\001", 4)}, + {0x1ffffull, string("\003\001\377\377", 4)}, + {0x20000ull, string("\003\002\000\000", 4)}, + {0x20001ull, string("\003\002\000\001", 4)}, + {0x3ffffull, string("\003\003\377\377", 4)}, + {0x40000ull, string("\003\004\000\000", 4)}, + {0x40001ull, string("\003\004\000\001", 4)}, + {0x7ffffull, string("\003\007\377\377", 4)}, + {0x80000ull, string("\003\010\000\000", 4)}, + {0x80001ull, string("\003\010\000\001", 4)}, + {0xfffffull, string("\003\017\377\377", 4)}, + {0x100000ull, string("\003\020\000\000", 4)}, + {0x100001ull, string("\003\020\000\001", 4)}, + {0x1fffffull, string("\003\037\377\377", 4)}, + {0x200000ull, string("\003 \000\000", 4)}, + {0x200001ull, string("\003 \000\001", 4)}, + {0x3fffffull, string("\003?\377\377", 4)}, + {0x400000ull, string("\003@\000\000", 4)}, + {0x400001ull, string("\003@\000\001", 4)}, + {0x7fffffull, string("\003\177\377\377", 4)}, + {0x800000ull, string("\003\200\000\000", 4)}, + {0x800001ull, string("\003\200\000\001", 4)}, + {0xffffffull, string("\003\377\377\377", 4)}, + {0x1000000ull, string("\004\001\000\000\000", 5)}, + {0x1000001ull, string("\004\001\000\000\001", 5)}, + {0x1ffffffull, string("\004\001\377\377\377", 5)}, + {0x2000000ull, string("\004\002\000\000\000", 5)}, + {0x2000001ull, string("\004\002\000\000\001", 5)}, + {0x3ffffffull, string("\004\003\377\377\377", 5)}, + {0x4000000ull, string("\004\004\000\000\000", 5)}, + {0x4000001ull, string("\004\004\000\000\001", 5)}, + {0x7ffffffull, string("\004\007\377\377\377", 5)}, + {0x8000000ull, string("\004\010\000\000\000", 5)}, + {0x8000001ull, string("\004\010\000\000\001", 5)}, + {0xfffffffull, string("\004\017\377\377\377", 5)}, + {0x10000000ull, string("\004\020\000\000\000", 5)}, + {0x10000001ull, string("\004\020\000\000\001", 5)}, + {0x1fffffffull, string("\004\037\377\377\377", 5)}, + {0x20000000ull, string("\004 \000\000\000", 5)}, + {0x20000001ull, string("\004 \000\000\001", 5)}, + {0x3fffffffull, string("\004?\377\377\377", 5)}, + {0x40000000ull, string("\004@\000\000\000", 5)}, + {0x40000001ull, string("\004@\000\000\001", 5)}, + {0x7fffffffull, string("\004\177\377\377\377", 5)}, + {0x80000000ull, string("\004\200\000\000\000", 5)}, + {0x80000001ull, string("\004\200\000\000\001", 5)}, + {0xffffffffull, string("\004\377\377\377\377", 5)}, + {0x100000000ull, string("\005\001\000\000\000\000", 6)}, + {0x100000001ull, string("\005\001\000\000\000\001", 6)}, + {0x1ffffffffull, string("\005\001\377\377\377\377", 6)}, + {0x200000000ull, string("\005\002\000\000\000\000", 6)}, + {0x200000001ull, string("\005\002\000\000\000\001", 6)}, + {0x3ffffffffull, string("\005\003\377\377\377\377", 6)}, + {0x400000000ull, string("\005\004\000\000\000\000", 6)}, + {0x400000001ull, string("\005\004\000\000\000\001", 6)}, + {0x7ffffffffull, string("\005\007\377\377\377\377", 6)}, + {0x800000000ull, string("\005\010\000\000\000\000", 6)}, + {0x800000001ull, string("\005\010\000\000\000\001", 6)}, + {0xfffffffffull, string("\005\017\377\377\377\377", 6)}, + {0x1000000000ull, string("\005\020\000\000\000\000", 6)}, + {0x1000000001ull, string("\005\020\000\000\000\001", 6)}, + {0x1fffffffffull, string("\005\037\377\377\377\377", 6)}, + {0x2000000000ull, string("\005 \000\000\000\000", 6)}, + {0x2000000001ull, string("\005 \000\000\000\001", 6)}, + {0x3fffffffffull, string("\005?\377\377\377\377", 6)}, + {0x4000000000ull, string("\005@\000\000\000\000", 6)}, + {0x4000000001ull, string("\005@\000\000\000\001", 6)}, + {0x7fffffffffull, string("\005\177\377\377\377\377", 6)}, + {0x8000000000ull, string("\005\200\000\000\000\000", 6)}, + {0x8000000001ull, string("\005\200\000\000\000\001", 6)}, + {0xffffffffffull, string("\005\377\377\377\377\377", 6)}, + {0x10000000000ull, string("\006\001\000\000\000\000\000", 7)}, + {0x10000000001ull, string("\006\001\000\000\000\000\001", 7)}, + {0x1ffffffffffull, string("\006\001\377\377\377\377\377", 7)}, + {0x20000000000ull, string("\006\002\000\000\000\000\000", 7)}, + {0x20000000001ull, string("\006\002\000\000\000\000\001", 7)}, + {0x3ffffffffffull, string("\006\003\377\377\377\377\377", 7)}, + {0x40000000000ull, string("\006\004\000\000\000\000\000", 7)}, + {0x40000000001ull, string("\006\004\000\000\000\000\001", 7)}, + {0x7ffffffffffull, string("\006\007\377\377\377\377\377", 7)}, + {0x80000000000ull, string("\006\010\000\000\000\000\000", 7)}, + {0x80000000001ull, string("\006\010\000\000\000\000\001", 7)}, + {0xfffffffffffull, string("\006\017\377\377\377\377\377", 7)}, + {0x100000000000ull, string("\006\020\000\000\000\000\000", 7)}, + {0x100000000001ull, string("\006\020\000\000\000\000\001", 7)}, + {0x1fffffffffffull, string("\006\037\377\377\377\377\377", 7)}, + {0x200000000000ull, string("\006 \000\000\000\000\000", 7)}, + {0x200000000001ull, string("\006 \000\000\000\000\001", 7)}, + {0x3fffffffffffull, string("\006?\377\377\377\377\377", 7)}, + {0x400000000000ull, string("\006@\000\000\000\000\000", 7)}, + {0x400000000001ull, string("\006@\000\000\000\000\001", 7)}, + {0x7fffffffffffull, string("\006\177\377\377\377\377\377", 7)}, + {0x800000000000ull, string("\006\200\000\000\000\000\000", 7)}, + {0x800000000001ull, string("\006\200\000\000\000\000\001", 7)}, + {0xffffffffffffull, string("\006\377\377\377\377\377\377", 7)}, + {0x1000000000000ull, string("\007\001\000\000\000\000\000\000", 8)}, + {0x1000000000001ull, string("\007\001\000\000\000\000\000\001", 8)}, + {0x1ffffffffffffull, string("\007\001\377\377\377\377\377\377", 8)}, + {0x2000000000000ull, string("\007\002\000\000\000\000\000\000", 8)}, + {0x2000000000001ull, string("\007\002\000\000\000\000\000\001", 8)}, + {0x3ffffffffffffull, string("\007\003\377\377\377\377\377\377", 8)}, + {0x4000000000000ull, string("\007\004\000\000\000\000\000\000", 8)}, + {0x4000000000001ull, string("\007\004\000\000\000\000\000\001", 8)}, + {0x7ffffffffffffull, string("\007\007\377\377\377\377\377\377", 8)}, + {0x8000000000000ull, string("\007\010\000\000\000\000\000\000", 8)}, + {0x8000000000001ull, string("\007\010\000\000\000\000\000\001", 8)}, + {0xfffffffffffffull, string("\007\017\377\377\377\377\377\377", 8)}, + {0x10000000000000ull, string("\007\020\000\000\000\000\000\000", 8)}, + {0x10000000000001ull, string("\007\020\000\000\000\000\000\001", 8)}, + {0x1fffffffffffffull, string("\007\037\377\377\377\377\377\377", 8)}, + {0x20000000000000ull, string("\007 \000\000\000\000\000\000", 8)}, + {0x20000000000001ull, string("\007 \000\000\000\000\000\001", 8)}, + {0x3fffffffffffffull, string("\007?\377\377\377\377\377\377", 8)}, + {0x40000000000000ull, string("\007@\000\000\000\000\000\000", 8)}, + {0x40000000000001ull, string("\007@\000\000\000\000\000\001", 8)}, + {0x7fffffffffffffull, string("\007\177\377\377\377\377\377\377", 8)}, + {0x80000000000000ull, string("\007\200\000\000\000\000\000\000", 8)}, + {0x80000000000001ull, string("\007\200\000\000\000\000\000\001", 8)}, + {0xffffffffffffffull, string("\007\377\377\377\377\377\377\377", 8)}, + {0x100000000000000ull, string("\010\001\000\000\000\000\000\000\000", 9)}, + {0x100000000000001ull, string("\010\001\000\000\000\000\000\000\001", 9)}, + {0x1ffffffffffffffull, string("\010\001\377\377\377\377\377\377\377", 9)}, + {0x200000000000000ull, string("\010\002\000\000\000\000\000\000\000", 9)}, + {0x200000000000001ull, string("\010\002\000\000\000\000\000\000\001", 9)}, + {0x3ffffffffffffffull, string("\010\003\377\377\377\377\377\377\377", 9)}, + {0x400000000000000ull, string("\010\004\000\000\000\000\000\000\000", 9)}, + {0x400000000000001ull, string("\010\004\000\000\000\000\000\000\001", 9)}, + {0x7ffffffffffffffull, string("\010\007\377\377\377\377\377\377\377", 9)}, + {0x800000000000000ull, string("\010\010\000\000\000\000\000\000\000", 9)}, + {0x800000000000001ull, string("\010\010\000\000\000\000\000\000\001", 9)}, + {0xfffffffffffffffull, string("\010\017\377\377\377\377\377\377\377", 9)}, + {0x1000000000000000ull, + string("\010\020\000\000\000\000\000\000\000", 9)}, + {0x1000000000000001ull, + string("\010\020\000\000\000\000\000\000\001", 9)}, + {0x1fffffffffffffffull, + string("\010\037\377\377\377\377\377\377\377", 9)}, + {0x2000000000000000ull, string("\010 \000\000\000\000\000\000\000", 9)}, + {0x2000000000000001ull, string("\010 \000\000\000\000\000\000\001", 9)}, + {0x3fffffffffffffffull, string("\010?\377\377\377\377\377\377\377", 9)}, + {0x4000000000000000ull, string("\010@\000\000\000\000\000\000\000", 9)}, + {0x4000000000000001ull, string("\010@\000\000\000\000\000\000\001", 9)}, + {0x7fffffffffffffffull, + string("\010\177\377\377\377\377\377\377\377", 9)}, + {0x8000000000000000ull, + string("\010\200\000\000\000\000\000\000\000", 9)}, + {0x8000000000000001ull, + string("\010\200\000\000\000\000\000\000\001", 9)}, + }; + for (const auto& t : data) { + uint64 num = t.first; + string result; + OrderedCode::WriteNumIncreasing(&result, num); + EXPECT_EQ(t.second, result) << std::hex << num; + + StringPiece in = result; + uint64 decoded; + EXPECT_TRUE(OrderedCode::ReadNumIncreasing(&in, &decoded)); + EXPECT_EQ(num, decoded); + EXPECT_EQ("", in); + } +} + +TEST(EncodingIsExpected, Signed) { + std::vector<std::pair<int64, string>> data = { + {0ll, string("\200", 1)}, + {1ll, string("\201", 1)}, + {2ll, string("\202", 1)}, + {1ll, string("\201", 1)}, + {2ll, string("\202", 1)}, + {3ll, string("\203", 1)}, + {3ll, string("\203", 1)}, + {4ll, string("\204", 1)}, + {5ll, string("\205", 1)}, + {7ll, string("\207", 1)}, + {8ll, string("\210", 1)}, + {9ll, string("\211", 1)}, + {15ll, string("\217", 1)}, + {16ll, string("\220", 1)}, + {17ll, string("\221", 1)}, + {31ll, string("\237", 1)}, + {32ll, string("\240", 1)}, + {33ll, string("\241", 1)}, + {63ll, string("\277", 1)}, + {64ll, string("\300@", 2)}, + {65ll, string("\300A", 2)}, + {127ll, string("\300\177", 2)}, + {128ll, string("\300\200", 2)}, + {129ll, string("\300\201", 2)}, + {255ll, string("\300\377", 2)}, + {256ll, string("\301\000", 2)}, + {257ll, string("\301\001", 2)}, + {511ll, string("\301\377", 2)}, + {512ll, string("\302\000", 2)}, + {513ll, string("\302\001", 2)}, + {1023ll, string("\303\377", 2)}, + {1024ll, string("\304\000", 2)}, + {1025ll, string("\304\001", 2)}, + {2047ll, string("\307\377", 2)}, + {2048ll, string("\310\000", 2)}, + {2049ll, string("\310\001", 2)}, + {4095ll, string("\317\377", 2)}, + {4096ll, string("\320\000", 2)}, + {4097ll, string("\320\001", 2)}, + {8191ll, string("\337\377", 2)}, + {8192ll, string("\340 \000", 3)}, + {8193ll, string("\340 \001", 3)}, + {16383ll, string("\340?\377", 3)}, + {16384ll, string("\340@\000", 3)}, + {16385ll, string("\340@\001", 3)}, + {32767ll, string("\340\177\377", 3)}, + {32768ll, string("\340\200\000", 3)}, + {32769ll, string("\340\200\001", 3)}, + {65535ll, string("\340\377\377", 3)}, + {65536ll, string("\341\000\000", 3)}, + {65537ll, string("\341\000\001", 3)}, + {131071ll, string("\341\377\377", 3)}, + {131072ll, string("\342\000\000", 3)}, + {131073ll, string("\342\000\001", 3)}, + {262143ll, string("\343\377\377", 3)}, + {262144ll, string("\344\000\000", 3)}, + {262145ll, string("\344\000\001", 3)}, + {524287ll, string("\347\377\377", 3)}, + {524288ll, string("\350\000\000", 3)}, + {524289ll, string("\350\000\001", 3)}, + {1048575ll, string("\357\377\377", 3)}, + {1048576ll, string("\360\020\000\000", 4)}, + {1048577ll, string("\360\020\000\001", 4)}, + {2097151ll, string("\360\037\377\377", 4)}, + {2097152ll, string("\360 \000\000", 4)}, + {2097153ll, string("\360 \000\001", 4)}, + {4194303ll, string("\360?\377\377", 4)}, + {4194304ll, string("\360@\000\000", 4)}, + {4194305ll, string("\360@\000\001", 4)}, + {8388607ll, string("\360\177\377\377", 4)}, + {8388608ll, string("\360\200\000\000", 4)}, + {8388609ll, string("\360\200\000\001", 4)}, + {16777215ll, string("\360\377\377\377", 4)}, + {16777216ll, string("\361\000\000\000", 4)}, + {16777217ll, string("\361\000\000\001", 4)}, + {33554431ll, string("\361\377\377\377", 4)}, + {33554432ll, string("\362\000\000\000", 4)}, + {33554433ll, string("\362\000\000\001", 4)}, + {67108863ll, string("\363\377\377\377", 4)}, + {67108864ll, string("\364\000\000\000", 4)}, + {67108865ll, string("\364\000\000\001", 4)}, + {134217727ll, string("\367\377\377\377", 4)}, + {134217728ll, string("\370\010\000\000\000", 5)}, + {134217729ll, string("\370\010\000\000\001", 5)}, + {268435455ll, string("\370\017\377\377\377", 5)}, + {268435456ll, string("\370\020\000\000\000", 5)}, + {268435457ll, string("\370\020\000\000\001", 5)}, + {536870911ll, string("\370\037\377\377\377", 5)}, + {536870912ll, string("\370 \000\000\000", 5)}, + {536870913ll, string("\370 \000\000\001", 5)}, + {1073741823ll, string("\370?\377\377\377", 5)}, + {1073741824ll, string("\370@\000\000\000", 5)}, + {1073741825ll, string("\370@\000\000\001", 5)}, + {2147483647ll, string("\370\177\377\377\377", 5)}, + {2147483648ll, string("\370\200\000\000\000", 5)}, + {2147483649ll, string("\370\200\000\000\001", 5)}, + {4294967295ll, string("\370\377\377\377\377", 5)}, + {4294967296ll, string("\371\000\000\000\000", 5)}, + {4294967297ll, string("\371\000\000\000\001", 5)}, + {8589934591ll, string("\371\377\377\377\377", 5)}, + {8589934592ll, string("\372\000\000\000\000", 5)}, + {8589934593ll, string("\372\000\000\000\001", 5)}, + {17179869183ll, string("\373\377\377\377\377", 5)}, + {17179869184ll, string("\374\004\000\000\000\000", 6)}, + {17179869185ll, string("\374\004\000\000\000\001", 6)}, + {34359738367ll, string("\374\007\377\377\377\377", 6)}, + {34359738368ll, string("\374\010\000\000\000\000", 6)}, + {34359738369ll, string("\374\010\000\000\000\001", 6)}, + {68719476735ll, string("\374\017\377\377\377\377", 6)}, + {68719476736ll, string("\374\020\000\000\000\000", 6)}, + {68719476737ll, string("\374\020\000\000\000\001", 6)}, + {137438953471ll, string("\374\037\377\377\377\377", 6)}, + {137438953472ll, string("\374 \000\000\000\000", 6)}, + {137438953473ll, string("\374 \000\000\000\001", 6)}, + {274877906943ll, string("\374?\377\377\377\377", 6)}, + {274877906944ll, string("\374@\000\000\000\000", 6)}, + {274877906945ll, string("\374@\000\000\000\001", 6)}, + {549755813887ll, string("\374\177\377\377\377\377", 6)}, + {549755813888ll, string("\374\200\000\000\000\000", 6)}, + {549755813889ll, string("\374\200\000\000\000\001", 6)}, + {1099511627775ll, string("\374\377\377\377\377\377", 6)}, + {1099511627776ll, string("\375\000\000\000\000\000", 6)}, + {1099511627777ll, string("\375\000\000\000\000\001", 6)}, + {2199023255551ll, string("\375\377\377\377\377\377", 6)}, + {2199023255552ll, string("\376\002\000\000\000\000\000", 7)}, + {2199023255553ll, string("\376\002\000\000\000\000\001", 7)}, + {4398046511103ll, string("\376\003\377\377\377\377\377", 7)}, + {4398046511104ll, string("\376\004\000\000\000\000\000", 7)}, + {4398046511105ll, string("\376\004\000\000\000\000\001", 7)}, + {8796093022207ll, string("\376\007\377\377\377\377\377", 7)}, + {8796093022208ll, string("\376\010\000\000\000\000\000", 7)}, + {8796093022209ll, string("\376\010\000\000\000\000\001", 7)}, + {17592186044415ll, string("\376\017\377\377\377\377\377", 7)}, + {17592186044416ll, string("\376\020\000\000\000\000\000", 7)}, + {17592186044417ll, string("\376\020\000\000\000\000\001", 7)}, + {35184372088831ll, string("\376\037\377\377\377\377\377", 7)}, + {35184372088832ll, string("\376 \000\000\000\000\000", 7)}, + {35184372088833ll, string("\376 \000\000\000\000\001", 7)}, + {70368744177663ll, string("\376?\377\377\377\377\377", 7)}, + {70368744177664ll, string("\376@\000\000\000\000\000", 7)}, + {70368744177665ll, string("\376@\000\000\000\000\001", 7)}, + {140737488355327ll, string("\376\177\377\377\377\377\377", 7)}, + {140737488355328ll, string("\376\200\000\000\000\000\000", 7)}, + {140737488355329ll, string("\376\200\000\000\000\000\001", 7)}, + {281474976710655ll, string("\376\377\377\377\377\377\377", 7)}, + {281474976710656ll, string("\377\001\000\000\000\000\000\000", 8)}, + {281474976710657ll, string("\377\001\000\000\000\000\000\001", 8)}, + {562949953421311ll, string("\377\001\377\377\377\377\377\377", 8)}, + {562949953421312ll, string("\377\002\000\000\000\000\000\000", 8)}, + {562949953421313ll, string("\377\002\000\000\000\000\000\001", 8)}, + {1125899906842623ll, string("\377\003\377\377\377\377\377\377", 8)}, + {1125899906842624ll, string("\377\004\000\000\000\000\000\000", 8)}, + {1125899906842625ll, string("\377\004\000\000\000\000\000\001", 8)}, + {2251799813685247ll, string("\377\007\377\377\377\377\377\377", 8)}, + {2251799813685248ll, string("\377\010\000\000\000\000\000\000", 8)}, + {2251799813685249ll, string("\377\010\000\000\000\000\000\001", 8)}, + {4503599627370495ll, string("\377\017\377\377\377\377\377\377", 8)}, + {4503599627370496ll, string("\377\020\000\000\000\000\000\000", 8)}, + {4503599627370497ll, string("\377\020\000\000\000\000\000\001", 8)}, + {9007199254740991ll, string("\377\037\377\377\377\377\377\377", 8)}, + {9007199254740992ll, string("\377 \000\000\000\000\000\000", 8)}, + {9007199254740993ll, string("\377 \000\000\000\000\000\001", 8)}, + {18014398509481983ll, string("\377?\377\377\377\377\377\377", 8)}, + {18014398509481984ll, string("\377@\000\000\000\000\000\000", 8)}, + {18014398509481985ll, string("\377@\000\000\000\000\000\001", 8)}, + {36028797018963967ll, string("\377\177\377\377\377\377\377\377", 8)}, + {36028797018963968ll, string("\377\200\200\000\000\000\000\000\000", 9)}, + {36028797018963969ll, string("\377\200\200\000\000\000\000\000\001", 9)}, + {72057594037927935ll, string("\377\200\377\377\377\377\377\377\377", 9)}, + {72057594037927936ll, string("\377\201\000\000\000\000\000\000\000", 9)}, + {72057594037927937ll, string("\377\201\000\000\000\000\000\000\001", 9)}, + {144115188075855871ll, string("\377\201\377\377\377\377\377\377\377", 9)}, + {144115188075855872ll, string("\377\202\000\000\000\000\000\000\000", 9)}, + {144115188075855873ll, string("\377\202\000\000\000\000\000\000\001", 9)}, + {288230376151711743ll, string("\377\203\377\377\377\377\377\377\377", 9)}, + {288230376151711744ll, string("\377\204\000\000\000\000\000\000\000", 9)}, + {288230376151711745ll, string("\377\204\000\000\000\000\000\000\001", 9)}, + {576460752303423487ll, string("\377\207\377\377\377\377\377\377\377", 9)}, + {576460752303423488ll, string("\377\210\000\000\000\000\000\000\000", 9)}, + {576460752303423489ll, string("\377\210\000\000\000\000\000\000\001", 9)}, + {1152921504606846975ll, + string("\377\217\377\377\377\377\377\377\377", 9)}, + {1152921504606846976ll, + string("\377\220\000\000\000\000\000\000\000", 9)}, + {1152921504606846977ll, + string("\377\220\000\000\000\000\000\000\001", 9)}, + {2305843009213693951ll, + string("\377\237\377\377\377\377\377\377\377", 9)}, + {2305843009213693952ll, + string("\377\240\000\000\000\000\000\000\000", 9)}, + {2305843009213693953ll, + string("\377\240\000\000\000\000\000\000\001", 9)}, + {4611686018427387903ll, + string("\377\277\377\377\377\377\377\377\377", 9)}, + {4611686018427387904ll, + string("\377\300@\000\000\000\000\000\000\000", 10)}, + {4611686018427387905ll, + string("\377\300@\000\000\000\000\000\000\001", 10)}, + {9223372036854775807ll, + string("\377\300\177\377\377\377\377\377\377\377", 10)}, + {-9223372036854775807ll, + string("\000?\200\000\000\000\000\000\000\001", 10)}, + {0ll, string("\200", 1)}, + {-1ll, string("\177", 1)}, + {-2ll, string("~", 1)}, + {-1ll, string("\177", 1)}, + {-2ll, string("~", 1)}, + {-3ll, string("}", 1)}, + {-3ll, string("}", 1)}, + {-4ll, string("|", 1)}, + {-5ll, string("{", 1)}, + {-7ll, string("y", 1)}, + {-8ll, string("x", 1)}, + {-9ll, string("w", 1)}, + {-15ll, string("q", 1)}, + {-16ll, string("p", 1)}, + {-17ll, string("o", 1)}, + {-31ll, string("a", 1)}, + {-32ll, string("`", 1)}, + {-33ll, string("_", 1)}, + {-63ll, string("A", 1)}, + {-64ll, string("@", 1)}, + {-65ll, string("?\277", 2)}, + {-127ll, string("?\201", 2)}, + {-128ll, string("?\200", 2)}, + {-129ll, string("?\177", 2)}, + {-255ll, string("?\001", 2)}, + {-256ll, string("?\000", 2)}, + {-257ll, string(">\377", 2)}, + {-511ll, string(">\001", 2)}, + {-512ll, string(">\000", 2)}, + {-513ll, string("=\377", 2)}, + {-1023ll, string("<\001", 2)}, + {-1024ll, string("<\000", 2)}, + {-1025ll, string(";\377", 2)}, + {-2047ll, string("8\001", 2)}, + {-2048ll, string("8\000", 2)}, + {-2049ll, string("7\377", 2)}, + {-4095ll, string("0\001", 2)}, + {-4096ll, string("0\000", 2)}, + {-4097ll, string("/\377", 2)}, + {-8191ll, string(" \001", 2)}, + {-8192ll, string(" \000", 2)}, + {-8193ll, string("\037\337\377", 3)}, + {-16383ll, string("\037\300\001", 3)}, + {-16384ll, string("\037\300\000", 3)}, + {-16385ll, string("\037\277\377", 3)}, + {-32767ll, string("\037\200\001", 3)}, + {-32768ll, string("\037\200\000", 3)}, + {-32769ll, string("\037\177\377", 3)}, + {-65535ll, string("\037\000\001", 3)}, + {-65536ll, string("\037\000\000", 3)}, + {-65537ll, string("\036\377\377", 3)}, + {-131071ll, string("\036\000\001", 3)}, + {-131072ll, string("\036\000\000", 3)}, + {-131073ll, string("\035\377\377", 3)}, + {-262143ll, string("\034\000\001", 3)}, + {-262144ll, string("\034\000\000", 3)}, + {-262145ll, string("\033\377\377", 3)}, + {-524287ll, string("\030\000\001", 3)}, + {-524288ll, string("\030\000\000", 3)}, + {-524289ll, string("\027\377\377", 3)}, + {-1048575ll, string("\020\000\001", 3)}, + {-1048576ll, string("\020\000\000", 3)}, + {-1048577ll, string("\017\357\377\377", 4)}, + {-2097151ll, string("\017\340\000\001", 4)}, + {-2097152ll, string("\017\340\000\000", 4)}, + {-2097153ll, string("\017\337\377\377", 4)}, + {-4194303ll, string("\017\300\000\001", 4)}, + {-4194304ll, string("\017\300\000\000", 4)}, + {-4194305ll, string("\017\277\377\377", 4)}, + {-8388607ll, string("\017\200\000\001", 4)}, + {-8388608ll, string("\017\200\000\000", 4)}, + {-8388609ll, string("\017\177\377\377", 4)}, + {-16777215ll, string("\017\000\000\001", 4)}, + {-16777216ll, string("\017\000\000\000", 4)}, + {-16777217ll, string("\016\377\377\377", 4)}, + {-33554431ll, string("\016\000\000\001", 4)}, + {-33554432ll, string("\016\000\000\000", 4)}, + {-33554433ll, string("\r\377\377\377", 4)}, + {-67108863ll, string("\014\000\000\001", 4)}, + {-67108864ll, string("\014\000\000\000", 4)}, + {-67108865ll, string("\013\377\377\377", 4)}, + {-134217727ll, string("\010\000\000\001", 4)}, + {-134217728ll, string("\010\000\000\000", 4)}, + {-134217729ll, string("\007\367\377\377\377", 5)}, + {-268435455ll, string("\007\360\000\000\001", 5)}, + {-268435456ll, string("\007\360\000\000\000", 5)}, + {-268435457ll, string("\007\357\377\377\377", 5)}, + {-536870911ll, string("\007\340\000\000\001", 5)}, + {-536870912ll, string("\007\340\000\000\000", 5)}, + {-536870913ll, string("\007\337\377\377\377", 5)}, + {-1073741823ll, string("\007\300\000\000\001", 5)}, + {-1073741824ll, string("\007\300\000\000\000", 5)}, + {-1073741825ll, string("\007\277\377\377\377", 5)}, + {-2147483647ll, string("\007\200\000\000\001", 5)}, + {-2147483648ll, string("\007\200\000\000\000", 5)}, + {-2147483649ll, string("\007\177\377\377\377", 5)}, + {-4294967295ll, string("\007\000\000\000\001", 5)}, + {-4294967296ll, string("\007\000\000\000\000", 5)}, + {-4294967297ll, string("\006\377\377\377\377", 5)}, + {-8589934591ll, string("\006\000\000\000\001", 5)}, + {-8589934592ll, string("\006\000\000\000\000", 5)}, + {-8589934593ll, string("\005\377\377\377\377", 5)}, + {-17179869183ll, string("\004\000\000\000\001", 5)}, + {-17179869184ll, string("\004\000\000\000\000", 5)}, + {-17179869185ll, string("\003\373\377\377\377\377", 6)}, + {-34359738367ll, string("\003\370\000\000\000\001", 6)}, + {-34359738368ll, string("\003\370\000\000\000\000", 6)}, + {-34359738369ll, string("\003\367\377\377\377\377", 6)}, + {-68719476735ll, string("\003\360\000\000\000\001", 6)}, + {-68719476736ll, string("\003\360\000\000\000\000", 6)}, + {-68719476737ll, string("\003\357\377\377\377\377", 6)}, + {-137438953471ll, string("\003\340\000\000\000\001", 6)}, + {-137438953472ll, string("\003\340\000\000\000\000", 6)}, + {-137438953473ll, string("\003\337\377\377\377\377", 6)}, + {-274877906943ll, string("\003\300\000\000\000\001", 6)}, + {-274877906944ll, string("\003\300\000\000\000\000", 6)}, + {-274877906945ll, string("\003\277\377\377\377\377", 6)}, + {-549755813887ll, string("\003\200\000\000\000\001", 6)}, + {-549755813888ll, string("\003\200\000\000\000\000", 6)}, + {-549755813889ll, string("\003\177\377\377\377\377", 6)}, + {-1099511627775ll, string("\003\000\000\000\000\001", 6)}, + {-1099511627776ll, string("\003\000\000\000\000\000", 6)}, + {-1099511627777ll, string("\002\377\377\377\377\377", 6)}, + {-2199023255551ll, string("\002\000\000\000\000\001", 6)}, + {-2199023255552ll, string("\002\000\000\000\000\000", 6)}, + {-2199023255553ll, string("\001\375\377\377\377\377\377", 7)}, + {-4398046511103ll, string("\001\374\000\000\000\000\001", 7)}, + {-4398046511104ll, string("\001\374\000\000\000\000\000", 7)}, + {-4398046511105ll, string("\001\373\377\377\377\377\377", 7)}, + {-8796093022207ll, string("\001\370\000\000\000\000\001", 7)}, + {-8796093022208ll, string("\001\370\000\000\000\000\000", 7)}, + {-8796093022209ll, string("\001\367\377\377\377\377\377", 7)}, + {-17592186044415ll, string("\001\360\000\000\000\000\001", 7)}, + {-17592186044416ll, string("\001\360\000\000\000\000\000", 7)}, + {-17592186044417ll, string("\001\357\377\377\377\377\377", 7)}, + {-35184372088831ll, string("\001\340\000\000\000\000\001", 7)}, + {-35184372088832ll, string("\001\340\000\000\000\000\000", 7)}, + {-35184372088833ll, string("\001\337\377\377\377\377\377", 7)}, + {-70368744177663ll, string("\001\300\000\000\000\000\001", 7)}, + {-70368744177664ll, string("\001\300\000\000\000\000\000", 7)}, + {-70368744177665ll, string("\001\277\377\377\377\377\377", 7)}, + {-140737488355327ll, string("\001\200\000\000\000\000\001", 7)}, + {-140737488355328ll, string("\001\200\000\000\000\000\000", 7)}, + {-140737488355329ll, string("\001\177\377\377\377\377\377", 7)}, + {-281474976710655ll, string("\001\000\000\000\000\000\001", 7)}, + {-281474976710656ll, string("\001\000\000\000\000\000\000", 7)}, + {-281474976710657ll, string("\000\376\377\377\377\377\377\377", 8)}, + {-562949953421311ll, string("\000\376\000\000\000\000\000\001", 8)}, + {-562949953421312ll, string("\000\376\000\000\000\000\000\000", 8)}, + {-562949953421313ll, string("\000\375\377\377\377\377\377\377", 8)}, + {-1125899906842623ll, string("\000\374\000\000\000\000\000\001", 8)}, + {-1125899906842624ll, string("\000\374\000\000\000\000\000\000", 8)}, + {-1125899906842625ll, string("\000\373\377\377\377\377\377\377", 8)}, + {-2251799813685247ll, string("\000\370\000\000\000\000\000\001", 8)}, + {-2251799813685248ll, string("\000\370\000\000\000\000\000\000", 8)}, + {-2251799813685249ll, string("\000\367\377\377\377\377\377\377", 8)}, + {-4503599627370495ll, string("\000\360\000\000\000\000\000\001", 8)}, + {-4503599627370496ll, string("\000\360\000\000\000\000\000\000", 8)}, + {-4503599627370497ll, string("\000\357\377\377\377\377\377\377", 8)}, + {-9007199254740991ll, string("\000\340\000\000\000\000\000\001", 8)}, + {-9007199254740992ll, string("\000\340\000\000\000\000\000\000", 8)}, + {-9007199254740993ll, string("\000\337\377\377\377\377\377\377", 8)}, + {-18014398509481983ll, string("\000\300\000\000\000\000\000\001", 8)}, + {-18014398509481984ll, string("\000\300\000\000\000\000\000\000", 8)}, + {-18014398509481985ll, string("\000\277\377\377\377\377\377\377", 8)}, + {-36028797018963967ll, string("\000\200\000\000\000\000\000\001", 8)}, + {-36028797018963968ll, string("\000\200\000\000\000\000\000\000", 8)}, + {-36028797018963969ll, string("\000\177\177\377\377\377\377\377\377", 9)}, + {-72057594037927935ll, string("\000\177\000\000\000\000\000\000\001", 9)}, + {-72057594037927936ll, string("\000\177\000\000\000\000\000\000\000", 9)}, + {-72057594037927937ll, string("\000~\377\377\377\377\377\377\377", 9)}, + {-144115188075855871ll, string("\000~\000\000\000\000\000\000\001", 9)}, + {-144115188075855872ll, string("\000~\000\000\000\000\000\000\000", 9)}, + {-144115188075855873ll, string("\000}\377\377\377\377\377\377\377", 9)}, + {-288230376151711743ll, string("\000|\000\000\000\000\000\000\001", 9)}, + {-288230376151711744ll, string("\000|\000\000\000\000\000\000\000", 9)}, + {-288230376151711745ll, string("\000{\377\377\377\377\377\377\377", 9)}, + {-576460752303423487ll, string("\000x\000\000\000\000\000\000\001", 9)}, + {-576460752303423488ll, string("\000x\000\000\000\000\000\000\000", 9)}, + {-576460752303423489ll, string("\000w\377\377\377\377\377\377\377", 9)}, + {-1152921504606846975ll, string("\000p\000\000\000\000\000\000\001", 9)}, + {-1152921504606846976ll, string("\000p\000\000\000\000\000\000\000", 9)}, + {-1152921504606846977ll, string("\000o\377\377\377\377\377\377\377", 9)}, + {-2305843009213693951ll, string("\000`\000\000\000\000\000\000\001", 9)}, + {-2305843009213693952ll, string("\000`\000\000\000\000\000\000\000", 9)}, + {-2305843009213693953ll, string("\000_\377\377\377\377\377\377\377", 9)}, + {-4611686018427387903ll, string("\000@\000\000\000\000\000\000\001", 9)}, + {-4611686018427387904ll, string("\000@\000\000\000\000\000\000\000", 9)}, + {-4611686018427387905ll, + string("\000?\277\377\377\377\377\377\377\377", 10)}, + {-9223372036854775807ll, + string("\000?\200\000\000\000\000\000\000\001", 10)}, + {9223372036854775807ll, + string("\377\300\177\377\377\377\377\377\377\377", 10)}, + }; + for (const auto& t : data) { + int64 num = t.first; + string result; + OrderedCode::WriteSignedNumIncreasing(&result, num); + EXPECT_EQ(t.second, result) << std::hex << num; + + StringPiece in = result; + int64 decoded; + EXPECT_TRUE(OrderedCode::ReadSignedNumIncreasing(&in, &decoded)); + EXPECT_EQ(num, decoded); + EXPECT_EQ("", in); + } +} + +static void BM_WriteString(int n, int len) { + testing::StopTiming(); + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + string x; + for (int i = 0; i < len; i++) { + x += rnd.Uniform(256); + } + string y; + + testing::BytesProcessed(n * len); + testing::StartTiming(); + while (n-- > 0) { + y.clear(); + OCWriteToString<string>(&y, x); + } +} + +static void BM_ReadString(int n, int len) { + testing::StopTiming(); + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + string x; + for (int i = 0; i < len; i++) { + x += rnd.Uniform(256); + } + string data; + OCWriteToString<string>(&data, x); + string result; + + testing::BytesProcessed(n * len); + testing::StartTiming(); + while (n-- > 0) { + result.clear(); + StringPiece s = data; + OCRead<string>(&s, &result); + } +} + +static void BM_WriteStringIncreasing(int n, int len) { BM_WriteString(n, len); } +static void BM_ReadStringIncreasing(int n, int len) { BM_ReadString(n, len); } + +BENCHMARK(BM_WriteStringIncreasing)->Range(0, 1024); +BENCHMARK(BM_ReadStringIncreasing)->Range(0, 1024); + +} // namespace strings +} // namespace tensorflow diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc new file mode 100644 index 0000000000..cccd50c7ff --- /dev/null +++ b/tensorflow/core/lib/strings/str_util.cc @@ -0,0 +1,312 @@ +#include "tensorflow/core/lib/strings/str_util.h" +#include <ctype.h> + +namespace tensorflow { +namespace str_util { + +static char hex_char[] = "0123456789abcdef"; + +string CEscape(const string& src) { + string dest; + + for (unsigned char c : src) { + switch (c) { + case '\n': + dest.append("\\n"); + break; + case '\r': + dest.append("\\r"); + break; + case '\t': + dest.append("\\t"); + break; + case '\"': + dest.append("\\\""); + break; + case '\'': + dest.append("\\'"); + break; + case '\\': + dest.append("\\\\"); + break; + default: + // Note that if we emit \xNN and the src character after that is a hex + // digit then that digit must be escaped too to prevent it being + // interpreted as part of the character code by C. + if ((c >= 0x80) || !isprint(c)) { + dest.append("\\"); + dest.push_back(hex_char[c / 64]); + dest.push_back(hex_char[(c % 64) / 8]); + dest.push_back(hex_char[c % 8]); + } else { + dest.push_back(c); + break; + } + } + } + + return dest; +} + +namespace { // Private helpers for CUnescape(). + +inline bool is_octal_digit(unsigned char c) { return c >= '0' && c <= '7'; } + +inline bool ascii_isxdigit(unsigned char c) { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F'); +} + +inline int hex_digit_to_int(char c) { + int x = static_cast<unsigned char>(c); + if (x > '9') { + x += 9; + } + return x & 0xf; +} + +bool CUnescapeInternal(StringPiece source, char* dest, int* dest_len, + string* error) { + char* d = dest; + const char* p = source.data(); + const char* end = source.end(); + const char* last_byte = end - 1; + + // Small optimization for case where source = dest and there's no escaping + while (p == d && p < end && *p != '\\') p++, d++; + + while (p < end) { + if (*p != '\\') { + *d++ = *p++; + } else { + if (++p > last_byte) { // skip past the '\\' + if (error) *error = "String cannot end with \\"; + return false; + } + switch (*p) { + case 'a': + *d++ = '\a'; + break; + case 'b': + *d++ = '\b'; + break; + case 'f': + *d++ = '\f'; + break; + case 'n': + *d++ = '\n'; + break; + case 'r': + *d++ = '\r'; + break; + case 't': + *d++ = '\t'; + break; + case 'v': + *d++ = '\v'; + break; + case '\\': + *d++ = '\\'; + break; + case '?': + *d++ = '\?'; + break; // \? Who knew? + case '\'': + *d++ = '\''; + break; + case '"': + *d++ = '\"'; + break; + case '0': + case '1': + case '2': + case '3': // octal digit: 1 to 3 digits + case '4': + case '5': + case '6': + case '7': { + const char* octal_start = p; + unsigned int ch = *p - '0'; + if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0'; + if (p < last_byte && is_octal_digit(p[1])) + ch = ch * 8 + *++p - '0'; // now points at last digit + if (ch > 0xff) { + if (error) { + *error = "Value of \\" + + string(octal_start, p + 1 - octal_start) + + " exceeds 0xff"; + } + return false; + } + *d++ = ch; + break; + } + case 'x': + case 'X': { + if (p >= last_byte) { + if (error) *error = "String cannot end with \\x"; + return false; + } else if (!ascii_isxdigit(p[1])) { + if (error) *error = "\\x cannot be followed by a non-hex digit"; + return false; + } + unsigned int ch = 0; + const char* hex_start = p; + while (p < last_byte && ascii_isxdigit(p[1])) + // Arbitrarily many hex digits + ch = (ch << 4) + hex_digit_to_int(*++p); + if (ch > 0xFF) { + if (error) { + *error = "Value of \\" + string(hex_start, p + 1 - hex_start) + + " exceeds 0xff"; + } + return false; + } + *d++ = ch; + break; + } + default: { + if (error) *error = string("Unknown escape sequence: \\") + *p; + return false; + } + } + p++; // read past letter we escaped + } + } + *dest_len = d - dest; + return true; +} + +} // namespace + +bool CUnescape(StringPiece source, string* dest, string* error) { + dest->resize(source.size()); + int dest_size; + if (!CUnescapeInternal(source, const_cast<char*>(dest->data()), &dest_size, + error)) { + return false; + } + dest->erase(dest_size); + return true; +} + +bool NumericParse32(const string& text, int32* val) { + // Slow, but this code is not performance critical, and this + // doesn't bring in any new dependencies + char junk; + if (sscanf(text.c_str(), "%d%c", val, &junk) == 1) { + return true; + } else { + return false; + } +} + +void StripTrailingWhitespace(string* s) { + string::size_type i; + for (i = s->size(); i > 0 && isspace((*s)[i - 1]); --i) { + } + s->resize(i); +} + +// Return lower-cased version of s. +string Lowercase(StringPiece s) { + string result(s.data(), s.size()); + for (char& c : result) { + c = tolower(c); + } + return result; +} + +// Return upper-cased version of s. +string Uppercase(StringPiece s) { + string result(s.data(), s.size()); + for (char& c : result) { + c = toupper(c); + } + return result; +} + +void TitlecaseString(string* s, StringPiece delimiters) { + bool upper = true; + for (string::iterator ss = s->begin(); ss != s->end(); ++ss) { + if (upper) { + *ss = toupper(*ss); + } + upper = (delimiters.find(*ss) != StringPiece::npos); + } +} + +size_t RemoveLeadingWhitespace(StringPiece* text) { + size_t count = 0; + const char* ptr = text->data(); + while (count < text->size() && isspace(*ptr)) { + count++; + ptr++; + } + text->remove_prefix(count); + return count; +} + +size_t RemoveTrailingWhitespace(StringPiece* text) { + size_t count = 0; + const char* ptr = text->data() + text->size() - 1; + while (count < text->size() && isspace(*ptr)) { + ++count; + --ptr; + } + text->remove_suffix(count); + return count; +} + +size_t RemoveWhitespaceContext(StringPiece* text) { + // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job + return (RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text)); +} + +bool ConsumePrefix(StringPiece* s, StringPiece expected) { + if (s->starts_with(expected)) { + s->remove_prefix(expected.size()); + return true; + } + return false; +} + +bool ConsumeLeadingDigits(StringPiece* s, uint64* val) { + const char* p = s->data(); + const char* limit = p + s->size(); + uint64 v = 0; + while (p < limit) { + const char c = *p; + if (c < '0' || c > '9') break; + uint64 new_v = (v * 10) + (c - '0'); + if (new_v < v) { + // Overflow occurred + return false; + } + v = new_v; + p++; + } + if (p > s->data()) { + // Consume some digits + s->remove_prefix(p - s->data()); + *val = v; + return true; + } else { + return false; + } +} + +bool SplitAndParseAsInts(StringPiece text, char delim, + std::vector<int32>* result) { + result->clear(); + std::vector<string> num_strings = Split(text, delim); + for (const auto& s : num_strings) { + int32 num; + if (!NumericParse32(s, &num)) return false; + result->push_back(num); + } + return true; +} + +} // namespace str_util +} // namespace tensorflow diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h new file mode 100644 index 0000000000..34ea462b2d --- /dev/null +++ b/tensorflow/core/lib/strings/str_util.h @@ -0,0 +1,149 @@ +#ifndef TENSORFLOW_LIB_STRINGS_STR_UTIL_H_ +#define TENSORFLOW_LIB_STRINGS_STR_UTIL_H_ + +#include <string> +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/strings/strcat.h" + +// Basic string utility routines +namespace tensorflow { +namespace str_util { + +// Returns a version of 'src' where unprintable characters have been +// escaped using C-style escape sequences. +string CEscape(const string& src); + +// Copies "source" to "dest", rewriting C-style escape sequences -- +// '\n', '\r', '\\', '\ooo', etc -- to their ASCII equivalents. +// +// Errors: Sets the description of the first encountered error in +// 'error'. To disable error reporting, set 'error' to NULL. +// +// NOTE: Does not support \u or \U! +bool CUnescape(StringPiece source, string* dest, string* error); + +// If "text" can be successfully parsed as the ASCII representation of +// an integer, sets "*val" to the value and returns true. Otherwise, +// returns false. +bool NumericParse32(const string& text, int32* val); + +// Removes any trailing whitespace from "*s". +void StripTrailingWhitespace(string* s); + +// Removes leading ascii_isspace() characters. +// Returns number of characters removed. +size_t RemoveLeadingWhitespace(StringPiece* text); + +// Removes trailing ascii_isspace() characters. +// Returns number of characters removed. +size_t RemoveTrailingWhitespace(StringPiece* text); + +// Removes leading and trailing ascii_isspace() chars. +// Returns number of chars removed. +size_t RemoveWhitespaceContext(StringPiece* text); + +// Consume a leading positive integer value. If any digits were +// found, store the value of the leading unsigned number in "*val", +// advance "*s" past the consumed number, and return true. If +// overflow occurred, returns false. Otherwise, returns false. +bool ConsumeLeadingDigits(StringPiece* s, uint64* val); + +// If "*s" starts with "expected", consume it and return true. +// Otherwise, return false. +bool ConsumePrefix(StringPiece* s, StringPiece expected); + +// Return lower-cased version of s. +string Lowercase(StringPiece s); + +// Return upper-cased version of s. +string Uppercase(StringPiece s); + +// Capitalize first character of each word in "*s". "delimiters" is a +// set of characters that can be used as word boundaries. +void TitlecaseString(string* s, StringPiece delimiters); + +// Join functionality +template <typename T> +string Join(const std::vector<T>& s, const char* sep); +template <typename T> +string Join(const gtl::ArraySlice<T>& s, const char* sep); + +struct AllowEmpty { + bool operator()(StringPiece sp) const { return true; } +}; +struct SkipEmpty { + bool operator()(StringPiece sp) const { return !sp.empty(); } +}; +struct SkipWhitespace { + bool operator()(StringPiece sp) const { + RemoveTrailingWhitespace(&sp); + return !sp.empty(); + } +}; + +std::vector<string> Split(StringPiece text, char delim); +template <typename Predicate> +std::vector<string> Split(StringPiece text, char delim, Predicate p); + +// Split "text" at "delim" characters, and parse each component as +// an integer. If successful, adds the individual numbers in order +// to "*result" and returns true. Otherwise returns false. +bool SplitAndParseAsInts(StringPiece text, char delim, + std::vector<int32>* result); + +// ------------------------------------------------------------------ +// Implementation details below +namespace internal { +template <typename T> +string JoinHelper(typename gtl::ArraySlice<T>::const_iterator begin, + typename gtl::ArraySlice<T>::const_iterator end, + const char* sep) { + string result; + bool first = true; + for (typename gtl::ArraySlice<T>::const_iterator it = begin; it != end; + ++it) { + tensorflow::strings::StrAppend(&result, (first ? "" : sep), *it); + first = false; + } + return result; +} +} // namespace internal + +template <typename T> +string Join(const std::vector<T>& s, const char* sep) { + return Join<T>(gtl::ArraySlice<T>(s), sep); +} + +template <typename T> +string Join(const gtl::ArraySlice<T>& s, const char* sep) { + return internal::JoinHelper<T>(s.begin(), s.end(), sep); +} + +inline std::vector<string> Split(StringPiece text, char delim) { + return Split(text, delim, AllowEmpty()); +} + +template <typename Predicate> +std::vector<string> Split(StringPiece text, char delim, Predicate p) { + std::vector<string> result; + int token_start = 0; + if (!text.empty()) { + for (int i = 0; i < text.size() + 1; i++) { + if ((i == text.size()) || (text[i] == delim)) { + StringPiece token(text.data() + token_start, i - token_start); + if (p(token)) { + result.push_back(token.ToString()); + } + token_start = i + 1; + } + } + } + return result; +} + +} // namespace str_util +} // namespace tensorflow + +#endif // TENSORFLOW_LIB_STRINGS_STR_UTIL_H_ diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc new file mode 100644 index 0000000000..f71cc6c609 --- /dev/null +++ b/tensorflow/core/lib/strings/str_util_test.cc @@ -0,0 +1,258 @@ +#include "tensorflow/core/lib/strings/str_util.h" + +#include <gtest/gtest.h> + +namespace tensorflow { + +TEST(CEscape, Basic) { + EXPECT_EQ(str_util::CEscape("hello"), "hello"); + EXPECT_EQ(str_util::CEscape("hello\n"), "hello\\n"); + EXPECT_EQ(str_util::CEscape("hello\r"), "hello\\r"); + EXPECT_EQ(str_util::CEscape("\t\r\"'"), "\\t\\r\\\"\\'"); + EXPECT_EQ(str_util::CEscape("\320hi\200"), "\\320hi\\200"); +} + +string ExpectCUnescapeSuccess(StringPiece source) { + string dest; + string error; + EXPECT_TRUE(str_util::CUnescape(source, &dest, &error)) << error; + return dest; +} + +TEST(CUnescape, Basic) { + EXPECT_EQ("hello", ExpectCUnescapeSuccess("hello")); + EXPECT_EQ("hello\n", ExpectCUnescapeSuccess("hello\\n")); + EXPECT_EQ("hello\r", ExpectCUnescapeSuccess("hello\\r")); + EXPECT_EQ("\t\r\"'", ExpectCUnescapeSuccess("\\t\\r\\\"\\'")); + EXPECT_EQ("\320hi\200", ExpectCUnescapeSuccess("\\320hi\\200")); +} + +TEST(NumericParse32, Basic) { + int32 val = -1234; + EXPECT_TRUE(str_util::NumericParse32("0", &val) && val == 0); + EXPECT_TRUE(str_util::NumericParse32("123", &val) && val == 123); + EXPECT_TRUE(str_util::NumericParse32("-375", &val) && val == -375); + EXPECT_FALSE(str_util::NumericParse32("123hello", &val)); + EXPECT_FALSE(str_util::NumericParse32("hello123", &val)); +} + +TEST(StripTrailingWhitespace, Basic) { + string test; + test = "hello"; + str_util::StripTrailingWhitespace(&test); + EXPECT_EQ(test, "hello"); + + test = "foo "; + str_util::StripTrailingWhitespace(&test); + EXPECT_EQ(test, "foo"); + + test = " "; + str_util::StripTrailingWhitespace(&test); + EXPECT_EQ(test, ""); + + test = ""; + str_util::StripTrailingWhitespace(&test); + EXPECT_EQ(test, ""); + + test = " abc\t"; + str_util::StripTrailingWhitespace(&test); + EXPECT_EQ(test, " abc"); +} + +TEST(RemoveLeadingWhitespace, Basic) { + string text = " \t \n \r Quick\t"; + StringPiece data(text); + // check that all whitespace is removed + EXPECT_EQ(str_util::RemoveLeadingWhitespace(&data), 11); + EXPECT_EQ(data, StringPiece("Quick\t")); + // check that non-whitespace is not removed + EXPECT_EQ(str_util::RemoveLeadingWhitespace(&data), 0); + EXPECT_EQ(data, StringPiece("Quick\t")); +} + +TEST(RemoveLeadingWhitespace, TerminationHandling) { + // check termination handling + string text = "\t"; + StringPiece data(text); + EXPECT_EQ(str_util::RemoveLeadingWhitespace(&data), 1); + EXPECT_EQ(data, StringPiece("")); + + // check termination handling again + EXPECT_EQ(str_util::RemoveLeadingWhitespace(&data), 0); + EXPECT_EQ(data, StringPiece("")); +} + +TEST(RemoveTrailingWhitespace, Basic) { + string text = " \t \n \r Quick \t"; + StringPiece data(text); + // check that all whitespace is removed + EXPECT_EQ(str_util::RemoveTrailingWhitespace(&data), 2); + EXPECT_EQ(data, StringPiece(" \t \n \r Quick")); + // check that non-whitespace is not removed + EXPECT_EQ(str_util::RemoveTrailingWhitespace(&data), 0); + EXPECT_EQ(data, StringPiece(" \t \n \r Quick")); +} + +TEST(RemoveTrailingWhitespace, TerminationHandling) { + // check termination handling + string text = "\t"; + StringPiece data(text); + EXPECT_EQ(str_util::RemoveTrailingWhitespace(&data), 1); + EXPECT_EQ(data, StringPiece("")); + + // check termination handling again + EXPECT_EQ(str_util::RemoveTrailingWhitespace(&data), 0); + EXPECT_EQ(data, StringPiece("")); +} + +TEST(RemoveWhitespaceContext, Basic) { + string text = " \t \n \r Quick \t"; + StringPiece data(text); + // check that all whitespace is removed + EXPECT_EQ(str_util::RemoveWhitespaceContext(&data), 13); + EXPECT_EQ(data, StringPiece("Quick")); + // check that non-whitespace is not removed + EXPECT_EQ(str_util::RemoveWhitespaceContext(&data), 0); + EXPECT_EQ(data, StringPiece("Quick")); + + // Test empty string + text = ""; + data = text; + EXPECT_EQ(str_util::RemoveWhitespaceContext(&data), 0); + EXPECT_EQ(data, StringPiece("")); +} + +void TestConsumeLeadingDigits(StringPiece s, int64 expected, + StringPiece remaining) { + uint64 v; + StringPiece input(s); + if (str_util::ConsumeLeadingDigits(&input, &v)) { + EXPECT_EQ(v, static_cast<uint64>(expected)); + EXPECT_EQ(input, remaining); + } else { + EXPECT_LT(expected, 0); + EXPECT_EQ(input, remaining); + } +} + +TEST(ConsumeLeadingDigits, Basic) { + TestConsumeLeadingDigits("123", 123, ""); + TestConsumeLeadingDigits("a123", -1, "a123"); + TestConsumeLeadingDigits("9_", 9, "_"); + TestConsumeLeadingDigits("11111111111xyz", 11111111111ll, "xyz"); + + // Overflow case + TestConsumeLeadingDigits("1111111111111111111111111111111xyz", -1, + "1111111111111111111111111111111xyz"); + + // 2^64 + TestConsumeLeadingDigits("18446744073709551616xyz", -1, + "18446744073709551616xyz"); + // 2^64-1 + TestConsumeLeadingDigits("18446744073709551615xyz", 18446744073709551615ull, + "xyz"); +} + +TEST(ConsumePrefix, Basic) { + string s("abcdef"); + StringPiece input(s); + EXPECT_FALSE(str_util::ConsumePrefix(&input, "abcdefg")); + EXPECT_EQ(input, "abcdef"); + + EXPECT_FALSE(str_util::ConsumePrefix(&input, "abce")); + EXPECT_EQ(input, "abcdef"); + + EXPECT_TRUE(str_util::ConsumePrefix(&input, "")); + EXPECT_EQ(input, "abcdef"); + + EXPECT_FALSE(str_util::ConsumePrefix(&input, "abcdeg")); + EXPECT_EQ(input, "abcdef"); + + EXPECT_TRUE(str_util::ConsumePrefix(&input, "abcdef")); + EXPECT_EQ(input, ""); + + input = s; + EXPECT_TRUE(str_util::ConsumePrefix(&input, "abcde")); + EXPECT_EQ(input, "f"); +} + +TEST(JoinStrings, Basic) { + std::vector<string> s; + s = {"hi"}; + EXPECT_EQ(str_util::Join(s, " "), "hi"); + s = {"hi", "there", "strings"}; + EXPECT_EQ(str_util::Join(s, " "), "hi there strings"); + + std::vector<StringPiece> sp; + sp = {"hi"}; + EXPECT_EQ(str_util::Join(sp, ",,"), "hi"); + sp = {"hi", "there", "strings"}; + EXPECT_EQ(str_util::Join(sp, "--"), "hi--there--strings"); +} + +TEST(Split, Basic) { + EXPECT_TRUE(str_util::Split("", ',').empty()); + EXPECT_EQ(str_util::Join(str_util::Split("a", ','), "|"), "a"); + EXPECT_EQ(str_util::Join(str_util::Split(",", ','), "|"), "|"); + EXPECT_EQ(str_util::Join(str_util::Split("a,b,c", ','), "|"), "a|b|c"); + EXPECT_EQ(str_util::Join(str_util::Split("a,,,b,,c,", ','), "|"), + "a|||b||c|"); + EXPECT_EQ(str_util::Join( + str_util::Split("a,,,b,,c,", ',', str_util::SkipEmpty()), "|"), + "a|b|c"); + EXPECT_EQ( + str_util::Join( + str_util::Split("a, ,b,,c,", ',', str_util::SkipWhitespace()), "|"), + "a|b|c"); +} + +TEST(SplitAndParseAsInts, Basic) { + std::vector<int32> nums; + EXPECT_TRUE(str_util::SplitAndParseAsInts("", ',', &nums)); + EXPECT_EQ(nums.size(), 0); + + EXPECT_TRUE(str_util::SplitAndParseAsInts("134", ',', &nums)); + EXPECT_EQ(nums.size(), 1); + EXPECT_EQ(nums[0], 134); + + EXPECT_TRUE(str_util::SplitAndParseAsInts("134,2,13,-5", ',', &nums)); + EXPECT_EQ(nums.size(), 4); + EXPECT_EQ(nums[0], 134); + EXPECT_EQ(nums[1], 2); + EXPECT_EQ(nums[2], 13); + EXPECT_EQ(nums[3], -5); + + EXPECT_FALSE(str_util::SplitAndParseAsInts("abc", ',', &nums)); + + EXPECT_FALSE(str_util::SplitAndParseAsInts("-13,abc", ',', &nums)); + + EXPECT_FALSE(str_util::SplitAndParseAsInts("13,abc,5", ',', &nums)); +} + +TEST(Lowercase, Basic) { + EXPECT_EQ("", str_util::Lowercase("")); + EXPECT_EQ("hello", str_util::Lowercase("hello")); + EXPECT_EQ("hello world", str_util::Lowercase("Hello World")); +} + +TEST(Uppercase, Basic) { + EXPECT_EQ("", str_util::Uppercase("")); + EXPECT_EQ("HELLO", str_util::Uppercase("hello")); + EXPECT_EQ("HELLO WORLD", str_util::Uppercase("Hello World")); +} + +TEST(TitlecaseString, Basic) { + string s = "sparse_lookup"; + str_util::TitlecaseString(&s, "_"); + ASSERT_EQ(s, "Sparse_Lookup"); + + s = "sparse_lookup"; + str_util::TitlecaseString(&s, " "); + ASSERT_EQ(s, "Sparse_lookup"); + + s = "dense"; + str_util::TitlecaseString(&s, " "); + ASSERT_EQ(s, "Dense"); +} + +} // namespace tensorflow diff --git a/tensorflow/core/lib/strings/strcat.cc b/tensorflow/core/lib/strings/strcat.cc new file mode 100644 index 0000000000..e564b9eb73 --- /dev/null +++ b/tensorflow/core/lib/strings/strcat.cc @@ -0,0 +1,194 @@ +#include "tensorflow/core/lib/strings/strcat.h" + +#include <stdarg.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/lib/gtl/stl_util.h" + +namespace tensorflow { +namespace strings { + +AlphaNum gEmptyAlphaNum(""); + +AlphaNum::AlphaNum(Hex hex) { + char *const end = &digits_[kFastToBufferSize]; + char *writer = end; + uint64 value = hex.value; + uint64 width = hex.spec; + // We accomplish minimum width by OR'ing in 0x10000 to the user's value, + // where 0x10000 is the smallest hex number that is as wide as the user + // asked for. + uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value; + static const char hexdigits[] = "0123456789abcdef"; + do { + *--writer = hexdigits[value & 0xF]; + value >>= 4; + mask >>= 4; + } while (mask != 0); + piece_.set(writer, end - writer); +} + +// ---------------------------------------------------------------------- +// StrCat() +// This merges the given strings or integers, with no delimiter. This +// is designed to be the fastest possible way to construct a string out +// of a mix of raw C strings, StringPieces, strings, and integer values. +// ---------------------------------------------------------------------- + +// Append is merely a version of memcpy that returns the address of the byte +// after the area just overwritten. It comes in multiple flavors to minimize +// call overhead. +static char *Append1(char *out, const AlphaNum &x) { + memcpy(out, x.data(), x.size()); + return out + x.size(); +} + +static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) { + memcpy(out, x1.data(), x1.size()); + out += x1.size(); + + memcpy(out, x2.data(), x2.size()); + return out + x2.size(); +} + +static char *Append4(char *out, const AlphaNum &x1, const AlphaNum &x2, + const AlphaNum &x3, const AlphaNum &x4) { + memcpy(out, x1.data(), x1.size()); + out += x1.size(); + + memcpy(out, x2.data(), x2.size()); + out += x2.size(); + + memcpy(out, x3.data(), x3.size()); + out += x3.size(); + + memcpy(out, x4.data(), x4.size()); + return out + x4.size(); +} + +string StrCat(const AlphaNum &a, const AlphaNum &b) { + string result; + gtl::STLStringResizeUninitialized(&result, a.size() + b.size()); + char *const begin = &*result.begin(); + char *out = Append2(begin, a, b); + DCHECK_EQ(out, begin + result.size()); + return result; +} + +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) { + string result; + gtl::STLStringResizeUninitialized(&result, a.size() + b.size() + c.size()); + char *const begin = &*result.begin(); + char *out = Append2(begin, a, b); + out = Append1(out, c); + DCHECK_EQ(out, begin + result.size()); + return result; +} + +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d) { + string result; + gtl::STLStringResizeUninitialized(&result, + a.size() + b.size() + c.size() + d.size()); + char *const begin = &*result.begin(); + char *out = Append4(begin, a, b, c, d); + DCHECK_EQ(out, begin + result.size()); + return result; +} + +namespace internal { + +// Do not call directly - these are not part of the public API. +string CatPieces(std::initializer_list<StringPiece> pieces) { + string result; + size_t total_size = 0; + for (const StringPiece piece : pieces) total_size += piece.size(); + gtl::STLStringResizeUninitialized(&result, total_size); + + char *const begin = &*result.begin(); + char *out = begin; + for (const StringPiece piece : pieces) { + const size_t this_size = piece.size(); + memcpy(out, piece.data(), this_size); + out += this_size; + } + DCHECK_EQ(out, begin + result.size()); + return result; +} + +// It's possible to call StrAppend with a StringPiece that is itself a fragment +// of the string we're appending to. However the results of this are random. +// Therefore, check for this in debug mode. Use unsigned math so we only have +// to do one comparison. +#define DCHECK_NO_OVERLAP(dest, src) \ + DCHECK_GE(uintptr_t((src).data() - (dest).data()), uintptr_t((dest).size())) + +void AppendPieces(string *result, std::initializer_list<StringPiece> pieces) { + size_t old_size = result->size(); + size_t total_size = old_size; + for (const StringPiece piece : pieces) { + DCHECK_NO_OVERLAP(*result, piece); + total_size += piece.size(); + } + gtl::STLStringResizeUninitialized(result, total_size); + + char *const begin = &*result->begin(); + char *out = begin + old_size; + for (const StringPiece piece : pieces) { + const size_t this_size = piece.size(); + memcpy(out, piece.data(), this_size); + out += this_size; + } + DCHECK_EQ(out, begin + result->size()); +} + +} // namespace internal + +void StrAppend(string *result, const AlphaNum &a) { + DCHECK_NO_OVERLAP(*result, a); + result->append(a.data(), a.size()); +} + +void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) { + DCHECK_NO_OVERLAP(*result, a); + DCHECK_NO_OVERLAP(*result, b); + string::size_type old_size = result->size(); + gtl::STLStringResizeUninitialized(result, old_size + a.size() + b.size()); + char *const begin = &*result->begin(); + char *out = Append2(begin + old_size, a, b); + DCHECK_EQ(out, begin + result->size()); +} + +void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b, + const AlphaNum &c) { + DCHECK_NO_OVERLAP(*result, a); + DCHECK_NO_OVERLAP(*result, b); + DCHECK_NO_OVERLAP(*result, c); + string::size_type old_size = result->size(); + gtl::STLStringResizeUninitialized(result, + old_size + a.size() + b.size() + c.size()); + char *const begin = &*result->begin(); + char *out = Append2(begin + old_size, a, b); + out = Append1(out, c); + DCHECK_EQ(out, begin + result->size()); +} + +void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b, + const AlphaNum &c, const AlphaNum &d) { + DCHECK_NO_OVERLAP(*result, a); + DCHECK_NO_OVERLAP(*result, b); + DCHECK_NO_OVERLAP(*result, c); + DCHECK_NO_OVERLAP(*result, d); + string::size_type old_size = result->size(); + gtl::STLStringResizeUninitialized( + result, old_size + a.size() + b.size() + c.size() + d.size()); + char *const begin = &*result->begin(); + char *out = Append4(begin + old_size, a, b, c, d); + DCHECK_EQ(out, begin + result->size()); +} + +} // namespace strings +} // namespace tensorflow diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h new file mode 100644 index 0000000000..763ad8368a --- /dev/null +++ b/tensorflow/core/lib/strings/strcat.h @@ -0,0 +1,229 @@ +// #status: RECOMMENDED +// #category: operations on strings +// #summary: Merges strings or numbers with no delimiter. +// +#ifndef TENSORFLOW_LIB_STRINGS_STRCAT_H_ +#define TENSORFLOW_LIB_STRINGS_STRCAT_H_ + +#include <string> + +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/platform/port.h" + +// The AlphaNum type was designed to be used as the parameter type for StrCat(). +// Any routine accepting either a string or a number may accept it. +// The basic idea is that by accepting a "const AlphaNum &" as an argument +// to your function, your callers will automagically convert bools, integers, +// and floating point values to strings for you. +// +// NOTE: Use of AlphaNum outside of the //strings package is unsupported except +// for the specific case of function parameters of type "AlphaNum" or "const +// AlphaNum &". In particular, instantiating AlphaNum directly as a stack +// variable is not supported. +// +// Conversion from 8-bit values is not accepted because if it were, then an +// attempt to pass ':' instead of ":" might result in a 58 ending up in your +// result. +// +// Bools convert to "0" or "1". +// +// Floating point values are converted to a string which, if passed to strtod(), +// would produce the exact same original double (except in case of NaN; all NaNs +// are considered the same value). We try to keep the string short but it's not +// guaranteed to be as short as possible. +// +// You can convert to Hexadecimal output rather than Decimal output using Hex. +// To do this, pass strings::Hex(my_int) as a parameter to StrCat. You may +// specify a minimum field width using a separate parameter, so the equivalent +// of Printf("%04x", my_int) is StrCat(Hex(my_int, strings::ZERO_PAD_4)) +// +// This class has implicit constructors. +namespace tensorflow { +namespace strings { + +enum PadSpec { + NO_PAD = 1, + ZERO_PAD_2, + ZERO_PAD_3, + ZERO_PAD_4, + ZERO_PAD_5, + ZERO_PAD_6, + ZERO_PAD_7, + ZERO_PAD_8, + ZERO_PAD_9, + ZERO_PAD_10, + ZERO_PAD_11, + ZERO_PAD_12, + ZERO_PAD_13, + ZERO_PAD_14, + ZERO_PAD_15, + ZERO_PAD_16, +}; + +struct Hex { + uint64 value; + enum PadSpec spec; + template <class Int> + explicit Hex(Int v, PadSpec s = NO_PAD) + : spec(s) { + // Prevent sign-extension by casting integers to + // their unsigned counterparts. + static_assert( + sizeof(v) == 1 || sizeof(v) == 2 || sizeof(v) == 4 || sizeof(v) == 8, + "Unknown integer type"); + value = sizeof(v) == 1 + ? static_cast<uint8>(v) + : sizeof(v) == 2 ? static_cast<uint16>(v) + : sizeof(v) == 4 ? static_cast<uint32>(v) + : static_cast<uint64>(v); + } +}; + +class AlphaNum { + public: + // No bool ctor -- bools convert to an integral type. + // A bool ctor would also convert incoming pointers (bletch). + + AlphaNum(int i32) // NOLINT(runtime/explicit) + : piece_(digits_, FastInt32ToBufferLeft(i32, digits_) - &digits_[0]) {} + AlphaNum(unsigned int u32) // NOLINT(runtime/explicit) + : piece_(digits_, FastUInt32ToBufferLeft(u32, digits_) - &digits_[0]) {} + AlphaNum(long x) // NOLINT(runtime/explicit) + : piece_(digits_, FastInt64ToBufferLeft(x, digits_) - &digits_[0]) {} + AlphaNum(unsigned long x) // NOLINT(runtime/explicit) + : piece_(digits_, FastUInt64ToBufferLeft(x, digits_) - &digits_[0]) {} + AlphaNum(long long int i64) // NOLINT(runtime/explicit) + : piece_(digits_, FastInt64ToBufferLeft(i64, digits_) - &digits_[0]) {} + AlphaNum(unsigned long long int u64) // NOLINT(runtime/explicit) + : piece_(digits_, FastUInt64ToBufferLeft(u64, digits_) - &digits_[0]) {} + + AlphaNum(float f) // NOLINT(runtime/explicit) + : piece_(digits_, strlen(FloatToBuffer(f, digits_))) {} + AlphaNum(double f) // NOLINT(runtime/explicit) + : piece_(digits_, strlen(DoubleToBuffer(f, digits_))) {} + + AlphaNum(Hex hex); // NOLINT(runtime/explicit) + + AlphaNum(const char *c_str) : piece_(c_str) {} // NOLINT(runtime/explicit) + AlphaNum(const StringPiece &pc) : piece_(pc) {} // NOLINT(runtime/explicit) + AlphaNum(const tensorflow::string &str) // NOLINT(runtime/explicit) + : piece_(str) {} + + StringPiece::size_type size() const { return piece_.size(); } + const char *data() const { return piece_.data(); } + StringPiece Piece() const { return piece_; } + + private: + StringPiece piece_; + char digits_[kFastToBufferSize]; + + // Use ":" not ':' + AlphaNum(char c); // NOLINT(runtime/explicit) + + TF_DISALLOW_COPY_AND_ASSIGN(AlphaNum); +}; + +extern AlphaNum gEmptyAlphaNum; + +using strings::AlphaNum; +using strings::gEmptyAlphaNum; + +// ---------------------------------------------------------------------- +// StrCat() +// This merges the given strings or numbers, with no delimiter. This +// is designed to be the fastest possible way to construct a string out +// of a mix of raw C strings, StringPieces, strings, bool values, +// and numeric values. +// +// Don't use this for user-visible strings. The localization process +// works poorly on strings built up out of fragments. +// +// For clarity and performance, don't use StrCat when appending to a +// string. In particular, avoid using any of these (anti-)patterns: +// str.append(StrCat(...)) +// str += StrCat(...) +// str = StrCat(str, ...) +// where the last is the worse, with the potential to change a loop +// from a linear time operation with O(1) dynamic allocations into a +// quadratic time operation with O(n) dynamic allocations. StrAppend +// is a better choice than any of the above, subject to the restriction +// of StrAppend(&str, a, b, c, ...) that none of the a, b, c, ... may +// be a reference into str. +// ---------------------------------------------------------------------- + +// For performance reasons, we have specializations for <= 4 args. +string StrCat(const AlphaNum &a) TF_MUST_USE_RESULT; +string StrCat(const AlphaNum &a, const AlphaNum &b) TF_MUST_USE_RESULT; +string StrCat(const AlphaNum &a, const AlphaNum &b, + const AlphaNum &c) TF_MUST_USE_RESULT; +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d) TF_MUST_USE_RESULT; + +// inline definitions must be duplicated due to TF_MUST_USE_RESULT +inline string StrCat(const AlphaNum &a) { return string(a.data(), a.size()); } + +namespace internal { + +// Do not call directly - this is not part of the public API. +string CatPieces(std::initializer_list<StringPiece> pieces); +void AppendPieces(string *dest, std::initializer_list<StringPiece> pieces); + +} // namespace internal + +// Support 5 or more arguments +template <typename... AV> +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, + const AV &... args) TF_MUST_USE_RESULT; + +template <typename... AV> +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AV &... args) { + return internal::CatPieces({a.Piece(), b.Piece(), c.Piece(), d.Piece(), + e.Piece(), + static_cast<const AlphaNum &>(args).Piece()...}); +} + +// ---------------------------------------------------------------------- +// StrAppend() +// Same as above, but adds the output to the given string. +// WARNING: For speed, StrAppend does not try to check each of its input +// arguments to be sure that they are not a subset of the string being +// appended to. That is, while this will work: +// +// string s = "foo"; +// s += s; +// +// This will not (necessarily) work: +// +// string s = "foo"; +// StrAppend(&s, s); +// +// Note: while StrCat supports appending up to 26 arguments, StrAppend +// is currently limited to 9. That's rarely an issue except when +// automatically transforming StrCat to StrAppend, and can easily be +// worked around as consecutive calls to StrAppend are quite efficient. +// ---------------------------------------------------------------------- + +void StrAppend(string *dest, const AlphaNum &a); +void StrAppend(string *dest, const AlphaNum &a, const AlphaNum &b); +void StrAppend(string *dest, const AlphaNum &a, const AlphaNum &b, + const AlphaNum &c); +void StrAppend(string *dest, const AlphaNum &a, const AlphaNum &b, + const AlphaNum &c, const AlphaNum &d); + +// Support 5 or more arguments +template <typename... AV> +inline void StrAppend(string *dest, const AlphaNum &a, const AlphaNum &b, + const AlphaNum &c, const AlphaNum &d, const AlphaNum &e, + const AV &... args) { + internal::AppendPieces(dest, + {a.Piece(), b.Piece(), c.Piece(), d.Piece(), e.Piece(), + static_cast<const AlphaNum &>(args).Piece()...}); +} + +} // namespace strings +} // namespace tensorflow + +#endif // TENSORFLOW_LIB_STRINGS_STRCAT_H_ diff --git a/tensorflow/core/lib/strings/strcat_test.cc b/tensorflow/core/lib/strings/strcat_test.cc new file mode 100644 index 0000000000..9ff7d81af9 --- /dev/null +++ b/tensorflow/core/lib/strings/strcat_test.cc @@ -0,0 +1,324 @@ +#include "tensorflow/core/lib/strings/strcat.h" + +#include <string> + +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/port.h" +#include <gtest/gtest.h> + +namespace tensorflow { +namespace strings { + +// Test StrCat of ints and longs of various sizes and signdedness. +TEST(StrCat, Ints) { + const int16 s = -1; + const uint16 us = 2; + const int i = -3; + const unsigned int ui = 4; + const int32 l = -5; + const uint32 ul = 6; + const int64 ll = -7; + const uint64 ull = 8; + const ptrdiff_t ptrdiff = -9; + const size_t size = 10; + const ssize_t ssize = -11; + const intptr_t intptr = -12; + const uintptr_t uintptr = 13; + string answer; + answer = StrCat(s, us); + EXPECT_EQ(answer, "-12"); + answer = StrCat(i, ui); + EXPECT_EQ(answer, "-34"); + answer = StrCat(l, ul); + EXPECT_EQ(answer, "-56"); + answer = StrCat(ll, ull); + EXPECT_EQ(answer, "-78"); + answer = StrCat(ptrdiff, size); + EXPECT_EQ(answer, "-910"); + answer = StrCat(ssize, intptr); + EXPECT_EQ(answer, "-11-12"); + answer = StrCat(uintptr, 0); + EXPECT_EQ(answer, "130"); +} + +TEST(StrCat, Basics) { + string result; + + string strs[] = {"Hello", "Cruel", "World"}; + + StringPiece pieces[] = {"Hello", "Cruel", "World"}; + + const char *c_strs[] = {"Hello", "Cruel", "World"}; + + int32 i32s[] = {'H', 'C', 'W'}; + uint64 ui64s[] = {12345678910LL, 10987654321LL}; + + result = StrCat(false, true, 2, 3); + EXPECT_EQ(result, "0123"); + + result = StrCat(-1); + EXPECT_EQ(result, "-1"); + + result = StrCat(0.5); + EXPECT_EQ(result, "0.5"); + + result = StrCat(strs[1], pieces[2]); + EXPECT_EQ(result, "CruelWorld"); + + result = StrCat(strs[0], ", ", pieces[2]); + EXPECT_EQ(result, "Hello, World"); + + result = StrCat(strs[0], ", ", strs[1], " ", strs[2], "!"); + EXPECT_EQ(result, "Hello, Cruel World!"); + + result = StrCat(pieces[0], ", ", pieces[1], " ", pieces[2]); + EXPECT_EQ(result, "Hello, Cruel World"); + + result = StrCat(c_strs[0], ", ", c_strs[1], " ", c_strs[2]); + EXPECT_EQ(result, "Hello, Cruel World"); + + result = StrCat("ASCII ", i32s[0], ", ", i32s[1], " ", i32s[2], "!"); + EXPECT_EQ(result, "ASCII 72, 67 87!"); + + result = StrCat(ui64s[0], ", ", ui64s[1], "!"); + EXPECT_EQ(result, "12345678910, 10987654321!"); + + string one = "1"; // Actually, it's the size of this string that we want; a + // 64-bit build distinguishes between size_t and uint64, + // even though they're both unsigned 64-bit values. + result = StrCat("And a ", one.size(), " and a ", &result[2] - &result[0], + " and a ", one, " 2 3 4", "!"); + EXPECT_EQ(result, "And a 1 and a 2 and a 1 2 3 4!"); + + // result = StrCat("Single chars won't compile", '!'); + // result = StrCat("Neither will NULLs", NULL); + result = StrCat("To output a char by ASCII/numeric value, use +: ", '!' + 0); + EXPECT_EQ(result, "To output a char by ASCII/numeric value, use +: 33"); + + float f = 100000.5; + result = StrCat("A hundred K and a half is ", f); + EXPECT_EQ(result, "A hundred K and a half is 100000.5"); + + double d = f; + d *= d; + result = StrCat("A hundred K and a half squared is ", d); + EXPECT_EQ(result, "A hundred K and a half squared is 10000100000.25"); + + result = StrCat(1, 2, 333, 4444, 55555, 666666, 7777777, 88888888, 999999999); + EXPECT_EQ(result, "12333444455555666666777777788888888999999999"); +} + +TEST(StrCat, MaxArgs) { + string result; + // Test 10 up to 26 arguments, the current maximum + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a"); + EXPECT_EQ(result, "123456789a"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b"); + EXPECT_EQ(result, "123456789ab"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c"); + EXPECT_EQ(result, "123456789abc"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d"); + EXPECT_EQ(result, "123456789abcd"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e"); + EXPECT_EQ(result, "123456789abcde"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f"); + EXPECT_EQ(result, "123456789abcdef"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g"); + EXPECT_EQ(result, "123456789abcdefg"); + result = + StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g", "h"); + EXPECT_EQ(result, "123456789abcdefgh"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g", + "h", "i"); + EXPECT_EQ(result, "123456789abcdefghi"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j"); + EXPECT_EQ(result, "123456789abcdefghij"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j", "k"); + EXPECT_EQ(result, "123456789abcdefghijk"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j", "k", "l"); + EXPECT_EQ(result, "123456789abcdefghijkl"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j", "k", "l", "m"); + EXPECT_EQ(result, "123456789abcdefghijklm"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j", "k", "l", "m", "n"); + EXPECT_EQ(result, "123456789abcdefghijklmn"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j", "k", "l", "m", "n", "o"); + EXPECT_EQ(result, "123456789abcdefghijklmno"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j", "k", "l", "m", "n", "o", "p"); + EXPECT_EQ(result, "123456789abcdefghijklmnop"); + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j", "k", "l", "m", "n", "o", "p", "q"); + EXPECT_EQ(result, "123456789abcdefghijklmnopq"); + // No limit thanks to C++11's variadic templates + result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, "a", "b", "c", "d", "e", "f", + "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", + "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", + "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", + "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"); + EXPECT_EQ(result, + "12345678910abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"); +} + +TEST(StrAppend, Basics) { + string result = "existing text"; + + string strs[] = {"Hello", "Cruel", "World"}; + + StringPiece pieces[] = {"Hello", "Cruel", "World"}; + + const char *c_strs[] = {"Hello", "Cruel", "World"}; + + int32 i32s[] = {'H', 'C', 'W'}; + uint64 ui64s[] = {12345678910LL, 10987654321LL}; + + string::size_type old_size = result.size(); + StrAppend(&result, strs[0]); + EXPECT_EQ(result.substr(old_size), "Hello"); + + old_size = result.size(); + StrAppend(&result, strs[1], pieces[2]); + EXPECT_EQ(result.substr(old_size), "CruelWorld"); + + old_size = result.size(); + StrAppend(&result, strs[0], ", ", pieces[2]); + EXPECT_EQ(result.substr(old_size), "Hello, World"); + + old_size = result.size(); + StrAppend(&result, strs[0], ", ", strs[1], " ", strs[2], "!"); + EXPECT_EQ(result.substr(old_size), "Hello, Cruel World!"); + + old_size = result.size(); + StrAppend(&result, pieces[0], ", ", pieces[1], " ", pieces[2]); + EXPECT_EQ(result.substr(old_size), "Hello, Cruel World"); + + old_size = result.size(); + StrAppend(&result, c_strs[0], ", ", c_strs[1], " ", c_strs[2]); + EXPECT_EQ(result.substr(old_size), "Hello, Cruel World"); + + old_size = result.size(); + StrAppend(&result, "ASCII ", i32s[0], ", ", i32s[1], " ", i32s[2], "!"); + EXPECT_EQ(result.substr(old_size), "ASCII 72, 67 87!"); + + old_size = result.size(); + StrAppend(&result, ui64s[0], ", ", ui64s[1], "!"); + EXPECT_EQ(result.substr(old_size), "12345678910, 10987654321!"); + + string one = "1"; // Actually, it's the size of this string that we want; a + // 64-bit build distinguishes between size_t and uint64, + // even though they're both unsigned 64-bit values. + old_size = result.size(); + StrAppend(&result, "And a ", one.size(), " and a ", &result[2] - &result[0], + " and a ", one, " 2 3 4", "!"); + EXPECT_EQ(result.substr(old_size), "And a 1 and a 2 and a 1 2 3 4!"); + + // result = StrCat("Single chars won't compile", '!'); + // result = StrCat("Neither will NULLs", NULL); + old_size = result.size(); + StrAppend(&result, "To output a char by ASCII/numeric value, use +: ", + '!' + 0); + EXPECT_EQ(result.substr(old_size), + "To output a char by ASCII/numeric value, use +: 33"); + + float f = 100000.5; + old_size = result.size(); + StrAppend(&result, "A hundred K and a half is ", f); + EXPECT_EQ(result.substr(old_size), "A hundred K and a half is 100000.5"); + + double d = f; + d *= d; + old_size = result.size(); + StrAppend(&result, "A hundred K and a half squared is ", d); + EXPECT_EQ(result.substr(old_size), + "A hundred K and a half squared is 10000100000.25"); + + // Test 9 arguments, the old maximum + old_size = result.size(); + StrAppend(&result, 1, 22, 333, 4444, 55555, 666666, 7777777, 88888888, 9); + EXPECT_EQ(result.substr(old_size), "1223334444555556666667777777888888889"); + + // No limit thanks to C++11's variadic templates + old_size = result.size(); + StrAppend(&result, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, "a", "b", "c", "d", "e", + "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", + "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", + "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", + "S", "T", "U", "V", "W", "X", "Y", "Z", + "No limit thanks to C++11's variadic templates"); + EXPECT_EQ(result.substr(old_size), + "12345678910abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + "No limit thanks to C++11's variadic templates"); +} + +TEST(StrAppend, Death) { + string s = "self"; + EXPECT_DEBUG_DEATH(StrAppend(&s, s.c_str() + 1), "Check failed:"); + EXPECT_DEBUG_DEATH(StrAppend(&s, s), "Check failed:"); +} + +static void CheckHex64(uint64 v) { + using tensorflow::strings::Hex; + string actual = StrCat(Hex(v, tensorflow::strings::ZERO_PAD_16)); + string expected = Printf("%016llx", static_cast<unsigned long long>(v)); + EXPECT_EQ(expected, actual) << " decimal value " << v; + + actual = StrCat(Hex(v, tensorflow::strings::ZERO_PAD_8)); + expected = Printf("%08llx", static_cast<unsigned long long>(v)); + EXPECT_EQ(expected, actual) << " decimal value " << v; + + actual = StrCat(Hex(v)); + expected = Printf("%llx", static_cast<unsigned long long>(v)); + EXPECT_EQ(expected, actual) << " decimal value " << v; +} + +static void CheckHex32(uint32 v) { + using tensorflow::strings::Hex; + string actual = StrCat(Hex(v, tensorflow::strings::ZERO_PAD_8)); + string expected = Printf("%08x", v); + EXPECT_EQ(expected, actual) << " decimal value " << v; + + actual = StrCat(Hex(v)); + expected = Printf("%x", v); + EXPECT_EQ(expected, actual) << " decimal value " << v; +} + +static void CheckHexSigned32(int32 v) { + using tensorflow::strings::Hex; + string actual = StrCat(Hex(v, tensorflow::strings::ZERO_PAD_8)); + string expected = Printf("%08x", v); + EXPECT_EQ(expected, actual) << " decimal value " << v; + + actual = StrCat(Hex(v)); + expected = Printf("%x", v); + EXPECT_EQ(expected, actual) << " decimal value " << v; +} + +static void TestFastPrints() { + using tensorflow::strings::Hex; + + // Test min int to make sure that works + for (int i = 0; i < 10000; i++) { + CheckHex64(i); + CheckHex32(i); + CheckHexSigned32(i); + CheckHexSigned32(-i); + } + CheckHex64(0x123456789abcdef0ull); + CheckHex32(0x12345678); + + int8 minus_one_8bit = -1; + EXPECT_EQ("ff", StrCat(Hex(minus_one_8bit))); + + int16 minus_one_16bit = -1; + EXPECT_EQ("ffff", StrCat(Hex(minus_one_16bit))); +} + +TEST(Numbers, TestFunctionsMovedOverFromNumbersMain) { TestFastPrints(); } + +} // namespace strings +} // namespace tensorflow diff --git a/tensorflow/core/lib/strings/stringprintf.cc b/tensorflow/core/lib/strings/stringprintf.cc new file mode 100644 index 0000000000..b354706cbd --- /dev/null +++ b/tensorflow/core/lib/strings/stringprintf.cc @@ -0,0 +1,85 @@ +#include "tensorflow/core/lib/strings/stringprintf.h" + +#include <errno.h> +#include <stdarg.h> // For va_list and related operations +#include <stdio.h> // MSVC requires this for _vsnprintf +#include <vector> + +namespace tensorflow { +namespace strings { + +#ifdef COMPILER_MSVC +enum { IS_COMPILER_MSVC = 1 }; +#else +enum { IS_COMPILER_MSVC = 0 }; +#endif + +void Appendv(string* dst, const char* format, va_list ap) { + // First try with a small fixed size buffer + static const int kSpaceLength = 1024; + char space[kSpaceLength]; + + // It's possible for methods that use a va_list to invalidate + // the data in it upon use. The fix is to make a copy + // of the structure before using it and use that copy instead. + va_list backup_ap; + va_copy(backup_ap, ap); + int result = vsnprintf(space, kSpaceLength, format, backup_ap); + va_end(backup_ap); + + if (result < kSpaceLength) { + if (result >= 0) { + // Normal case -- everything fit. + dst->append(space, result); + return; + } + + if (IS_COMPILER_MSVC) { + // Error or MSVC running out of space. MSVC 8.0 and higher + // can be asked about space needed with the special idiom below: + va_copy(backup_ap, ap); + result = vsnprintf(NULL, 0, format, backup_ap); + va_end(backup_ap); + } + + if (result < 0) { + // Just an error. + return; + } + } + + // Increase the buffer size to the size requested by vsnprintf, + // plus one for the closing \0. + int length = result + 1; + char* buf = new char[length]; + + // Restore the va_list before we use it again + va_copy(backup_ap, ap); + result = vsnprintf(buf, length, format, backup_ap); + va_end(backup_ap); + + if (result >= 0 && result < length) { + // It fit + dst->append(buf, result); + } + delete[] buf; +} + +string Printf(const char* format, ...) { + va_list ap; + va_start(ap, format); + string result; + Appendv(&result, format, ap); + va_end(ap); + return result; +} + +void Appendf(string* dst, const char* format, ...) { + va_list ap; + va_start(ap, format); + Appendv(dst, format, ap); + va_end(ap); +} + +} // namespace strings +} // namespace tensorflow diff --git a/tensorflow/core/lib/strings/stringprintf.h b/tensorflow/core/lib/strings/stringprintf.h new file mode 100644 index 0000000000..23ca2583ca --- /dev/null +++ b/tensorflow/core/lib/strings/stringprintf.h @@ -0,0 +1,37 @@ +// Printf variants that place their output in a C++ string. +// +// Usage: +// string result = strings::Printf("%d %s\n", 10, "hello"); +// strings::SPrintf(&result, "%d %s\n", 10, "hello"); +// strings::Appendf(&result, "%d %s\n", 20, "there"); + +#ifndef TENSORFLOW_LIB_STRINGS_STRINGPRINTF_H_ +#define TENSORFLOW_LIB_STRINGS_STRINGPRINTF_H_ + +#include <stdarg.h> +#include <string> +#include <vector> + +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { +namespace strings { + +// Return a C++ string +extern string Printf(const char* format, ...) + // Tell the compiler to do printf format string checking. + TF_PRINTF_ATTRIBUTE(1, 2); + +// Append result to a supplied string +extern void Appendf(string* dst, const char* format, ...) + // Tell the compiler to do printf format string checking. + TF_PRINTF_ATTRIBUTE(2, 3); + +// Lower-level routine that takes a va_list and appends to a specified +// string. All other routines are just convenience wrappers around it. +extern void Appendv(string* dst, const char* format, va_list ap); + +} // namespace strings +} // namespace tensorflow + +#endif // TENSORFLOW_LIB_STRINGS_STRINGPRINTF_H_ diff --git a/tensorflow/core/lib/strings/stringprintf_test.cc b/tensorflow/core/lib/strings/stringprintf_test.cc new file mode 100644 index 0000000000..737ed5c0e0 --- /dev/null +++ b/tensorflow/core/lib/strings/stringprintf_test.cc @@ -0,0 +1,113 @@ +#include "tensorflow/core/lib/strings/stringprintf.h" + +#include <string> + +#include <gtest/gtest.h> + +namespace tensorflow { +namespace strings { +namespace { + +TEST(PrintfTest, Empty) { + EXPECT_EQ("", Printf("%s", string().c_str())); + EXPECT_EQ("", Printf("%s", "")); +} + +TEST(PrintfTest, Misc) { +// MSVC does not support $ format specifier. +#if !defined(COMPILER_MSVC) + EXPECT_EQ("123hello w", Printf("%3$d%2$s %1$c", 'w', "hello", 123)); +#endif // !COMPILER_MSVC +} + +TEST(AppendfTest, Empty) { + string value("Hello"); + const char* empty = ""; + Appendf(&value, "%s", empty); + EXPECT_EQ("Hello", value); +} + +TEST(AppendfTest, EmptyString) { + string value("Hello"); + Appendf(&value, "%s", ""); + EXPECT_EQ("Hello", value); +} + +TEST(AppendfTest, String) { + string value("Hello"); + Appendf(&value, " %s", "World"); + EXPECT_EQ("Hello World", value); +} + +TEST(AppendfTest, Int) { + string value("Hello"); + Appendf(&value, " %d", 123); + EXPECT_EQ("Hello 123", value); +} + +TEST(PrintfTest, Multibyte) { + // If we are in multibyte mode and feed invalid multibyte sequence, + // Printf should return an empty string instead of running + // out of memory while trying to determine destination buffer size. + // see b/4194543. + + char* old_locale = setlocale(LC_CTYPE, NULL); + // Push locale with multibyte mode + setlocale(LC_CTYPE, "en_US.utf8"); + + const char kInvalidCodePoint[] = "\375\067s"; + string value = Printf("%.*s", 3, kInvalidCodePoint); + + // In some versions of glibc (e.g. eglibc-2.11.1, aka GRTEv2), snprintf + // returns error given an invalid codepoint. Other versions + // (e.g. eglibc-2.15, aka pre-GRTEv3) emit the codepoint verbatim. + // We test that the output is one of the above. + EXPECT_TRUE(value.empty() || value == kInvalidCodePoint); + + // Repeat with longer string, to make sure that the dynamically + // allocated path in StringAppendV is handled correctly. + int n = 2048; + char* buf = new char[n + 1]; + memset(buf, ' ', n - 3); + memcpy(buf + n - 3, kInvalidCodePoint, 4); + value = Printf("%.*s", n, buf); + // See GRTEv2 vs. GRTEv3 comment above. + EXPECT_TRUE(value.empty() || value == buf); + delete[] buf; + + setlocale(LC_CTYPE, old_locale); +} + +TEST(PrintfTest, NoMultibyte) { + // No multibyte handling, but the string contains funny chars. + char* old_locale = setlocale(LC_CTYPE, NULL); + setlocale(LC_CTYPE, "POSIX"); + string value = Printf("%.*s", 3, "\375\067s"); + setlocale(LC_CTYPE, old_locale); + EXPECT_EQ("\375\067s", value); +} + +TEST(PrintfTest, DontOverwriteErrno) { + // Check that errno isn't overwritten unless we're printing + // something significantly larger than what people are normally + // printing in their badly written PLOG() statements. + errno = ECHILD; + string value = Printf("Hello, %s!", "World"); + EXPECT_EQ(ECHILD, errno); +} + +TEST(PrintfTest, LargeBuf) { + // Check that the large buffer is handled correctly. + int n = 2048; + char* buf = new char[n + 1]; + memset(buf, ' ', n); + buf[n] = 0; + string value = Printf("%s", buf); + EXPECT_EQ(buf, value); + delete[] buf; +} + +} // namespace + +} // namespace strings +} // namespace tensorflow |