diff options
author | Manjunath Kudlur <keveman@gmail.com> | 2015-11-06 16:27:58 -0800 |
---|---|---|
committer | Manjunath Kudlur <keveman@gmail.com> | 2015-11-06 16:27:58 -0800 |
commit | f41959ccb2d9d4c722fe8fc3351401d53bcf4900 (patch) | |
tree | ef0ca22cb2a5ac4bdec9d080d8e0788a53ed496d /tensorflow/core/lib/strings/str_util.cc |
TensorFlow: Initial commit of TensorFlow library.
TensorFlow is an open source software library for numerical computation
using data flow graphs.
Base CL: 107276108
Diffstat (limited to 'tensorflow/core/lib/strings/str_util.cc')
-rw-r--r-- | tensorflow/core/lib/strings/str_util.cc | 312 |
1 files changed, 312 insertions, 0 deletions
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc new file mode 100644 index 0000000000..cccd50c7ff --- /dev/null +++ b/tensorflow/core/lib/strings/str_util.cc @@ -0,0 +1,312 @@ +#include "tensorflow/core/lib/strings/str_util.h" +#include <ctype.h> + +namespace tensorflow { +namespace str_util { + +static char hex_char[] = "0123456789abcdef"; + +string CEscape(const string& src) { + string dest; + + for (unsigned char c : src) { + switch (c) { + case '\n': + dest.append("\\n"); + break; + case '\r': + dest.append("\\r"); + break; + case '\t': + dest.append("\\t"); + break; + case '\"': + dest.append("\\\""); + break; + case '\'': + dest.append("\\'"); + break; + case '\\': + dest.append("\\\\"); + break; + default: + // Note that if we emit \xNN and the src character after that is a hex + // digit then that digit must be escaped too to prevent it being + // interpreted as part of the character code by C. + if ((c >= 0x80) || !isprint(c)) { + dest.append("\\"); + dest.push_back(hex_char[c / 64]); + dest.push_back(hex_char[(c % 64) / 8]); + dest.push_back(hex_char[c % 8]); + } else { + dest.push_back(c); + break; + } + } + } + + return dest; +} + +namespace { // Private helpers for CUnescape(). + +inline bool is_octal_digit(unsigned char c) { return c >= '0' && c <= '7'; } + +inline bool ascii_isxdigit(unsigned char c) { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F'); +} + +inline int hex_digit_to_int(char c) { + int x = static_cast<unsigned char>(c); + if (x > '9') { + x += 9; + } + return x & 0xf; +} + +bool CUnescapeInternal(StringPiece source, char* dest, int* dest_len, + string* error) { + char* d = dest; + const char* p = source.data(); + const char* end = source.end(); + const char* last_byte = end - 1; + + // Small optimization for case where source = dest and there's no escaping + while (p == d && p < end && *p != '\\') p++, d++; + + while (p < end) { + if (*p != '\\') { + *d++ = *p++; + } else { + if (++p > last_byte) { // skip past the '\\' + if (error) *error = "String cannot end with \\"; + return false; + } + switch (*p) { + case 'a': + *d++ = '\a'; + break; + case 'b': + *d++ = '\b'; + break; + case 'f': + *d++ = '\f'; + break; + case 'n': + *d++ = '\n'; + break; + case 'r': + *d++ = '\r'; + break; + case 't': + *d++ = '\t'; + break; + case 'v': + *d++ = '\v'; + break; + case '\\': + *d++ = '\\'; + break; + case '?': + *d++ = '\?'; + break; // \? Who knew? + case '\'': + *d++ = '\''; + break; + case '"': + *d++ = '\"'; + break; + case '0': + case '1': + case '2': + case '3': // octal digit: 1 to 3 digits + case '4': + case '5': + case '6': + case '7': { + const char* octal_start = p; + unsigned int ch = *p - '0'; + if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0'; + if (p < last_byte && is_octal_digit(p[1])) + ch = ch * 8 + *++p - '0'; // now points at last digit + if (ch > 0xff) { + if (error) { + *error = "Value of \\" + + string(octal_start, p + 1 - octal_start) + + " exceeds 0xff"; + } + return false; + } + *d++ = ch; + break; + } + case 'x': + case 'X': { + if (p >= last_byte) { + if (error) *error = "String cannot end with \\x"; + return false; + } else if (!ascii_isxdigit(p[1])) { + if (error) *error = "\\x cannot be followed by a non-hex digit"; + return false; + } + unsigned int ch = 0; + const char* hex_start = p; + while (p < last_byte && ascii_isxdigit(p[1])) + // Arbitrarily many hex digits + ch = (ch << 4) + hex_digit_to_int(*++p); + if (ch > 0xFF) { + if (error) { + *error = "Value of \\" + string(hex_start, p + 1 - hex_start) + + " exceeds 0xff"; + } + return false; + } + *d++ = ch; + break; + } + default: { + if (error) *error = string("Unknown escape sequence: \\") + *p; + return false; + } + } + p++; // read past letter we escaped + } + } + *dest_len = d - dest; + return true; +} + +} // namespace + +bool CUnescape(StringPiece source, string* dest, string* error) { + dest->resize(source.size()); + int dest_size; + if (!CUnescapeInternal(source, const_cast<char*>(dest->data()), &dest_size, + error)) { + return false; + } + dest->erase(dest_size); + return true; +} + +bool NumericParse32(const string& text, int32* val) { + // Slow, but this code is not performance critical, and this + // doesn't bring in any new dependencies + char junk; + if (sscanf(text.c_str(), "%d%c", val, &junk) == 1) { + return true; + } else { + return false; + } +} + +void StripTrailingWhitespace(string* s) { + string::size_type i; + for (i = s->size(); i > 0 && isspace((*s)[i - 1]); --i) { + } + s->resize(i); +} + +// Return lower-cased version of s. +string Lowercase(StringPiece s) { + string result(s.data(), s.size()); + for (char& c : result) { + c = tolower(c); + } + return result; +} + +// Return upper-cased version of s. +string Uppercase(StringPiece s) { + string result(s.data(), s.size()); + for (char& c : result) { + c = toupper(c); + } + return result; +} + +void TitlecaseString(string* s, StringPiece delimiters) { + bool upper = true; + for (string::iterator ss = s->begin(); ss != s->end(); ++ss) { + if (upper) { + *ss = toupper(*ss); + } + upper = (delimiters.find(*ss) != StringPiece::npos); + } +} + +size_t RemoveLeadingWhitespace(StringPiece* text) { + size_t count = 0; + const char* ptr = text->data(); + while (count < text->size() && isspace(*ptr)) { + count++; + ptr++; + } + text->remove_prefix(count); + return count; +} + +size_t RemoveTrailingWhitespace(StringPiece* text) { + size_t count = 0; + const char* ptr = text->data() + text->size() - 1; + while (count < text->size() && isspace(*ptr)) { + ++count; + --ptr; + } + text->remove_suffix(count); + return count; +} + +size_t RemoveWhitespaceContext(StringPiece* text) { + // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job + return (RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text)); +} + +bool ConsumePrefix(StringPiece* s, StringPiece expected) { + if (s->starts_with(expected)) { + s->remove_prefix(expected.size()); + return true; + } + return false; +} + +bool ConsumeLeadingDigits(StringPiece* s, uint64* val) { + const char* p = s->data(); + const char* limit = p + s->size(); + uint64 v = 0; + while (p < limit) { + const char c = *p; + if (c < '0' || c > '9') break; + uint64 new_v = (v * 10) + (c - '0'); + if (new_v < v) { + // Overflow occurred + return false; + } + v = new_v; + p++; + } + if (p > s->data()) { + // Consume some digits + s->remove_prefix(p - s->data()); + *val = v; + return true; + } else { + return false; + } +} + +bool SplitAndParseAsInts(StringPiece text, char delim, + std::vector<int32>* result) { + result->clear(); + std::vector<string> num_strings = Split(text, delim); + for (const auto& s : num_strings) { + int32 num; + if (!NumericParse32(s, &num)) return false; + result->push_back(num); + } + return true; +} + +} // namespace str_util +} // namespace tensorflow |