diff options
author | misterg <misterg@google.com> | 2017-09-19 16:54:40 -0400 |
---|---|---|
committer | misterg <misterg@google.com> | 2017-09-19 16:54:40 -0400 |
commit | c2e754829628d1e9b7a16b3389cfdace76950fdf (patch) | |
tree | 5a7f056f44e27c30e10025113b644f0b3b5801fc /absl/strings/str_split.h |
Initial Commit
Diffstat (limited to 'absl/strings/str_split.h')
-rw-r--r-- | absl/strings/str_split.h | 511 |
1 files changed, 511 insertions, 0 deletions
diff --git a/absl/strings/str_split.h b/absl/strings/str_split.h new file mode 100644 index 00000000..a7b48b18 --- /dev/null +++ b/absl/strings/str_split.h @@ -0,0 +1,511 @@ +// +// Copyright 2017 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ----------------------------------------------------------------------------- +// File: str_split.h +// ----------------------------------------------------------------------------- +// +// This file contains functions for splitting strings. It defines the main +// `StrSplit()` function, several delimiters for determining the boundaries on +// which to split the std::string, and predicates for filtering delimited results. +// `StrSplit()` adapts the returned collection to the type specified by the +// caller. +// +// Example: +// +// // Splits the given std::string on commas. Returns the results in a +// // vector of strings. +// std::vector<std::string> v = absl::StrSplit("a,b,c", ','); +// // Can also use "," +// // v[0] == "a", v[1] == "b", v[2] == "c" +// +// See StrSplit() below for more information. +#ifndef ABSL_STRINGS_STR_SPLIT_H_ +#define ABSL_STRINGS_STR_SPLIT_H_ + +#include <algorithm> +#include <cstddef> +#include <map> +#include <set> +#include <string> +#include <utility> +#include <vector> + +#include "absl/base/internal/raw_logging.h" +#include "absl/strings/internal/str_split_internal.h" +#include "absl/strings/string_view.h" +#include "absl/strings/strip.h" + +namespace absl { + +//------------------------------------------------------------------------------ +// Delimiters +//------------------------------------------------------------------------------ +// +// `StrSplit()` uses delimiters to define the boundaries between elements in the +// provided input. Several `Delimiter` types are defined below. If a std::string +// (`const char*`, `std::string`, or `absl::string_view`) is passed in place of +// an explicit `Delimiter` object, `StrSplit()` treats it the same way as if it +// were passed a `ByString` delimiter. +// +// A `Delimiter` is an object with a `Find()` function that knows how to find +// the first occurrence of itself in a given `absl::string_view`. +// +// The following `Delimiter` types are available for use within `StrSplit()`: +// +// - `ByString` (default for std::string arguments) +// - `ByChar` (default for a char argument) +// - `ByAnyChar` +// - `ByLength` +// - `MaxSplits` +// +// +// A Delimiter's Find() member function will be passed the input text that is to +// be split and the position to begin searching for the next delimiter in the +// input text. The returned absl::string_view should refer to the next +// occurrence (after pos) of the represented delimiter; this returned +// absl::string_view represents the next location where the input std::string should +// be broken. The returned absl::string_view may be zero-length if the Delimiter +// does not represent a part of the std::string (e.g., a fixed-length delimiter). If +// no delimiter is found in the given text, a zero-length absl::string_view +// referring to text.end() should be returned (e.g., +// absl::string_view(text.end(), 0)). It is important that the returned +// absl::string_view always be within the bounds of input text given as an +// argument--it must not refer to a std::string that is physically located outside of +// the given std::string. +// +// The following example is a simple Delimiter object that is created with a +// single char and will look for that char in the text passed to the Find() +// function: +// +// struct SimpleDelimiter { +// const char c_; +// explicit SimpleDelimiter(char c) : c_(c) {} +// absl::string_view Find(absl::string_view text, size_t pos) { +// auto found = text.find(c_, pos); +// if (found == absl::string_view::npos) +// return absl::string_view(text.end(), 0); +// +// return absl::string_view(text, found, 1); +// } +// }; + +// ByString +// +// A sub-std::string delimiter. If `StrSplit()` is passed a std::string in place of a +// `Delimiter` object, the std::string will be implicitly converted into a +// `ByString` delimiter. +// +// Example: +// +// // Because a std::string literal is converted to an `absl::ByString`, +// // the following two splits are equivalent. +// +// std::vector<std::string> v1 = absl::StrSplit("a, b, c", ", "); +// +// using absl::ByString; +// std::vector<std::string> v2 = absl::StrSplit("a, b, c", +// ByString(", ")); +// // v[0] == "a", v[1] == "b", v[3] == "c" +class ByString { + public: + explicit ByString(absl::string_view sp); + absl::string_view Find(absl::string_view text, size_t pos) const; + + private: + const std::string delimiter_; +}; + +// ByChar +// +// A single character delimiter. `ByChar` is functionally equivalent to a +// 1-char std::string within a `ByString` delimiter, but slightly more +// efficient. +// +// Example: +// +// // Because a char literal is converted to a absl::ByChar, +// // the following two splits are equivalent. +// std::vector<std::string> v1 = absl::StrSplit("a,b,c", ','); +// using absl::ByChar; +// std::vector<std::string> v2 = absl::StrSplit("a,b,c", ByChar(',')); +// // v[0] == "a", v[1] == "b", v[3] == "c" +// +// `ByChar` is also the default delimiter if a single character is given +// as the delimiter to `StrSplit()`. For example, the following calls are +// equivalent: +// +// std::vector<std::string> v = absl::StrSplit("a-b", '-'); +// +// using absl::ByChar; +// std::vector<std::string> v = absl::StrSplit("a-b", ByChar('-')); +// +class ByChar { + public: + explicit ByChar(char c) : c_(c) {} + absl::string_view Find(absl::string_view text, size_t pos) const; + + private: + char c_; +}; + +// ByAnyChar +// +// A delimiter that will match any of the given byte-sized characters within +// its provided std::string. +// +// Note: this delimiter works with single-byte std::string data, but does not work +// with variable-width encodings, such as UTF-8. +// +// Example: +// +// using absl::ByAnyChar; +// std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); +// // v[0] == "a", v[1] == "b", v[3] == "c" +// +// If `ByAnyChar` is given the empty std::string, it behaves exactly like +// `ByString` and matches each individual character in the input std::string. +// +class ByAnyChar { + public: + explicit ByAnyChar(absl::string_view sp); + absl::string_view Find(absl::string_view text, size_t pos) const; + + private: + const std::string delimiters_; +}; + +// ByLength +// +// A delimiter for splitting into equal-length strings. The length argument to +// the constructor must be greater than 0. +// +// Note: this delimiter works with single-byte std::string data, but does not work +// with variable-width encodings, such as UTF-8. +// +// Example: +// +// using absl::ByLength; +// std::vector<std::string> v = absl::StrSplit("123456789", ByLength(3)); + +// // v[0] == "123", v[1] == "456", v[2] == "789" +// +// Note that the std::string does not have to be a multiple of the fixed split +// length. In such a case, the last substring will be shorter. +// +// using absl::ByLength; +// std::vector<std::string> v = absl::StrSplit("12345", ByLength(2)); +// +// // v[0] == "12", v[1] == "35", v[2] == "5" +class ByLength { + public: + explicit ByLength(ptrdiff_t length); + absl::string_view Find(absl::string_view text, size_t pos) const; + + private: + const ptrdiff_t length_; +}; + +namespace strings_internal { + +// A traits-like metafunction for selecting the default Delimiter object type +// for a particular Delimiter type. The base case simply exposes type Delimiter +// itself as the delimiter's Type. However, there are specializations for +// std::string-like objects that map them to the ByString delimiter object. +// This allows functions like absl::StrSplit() and absl::MaxSplits() to accept +// std::string-like objects (e.g., ',') as delimiter arguments but they will be +// treated as if a ByString delimiter was given. +template <typename Delimiter> +struct SelectDelimiter { + using type = Delimiter; +}; + +template <> +struct SelectDelimiter<char> { + using type = ByChar; +}; +template <> +struct SelectDelimiter<char*> { + using type = ByString; +}; +template <> +struct SelectDelimiter<const char*> { + using type = ByString; +}; +template <> +struct SelectDelimiter<absl::string_view> { + using type = ByString; +}; +template <> +struct SelectDelimiter<std::string> { + using type = ByString; +}; + +// Wraps another delimiter and sets a max number of matches for that delimiter. +template <typename Delimiter> +class MaxSplitsImpl { + public: + MaxSplitsImpl(Delimiter delimiter, int limit) + : delimiter_(delimiter), limit_(limit), count_(0) {} + absl::string_view Find(absl::string_view text, size_t pos) { + if (count_++ == limit_) { + return absl::string_view(text.end(), 0); // No more matches. + } + return delimiter_.Find(text, pos); + } + + private: + Delimiter delimiter_; + const int limit_; + int count_; +}; + +} // namespace strings_internal + +// MaxSplits() +// +// A delimiter that limits the number of matches which can occur to the passed +// `limit`. The last element in the returned collection will contain all +// remaining unsplit pieces, which may contain instances of the delimiter. +// The collection will contain at most `limit` + 1 elements. +// Example: +// +// using absl::MaxSplits; +// std::vector<std::string> v = absl::StrSplit("a,b,c", MaxSplits(',', 1)); +// +// // v[0] == "a", v[1] == "b,c" +template <typename Delimiter> +inline strings_internal::MaxSplitsImpl< + typename strings_internal::SelectDelimiter<Delimiter>::type> +MaxSplits(Delimiter delimiter, int limit) { + typedef + typename strings_internal::SelectDelimiter<Delimiter>::type DelimiterType; + return strings_internal::MaxSplitsImpl<DelimiterType>( + DelimiterType(delimiter), limit); +} + +//------------------------------------------------------------------------------ +// Predicates +//------------------------------------------------------------------------------ +// +// Predicates filter the results of a `StrSplit()` by determining whether or not +// a resultant element is included in the result set. A predicate may be passed +// as an optional third argument to the `StrSplit()` function. +// +// Predicates are unary functions (or functors) that take a single +// `absl::string_view` argument and return a bool indicating whether the +// argument should be included (`true`) or excluded (`false`). +// +// Predicates are useful when filtering out empty substrings. By default, empty +// substrings may be returned by `StrSplit()`, which is similar to the way split +// functions work in other programming languages. + +// AllowEmpty() +// +// Always returns `true`, indicating that all strings--including empty +// strings--should be included in the split output. This predicate is not +// strictly needed because this is the default behavior of `StrSplit()`; +// however, it might be useful at some call sites to make the intent explicit. +// +// Example: +// +// std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', AllowEmpty()); +// +// // v[0] == " a ", v[1] == " ", v[2] == "", v[3] = "b", v[4] == "" +struct AllowEmpty { + bool operator()(absl::string_view) const { return true; } +}; + +// SkipEmpty() +// +// Returns `false` if the given `absl::string_view` is empty, indicating that +// `StrSplit()` should omit the empty std::string. +// +// Example: +// +// std::vector<std::string> v = absl::StrSplit(",a,,b,", ',', SkipEmpty()); +// +// // v[0] == "a", v[1] == "b" +// +// Note: `SkipEmpty()` does not consider a std::string containing only whitespace +// to be empty. To skip such whitespace as well, use the `SkipWhitespace()` +// predicate. +struct SkipEmpty { + bool operator()(absl::string_view sp) const { return !sp.empty(); } +}; + +// SkipWhitespace() +// +// Returns `false` if the given `absl::string_view` is empty *or* contains only +// whitespace, indicating that `StrSplit()` should omit the std::string. +// +// Example: +// +// std::vector<std::string> v = absl::StrSplit(" a , ,,b,", +// ',', SkipWhitespace()); +// // v[0] == " a ", v[1] == "b" +// +// // SkipEmpty() would return whitespace elements +// std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', SkipEmpty()); +// // v[0] == " a ", v[1] == " ", v[2] == "b" +struct SkipWhitespace { + bool operator()(absl::string_view sp) const { + sp = absl::StripAsciiWhitespace(sp); + return !sp.empty(); + } +}; + +//------------------------------------------------------------------------------ +// StrSplit() +//------------------------------------------------------------------------------ + +// StrSplit() +// +// Splits a given `std::string` based on the provided `Delimiter` object, +// returning the elements within the type specified by the caller. Optionally, +// you may also pass a `Predicate` to `StrSplit()` indicating whether to include +// or exclude the resulting element within the final result set. (See the +// overviews for Delimiters and Predicates above.) +// +// Example: +// +// std::vector<std::string> v = absl::StrSplit("a,b,c,d", ','); +// // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d" +// +// You can also provide an explicit `Delimiter` object: +// +// Example: +// +// using absl::ByAnyChar; +// std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); +// // v[0] == "a", v[1] == "b", v[3] == "c" +// +// See above for more information on delimiters. +// +// By default, empty strings are included in the result set. You can optionally +// include a third `Predicate` argument to apply a test for whether the +// resultant element should be included in the result set: +// +// Example: +// +// std::vector<std::string> v = absl::StrSplit(" a , ,,b,", +// ',', SkipWhitespace()); +// // v[0] == "a", v[1] == "b" +// +// See above for more information on predicates. +// +//------------------------------------------------------------------------------ +// StrSplit() Return Types +//------------------------------------------------------------------------------ +// +// The `StrSplit()` function adapts the returned collection to the collection +// specified by the caller (e.g. `std::vector` above). The returned collections +// may contain `string`, `absl::string_view` (in which case the original std::string +// being split must ensure that it outlives the collection), or any object that +// can be explicitly created from an `absl::string_view`. This behavior works +// for: +// +// 1) All standard STL containers including `std::vector`, `std::list`, +// `std::deque`, `std::set`,`std::multiset`, 'std::map`, and `std::multimap` +// 2) `std::pair` (which is not actually a container). See below. +// +// Example: +// +// // The results are returned as `absl::string_view` objects. Note that we +// // have to ensure that the input std::string outlives any results. +// std::vector<absl::string_view> v = absl::StrSplit("a,b,c", ','); +// +// // Stores results in a std::set<std::string>, which also performs +// // de-duplication and orders the elements in ascending order. +// std::set<std::string> a = absl::StrSplit("b,a,c,a,b", ','); +// // v[0] == "a", v[1] == "b", v[2] = "c" +// +// // `StrSplit()` can be used within a range-based for loop, in which case +// // each element will be of type `absl::string_view`. +// std::vector<std::string> v; +// for (const auto sv : absl::StrSplit("a,b,c", ',')) { +// if (sv != "b") v.emplace_back(sv); +// } +// // v[0] == "a", v[1] == "c" +// +// // Stores results in a map. The map implementation assumes that the input +// // is provided as a series of key/value pairs. For example, the 0th element +// // resulting from the split will be stored as a key to the 1st element. If +// // an odd number of elements are resolved, the last element is paired with +// // a default-constructed value (e.g., empty std::string). +// std::map<std::string, std::string> m = absl::StrSplit("a,b,c", ','); +// // m["a"] == "b", m["c"] == "" // last component value equals "" +// +// Splitting to `std::pair` is an interesting case because it can hold only two +// elements and is not a collection type. When splitting to a `std::pair` the +// first two split strings become the `std::pair` `.first` and `.second` +// members, respectively. The remaining split substrings are discarded. If there +// are less than two split substrings, the empty std::string is used for the +// corresponding +// `std::pair` member. +// +// Example: +// +// // Stores first two split strings as the members in a std::pair. +// std::pair<std::string, std::string> p = absl::StrSplit("a,b,c", ','); +// // p.first == "a", p.second == "b" // "c" is omitted. +// +// The `StrSplit()` function can be used multiple times to perform more +// complicated splitting logic, such as intelligently parsing key-value pairs. +// +// Example: +// +// // The input std::string "a=b=c,d=e,f=,g" becomes +// // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" } +// std::map<std::string, std::string> m; +// for (absl::string_view sp : absl::StrSplit("a=b=c,d=e,f=,g", ',')) { +// m.insert(absl::StrSplit(sp, absl::MaxSplits('=', 1))); +// } +// EXPECT_EQ("b=c", m.find("a")->second); +// EXPECT_EQ("e", m.find("d")->second); +// EXPECT_EQ("", m.find("f")->second); +// EXPECT_EQ("", m.find("g")->second); +// +// WARNING: Due to a legacy bug that is maintained for backward compatibility, +// splitting the following empty string_views produces different results: +// +// absl::StrSplit(absl::string_view(""), '-'); // {""} +// absl::StrSplit(absl::string_view(), '-'); // {}, but should be {""} +// +// Try not to depend on this distinction because the bug may one day be fixed. +template <typename Delimiter> +strings_internal::Splitter< + typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty> +StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d) { + using DelimiterType = + typename strings_internal::SelectDelimiter<Delimiter>::type; + return strings_internal::Splitter<DelimiterType, AllowEmpty>( + std::move(text), DelimiterType(d), AllowEmpty()); +} + +template <typename Delimiter, typename Predicate> +strings_internal::Splitter< + typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate> +StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d, + Predicate p) { + using DelimiterType = + typename strings_internal::SelectDelimiter<Delimiter>::type; + return strings_internal::Splitter<DelimiterType, Predicate>( + std::move(text), DelimiterType(d), std::move(p)); +} + +} // namespace absl + +#endif // ABSL_STRINGS_STR_SPLIT_H_ |