From a8b108619083c2088269ea8071c7958f277ed41f Mon Sep 17 00:00:00 2001 From: Benjamin Barenblat Date: Mon, 27 Jul 2015 17:03:43 -0400 Subject: Rework to use the C++11 regex library Switch to using the C++11 regex library for better portability and ease of use. As an added bonus, this should make it easier to implement regex substitution. --- Makefile.am | 3 +- configure.ac | 4 - src/regex__FFI.cc | 223 +++++++++++++++++++++++++----------------------------- src/regex__FFI.h | 9 +-- 4 files changed, 106 insertions(+), 133 deletions(-) diff --git a/Makefile.am b/Makefile.am index b3926a2..3cc46e9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -29,7 +29,8 @@ CXXFLAGS = \ CXXFLAGS += \ -Weverything \ -Wno-c++98-compat \ - -Wno-padded + -Wno-padded \ + -Wno-switch-enum LDFLAGS = \ -Wl,-Bsymbolic-functions \ diff --git a/configure.ac b/configure.ac index a9c1c74..9140509 100644 --- a/configure.ac +++ b/configure.ac @@ -24,10 +24,6 @@ AC_CONFIG_HEADERS([config.h]) AC_PROG_CXX([clang++]) AX_CXX_COMPILE_STDCXX_11([noext], [mandatory]) -AC_CHECK_HEADERS([regex.h], - [], - [AC_MSG_FAILURE([a POSIX-compatible regex.h is required])]) - AC_CHECK_HEADERS([urweb/urweb_cpp.h], [], [AC_MSG_FAILURE([Ur/Web headers are required])]) diff --git a/src/regex__FFI.cc b/src/regex__FFI.cc index 403171f..0ea3455 100644 --- a/src/regex__FFI.cc +++ b/src/regex__FFI.cc @@ -14,10 +14,11 @@ #include "regex__FFI.h" -#include #include #include +#include +#include extern "C" { #include @@ -27,9 +28,6 @@ extern "C" { namespace { -using Regex = uw_Regex__FFI_regex; -using Match = uw_Regex__FFI_match; - // Asserts a condition without crashing or releasing information about where the // error occurred. This function is essential for web programming, where an // attacker should not be able to bring down the app by causing an assertion @@ -46,147 +44,130 @@ void Assert(uw_context* const context, Assert(context, condition, FATAL, message); } -void FinalizeRegex(void* regex, [[gnu::unused]] const int _will_retry) { - regfree(reinterpret_cast(regex)); +void DeleteRegex(void* regex, [[gnu::unused]] const int _will_retry) { + delete reinterpret_cast(regex); } -void DeleteRegex(void* regex, [[gnu::unused]] const int _will_retry) { - delete reinterpret_cast(regex); +void DeleteMatchResults(void* match_result, + [[gnu::unused]] const int _will_retry) { + delete reinterpret_cast(match_result); +} + +// Bounds-checked numeric type conversion +template +U Number(uw_context* const context, const T input) { + Assert(context, input <= std::numeric_limits::max(), + "regex: detected overflow during numeric conversion"); + if (std::numeric_limits::is_signed == std::numeric_limits::is_signed) { + Assert(context, std::numeric_limits::lowest() <= input, + "regex: detected underflow during numeric conversion"); + } else if (std::numeric_limits::is_signed) { + Assert(context, 0 <= input, + "regex: detected underflow during numeric conversion"); + } + return static_cast(input); } } // namespace -uw_Basis_bool uw_Regex__FFI_succeeded( - [[gnu::unused]] struct uw_context* _context, - const Match match) { - return match.succeeded ? uw_Basis_True : uw_Basis_False; +uw_Basis_bool uw_Regex__FFI_succeeded([[gnu::unused]] uw_context* const context, + const uw_Regex__FFI_match match) { + if (reinterpret_cast(match.result)->empty()) { + return uw_Basis_False; + } else { + return uw_Basis_True; + } } uw_Basis_int uw_Regex__FFI_n_subexpression_matches( - [[gnu::unused]] struct uw_context* _context, - const Match match) { - return match.n_matches; + uw_context* const context, + const uw_Regex__FFI_match match) { + const std::cmatch::size_type n_matches = + reinterpret_cast(match.result)->size(); + if (n_matches == 0) { + // Nothing got matched. + return 0; + } else { + // At least one match occurred. Compute the number of parenthesized + // subexpressions that got matched, and return it. + return Number(context, n_matches) - 1; + } } uw_Basis_string uw_Regex__FFI_subexpression_match( - struct uw_context* context, - const Match match, - const uw_Basis_int match_index) { - Assert(context, match.matches[match_index].rm_so != -1, + uw_context* const context, + const uw_Regex__FFI_match match, + const uw_Basis_int match_index_signed) { + const std::cmatch* const match_result = + reinterpret_cast(match.result); + const std::size_t match_index = + Number(context, match_index_signed); + Assert(context, match_index < match_result->size(), "regex: match does not exist"); - // Locate the substring in the string to match aginst. - const char* const substring_start = - match.haystack + match.matches[match_index].rm_so; - // Copy it into its own buffer so we can properly null-terminate it. - const std::size_t substring_length = - static_cast(match.matches[match_index].rm_eo - - match.matches[match_index].rm_so); - uw_Basis_string result = reinterpret_cast( - uw_malloc(context, substring_length + 1)); - std::memcpy(result, substring_start, substring_length); - result[substring_length] = '\0'; + const auto matched_substring = (*match_result)[match_index + 1]; + // Save the matched substring. + const std::size_t result_length = + Number( + context, + matched_substring.length()); + uw_Basis_string result = + reinterpret_cast( + uw_malloc(context, result_length + 1)); + std::strcpy(result, matched_substring.str().c_str()); return result; } -Regex uw_Regex__FFI_compile(uw_context* const context, - const uw_Basis_bool case_sensitive, - const uw_Basis_string input) { - Regex result; - result.text = input; - // We'd like to stack-allocate the compiled field of the Regex struct--or, at - // least, to allocate it with uw_malloc. Unfortunately, neither of those will - // work, because we need to be able to run a finalizer on it, and Ur - // finalizers can only reference addresses that are not managed by Ur. - result.compiled = new regex_t; +uw_Regex__FFI_regex uw_Regex__FFI_compile(uw_context* const context, + const uw_Basis_bool case_sensitive, + const uw_Basis_string input) { + // We'd like to stack-allocate the result--or, at least, to allocate it with + // uw_malloc. Unfortunately, neither of those will work, because we need to + // run a finalizer on it, and Ur finalizers can only reference addresses that + // are not managed by Ur. + auto* result = new std::regex; Assert(context, - uw_register_transactional(context, result.compiled, + uw_register_transactional(context, result, nullptr, nullptr, DeleteRegex) == 0, "regex: could not register DeleteRegex finalizer"); - // Compile the regex. - const auto flags = REG_EXTENDED | (case_sensitive ? 0 : REG_ICASE); - switch (const auto regcomp_error = regcomp(result.compiled, input, flags)) { - case 0: - // Everything worked perfectly. - break; - case REG_ESPACE: - // We ran out of memory. - uw_error(context, BOUNDED_RETRY, "regex: could not allocate"); - default: - // Something else happened. Generate a nice message for the user. - const auto message_size = - regerror(regcomp_error, result.compiled, nullptr, 0); - char* const message = - reinterpret_cast(uw_malloc(context, message_size)); - Assert(context, - regerror(regcomp_error, result.compiled, message, - message_size) == message_size, - "regex: compilation failed, but error message could not be" - " generated"); - uw_error(context, FATAL, "regex: compilation failed: %s", message); + auto flags = std::regex_constants::extended; + if (!case_sensitive) { + flags |= std::regex_constants::icase; + } + try { + result->assign(input, flags); + } catch (const std::regex_error& e) { + switch (e.code()) { + case std::regex_constants::error_space: + case std::regex_constants::error_stack: + // We ran out of memory. + uw_error(context, BOUNDED_RETRY, + "regex: compilation failed: %s", e.what()); + default: + uw_error(context, FATAL, + "regex: compilation failed: %s", e.what()); + } } - Assert(context, - uw_register_transactional(context, result.compiled, - nullptr, nullptr, FinalizeRegex) == 0, - "regex: could not register FinalizeRegex finalizer"); - // Give the caller the regex. return result; } -Match uw_Regex__FFI_do_match(uw_context* const context, const Regex needle, - const uw_Basis_string haystack) { - Match result; - // Make a duplicate of the string to match against, so if it goes out of scope - // in the calling Ur code, we still have it. TODO(bbaren): Is this necessary? +uw_Regex__FFI_match uw_Regex__FFI_do_match(uw_context* const context, + const uw_Regex__FFI_regex needle, + const uw_Basis_string haystack) { + uw_Regex__FFI_match result; + // Make a duplicate of the string to match against, so if it goes out of + // scope in the calling Ur code, we still have it. result.haystack = - reinterpret_cast( - uw_malloc(context, std::strlen(haystack))); + reinterpret_cast(uw_malloc(context, std::strlen(haystack))); std::strcpy(result.haystack, haystack); - // Figure out how many groups we could have so we can allocate enough space to - // store the match information. - result.n_matches = 0; - for (std::size_t i = 0; i < std::strlen(needle.text); i++) { - switch (needle.text[i]) { - case '\\': - // The next character is escaped, so it can't possibly be the - // metacharacter '('. Skip it. - i++; - break; - case '(': - // That's our metacharacter. - result.n_matches++; - break; - default: - // Nothing interesting. - break; - } - } - // Allocate to store the match information. Allocate one more slot than we - // need, because the regex engine puts information about the entire match in - // the first slot. - result.matches = - reinterpret_cast( - uw_malloc(context, (result.n_matches + 1) * sizeof(regmatch_t))); - // Execute the regex. - switch (regexec(needle.compiled, haystack, result.n_matches + 1, - result.matches, 0)) { - case 0: - // A match occurred. - result.succeeded = 1; - // Bump the matches array to skip information about the entire match. - result.matches++; - break; - case REG_NOMATCH: - // No match occurred. - result.succeeded = 0; - result.n_matches = 0; - result.matches = nullptr; - break; - case REG_ESPACE: - // We ran out of memory. - uw_error(context, BOUNDED_RETRY, "regex: could not allocate"); - default: - // Some unknown error occurred. - uw_error(context, FATAL, "regex: could not execute regular expression"); - } + // Allocate to store the match information. + auto* match_results = new std::cmatch; + Assert(context, + uw_register_transactional(context, match_results, + nullptr, nullptr, DeleteMatchResults) == 0, + "regex: could not register DeleteMatchResults finalizer"); + result.result = match_results; + // Execute the regex on the saved haystack, not the original one. + std::regex_search(result.haystack, *match_results, + *reinterpret_cast(needle)); return result; } diff --git a/src/regex__FFI.h b/src/regex__FFI.h index 6fa9482..9d6a2c2 100644 --- a/src/regex__FFI.h +++ b/src/regex__FFI.h @@ -22,16 +22,11 @@ extern "C" { #include -typedef struct { - char* text; - void* compiled; -} uw_Regex__FFI_regex; +typedef void* uw_Regex__FFI_regex; typedef struct { char* haystack; - int succeeded; - unsigned n_matches; - regmatch_t* matches; + void* result; } uw_Regex__FFI_match; uw_Basis_bool uw_Regex__FFI_succeeded(struct uw_context*, -- cgit v1.2.3