From 0bfe054e0e93cf0c0a19f63eb2cfb6b4afd88ef7 Mon Sep 17 00:00:00 2001 From: Benjamin Barenblat Date: Fri, 3 Jul 2015 15:52:18 -0400 Subject: Initial commit of the regex matcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap glibc’s regex engine to allow matching and group capture in POSIX extended regular expressions. It might be worth rewriting this in terms of the C++11 regex engine; it’s more featureful and more pleasant to use, although it would require more casting. (C can’t represent the std::regex type, so I’d need to use some void pointers.) --- src/regex__FFI.cc | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 src/regex__FFI.cc (limited to 'src/regex__FFI.cc') diff --git a/src/regex__FFI.cc b/src/regex__FFI.cc new file mode 100644 index 0000000..403171f --- /dev/null +++ b/src/regex__FFI.cc @@ -0,0 +1,192 @@ +// Copyright (C) 2015 the Massachusetts Institute of Technology +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +#include "regex__FFI.h" + +#include +#include + +#include + +extern "C" { +#include +} + +#include "config.h" + +namespace { + +using Regex = uw_Regex__FFI_regex; +using Match = uw_Regex__FFI_match; + +// Asserts a condition without crashing or releasing information about where the +// error occurred. This function is essential for web programming, where an +// attacker should not be able to bring down the app by causing an assertion +// failure. +void Assert(uw_context* const context, const bool condition, + const failure_kind action, const char* const message) { + if (!condition) { + uw_error(context, action, message); + } +} + +void Assert(uw_context* const context, + const bool condition, const char* const message) { + Assert(context, condition, FATAL, message); +} + +void FinalizeRegex(void* regex, [[gnu::unused]] const int _will_retry) { + regfree(reinterpret_cast(regex)); +} + +void DeleteRegex(void* regex, [[gnu::unused]] const int _will_retry) { + delete reinterpret_cast(regex); +} + +} // namespace + +uw_Basis_bool uw_Regex__FFI_succeeded( + [[gnu::unused]] struct uw_context* _context, + const Match match) { + return match.succeeded ? uw_Basis_True : uw_Basis_False; +} + +uw_Basis_int uw_Regex__FFI_n_subexpression_matches( + [[gnu::unused]] struct uw_context* _context, + const Match match) { + return match.n_matches; +} + +uw_Basis_string uw_Regex__FFI_subexpression_match( + struct uw_context* context, + const Match match, + const uw_Basis_int match_index) { + Assert(context, match.matches[match_index].rm_so != -1, + "regex: match does not exist"); + // Locate the substring in the string to match aginst. + const char* const substring_start = + match.haystack + match.matches[match_index].rm_so; + // Copy it into its own buffer so we can properly null-terminate it. + const std::size_t substring_length = + static_cast(match.matches[match_index].rm_eo + - match.matches[match_index].rm_so); + uw_Basis_string result = reinterpret_cast( + uw_malloc(context, substring_length + 1)); + std::memcpy(result, substring_start, substring_length); + result[substring_length] = '\0'; + return result; +} + +Regex uw_Regex__FFI_compile(uw_context* const context, + const uw_Basis_bool case_sensitive, + const uw_Basis_string input) { + Regex result; + result.text = input; + // We'd like to stack-allocate the compiled field of the Regex struct--or, at + // least, to allocate it with uw_malloc. Unfortunately, neither of those will + // work, because we need to be able to run a finalizer on it, and Ur + // finalizers can only reference addresses that are not managed by Ur. + result.compiled = new regex_t; + Assert(context, + uw_register_transactional(context, result.compiled, + nullptr, nullptr, DeleteRegex) == 0, + "regex: could not register DeleteRegex finalizer"); + // Compile the regex. + const auto flags = REG_EXTENDED | (case_sensitive ? 0 : REG_ICASE); + switch (const auto regcomp_error = regcomp(result.compiled, input, flags)) { + case 0: + // Everything worked perfectly. + break; + case REG_ESPACE: + // We ran out of memory. + uw_error(context, BOUNDED_RETRY, "regex: could not allocate"); + default: + // Something else happened. Generate a nice message for the user. + const auto message_size = + regerror(regcomp_error, result.compiled, nullptr, 0); + char* const message = + reinterpret_cast(uw_malloc(context, message_size)); + Assert(context, + regerror(regcomp_error, result.compiled, message, + message_size) == message_size, + "regex: compilation failed, but error message could not be" + " generated"); + uw_error(context, FATAL, "regex: compilation failed: %s", message); + } + Assert(context, + uw_register_transactional(context, result.compiled, + nullptr, nullptr, FinalizeRegex) == 0, + "regex: could not register FinalizeRegex finalizer"); + // Give the caller the regex. + return result; +} + +Match uw_Regex__FFI_do_match(uw_context* const context, const Regex needle, + const uw_Basis_string haystack) { + Match result; + // Make a duplicate of the string to match against, so if it goes out of scope + // in the calling Ur code, we still have it. TODO(bbaren): Is this necessary? + result.haystack = + reinterpret_cast( + uw_malloc(context, std::strlen(haystack))); + std::strcpy(result.haystack, haystack); + // Figure out how many groups we could have so we can allocate enough space to + // store the match information. + result.n_matches = 0; + for (std::size_t i = 0; i < std::strlen(needle.text); i++) { + switch (needle.text[i]) { + case '\\': + // The next character is escaped, so it can't possibly be the + // metacharacter '('. Skip it. + i++; + break; + case '(': + // That's our metacharacter. + result.n_matches++; + break; + default: + // Nothing interesting. + break; + } + } + // Allocate to store the match information. Allocate one more slot than we + // need, because the regex engine puts information about the entire match in + // the first slot. + result.matches = + reinterpret_cast( + uw_malloc(context, (result.n_matches + 1) * sizeof(regmatch_t))); + // Execute the regex. + switch (regexec(needle.compiled, haystack, result.n_matches + 1, + result.matches, 0)) { + case 0: + // A match occurred. + result.succeeded = 1; + // Bump the matches array to skip information about the entire match. + result.matches++; + break; + case REG_NOMATCH: + // No match occurred. + result.succeeded = 0; + result.n_matches = 0; + result.matches = nullptr; + break; + case REG_ESPACE: + // We ran out of memory. + uw_error(context, BOUNDED_RETRY, "regex: could not allocate"); + default: + // Some unknown error occurred. + uw_error(context, FATAL, "regex: could not execute regular expression"); + } + return result; +} -- cgit v1.2.3