From 0bfe054e0e93cf0c0a19f63eb2cfb6b4afd88ef7 Mon Sep 17 00:00:00 2001 From: Benjamin Barenblat Date: Fri, 3 Jul 2015 15:52:18 -0400 Subject: Initial commit of the regex matcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap glibc’s regex engine to allow matching and group capture in POSIX extended regular expressions. It might be worth rewriting this in terms of the C++11 regex engine; it’s more featureful and more pleasant to use, although it would require more casting. (C can’t represent the std::regex type, so I’d need to use some void pointers.) --- src/lib.urp | 5 ++ src/regex.ur | 44 ++++++++++++ src/regex.urs | 77 +++++++++++++++++++++ src/regex__FFI.cc | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/regex__FFI.h | 60 +++++++++++++++++ src/regex__FFI.urs | 34 ++++++++++ 6 files changed, 412 insertions(+) create mode 100644 src/lib.urp create mode 100644 src/regex.ur create mode 100644 src/regex.urs create mode 100644 src/regex__FFI.cc create mode 100644 src/regex__FFI.h create mode 100644 src/regex__FFI.urs (limited to 'src') diff --git a/src/lib.urp b/src/lib.urp new file mode 100644 index 0000000..9f95450 --- /dev/null +++ b/src/lib.urp @@ -0,0 +1,5 @@ +ffi regex__FFI +include regex__FFI.h +link -lurweb_regex + +regex \ No newline at end of file diff --git a/src/regex.ur b/src/regex.ur new file mode 100644 index 0000000..ddc7793 --- /dev/null +++ b/src/regex.ur @@ -0,0 +1,44 @@ +(* Copyright 2015 the Massachusetts Institute of Technology + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. *) + +structure FFI = Regex__FFI + +type t = FFI.regex + +val compile = FFI.compile True + +val compile_case_insensitive = FFI.compile False + +fun match regex input = + (* Perform the match. *) + let + val result = FFI.do_match regex input + in + if not (FFI.succeeded result) + then + (* No match occurred. *) + None + else + (* Get the subexpressions. We must do this iteratively, as the Regex__FFI + API can't return a list of matches. *) + let + fun loop i = + if i = FFI.n_subexpression_matches result + then + (* We've got all the subexpressions. *) + [] + else FFI.subexpression_match result i :: loop (i + 1) + in + Some (loop 0) + end + end diff --git a/src/regex.urs b/src/regex.urs new file mode 100644 index 0000000..15ce216 --- /dev/null +++ b/src/regex.urs @@ -0,0 +1,77 @@ +(* Copyright 2015 the Massachusetts Institute of Technology + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. *) + +(* Regular expression matching + +This library implements POSIX extended regular expressions, which most closely +match what 'normal' people think about when they hear 'regular expressions'. +Here's a brief syntax reminder: + + .[]^$()\*{}?+| are metacharacters and must be backslash-escaped if you want to + use them. (Remember, in Ur/Web, backslash is also the string escape + character, so if you want to match a literal open brace, you need to specify + "\\{", if you want to match a literal backslash, you need to specify "\\\\", + etc.) + + . matches any character + x? matches 'x' zero or one time + x* matches 'x' zero or more times + x+ matches 'x' one or more times + x{3,5} matches 'xxx', 'xxxx', and 'xxxxx' + + ^ matches the start of a line + $ matches the end of a line + + [abcx-z] matches 'a', 'b', 'c', 'x', 'y', or 'z' + [^a-z] matches any single character not equal to 'a', 'b', ..., or 'z' + + (abc) matches the string 'abc' and saves it as a marked subexpression + \3 matches the 3rd marked subexpression + + Character classes may be used inside bracket expressions: + [:alnum:] [A-Za-z0-9] alphanumeric characters + [:alpha:] [A-Za-z] alphabetic characters + [:blank:] [ \t] space and tab + [:cntrl:] [\x00-\x1F\x7F] control characters + [:digit:] [0-9] digits + [:graph:] [\x21-\x7E] visible characters + [:lower:] [a-z] lowercase letters + [:print:] [\x20-\x7E] visible characters and the space character + [:punct:] [][!"#$%&'()*+,./:;<=>?@\^_`{|}~-] punctuation characters + [:space:] [ \t\r\n\v\f] whitespace characters + [:upper:] [A-Z] uppercase letters + [:xdigit:] [A-Fa-f0-9] Hexadecimal digits + So if you want to match all duodecimal digits, you can specify + '[[:digit:]A-Ba-b]'. If you simply want all decimal digits, you need + '[[:digit:]]'. *) + + +(* Creating *) + +(* A compiled regular expression. *) +type t + +(* Compiles a regular expression from a POSIX extended regular expression +string. *) +val compile : string -> t + +(* Compiles a case-insensitive regular expression from a POSIX extended regular expression string. *) +val compile_case_insensitive : string -> t + + +(* Searching *) + +(* Matches a regular expression against any part of a string. Returns 'Some +strs', where 'strs' is a list of subexpression matches, if a match succeeds, and +'None' otherwise. *) +val match : t -> string -> option (list string) diff --git a/src/regex__FFI.cc b/src/regex__FFI.cc new file mode 100644 index 0000000..403171f --- /dev/null +++ b/src/regex__FFI.cc @@ -0,0 +1,192 @@ +// Copyright (C) 2015 the Massachusetts Institute of Technology +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +#include "regex__FFI.h" + +#include +#include + +#include + +extern "C" { +#include +} + +#include "config.h" + +namespace { + +using Regex = uw_Regex__FFI_regex; +using Match = uw_Regex__FFI_match; + +// Asserts a condition without crashing or releasing information about where the +// error occurred. This function is essential for web programming, where an +// attacker should not be able to bring down the app by causing an assertion +// failure. +void Assert(uw_context* const context, const bool condition, + const failure_kind action, const char* const message) { + if (!condition) { + uw_error(context, action, message); + } +} + +void Assert(uw_context* const context, + const bool condition, const char* const message) { + Assert(context, condition, FATAL, message); +} + +void FinalizeRegex(void* regex, [[gnu::unused]] const int _will_retry) { + regfree(reinterpret_cast(regex)); +} + +void DeleteRegex(void* regex, [[gnu::unused]] const int _will_retry) { + delete reinterpret_cast(regex); +} + +} // namespace + +uw_Basis_bool uw_Regex__FFI_succeeded( + [[gnu::unused]] struct uw_context* _context, + const Match match) { + return match.succeeded ? uw_Basis_True : uw_Basis_False; +} + +uw_Basis_int uw_Regex__FFI_n_subexpression_matches( + [[gnu::unused]] struct uw_context* _context, + const Match match) { + return match.n_matches; +} + +uw_Basis_string uw_Regex__FFI_subexpression_match( + struct uw_context* context, + const Match match, + const uw_Basis_int match_index) { + Assert(context, match.matches[match_index].rm_so != -1, + "regex: match does not exist"); + // Locate the substring in the string to match aginst. + const char* const substring_start = + match.haystack + match.matches[match_index].rm_so; + // Copy it into its own buffer so we can properly null-terminate it. + const std::size_t substring_length = + static_cast(match.matches[match_index].rm_eo + - match.matches[match_index].rm_so); + uw_Basis_string result = reinterpret_cast( + uw_malloc(context, substring_length + 1)); + std::memcpy(result, substring_start, substring_length); + result[substring_length] = '\0'; + return result; +} + +Regex uw_Regex__FFI_compile(uw_context* const context, + const uw_Basis_bool case_sensitive, + const uw_Basis_string input) { + Regex result; + result.text = input; + // We'd like to stack-allocate the compiled field of the Regex struct--or, at + // least, to allocate it with uw_malloc. Unfortunately, neither of those will + // work, because we need to be able to run a finalizer on it, and Ur + // finalizers can only reference addresses that are not managed by Ur. + result.compiled = new regex_t; + Assert(context, + uw_register_transactional(context, result.compiled, + nullptr, nullptr, DeleteRegex) == 0, + "regex: could not register DeleteRegex finalizer"); + // Compile the regex. + const auto flags = REG_EXTENDED | (case_sensitive ? 0 : REG_ICASE); + switch (const auto regcomp_error = regcomp(result.compiled, input, flags)) { + case 0: + // Everything worked perfectly. + break; + case REG_ESPACE: + // We ran out of memory. + uw_error(context, BOUNDED_RETRY, "regex: could not allocate"); + default: + // Something else happened. Generate a nice message for the user. + const auto message_size = + regerror(regcomp_error, result.compiled, nullptr, 0); + char* const message = + reinterpret_cast(uw_malloc(context, message_size)); + Assert(context, + regerror(regcomp_error, result.compiled, message, + message_size) == message_size, + "regex: compilation failed, but error message could not be" + " generated"); + uw_error(context, FATAL, "regex: compilation failed: %s", message); + } + Assert(context, + uw_register_transactional(context, result.compiled, + nullptr, nullptr, FinalizeRegex) == 0, + "regex: could not register FinalizeRegex finalizer"); + // Give the caller the regex. + return result; +} + +Match uw_Regex__FFI_do_match(uw_context* const context, const Regex needle, + const uw_Basis_string haystack) { + Match result; + // Make a duplicate of the string to match against, so if it goes out of scope + // in the calling Ur code, we still have it. TODO(bbaren): Is this necessary? + result.haystack = + reinterpret_cast( + uw_malloc(context, std::strlen(haystack))); + std::strcpy(result.haystack, haystack); + // Figure out how many groups we could have so we can allocate enough space to + // store the match information. + result.n_matches = 0; + for (std::size_t i = 0; i < std::strlen(needle.text); i++) { + switch (needle.text[i]) { + case '\\': + // The next character is escaped, so it can't possibly be the + // metacharacter '('. Skip it. + i++; + break; + case '(': + // That's our metacharacter. + result.n_matches++; + break; + default: + // Nothing interesting. + break; + } + } + // Allocate to store the match information. Allocate one more slot than we + // need, because the regex engine puts information about the entire match in + // the first slot. + result.matches = + reinterpret_cast( + uw_malloc(context, (result.n_matches + 1) * sizeof(regmatch_t))); + // Execute the regex. + switch (regexec(needle.compiled, haystack, result.n_matches + 1, + result.matches, 0)) { + case 0: + // A match occurred. + result.succeeded = 1; + // Bump the matches array to skip information about the entire match. + result.matches++; + break; + case REG_NOMATCH: + // No match occurred. + result.succeeded = 0; + result.n_matches = 0; + result.matches = nullptr; + break; + case REG_ESPACE: + // We ran out of memory. + uw_error(context, BOUNDED_RETRY, "regex: could not allocate"); + default: + // Some unknown error occurred. + uw_error(context, FATAL, "regex: could not execute regular expression"); + } + return result; +} diff --git a/src/regex__FFI.h b/src/regex__FFI.h new file mode 100644 index 0000000..ff2f13d --- /dev/null +++ b/src/regex__FFI.h @@ -0,0 +1,60 @@ +/* Copyright (C) 2015 the Massachusetts Institute of Technology + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. */ + +#ifndef URWEB_REGEX__FFI_H +#define URWEB_REGEX__FFI_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include + +typedef struct { + char* text; + regex_t* compiled; +} uw_Regex__FFI_regex; + +typedef struct { + char* haystack; + int succeeded; + unsigned n_matches; + regmatch_t* matches; +} uw_Regex__FFI_match; + +uw_Basis_bool uw_Regex__FFI_succeeded(struct uw_context*, + const uw_Regex__FFI_match); + +uw_Basis_int uw_Regex__FFI_n_subexpression_matches(struct uw_context*, + const uw_Regex__FFI_match); + +uw_Basis_string uw_Regex__FFI_subexpression_match(struct uw_context*, + const uw_Regex__FFI_match, + const uw_Basis_int); + +uw_Regex__FFI_regex uw_Regex__FFI_compile(struct uw_context*, + const uw_Basis_bool, + const uw_Basis_string); + +uw_Regex__FFI_match uw_Regex__FFI_do_match(struct uw_context*, + const uw_Regex__FFI_regex, + const uw_Basis_string); + +#ifdef __cplusplus +} +#endif + +#endif // URWEB_REGEX__FFI_H diff --git a/src/regex__FFI.urs b/src/regex__FFI.urs new file mode 100644 index 0000000..690ca1d --- /dev/null +++ b/src/regex__FFI.urs @@ -0,0 +1,34 @@ +(* Copyright 2015 the Massachusetts Institute of Technology + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. *) + +(* This is an internal module. You should use the high-level API in Regex +instead. *) + + +(* A compiled regular expression. *) +type regex + +(* Data about a match. There is no function which returns all subexpression +matches, as we can't build an Ur list in C. *) +type match +val succeeded : match -> bool +val n_subexpression_matches : match -> int +val subexpression_match : match -> int -> string + + +(* Compiles a regular expression from a POSIX extended regular expression +string. *) +val compile : bool (* case sensitive? *) -> string -> regex + +(* Matches a regular expression against any part of a string. *) +val do_match : regex -> string -> match -- cgit v1.2.3