From 0bfe054e0e93cf0c0a19f63eb2cfb6b4afd88ef7 Mon Sep 17 00:00:00 2001 From: Benjamin Barenblat Date: Fri, 3 Jul 2015 15:52:18 -0400 Subject: Initial commit of the regex matcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap glibc’s regex engine to allow matching and group capture in POSIX extended regular expressions. It might be worth rewriting this in terms of the C++11 regex engine; it’s more featureful and more pleasant to use, although it would require more casting. (C can’t represent the std::regex type, so I’d need to use some void pointers.) --- .gitignore | 50 ++++++++++ LICENSE | 202 +++++++++++++++++++++++++++++++++++++++++ Makefile.am | 61 +++++++++++++ configure.ac | 48 ++++++++++ m4/ax_cxx_compile_stdcxx_11.m4 | 142 +++++++++++++++++++++++++++++ src/lib.urp | 5 + src/regex.ur | 44 +++++++++ src/regex.urs | 77 ++++++++++++++++ src/regex__FFI.cc | 192 +++++++++++++++++++++++++++++++++++++++ src/regex__FFI.h | 60 ++++++++++++ src/regex__FFI.urs | 34 +++++++ 11 files changed, 915 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 Makefile.am create mode 100644 configure.ac create mode 100644 m4/ax_cxx_compile_stdcxx_11.m4 create mode 100644 src/lib.urp create mode 100644 src/regex.ur create mode 100644 src/regex.urs create mode 100644 src/regex__FFI.cc create mode 100644 src/regex__FFI.h create mode 100644 src/regex__FFI.urs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7cc3631 --- /dev/null +++ b/.gitignore @@ -0,0 +1,50 @@ +# -*- conf -*- + +# Editor backup files +*~ +\#* +.\#* + +# aclocal +aclocal.m4 +autom4te.cache/ + +# libtoolize +ltmain.sh +m4/libtool.m4 +m4/ltoptions.m4 +m4/ltsugar.m4 +m4/ltversion.m4 +m4/lt~obsolete.m4 + +# autoconf +configure + +# autoheader +config.h.in + +# automake +Makefile.in +compile +config.guess +config.sub +depcomp +install-sh +missing + +# configure +.deps/ +Makefile +config.h +config.log +config.status +libtool +stamp-* + +# make +.dirstamp +*.la +*.lo +*.o + +.libs/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..b3926a2 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,61 @@ +# Copyright 2015 the Massachusetts Institute of Technology +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +ACLOCAL_AMFLAGS = -I m4 + +CPPFLAGS = \ + -Wall \ + -D_FORTIFY_SOURCE=2 + +CXXFLAGS = \ + -std=c++11 \ + -ftrapv \ + -fstack-protector-strong --param=ssp-buffer-size=4 \ + -fPIE \ + -O2 \ + -ffunction-sections -fdata-sections + +CXXFLAGS += \ + -Weverything \ + -Wno-c++98-compat \ + -Wno-padded + +LDFLAGS = \ + -Wl,-Bsymbolic-functions \ + -fPIE -pie \ + -Wl,-z,now \ + -Wl,-z,relro \ + -Wl,--hash-style=gnu \ + -Wl,--no-copy-dt-needed-entries \ + -Wl,--as-needed + +lib_LTLIBRARIES = liburweb_regex.la + +liburweb_regex_la_SOURCES = src/regex__FFI.cc +liburweb_regex_la_DATA = \ + src/lib.urp \ + src/regex__FFI.h \ + src/regex__FFI.urs \ + src/regex.urs \ + src/regex.ur + +liburweb_regex_la_LIBADD = -lurweb +liburweb_regex_la_LDFLAGS = \ + -export-symbols-regex '^uw_Regex_' + +liburweb_regex_ladir = $(datadir)/urweb/ur/regex + +EXTRA_DIST = \ + LICENSE \ + $(liburweb_regex_la_DATA) diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..a9c1c74 --- /dev/null +++ b/configure.ac @@ -0,0 +1,48 @@ +# Copyright 2015 the Massachusetts Institute of Technology +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +AC_INIT([urweb-regex], [1.0.0], [bbaren@mit.edu]) +AM_INIT_AUTOMAKE([no-dist-gzip dist-xz foreign subdir-objects]) +LT_INIT + +AC_CONFIG_MACRO_DIR([m4]) + +AC_CONFIG_SRCDIR([config.h.in]) +AC_CONFIG_HEADERS([config.h]) + +AC_PROG_CXX([clang++]) +AX_CXX_COMPILE_STDCXX_11([noext], [mandatory]) + +AC_CHECK_HEADERS([regex.h], + [], + [AC_MSG_FAILURE([a POSIX-compatible regex.h is required])]) + +AC_CHECK_HEADERS([urweb/urweb_cpp.h], + [], + [AC_MSG_FAILURE([Ur/Web headers are required])]) +save_LIBS=$LIBS +LIBS="$LIBS -lurweb" +AC_LINK_IFELSE([AC_LANG_CALL( + [int uw_init_client_data; + int uw_copy_client_data; + int uw_global_custom; + int uw_do_expunge; + int uw_post_expunge; + int uw_free_client_data;], + [uw_error])], + [], + [AC_MSG_FAILURE([could not link with -lurweb])]) +LIBS=$save_LIBS + +AC_OUTPUT([Makefile]) diff --git a/m4/ax_cxx_compile_stdcxx_11.m4 b/m4/ax_cxx_compile_stdcxx_11.m4 new file mode 100644 index 0000000..163a4c6 --- /dev/null +++ b/m4/ax_cxx_compile_stdcxx_11.m4 @@ -0,0 +1,142 @@ +# ============================================================================ +# http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html +# ============================================================================ +# +# SYNOPSIS +# +# AX_CXX_COMPILE_STDCXX_11([ext|noext],[mandatory|optional]) +# +# DESCRIPTION +# +# Check for baseline language coverage in the compiler for the C++11 +# standard; if necessary, add switches to CXXFLAGS to enable support. +# +# The first argument, if specified, indicates whether you insist on an +# extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g. +# -std=c++11). If neither is specified, you get whatever works, with +# preference for an extended mode. +# +# The second argument, if specified 'mandatory' or if left unspecified, +# indicates that baseline C++11 support is required and that the macro +# should error out if no mode with that support is found. If specified +# 'optional', then configuration proceeds regardless, after defining +# HAVE_CXX11 if and only if a supporting mode is found. +# +# LICENSE +# +# Copyright (c) 2008 Benjamin Kosnik +# Copyright (c) 2012 Zack Weinberg +# Copyright (c) 2013 Roy Stogner +# Copyright (c) 2014 Alexey Sokolov +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 4 + +m4_define([_AX_CXX_COMPILE_STDCXX_11_testbody], [[ + template + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + struct Base { + virtual void f() {} + }; + struct Child : public Base { + virtual void f() override {} + }; + + typedef check> right_angle_brackets; + + int a; + decltype(a) b; + + typedef check check_type; + check_type c; + check_type&& cr = static_cast(c); + + auto d = a; + auto l = [](){}; +]]) + +AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [dnl + m4_if([$1], [], [], + [$1], [ext], [], + [$1], [noext], [], + [m4_fatal([invalid argument `$1' to AX_CXX_COMPILE_STDCXX_11])])dnl + m4_if([$2], [], [ax_cxx_compile_cxx11_required=true], + [$2], [mandatory], [ax_cxx_compile_cxx11_required=true], + [$2], [optional], [ax_cxx_compile_cxx11_required=false], + [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX_11])]) + AC_LANG_PUSH([C++])dnl + ac_success=no + AC_CACHE_CHECK(whether $CXX supports C++11 features by default, + ax_cv_cxx_compile_cxx11, + [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], + [ax_cv_cxx_compile_cxx11=yes], + [ax_cv_cxx_compile_cxx11=no])]) + if test x$ax_cv_cxx_compile_cxx11 = xyes; then + ac_success=yes + fi + + m4_if([$1], [noext], [], [dnl + if test x$ac_success = xno; then + for switch in -std=gnu++11 -std=gnu++0x; do + cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch]) + AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch, + $cachevar, + [ac_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS $switch" + AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], + [eval $cachevar=yes], + [eval $cachevar=no]) + CXXFLAGS="$ac_save_CXXFLAGS"]) + if eval test x\$$cachevar = xyes; then + CXXFLAGS="$CXXFLAGS $switch" + ac_success=yes + break + fi + done + fi]) + + m4_if([$1], [ext], [], [dnl + if test x$ac_success = xno; then + for switch in -std=c++11 -std=c++0x; do + cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch]) + AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch, + $cachevar, + [ac_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS $switch" + AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])], + [eval $cachevar=yes], + [eval $cachevar=no]) + CXXFLAGS="$ac_save_CXXFLAGS"]) + if eval test x\$$cachevar = xyes; then + CXXFLAGS="$CXXFLAGS $switch" + ac_success=yes + break + fi + done + fi]) + AC_LANG_POP([C++]) + if test x$ax_cxx_compile_cxx11_required = xtrue; then + if test x$ac_success = xno; then + AC_MSG_ERROR([*** A compiler with support for C++11 language features is required.]) + fi + else + if test x$ac_success = xno; then + HAVE_CXX11=0 + AC_MSG_NOTICE([No compiler with C++11 support was found]) + else + HAVE_CXX11=1 + AC_DEFINE(HAVE_CXX11,1, + [define if the compiler supports basic C++11 syntax]) + fi + + AC_SUBST(HAVE_CXX11) + fi +]) diff --git a/src/lib.urp b/src/lib.urp new file mode 100644 index 0000000..9f95450 --- /dev/null +++ b/src/lib.urp @@ -0,0 +1,5 @@ +ffi regex__FFI +include regex__FFI.h +link -lurweb_regex + +regex \ No newline at end of file diff --git a/src/regex.ur b/src/regex.ur new file mode 100644 index 0000000..ddc7793 --- /dev/null +++ b/src/regex.ur @@ -0,0 +1,44 @@ +(* Copyright 2015 the Massachusetts Institute of Technology + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. *) + +structure FFI = Regex__FFI + +type t = FFI.regex + +val compile = FFI.compile True + +val compile_case_insensitive = FFI.compile False + +fun match regex input = + (* Perform the match. *) + let + val result = FFI.do_match regex input + in + if not (FFI.succeeded result) + then + (* No match occurred. *) + None + else + (* Get the subexpressions. We must do this iteratively, as the Regex__FFI + API can't return a list of matches. *) + let + fun loop i = + if i = FFI.n_subexpression_matches result + then + (* We've got all the subexpressions. *) + [] + else FFI.subexpression_match result i :: loop (i + 1) + in + Some (loop 0) + end + end diff --git a/src/regex.urs b/src/regex.urs new file mode 100644 index 0000000..15ce216 --- /dev/null +++ b/src/regex.urs @@ -0,0 +1,77 @@ +(* Copyright 2015 the Massachusetts Institute of Technology + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. *) + +(* Regular expression matching + +This library implements POSIX extended regular expressions, which most closely +match what 'normal' people think about when they hear 'regular expressions'. +Here's a brief syntax reminder: + + .[]^$()\*{}?+| are metacharacters and must be backslash-escaped if you want to + use them. (Remember, in Ur/Web, backslash is also the string escape + character, so if you want to match a literal open brace, you need to specify + "\\{", if you want to match a literal backslash, you need to specify "\\\\", + etc.) + + . matches any character + x? matches 'x' zero or one time + x* matches 'x' zero or more times + x+ matches 'x' one or more times + x{3,5} matches 'xxx', 'xxxx', and 'xxxxx' + + ^ matches the start of a line + $ matches the end of a line + + [abcx-z] matches 'a', 'b', 'c', 'x', 'y', or 'z' + [^a-z] matches any single character not equal to 'a', 'b', ..., or 'z' + + (abc) matches the string 'abc' and saves it as a marked subexpression + \3 matches the 3rd marked subexpression + + Character classes may be used inside bracket expressions: + [:alnum:] [A-Za-z0-9] alphanumeric characters + [:alpha:] [A-Za-z] alphabetic characters + [:blank:] [ \t] space and tab + [:cntrl:] [\x00-\x1F\x7F] control characters + [:digit:] [0-9] digits + [:graph:] [\x21-\x7E] visible characters + [:lower:] [a-z] lowercase letters + [:print:] [\x20-\x7E] visible characters and the space character + [:punct:] [][!"#$%&'()*+,./:;<=>?@\^_`{|}~-] punctuation characters + [:space:] [ \t\r\n\v\f] whitespace characters + [:upper:] [A-Z] uppercase letters + [:xdigit:] [A-Fa-f0-9] Hexadecimal digits + So if you want to match all duodecimal digits, you can specify + '[[:digit:]A-Ba-b]'. If you simply want all decimal digits, you need + '[[:digit:]]'. *) + + +(* Creating *) + +(* A compiled regular expression. *) +type t + +(* Compiles a regular expression from a POSIX extended regular expression +string. *) +val compile : string -> t + +(* Compiles a case-insensitive regular expression from a POSIX extended regular expression string. *) +val compile_case_insensitive : string -> t + + +(* Searching *) + +(* Matches a regular expression against any part of a string. Returns 'Some +strs', where 'strs' is a list of subexpression matches, if a match succeeds, and +'None' otherwise. *) +val match : t -> string -> option (list string) diff --git a/src/regex__FFI.cc b/src/regex__FFI.cc new file mode 100644 index 0000000..403171f --- /dev/null +++ b/src/regex__FFI.cc @@ -0,0 +1,192 @@ +// Copyright (C) 2015 the Massachusetts Institute of Technology +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +#include "regex__FFI.h" + +#include +#include + +#include + +extern "C" { +#include +} + +#include "config.h" + +namespace { + +using Regex = uw_Regex__FFI_regex; +using Match = uw_Regex__FFI_match; + +// Asserts a condition without crashing or releasing information about where the +// error occurred. This function is essential for web programming, where an +// attacker should not be able to bring down the app by causing an assertion +// failure. +void Assert(uw_context* const context, const bool condition, + const failure_kind action, const char* const message) { + if (!condition) { + uw_error(context, action, message); + } +} + +void Assert(uw_context* const context, + const bool condition, const char* const message) { + Assert(context, condition, FATAL, message); +} + +void FinalizeRegex(void* regex, [[gnu::unused]] const int _will_retry) { + regfree(reinterpret_cast(regex)); +} + +void DeleteRegex(void* regex, [[gnu::unused]] const int _will_retry) { + delete reinterpret_cast(regex); +} + +} // namespace + +uw_Basis_bool uw_Regex__FFI_succeeded( + [[gnu::unused]] struct uw_context* _context, + const Match match) { + return match.succeeded ? uw_Basis_True : uw_Basis_False; +} + +uw_Basis_int uw_Regex__FFI_n_subexpression_matches( + [[gnu::unused]] struct uw_context* _context, + const Match match) { + return match.n_matches; +} + +uw_Basis_string uw_Regex__FFI_subexpression_match( + struct uw_context* context, + const Match match, + const uw_Basis_int match_index) { + Assert(context, match.matches[match_index].rm_so != -1, + "regex: match does not exist"); + // Locate the substring in the string to match aginst. + const char* const substring_start = + match.haystack + match.matches[match_index].rm_so; + // Copy it into its own buffer so we can properly null-terminate it. + const std::size_t substring_length = + static_cast(match.matches[match_index].rm_eo + - match.matches[match_index].rm_so); + uw_Basis_string result = reinterpret_cast( + uw_malloc(context, substring_length + 1)); + std::memcpy(result, substring_start, substring_length); + result[substring_length] = '\0'; + return result; +} + +Regex uw_Regex__FFI_compile(uw_context* const context, + const uw_Basis_bool case_sensitive, + const uw_Basis_string input) { + Regex result; + result.text = input; + // We'd like to stack-allocate the compiled field of the Regex struct--or, at + // least, to allocate it with uw_malloc. Unfortunately, neither of those will + // work, because we need to be able to run a finalizer on it, and Ur + // finalizers can only reference addresses that are not managed by Ur. + result.compiled = new regex_t; + Assert(context, + uw_register_transactional(context, result.compiled, + nullptr, nullptr, DeleteRegex) == 0, + "regex: could not register DeleteRegex finalizer"); + // Compile the regex. + const auto flags = REG_EXTENDED | (case_sensitive ? 0 : REG_ICASE); + switch (const auto regcomp_error = regcomp(result.compiled, input, flags)) { + case 0: + // Everything worked perfectly. + break; + case REG_ESPACE: + // We ran out of memory. + uw_error(context, BOUNDED_RETRY, "regex: could not allocate"); + default: + // Something else happened. Generate a nice message for the user. + const auto message_size = + regerror(regcomp_error, result.compiled, nullptr, 0); + char* const message = + reinterpret_cast(uw_malloc(context, message_size)); + Assert(context, + regerror(regcomp_error, result.compiled, message, + message_size) == message_size, + "regex: compilation failed, but error message could not be" + " generated"); + uw_error(context, FATAL, "regex: compilation failed: %s", message); + } + Assert(context, + uw_register_transactional(context, result.compiled, + nullptr, nullptr, FinalizeRegex) == 0, + "regex: could not register FinalizeRegex finalizer"); + // Give the caller the regex. + return result; +} + +Match uw_Regex__FFI_do_match(uw_context* const context, const Regex needle, + const uw_Basis_string haystack) { + Match result; + // Make a duplicate of the string to match against, so if it goes out of scope + // in the calling Ur code, we still have it. TODO(bbaren): Is this necessary? + result.haystack = + reinterpret_cast( + uw_malloc(context, std::strlen(haystack))); + std::strcpy(result.haystack, haystack); + // Figure out how many groups we could have so we can allocate enough space to + // store the match information. + result.n_matches = 0; + for (std::size_t i = 0; i < std::strlen(needle.text); i++) { + switch (needle.text[i]) { + case '\\': + // The next character is escaped, so it can't possibly be the + // metacharacter '('. Skip it. + i++; + break; + case '(': + // That's our metacharacter. + result.n_matches++; + break; + default: + // Nothing interesting. + break; + } + } + // Allocate to store the match information. Allocate one more slot than we + // need, because the regex engine puts information about the entire match in + // the first slot. + result.matches = + reinterpret_cast( + uw_malloc(context, (result.n_matches + 1) * sizeof(regmatch_t))); + // Execute the regex. + switch (regexec(needle.compiled, haystack, result.n_matches + 1, + result.matches, 0)) { + case 0: + // A match occurred. + result.succeeded = 1; + // Bump the matches array to skip information about the entire match. + result.matches++; + break; + case REG_NOMATCH: + // No match occurred. + result.succeeded = 0; + result.n_matches = 0; + result.matches = nullptr; + break; + case REG_ESPACE: + // We ran out of memory. + uw_error(context, BOUNDED_RETRY, "regex: could not allocate"); + default: + // Some unknown error occurred. + uw_error(context, FATAL, "regex: could not execute regular expression"); + } + return result; +} diff --git a/src/regex__FFI.h b/src/regex__FFI.h new file mode 100644 index 0000000..ff2f13d --- /dev/null +++ b/src/regex__FFI.h @@ -0,0 +1,60 @@ +/* Copyright (C) 2015 the Massachusetts Institute of Technology + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. */ + +#ifndef URWEB_REGEX__FFI_H +#define URWEB_REGEX__FFI_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include +#include + +typedef struct { + char* text; + regex_t* compiled; +} uw_Regex__FFI_regex; + +typedef struct { + char* haystack; + int succeeded; + unsigned n_matches; + regmatch_t* matches; +} uw_Regex__FFI_match; + +uw_Basis_bool uw_Regex__FFI_succeeded(struct uw_context*, + const uw_Regex__FFI_match); + +uw_Basis_int uw_Regex__FFI_n_subexpression_matches(struct uw_context*, + const uw_Regex__FFI_match); + +uw_Basis_string uw_Regex__FFI_subexpression_match(struct uw_context*, + const uw_Regex__FFI_match, + const uw_Basis_int); + +uw_Regex__FFI_regex uw_Regex__FFI_compile(struct uw_context*, + const uw_Basis_bool, + const uw_Basis_string); + +uw_Regex__FFI_match uw_Regex__FFI_do_match(struct uw_context*, + const uw_Regex__FFI_regex, + const uw_Basis_string); + +#ifdef __cplusplus +} +#endif + +#endif // URWEB_REGEX__FFI_H diff --git a/src/regex__FFI.urs b/src/regex__FFI.urs new file mode 100644 index 0000000..690ca1d --- /dev/null +++ b/src/regex__FFI.urs @@ -0,0 +1,34 @@ +(* Copyright 2015 the Massachusetts Institute of Technology + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. *) + +(* This is an internal module. You should use the high-level API in Regex +instead. *) + + +(* A compiled regular expression. *) +type regex + +(* Data about a match. There is no function which returns all subexpression +matches, as we can't build an Ur list in C. *) +type match +val succeeded : match -> bool +val n_subexpression_matches : match -> int +val subexpression_match : match -> int -> string + + +(* Compiles a regular expression from a POSIX extended regular expression +string. *) +val compile : bool (* case sensitive? *) -> string -> regex + +(* Matches a regular expression against any part of a string. *) +val do_match : regex -> string -> match -- cgit v1.2.3