Rework replacement API to rely on transformation

Redesign library API around highly general regex-based transformations. Instead of specifying a string to substitute for each match, you now execute an entire function over the match (and over nonmatching regions as well). The resulting C++ code is much simpler, with more functionality pushed into Ur, and the engine now supports certain types of regex transformations needed to mimic Perl.
author: Benjamin Barenblat <bbaren@mit.edu> 2015-08-26 17:29:32 -0400
committer: Benjamin Barenblat <bbaren@mit.edu> 2015-08-26 17:29:32 -0400
commit: bc6b0bee8fe4120642029daaa8ce6c069ef667b8 (patch)
tree: fab27db4556318e4f6a4b7c60e6b47c7ab58e59a
parent: 75ff1a7a1979466a77dcc3acbbb88e897213027f (diff)
7 files changed, 287 insertions, 184 deletions
diff --git a/src/lib.urp b/src/lib.urp
index 75bcd48..7fcdc82 100644
--- a/src/lib.urp
+++ b/src/lib.urp
@@ -1,12 +1,15 @@
 ffi regex__FFI
 include regex__FFI.h
 link -lurweb_regex
-jsFunc Regex__FFI.succeeded=UrWeb.Regex.succeeded
-jsFunc Regex__FFI.n_subexpression_matches=UrWeb.Regex.nSubexpressionMatches
-jsFunc Regex__FFI.subexpression_match=UrWeb.Regex.subexpressionMatch
+jsFunc Regex__FFI.substring_start=UrWeb.Regex.Substring.start
+jsFunc Regex__FFI.substring_length=UrWeb.Regex.Substring.length
+jsFunc Regex__FFI.substring_list_length=UrWeb.Regex.Substring.List.length
+jsFunc Regex__FFI.substring_list_get=UrWeb.Regex.Substring.List.get
 jsFunc Regex__FFI.do_match=UrWeb.Regex.doMatch
-jsFunc Regex__FFI.replace=UrWeb.Regex.replace
 file /cgGvSqBi.js regex__FFI.js
 script /cgGvSqBi.js
 
-regex
-\ No newline at end of file
+$/list
+$/option
+$/string
+regex
diff --git a/src/regex.ur b/src/regex.ur
index 8d099ca..0024561 100644
--- a/src/regex.ur
+++ b/src/regex.ur
@@ -12,30 +12,118 @@ under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 CONDITIONS OF ANY KIND, either express or implied.  See the License for the
 specific language governing permissions and limitations under the License. *)
 
+(* Utility *)
+
+fun concat (strings : list string) : string =
+  List.foldr String.append "" strings
+
+
+(* Substrings and matches *)
+
+type substring = {Start : int, Len : int}
+type match = {Whole : substring, Groups : list substring}
+
+fun substring_offset (delta : int) (substring : substring) : substring =
+  {Start = substring.Start + delta, Len = substring.Len}
+
+fun match_offset (delta : int) (match : match) : match =
+  {Whole = substring_offset delta match.Whole,
+   Groups = List.mp (substring_offset delta) match.Groups}
+
+(* Returns the index of the character just _after_ a match. *)
+fun after_match (match : match) : int =
+  match.Whole.Start + match.Whole.Len
+
+
+(* Unmarshaling FFI types *)
+
 structure FFI = Regex__FFI
 
-fun match regex input =
-  (* Perform the match. *)
-  let
-    val result = FFI.do_match regex input
-  in
-    if not (FFI.succeeded result)
-    then
-      (* No match occurred. *)
-      None
-    else
-      (* Get the subexpressions.  We must do this iteratively, as the Regex__FFI
-      API can't return a list of matches. *)
+(* Unmarshals an 'FFI.substring_t' from C into Ur. *)
+fun unmarshal_substring (substring : FFI.substring_t) : substring =
+  {Start = FFI.substring_start substring,
+   Len = FFI.substring_length substring}
+
+(* Unmarshals an 'FFI.substring_list_t' from C into Ur. *)
+fun unmarshal_substring_list (substrings : FFI.substring_list_t)
+    : option match =
+  case FFI.substring_list_length substrings of
+      0 => None
+    | n_groups =>
       let
-        fun loop i =
-          if i = FFI.n_subexpression_matches result
-          then
-            (* We've got all the subexpressions. *)
-            []
-          else FFI.subexpression_match result i :: loop (i + 1)
+        fun loop n =
+          if n_groups <= n
+          then []
+          else unmarshal_substring (FFI.substring_list_get substrings n)
+                 :: loop (n + 1)
       in
-        Some (loop 0)
+        Some {Whole = unmarshal_substring (FFI.substring_list_get substrings 0),
+              Groups = loop 1}
       end
+
+
+(* Regular expressions *)
+
+(* Ensures that a regex is not going to cause problems later down the line. *)
+fun validate (regex : string) : string =
+  if String.lengthGe regex 1
+  then regex
+  else error <xml>regex: Empty regex</xml>
+
+fun match needle haystack =
+  unmarshal_substring_list (FFI.do_match (validate needle) haystack)
+
+fun all_matches needle haystack =
+  case match needle haystack of
+      None => []
+    | Some match =>
+      let
+        val remaining_start = after_match match
+      in
+        match
+          :: List.mp (match_offset remaining_start)
+                     (all_matches needle
+                                  (String.suffix haystack remaining_start))
+      end
+
+fun transform needle f_nomatch f_match haystack =
+  let
+    val haystack_length = String.length haystack
+    val matches = all_matches needle haystack
+  in
+    (* Handle the first nonmatching region. *)
+    f_nomatch {Start = 0,
+               Len = case matches of
+                         [] => haystack_length
+                       | first_match :: _ => first_match.Whole.Start}
+    ^
+    (* Handle the remaining regions. *)
+    concat
+      (List.mapi
+         (fn match_number match =>
+            (* Handle the matching region. *)
+            f_match match
+            ^
+            (* Handle the nonmatching region. *)
+            let
+              val start = match.Whole.Start + match.Whole.Len
+            in
+              f_nomatch {Start = start,
+                         Len =
+                           case List.nth matches (match_number + 1) of
+                               None =>
+                               (* We’re on the last matching region in the
+                               string, so the nonmatching region lasts until the
+                               end of the string. *)
+                               haystack_length - start
+                             | Some next_match =>
+                               next_match.Whole.Start - start}
+            end)
+         matches)
   end
 
-val replace = FFI.replace
+fun transform_matches needle f_match haystack =
+  transform needle (String.substring haystack) f_match haystack
+
+fun replace needle haystack replacement =
+  transform_matches needle (fn _ => replacement) haystack
diff --git a/src/regex.urs b/src/regex.urs
index bd7696b..6e7bdcc 100644
--- a/src/regex.urs
+++ b/src/regex.urs
@@ -16,18 +16,65 @@ specific language governing permissions and limitations under the License. *)
 
 This library implements ECMAScript regular expressions. *)
 
+type substring = {Start : int, Len : int}
+type match = {Whole : substring, Groups : list substring}
+
 (* Searching *)
 
-(* Matches a regular expression against any part of a string.  Returns 'Some
-strs', where 'strs' is a list of subexpression matches, if a match succeeds, and
-'None' otherwise. *)
+(* Matches a regular expression against any part of a string.  Returns
+'Some match' if a match succeeds and 'None' otherwise. *)
 val match : string (* needle *)
-         -> string (* haystack *)
-         -> option (list string)
+            -> string (* haystack *)
+            -> option match
+
+(* Finds _all_ matches for a regular expression in a string. *)
+val all_matches : string (* needle *)
+                  -> string (* haystack *)
+                  -> list match
+
+(* Replacement *)
 
 (* Replaces all substrings in 'haystack' that match 'needle' with the string
 'replacement.' *)
 val replace : string (* needle *)
-           -> string (* replacement *)
-           -> string (* haystack *)
-           -> string
+              -> string (* replacement *)
+              -> string (* haystack *)
+              -> string
+
+(* Transforms a string by applying a function to replace every match in the
+string. *)
+val transform_matches : string (* needle *)
+                        -> (match -> string) (* transformation *)
+                        -> string (* haystack *)
+                        -> string
+
+(* Executes a general regex-guided transformation over a string.  Matches
+'needle' against any part of 'haystack', splitting 'haystack' into matching and
+nonmatching regions.  Then, runs the provided transformation functions over the
+regions and concatenates the results.
+
+The number of nonmatching regions is always exactly one more than the number of
+matching regions.  If two matching regions abut or a matching region adjoins the
+edge of a string, this function will insert an empty nonmatching region as
+appropriate.
+
+An example may make this a bit clearer:
+
+    let
+      val haystack "axbxax"
+    in
+      transform "x"
+                (fn nm => "_" ^ String.substring haystack nm ^ "_")
+                (fn  m => "*" ^ String.substring haystack  m ^ "_")
+                haystack
+    end
+
+evaluates to
+
+    "_a_*x*_b_*x*__"
+*)
+val transform : string (* needle *)
+                -> (substring -> string) (* non-matching transformation *)
+                -> (match -> string) (* matching transformation *)
+                -> string (* haystack *)
+                -> string
diff --git a/src/regex__FFI.cc b/src/regex__FFI.cc
index e3ff601..619ea44 100644
--- a/src/regex__FFI.cc
+++ b/src/regex__FFI.cc
@@ -15,10 +15,9 @@
 
 #include "src/regex__FFI.h"
 
-#include <cstdio>
-#include <cstring>
-#include <string>
 #include <regex>  // NOLINT(build/c++11)
+#include <stdexcept>
+#include <vector>
 
 #include <boost/numeric/conversion/cast.hpp>  // NOLINT(build/include_order)
 extern "C" {
@@ -29,6 +28,10 @@ extern "C" {
 
 namespace {
 
+using Int = uw_Basis_int;
+using Substring = uw_Regex__FFI_substring_t;
+using SubstringList = uw_Regex__FFI_substring_list_t;
+
 // Asserts a condition without crashing or releasing information about where the
 // error occurred.  This function is essential for web programming, where an
 // attacker should not be able to bring down the app by causing an assertion
@@ -45,11 +48,6 @@ void Assert(uw_context* const context, const bool condition,
   Assert(context, condition, FATAL, message);
 }
 
-void DeleteMatchResults(void* match_result,
-                        [[gnu::unused]] const int _will_retry) {
-  delete reinterpret_cast<std::cmatch*>(match_result);
-}
-
 // Bounds-checked numeric type conversion
 template <typename Target, typename Source>
 Target Number(uw_context* const context, Source arg) {
@@ -79,101 +77,56 @@ std::regex Compile(uw_context* const context, const char needle_string[]) {
   return needle;
 }
 
+// Treats 'list' as a 'std::vector<Substring>*' and 'delete's it.
+void DeleteGroupList(void* list, [[gnu::unused]] const int will_retry) {
+  delete reinterpret_cast<std::vector<Substring>*>(list);
+}
+
 }  // namespace
 
-uw_Basis_bool uw_Regex__FFI_succeeded([[gnu::unused]] uw_context* const context,
-                                      const uw_Regex__FFI_match match) {
-  if (reinterpret_cast<std::cmatch*>(match.result)->empty()) {
-    return uw_Basis_False;
-  } else {
-    return uw_Basis_True;
-  }
+Int uw_Regex__FFI_substring_start(uw_context* const context,
+                                  const Substring substring) {
+  return Number<Int>(context, substring.start);
 }
 
-uw_Basis_int uw_Regex__FFI_n_subexpression_matches(
-    uw_context* const context, const uw_Regex__FFI_match match) {
-  const std::cmatch::size_type n_matches =
-      reinterpret_cast<std::cmatch*>(match.result)->size();
-  if (n_matches == 0) {
-    // Nothing got matched.
-    return 0;
-  } else {
-    // At least one match occurred.  Compute the number of parenthesized
-    // subexpressions that got matched, and return it.
-    return Number<uw_Basis_int>(context, n_matches) - 1;
-  }
+Int uw_Regex__FFI_substring_length(uw_context* const context,
+                                   const Substring substring) {
+  return Number<Int>(context, substring.length);
 }
 
-uw_Basis_string uw_Regex__FFI_subexpression_match(
-    uw_context* const context, const uw_Regex__FFI_match match,
-    const uw_Basis_int match_index_signed) {
-  const std::cmatch* const match_result =
-      reinterpret_cast<std::cmatch*>(match.result);
-  const std::size_t match_index =
-      Number<std::size_t>(context, match_index_signed);
-  Assert(context, match_index < match_result->size(),
-         "regex: match does not exist");
-  const auto matched_substring = (*match_result)[match_index + 1];
-  // Save the matched substring.
-  const std::size_t result_length =
-      Number<std::size_t>(context, matched_substring.length());
-  uw_Basis_string result =
-      reinterpret_cast<uw_Basis_string>(uw_malloc(context, result_length + 1));
-  Assert(context, std::snprintf(result, result_length + 1, "%s",
-                                matched_substring.str().c_str()) >= 0,
-         "regex: snprintf failed during match");
-  return result;
+Int uw_Regex__FFI_substring_list_length(uw_context* const context,
+                                        const SubstringList list) {
+  return Number<Int>(
+      context, reinterpret_cast<const std::vector<Substring>*>(list)->size());
 }
 
-uw_Regex__FFI_match uw_Regex__FFI_do_match(uw_context* const context,
-                                           const uw_Basis_string needle_string,
-                                           const uw_Basis_string haystack) {
-  std::regex needle = Compile(context, needle_string);
-  uw_Regex__FFI_match result;
-  // Make a duplicate of the string to match against, so if it goes out of
-  // scope in the calling Ur code, we still have it.
-  const auto haystack_length = std::strlen(haystack);
-  result.haystack =
-      reinterpret_cast<char*>(uw_malloc(context, haystack_length + 1));
-  Assert(context, std::snprintf(result.haystack, haystack_length + 1, "%s",
-                                haystack) >= 0,
-         "regex: snprintf failed during match");
-  // Allocate to store the match information.
-  auto* match_results = new std::cmatch;
-  Assert(context, uw_register_transactional(context, match_results, nullptr,
-                                            nullptr, DeleteMatchResults) == 0,
-         "regex: could not register DeleteMatchResults finalizer");
-  result.result = match_results;
-  // Execute the regex on the saved haystack, not the original one.
-  std::regex_search(result.haystack, *match_results, needle);
-  return result;
+Substring uw_Regex__FFI_substring_list_get(uw_context* const context,
+                                           const SubstringList list,
+                                           const Int index_int) {
+  const auto index = Number<std::size_t>(context, index_int);
+  try {
+    return reinterpret_cast<const std::vector<Substring>*>(list)->at(index);
+  } catch (const std::out_of_range& e) {
+    uw_error(context, FATAL, "regex: index out of range", e.what());
+  }
 }
 
-uw_Basis_string uw_Regex__FFI_replace(uw_context* const context,
-                                      const uw_Basis_string needle_string,
-                                      const uw_Basis_string replacement,
-                                      const uw_Basis_string haystack) {
-  std::regex needle = Compile(context, needle_string);
-  // Perform the replacement.
-  std::string result;
-  try {
-    result = std::regex_replace(haystack, needle, replacement);
-  } catch (const std::regex_error& e) {
-    switch (e.code()) {
-      case std::regex_constants::error_space:
-      case std::regex_constants::error_stack:
-        // We ran out of memory.
-        uw_error(context, BOUNDED_RETRY, "regex: replacement failed: %s",
-                 e.what());
-      default:
-        uw_error(context, FATAL, "regex: replacement failed: %s", e.what());
-    }
+SubstringList uw_Regex__FFI_do_match(uw_context* const context,
+                                     const uw_Basis_string needle,
+                                     const uw_Basis_string haystack) {
+  // Perform the match.
+  std::cmatch match_results;
+  std::regex_search(haystack, match_results, Compile(context, needle));
+  Assert(context, match_results.ready(), "regex: search failed");
+  // Marshal the results into the form Ur expects.
+  auto* const result = new std::vector<Substring>;
+  Assert(context, uw_register_transactional(context, result, nullptr, nullptr,
+                                            DeleteGroupList) == 0,
+         "regex: could not register DeleteGroupList finalizer");
+  for (std::size_t i = 0; i < match_results.size(); i++) {
+    result->emplace_back(
+        Substring{Number<long>(context, match_results.position(i)),
+                  Number<long>(context, match_results.length(i))});
   }
-  // Save the result string.
-  char* const result_string =
-      reinterpret_cast<char*>(uw_malloc(context, result.length() + 1));
-  Assert(context, std::snprintf(result_string, result.length() + 1, "%s",
-                                result.c_str()) >= 0,
-         "regex: snprintf failed during replace");
-  return result_string;
+  return result;
 }
diff --git a/src/regex__FFI.h b/src/regex__FFI.h
index 84b81a9..7399005 100644
--- a/src/regex__FFI.h
+++ b/src/regex__FFI.h
@@ -26,27 +26,28 @@ extern "C" {
 #include <urweb/urweb_cpp.h>
 
 typedef struct {
-  char* haystack;
-  void* result;
-} uw_Regex__FFI_match;
+  long start;
+  long length;
+} uw_Regex__FFI_substring_t;
 
-uw_Basis_bool uw_Regex__FFI_succeeded(struct uw_context*,
-                                      const uw_Regex__FFI_match);
+typedef void* uw_Regex__FFI_substring_list_t;
 
-uw_Basis_int uw_Regex__FFI_n_subexpression_matches(struct uw_context*,
-                                                   const uw_Regex__FFI_match);
+uw_Basis_int uw_Regex__FFI_substring_start(struct uw_context*,
+                                           const uw_Regex__FFI_substring_t);
 
-uw_Basis_string uw_Regex__FFI_subexpression_match(struct uw_context*,
-                                                  const uw_Regex__FFI_match,
-                                                  const uw_Basis_int);
+uw_Basis_int uw_Regex__FFI_substring_length(struct uw_context*,
+                                            const uw_Regex__FFI_substring_t);
 
-uw_Regex__FFI_match uw_Regex__FFI_do_match(struct uw_context*,
-                                           const uw_Basis_string,
-                                           const uw_Basis_string);
+uw_Basis_int uw_Regex__FFI_substring_list_length(
+    struct uw_context*, const uw_Regex__FFI_substring_list_t);
 
-uw_Basis_string uw_Regex__FFI_replace(struct uw_context*, const uw_Basis_string,
-                                      const uw_Basis_string,
-                                      const uw_Basis_string);
+uw_Regex__FFI_substring_t uw_Regex__FFI_substring_list_get(
+    struct uw_context*, const uw_Regex__FFI_substring_list_t,
+    const uw_Basis_int);
+
+uw_Regex__FFI_substring_list_t uw_Regex__FFI_do_match(struct uw_context*,
+                                                      const uw_Basis_string,
+                                                      const uw_Basis_string);
 
 #ifdef __cplusplus
 }
diff --git a/src/regex__FFI.js b/src/regex__FFI.js
index d69369b..1062520 100644
--- a/src/regex__FFI.js
+++ b/src/regex__FFI.js
@@ -14,37 +14,42 @@
 
 var UrWeb = { Regex: {
 
-_compile: function(needle_string) {
-  var needle;
+Substring: {
+  start: function(substring) {
+    return substring.start;
+  },
+
+  length: function(substring) {
+    return substring.length;
+  },
+
+  List: {
+    length: function(list) {
+      return list.length;
+    },
+
+    get: function(list, n) {
+      return list[n];
+    },
+  },
+},
+
+doMatch: function(needle_string, haystack) {
   try {
-    needle = new RegExp(needle_string, "g");
+    var needle = new RegExp(needle_string);
   } catch (e) {
     er("regex: compilation failed");
   }
-  return needle;
-},
-
-succeeded: function(match) {
-  return !!match;
-},
-
-nSubexpressionMatches: function(match) {
-  return match.length - 1;
-},
-
-subexpressionMatch: function(match, n) {
-  if (match.length - 1 <= n) {
-    er("regex: match does not exist");
+  var result = needle.exec(haystack);
+  if (result) {
+    for (var i = 0; i < result.length; i++) {
+      result[i] = {start: haystack.indexOf(result[i]),
+                   length: result[i].length};
+    }
+  } else {
+    result = []
   }
-  return match[n + 1];
-},
-
-doMatch: function(needle, haystack) {
-  return haystack.match(UrWeb.Regex._compile(needle));
-},
-
-replace: function(needle, replacement, haystack) {
-  return haystack.replace(UrWeb.Regex._compile(needle), replacement);
+  return result;
 },
 
 }};  // UrWeb.Regex
diff --git a/src/regex__FFI.urs b/src/regex__FFI.urs
index 0f10052..02e3880 100644
--- a/src/regex__FFI.urs
+++ b/src/regex__FFI.urs
@@ -15,23 +15,29 @@ specific language governing permissions and limitations under the License. *)
 (* This is an internal module.  You should use the high-level API in Regex
 instead. *)
 
+(* Ideally, these types would be declared in a nice module hierarchy.
+Unfortunately, Ur/Web bug #207 makes that impossible. *)
+type substring_t
+val substring_start : substring_t -> int
+val substring_length : substring_t -> int
 
-(* Data about a match.  There is no function which returns all subexpression
-matches, as we can't build an Ur list in C. *)
-type match
-val succeeded : match -> bool
-val n_subexpression_matches : match -> int
-val subexpression_match : match -> int -> string
+type substring_list_t
+val substring_list_length : substring_list_t -> int
+val substring_list_get : substring_list_t -> int -> substring_t
 
+(* Matches a regular expression against any part of a string.  Returns a list of
+groups.  The zeroth element of each match represents the match as a whole.
+Thus, matching /a(b*c)d/ against
 
-(* Matches a regular expression against any part of a string. *)
+              1    1
+    0    5    0    5
+    __acd__abbbbcd__
+
+will yield
+
+    [(2,3), (3, 1)]
+
+where (x,y) is a substring with start x and length y. *)
 val do_match : string (* needle *)
             -> string (* haystack *)
-            -> match
-
-(* Replaces all substrings in 'haystack' that match 'needle' with the string
-'replacement.' *)
-val replace : string (* needle *)
-           -> string (* replacement *)
-           -> string (* haystack *)
-           -> string
+            -> substring_list_t
author	Benjamin Barenblat <bbaren@mit.edu>	2015-08-26 17:29:32 -0400
committer	Benjamin Barenblat <bbaren@mit.edu>	2015-08-26 17:29:32 -0400
commit	bc6b0bee8fe4120642029daaa8ce6c069ef667b8 (patch)
tree	fab27db4556318e4f6a4b7c60e6b47c7ab58e59a
parent	75ff1a7a1979466a77dcc3acbbb88e897213027f (diff)