From 0bfe054e0e93cf0c0a19f63eb2cfb6b4afd88ef7 Mon Sep 17 00:00:00 2001
From: Benjamin Barenblat <bbaren@mit.edu>
Date: Fri, 3 Jul 2015 15:52:18 -0400
Subject: Initial commit of the regex matcher
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wrap glibc’s regex engine to allow matching and group capture in POSIX
extended regular expressions.

It might be worth rewriting this in terms of the C++11 regex engine;
it’s more featureful and more pleasant to use, although it would require
more casting.  (C can’t represent the std::regex type, so I’d need to
use some void pointers.)
---
 src/regex.urs | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 src/regex.urs

(limited to 'src/regex.urs')

diff --git a/src/regex.urs b/src/regex.urs
new file mode 100644
index 0000000..15ce216
--- /dev/null
+++ b/src/regex.urs
@@ -0,0 +1,77 @@
+(* Copyright 2015 the Massachusetts Institute of Technology
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use
+this file except in compliance with the License.  You may obtain a copy of the
+License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations under the License. *)
+
+(* Regular expression matching
+
+This library implements POSIX extended regular expressions, which most closely
+match what 'normal' people think about when they hear 'regular expressions'.
+Here's a brief syntax reminder:
+
+  .[]^$()\*{}?+| are metacharacters and must be backslash-escaped if you want to
+  use them.  (Remember, in Ur/Web, backslash is also the string escape
+  character, so if you want to match a literal open brace, you need to specify
+  "\\{", if you want to match a literal backslash, you need to specify "\\\\",
+  etc.)
+
+  . matches any character
+  x? matches 'x' zero or one time
+  x* matches 'x' zero or more times
+  x+ matches 'x' one or more times
+  x{3,5} matches 'xxx', 'xxxx', and 'xxxxx'
+
+  ^ matches the start of a line
+  $ matches the end of a line
+
+  [abcx-z] matches 'a', 'b', 'c', 'x', 'y', or 'z'
+  [^a-z] matches any single character not equal to 'a', 'b', ..., or 'z'
+
+  (abc) matches the string 'abc' and saves it as a marked subexpression
+  \3 matches the 3rd marked subexpression
+
+  Character classes may be used inside bracket expressions:
+  [:alnum:]   [A-Za-z0-9]                         alphanumeric characters
+  [:alpha:]   [A-Za-z]                            alphabetic characters
+  [:blank:]   [ \t]                               space and tab
+  [:cntrl:]   [\x00-\x1F\x7F]                     control characters
+  [:digit:]   [0-9]                               digits
+  [:graph:]   [\x21-\x7E]                         visible characters
+  [:lower:]   [a-z]                               lowercase letters
+  [:print:]   [\x20-\x7E]                         visible characters and the space character
+  [:punct:]   [][!"#$%&'()*+,./:;<=>?@\^_`{|}~-]  punctuation characters
+  [:space:]   [ \t\r\n\v\f]                       whitespace characters
+  [:upper:]   [A-Z]                               uppercase letters
+  [:xdigit:]  [A-Fa-f0-9]                         Hexadecimal digits
+  So if you want to match all duodecimal digits, you can specify
+  '[[:digit:]A-Ba-b]'.  If you simply want all decimal digits, you need
+  '[[:digit:]]'. *)
+
+
+(* Creating *)
+
+(* A compiled regular expression. *)
+type t
+
+(* Compiles a regular expression from a POSIX extended regular expression
+string. *)
+val compile : string -> t
+
+(* Compiles a case-insensitive regular expression from a POSIX extended regular expression string. *)
+val compile_case_insensitive : string -> t
+
+
+(* Searching *)
+
+(* Matches a regular expression against any part of a string.  Returns 'Some
+strs', where 'strs' is a list of subexpression matches, if a match succeeds, and
+'None' otherwise. *)
+val match : t -> string -> option (list string)
-- 
cgit v1.2.3