From 0bfe054e0e93cf0c0a19f63eb2cfb6b4afd88ef7 Mon Sep 17 00:00:00 2001 From: Benjamin Barenblat Date: Fri, 3 Jul 2015 15:52:18 -0400 Subject: Initial commit of the regex matcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap glibc’s regex engine to allow matching and group capture in POSIX extended regular expressions. It might be worth rewriting this in terms of the C++11 regex engine; it’s more featureful and more pleasant to use, although it would require more casting. (C can’t represent the std::regex type, so I’d need to use some void pointers.) --- src/regex.urs | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 src/regex.urs (limited to 'src/regex.urs') diff --git a/src/regex.urs b/src/regex.urs new file mode 100644 index 0000000..15ce216 --- /dev/null +++ b/src/regex.urs @@ -0,0 +1,77 @@ +(* Copyright 2015 the Massachusetts Institute of Technology + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. *) + +(* Regular expression matching + +This library implements POSIX extended regular expressions, which most closely +match what 'normal' people think about when they hear 'regular expressions'. +Here's a brief syntax reminder: + + .[]^$()\*{}?+| are metacharacters and must be backslash-escaped if you want to + use them. (Remember, in Ur/Web, backslash is also the string escape + character, so if you want to match a literal open brace, you need to specify + "\\{", if you want to match a literal backslash, you need to specify "\\\\", + etc.) + + . matches any character + x? matches 'x' zero or one time + x* matches 'x' zero or more times + x+ matches 'x' one or more times + x{3,5} matches 'xxx', 'xxxx', and 'xxxxx' + + ^ matches the start of a line + $ matches the end of a line + + [abcx-z] matches 'a', 'b', 'c', 'x', 'y', or 'z' + [^a-z] matches any single character not equal to 'a', 'b', ..., or 'z' + + (abc) matches the string 'abc' and saves it as a marked subexpression + \3 matches the 3rd marked subexpression + + Character classes may be used inside bracket expressions: + [:alnum:] [A-Za-z0-9] alphanumeric characters + [:alpha:] [A-Za-z] alphabetic characters + [:blank:] [ \t] space and tab + [:cntrl:] [\x00-\x1F\x7F] control characters + [:digit:] [0-9] digits + [:graph:] [\x21-\x7E] visible characters + [:lower:] [a-z] lowercase letters + [:print:] [\x20-\x7E] visible characters and the space character + [:punct:] [][!"#$%&'()*+,./:;<=>?@\^_`{|}~-] punctuation characters + [:space:] [ \t\r\n\v\f] whitespace characters + [:upper:] [A-Z] uppercase letters + [:xdigit:] [A-Fa-f0-9] Hexadecimal digits + So if you want to match all duodecimal digits, you can specify + '[[:digit:]A-Ba-b]'. If you simply want all decimal digits, you need + '[[:digit:]]'. *) + + +(* Creating *) + +(* A compiled regular expression. *) +type t + +(* Compiles a regular expression from a POSIX extended regular expression +string. *) +val compile : string -> t + +(* Compiles a case-insensitive regular expression from a POSIX extended regular expression string. *) +val compile_case_insensitive : string -> t + + +(* Searching *) + +(* Matches a regular expression against any part of a string. Returns 'Some +strs', where 'strs' is a list of subexpression matches, if a match succeeds, and +'None' otherwise. *) +val match : t -> string -> option (list string) -- cgit v1.2.3