From 62301a30a8b6e9b71d3549178f89d62c8c3c7d48 Mon Sep 17 00:00:00 2001 From: thomasvl Date: Wed, 30 Jan 2008 18:42:33 +0000 Subject: fold in GTMRegex and ignore the build dir --- Foundation/GTMRegex.h | 338 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 338 insertions(+) create mode 100644 Foundation/GTMRegex.h (limited to 'Foundation/GTMRegex.h') diff --git a/Foundation/GTMRegex.h b/Foundation/GTMRegex.h new file mode 100644 index 0000000..8e0f492 --- /dev/null +++ b/Foundation/GTMRegex.h @@ -0,0 +1,338 @@ +// +// GTMRegex.h +// +// Copyright 2007-2008 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. +// + +#import +#import + +/// Options for controlling the behavior of the matches +typedef enum { + + kGTMRegexOptionIgnoreCase = 0x01, + // Ignore case in matching, ie: 'a' matches 'a' or 'A' + + kGTMRegexOptionSupressNewlineSupport = 0x02, + // By default (without this option), regular expressions are implicitly + // processed on a line by line basis, where "lines" are delimited by newline + // characters. In this mode '.' (dot) does NOT match newline characters, and + // '^' and '$' match at the beginning and end of the string as well as + // around newline characters. This behavior matches the default behavior for + // regular expressions in other languages including Perl and Python. For + // example, + // foo.*bar + // would match + // fooAAAbar + // but would NOT match + // fooAAA\nbar + // With the kGTMRegexOptionSupressNewlineSupport option, newlines are treated + // just like any other character which means that '.' will match them. In + // this mode, ^ and $ only match the beginning and end of the input string + // and do NOT match around the newline characters. For example, + // foo.*bar + // would match + // fooAAAbar + // and would also match + // fooAAA\nbar + +} GTMRegexOptions; + +/// Class for doing Extended Regex operations w/ libregex (see re_format(7)). +// +// NOTE: the docs for recomp/regexec make *no* claims about i18n. All work +// within this class is done w/ UTF-8 so Unicode should move through it safely, +// however, the character classes described in re_format(7) might not really +// be unicode "savvy", so use them and this class w/ that in mind. +// +// Example usage: +// +// NSArray *inputArrayOfStrings = ... +// NSEnumerator *enumerator = [inputArrayOfString objectEnumerator]; +// NSString *curStr = nil; +// NSArray *matches = [NSMutableArray array]; +// +// GTMRegex *regex = [GTMRegex regexWithPattern:@"foo.*bar"]; +// while ((curStr = [enumerator nextObject]) != nil) { +// if ([regex matchesString:curStr]) +// [matches addObject:curStr]; +// } +// .... +// +// ------------- +// +// If you need to include something dynamic in a pattern: +// +// NSString *pattern = +// [NSString stringWithFormat:@"^foo:%@bar", +// [GTMRegex escapedPatternForString:inputStr]]; +// GTMRegex *regex = [GTMRegex regexWithPattern:pattern]; +// .... +// +// ------------- +// +// GTMRegex *regex = [GTMRegex regexWithPattern:@"(foo+)(bar)"]; +// NSString *highlighted = +// [regex stringByReplacingMatchesInString:inputString +// withReplacement:@"\\1\\2"]; +// .... +// +@interface GTMRegex : NSObject { + @private + NSString *pattern_; + GTMRegexOptions options_; + regex_t regexData_; +} + +/// Create a new, autoreleased object w/ the given regex pattern with the default options ++ (id)regexWithPattern:(NSString *)pattern; + +/// Create a new, autoreleased object w/ the given regex pattern and specify the matching options ++ (id)regexWithPattern:(NSString *)pattern options:(GTMRegexOptions)options; + +/// Returns a new, autoreleased copy of |str| w/ any pattern chars in it escaped so they have no meaning when used w/in a pattern. ++ (NSString *)escapedPatternForString:(NSString *)str; + +/// Initialize a new object w/ the given regex pattern with the default options +- (id)initWithPattern:(NSString *)pattern; + +/// Initialize a new object w/ the given regex pattern and specify the matching options +- (id)initWithPattern:(NSString *)pattern options:(GTMRegexOptions)options; + +/// Returns the number of sub patterns in the pattern +// +// Sub Patterns are basically the number of parenthesis blocks w/in the pattern. +// ie: The pattern "foo((bar)|(baz))" has 3 sub patterns. +// +- (int)subPatternCount; + +/// Returns YES if the whole string |str| matches the pattern. +- (BOOL)matchesString:(NSString *)str; + +/// Returns a new, autoreleased array of string that contain the subpattern matches for the string. +// +// If the whole string does not match the pattern, nil is returned. +// +// The api follows the conventions of most regex engines, and index 0 (zero) is +// the full match, then the subpatterns are index 1, 2, ... going left to right. +// If the pattern has optional subpatterns, then anything that didn't match +// will have NSNull at that index. +// ie: The pattern "(fo(o+))((bar)|(baz))" has five subpatterns, and when +// applied to the string "foooooobaz" you'd get an array of: +// 0: "foooooobaz" +// 1: "foooooo" +// 2: "ooooo" +// 3: "baz" +// 4: NSNull +// 5: "baz" +// +- (NSArray *)subPatternsOfString:(NSString *)str; + +/// Returns a new, autoreleased enumerator that will walk segments (GTMRegexStringSegment) of |str| based on the pattern. +// +// This will split the string into "segments" using the given pattern. You get +// both the matches and parts that are inbetween matches. ie-the entire string +// will eventually be returned. +// +// See GTMRegexStringSegment for more infomation and examples. +// +- (NSEnumerator *)segmentEnumeratorForString:(NSString *)str; + +/// Returns a new, autoreleased enumerator that will walk only the matching segments (GTMRegexStringSegment) of |str| based on the pattern. +// +// This extracts the "segments" of the string that used the pattern. So it can +// be used to collect all of the matching substrings from within a string. +// +// See GTMRegexStringSegment for more infomation and examples. +// +- (NSEnumerator *)matchSegmentEnumeratorForString:(NSString *)str; + +/// Returns a new, autoreleased string with all matches of the pattern in |str| replaced with |replacementPattern|. +// +// Replacement uses the SED substitution like syntax w/in |replacementPattern| +// to allow the use of matches in the replacment. The replacement pattern can +// make use of any number of match references by using a backslash followed by +// the match subexpression number (ie-"\2", "\0", ...), see subPatternsOfString: +// for details on the subexpression indexing. +// +// REMINDER: you need to double-slash since the slash has meaning to the +// compiler/preprocessor. ie: "\\0" +// +- (NSString *)stringByReplacingMatchesInString:(NSString *)str + withReplacement:(NSString *)replacementPattern; + +@end + +/// Class returned by the nextObject for the enumerators from GTMRegex +// +// The two enumerators on from GTMRegex return objects of this type. This object +// represents a "piece" of the string the enumerator is walking. It's the apis +// on this object allow you to figure out why each segment was returned and to +// act on it. +// +// The easiest way to under stand this how the enumerators and this class works +// is through and examples :: +// Pattern: "foo+" +// String: "fo bar foobar foofooo baz" +// If you walk this w/ -segmentEnumeratorForString you'll get: +// # nextObjects Calls -isMatch -string +// 1 NO "fo bar " +// 2 YES "foo" +// 3 NO "bar " +// 4 YES "foo" +// 5 YES "fooo" +// 6 NO " baz" +// And if you walk this w/ -matchSegmentEnumeratorForString you'll get: +// # nextObjects Calls -isMatch -string +// 1 YES "foo" +// 2 YES "foo" +// 3 YES "fooo" +// (see the comments on subPatternString for how it works) +// +// Example usage: +// +// NSMutableString processedStr = [NSMutableString string]; +// NSEnumerator *enumerator = +// [inputStr segmentEnumeratorForPattern:@"foo+((ba+r)|(ba+z))"]; +// GTMRegexStringSegment *segment = nil; +// while ((segment = [enumerator nextObject]) != nil) { +// if ([segment isMatch]) { +// if ([segment subPatterString:2] != nil) { +// // matched: "(ba+r)" +// [processStr appendFormat:@"%@", [segment string]]; +// } else { +// // matched: "(ba+z)" +// [processStr appendFormat:@"%@", [segment string]]; +// } +// } else { +// [processStr appendString:[segment string]]; +// } +// } +// // proccessedStr now has all the versions of foobar wrapped in bold tags, +// // and all the versons of foobaz in italics tags. +// // ie: " fooobar foobaaz " ==> " fooobar foobaaz " +// +@interface GTMRegexStringSegment : NSObject { + @private + NSData *utf8StrBuf_; + regmatch_t *regMatches_; // STRONG: ie-we call free + int numRegMatches_; + BOOL isMatch_; +} + +/// Returns YES if this segment from from a match of the regex, false if it was a segment between matches. +// +// Use -isMatch to see if the segment from from a match of the pattern or if the +// segment is some text between matches. (NOTE: isMatch is always YES for +// matchSegmentEnumeratorForString) +// +- (BOOL)isMatch; + +/// Returns a new, autoreleased string w/ the full text segment from the original string. +- (NSString *)string; + +/// Returns a new, autoreleased string w/ the |index| sub pattern from this segment of the original string. +// +// This api follows the conventions of most regex engines, and index 0 (zero) is +// the full match, then the subpatterns are index 1, 2, ... going left to right. +// If the pattern has optional subpatterns, then anything that didn't match +// will return nil. +// ie: When using the pattern "(fo(o+))((bar)|(baz))" the following indexes +// fetch these values for a segment where -string is @"foooooobaz": +// 0: "foooooobaz" +// 1: "foooooo" +// 2: "ooooo" +// 3: "baz" +// 4: nil +// 5: "baz" +// +- (NSString *)subPatternString:(int)index; + +@end + +/// Some helpers to streamline usage of GTMRegex +// +// Example usage: +// +// if ([inputStr matchesPattern:@"foo.*bar"]) { +// // act on match +// .... +// } +// +// ------------- +// +// NSString *subStr = [inputStr firstSubStringMatchedByPattern:@"^foo:.*$"]; +// if (subStr != nil) { +// // act on subStr +// .... +// } +// +// ------------- +// +// NSArray *headingList = +// [inputStr allSubstringsMatchedByPattern:@"^Heading:.*$"]; +// // act on the list of headings +// .... +// +// ------------- +// +// NSString *highlightedString = +// [inputString stringByReplacingMatchesOfPattern:@"(foo+)(bar)" +// withReplacement:@"\\1\\2"]; +// .... +// +@interface NSString (GTMRegexAdditions) + +/// Returns YES if the full string matches regex |pattern| using the default match options +- (BOOL)gtm_matchesPattern:(NSString *)pattern; + +/// Returns a new, autoreleased array of strings that contain the subpattern matches of |pattern| using the default match options +// +// See [GTMRegex subPatternsOfString:] for information about the returned array. +// +- (NSArray *)gtm_subPatternsOfPattern:(NSString *)pattern; + +/// Returns a new, autoreleased string w/ the first substring that matched the regex |pattern| using the default match options +- (NSString *)gtm_firstSubStringMatchedByPattern:(NSString *)pattern; + +/// Returns a new, autoreleased array of substrings in the string that match the regex |pattern| using the default match options +// +// Note: if the string has no matches, you get an empty array. +- (NSArray *)gtm_allSubstringsMatchedByPattern:(NSString *)pattern; + +/// Returns a new, autoreleased segment enumerator that will break the string using pattern w/ the default match options +// +// The enumerator returns GTMRegexStringSegment options, see that class for more +// details and examples. +// +- (NSEnumerator *)gtm_segmentEnumeratorForPattern:(NSString *)pattern; + +/// Returns a new, autoreleased segment enumerator that will only return matching segments from the string using pattern w/ the default match options +// +// The enumerator returns GTMRegexStringSegment options, see that class for more +// details and examples. +// +- (NSEnumerator *)gtm_matchSegmentEnumeratorForPattern:(NSString *)pattern; + +/// Returns a new, autoreleased string with all matches for pattern |pattern| are replaced w/ |replacementPattern|. Uses the default match options. +// +// |replacemetPattern| has support for using any subExpression that matched, +// see [GTMRegex stringByReplacingMatchesInString:withReplacement:] above +// for details. +// +- (NSString *)gtm_stringByReplacingMatchesOfPattern:(NSString *)pattern + withReplacement:(NSString *)replacementPattern; + +@end -- cgit v1.2.3