// // GTMRegex.h // // Copyright 2007-2008 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations under // the License. // #import #import #import "GTMDefines.h" /// Options for controlling the behavior of the matches enum { kGTMRegexOptionIgnoreCase = 0x01, // Ignore case in matching, ie: 'a' matches 'a' or 'A' kGTMRegexOptionSupressNewlineSupport = 0x02, // By default (without this option), regular expressions are implicitly // processed on a line by line basis, where "lines" are delimited by newline // characters. In this mode '.' (dot) does NOT match newline characters, and // '^' and '$' match at the beginning and end of the string as well as // around newline characters. This behavior matches the default behavior for // regular expressions in other languages including Perl and Python. For // example, // foo.*bar // would match // fooAAAbar // but would NOT match // fooAAA\nbar // With the kGTMRegexOptionSupressNewlineSupport option, newlines are treated // just like any other character which means that '.' will match them. In // this mode, ^ and $ only match the beginning and end of the input string // and do NOT match around the newline characters. For example, // foo.*bar // would match // fooAAAbar // and would also match // fooAAA\nbar }; typedef NSUInteger GTMRegexOptions; /// Global contants needed for errors from consuming patterns // Ignore the "Macro name is a reserved identifier" warning in this section #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wreserved-id-macro" #undef _EXTERN #undef _INITIALIZE_AS #if GTMREGEX_DEFINE_GLOBALS #define _EXTERN #define _INITIALIZE_AS(x) =x #else #define _EXTERN GTM_EXTERN #define _INITIALIZE_AS(x) #endif #pragma clang diagnostic pop _EXTERN NSString* kGTMRegexErrorDomain _INITIALIZE_AS(@"com.google.mactoolbox.RegexDomain"); enum { kGTMRegexPatternParseFailedError = -100 }; // Keys for the userInfo from a kGTMRegexErrorDomain/kGTMRegexPatternParseFailedError error _EXTERN NSString* kGTMRegexPatternErrorPattern _INITIALIZE_AS(@"pattern"); _EXTERN NSString* kGTMRegexPatternErrorErrorString _INITIALIZE_AS(@"patternError"); /// Class for doing Extended Regex operations w/ libregex (see re_format(7)). // // NOTE: the docs for recomp/regexec make *no* claims about i18n. All work // within this class is done w/ UTF-8 so Unicode should move through it safely, // however, the character classes described in re_format(7) might not really // be unicode "savvy", so use them and this class w/ that in mind. // // Example usage: // // NSArray *inputArrayOfStrings = ... // NSArray *matches = [NSMutableArray array]; // // GTMRegex *regex = [GTMRegex regexWithPattern:@"foo.*bar"]; // for (NSString *curStr in inputArrayOfStrings) { // if ([regex matchesString:curStr]) // [matches addObject:curStr]; // } // .... // // ------------- // // If you need to include something dynamic in a pattern: // // NSString *pattern = // [NSString stringWithFormat:@"^foo:%@bar", // [GTMRegex escapedPatternForString:inputStr]]; // GTMRegex *regex = [GTMRegex regexWithPattern:pattern]; // .... // // ------------- // // GTMRegex *regex = [GTMRegex regexWithPattern:@"(foo+)(bar)"]; // NSString *highlighted = // [regex stringByReplacingMatchesInString:inputString // withReplacement:@"\\1\\2"]; // .... // // Use NSRegularExpression instead NS_DEPRECATED(10_0, 10_7, 1_0, 4_0) @interface GTMRegex : NSObject { @private NSString *pattern_; GTMRegexOptions options_; regex_t regexData_; } /// Create a new, autoreleased object w/ the given regex pattern with the default options + (id)regexWithPattern:(NSString *)pattern; /// Create a new, autoreleased object w/ the given regex pattern and specify the matching options + (id)regexWithPattern:(NSString *)pattern options:(GTMRegexOptions)options; /// Create a new, autoreleased object w/ the given regex pattern, specify the matching options and receive any error consuming the pattern. + (id)regexWithPattern:(NSString *)pattern options:(GTMRegexOptions)options withError:(NSError **)outErrorOrNULL; /// Returns a new, autoreleased copy of |str| w/ any pattern chars in it escaped so they have no meaning when used w/in a pattern. + (NSString *)escapedPatternForString:(NSString *)str; /// Initialize a new object w/ the given regex pattern with the default options - (id)initWithPattern:(NSString *)pattern; /// Initialize a new object w/ the given regex pattern and specify the matching options - (id)initWithPattern:(NSString *)pattern options:(GTMRegexOptions)options; /// Initialize a new object w/ the given regex pattern, specify the matching options, and receive any error consuming the pattern. - (id)initWithPattern:(NSString *)pattern options:(GTMRegexOptions)options withError:(NSError **)outErrorOrNULL; /// Returns the number of sub patterns in the pattern // // Sub Patterns are basically the number of parenthesis blocks w/in the pattern. // ie: The pattern "foo((bar)|(baz))" has 3 sub patterns. // - (NSUInteger)subPatternCount; /// Returns YES if the whole string |str| matches the pattern. - (BOOL)matchesString:(NSString *)str; /// Returns a new, autoreleased array of string that contain the subpattern matches for the string. // // If the whole string does not match the pattern, nil is returned. // // The api follows the conventions of most regex engines, and index 0 (zero) is // the full match, then the subpatterns are index 1, 2, ... going left to right. // If the pattern has optional subpatterns, then anything that didn't match // will have NSNull at that index. // ie: The pattern "(fo(o+))((bar)|(baz))" has five subpatterns, and when // applied to the string "foooooobaz" you'd get an array of: // 0: "foooooobaz" // 1: "foooooo" // 2: "ooooo" // 3: "baz" // 4: NSNull // 5: "baz" // - (NSArray *)subPatternsOfString:(NSString *)str; /// Returns the first match for this pattern in |str|. - (NSString *)firstSubStringMatchedInString:(NSString *)str; /// Returns YES if this pattern some substring of |str|. - (BOOL)matchesSubStringInString:(NSString *)str; /// Returns a new, autoreleased enumerator that will walk segments (GTMRegexStringSegment) of |str| based on the pattern. // // This will split the string into "segments" using the given pattern. You get // both the matches and parts that are inbetween matches. ie-the entire string // will eventually be returned. // // See GTMRegexStringSegment for more infomation and examples. // - (NSEnumerator *)segmentEnumeratorForString:(NSString *)str; /// Returns a new, autoreleased enumerator that will walk only the matching segments (GTMRegexStringSegment) of |str| based on the pattern. // // This extracts the "segments" of the string that used the pattern. So it can // be used to collect all of the matching substrings from within a string. // // See GTMRegexStringSegment for more infomation and examples. // - (NSEnumerator *)matchSegmentEnumeratorForString:(NSString *)str; /// Returns a new, autoreleased string with all matches of the pattern in |str| replaced with |replacementPattern|. // // Replacement uses the SED substitution like syntax w/in |replacementPattern| // to allow the use of matches in the replacment. The replacement pattern can // make use of any number of match references by using a backslash followed by // the match subexpression number (ie-"\2", "\0", ...), see subPatternsOfString: // for details on the subexpression indexing. // // REMINDER: you need to double-slash since the slash has meaning to the // compiler/preprocessor. ie: "\\0" // - (NSString *)stringByReplacingMatchesInString:(NSString *)str withReplacement:(NSString *)replacementPattern; @end /// Class returned by the nextObject for the enumerators from GTMRegex // // The two enumerators on from GTMRegex return objects of this type. This object // represents a "piece" of the string the enumerator is walking. It's the apis // on this object allow you to figure out why each segment was returned and to // act on it. // // The easiest way to under stand this how the enumerators and this class works // is through and examples :: // Pattern: "foo+" // String: "fo bar foobar foofooo baz" // If you walk this w/ -segmentEnumeratorForString you'll get: // # nextObjects Calls -isMatch -string // 1 NO "fo bar " // 2 YES "foo" // 3 NO "bar " // 4 YES "foo" // 5 YES "fooo" // 6 NO " baz" // And if you walk this w/ -matchSegmentEnumeratorForString you'll get: // # nextObjects Calls -isMatch -string // 1 YES "foo" // 2 YES "foo" // 3 YES "fooo" // (see the comments on subPatternString for how it works) // // Example usage: // // NSMutableString processedStr = [NSMutableString string]; // NSEnumerator *enumerator = // [inputStr segmentEnumeratorForPattern:@"foo+((ba+r)|(ba+z))"]; // GTMRegexStringSegment *segment = nil; // while ((segment = [enumerator nextObject]) != nil) { // if ([segment isMatch]) { // if ([segment subPatterString:2] != nil) { // // matched: "(ba+r)" // [processStr appendFormat:@"%@", [segment string]]; // } else { // // matched: "(ba+z)" // [processStr appendFormat:@"%@", [segment string]]; // } // } else { // [processStr appendString:[segment string]]; // } // } // // proccessedStr now has all the versions of foobar wrapped in bold tags, // // and all the versons of foobaz in italics tags. // // ie: " fooobar foobaaz " ==> " fooobar foobaaz " // @interface GTMRegexStringSegment : NSObject { @private NSData *utf8StrBuf_; regmatch_t *regMatches_; // STRONG: ie-we call free NSUInteger numRegMatches_; BOOL isMatch_; } /// Returns YES if this segment from from a match of the regex, false if it was a segment between matches. // // Use -isMatch to see if the segment from from a match of the pattern or if the // segment is some text between matches. (NOTE: isMatch is always YES for // matchSegmentEnumeratorForString) // - (BOOL)isMatch; /// Returns a new, autoreleased string w/ the full text segment from the original string. - (NSString *)string; /// Returns a new, autoreleased string w/ the |index| sub pattern from this segment of the original string. // // This api follows the conventions of most regex engines, and index 0 (zero) is // the full match, then the subpatterns are index 1, 2, ... going left to right. // If the pattern has optional subpatterns, then anything that didn't match // will return nil. // ie: When using the pattern "(fo(o+))((bar)|(baz))" the following indexes // fetch these values for a segment where -string is @"foooooobaz": // 0: "foooooobaz" // 1: "foooooo" // 2: "ooooo" // 3: "baz" // 4: nil // 5: "baz" // - (NSString *)subPatternString:(NSUInteger)index; @end /// Some helpers to streamline usage of GTMRegex // // Example usage: // // if ([inputStr matchesPattern:@"foo.*bar"]) { // // act on match // .... // } // // ------------- // // NSString *subStr = [inputStr firstSubStringMatchedByPattern:@"^foo:.*$"]; // if (subStr != nil) { // // act on subStr // .... // } // // ------------- // // NSArray *headingList = // [inputStr allSubstringsMatchedByPattern:@"^Heading:.*$"]; // // act on the list of headings // .... // // ------------- // // NSString *highlightedString = // [inputString stringByReplacingMatchesOfPattern:@"(foo+)(bar)" // withReplacement:@"\\1\\2"]; // .... // @interface NSString (GTMRegexAdditions) /// Returns YES if the full string matches regex |pattern| using the default match options - (BOOL)gtm_matchesPattern:(NSString *)pattern; /// Returns a new, autoreleased array of strings that contain the subpattern matches of |pattern| using the default match options // // See [GTMRegex subPatternsOfString:] for information about the returned array. // - (NSArray *)gtm_subPatternsOfPattern:(NSString *)pattern; /// Returns a new, autoreleased string w/ the first substring that matched the regex |pattern| using the default match options - (NSString *)gtm_firstSubStringMatchedByPattern:(NSString *)pattern; /// Returns YES if a substring string matches regex |pattern| using the default match options - (BOOL)gtm_subStringMatchesPattern:(NSString *)pattern; /// Returns a new, autoreleased array of substrings in the string that match the regex |pattern| using the default match options // // Note: if the string has no matches, you get an empty array. - (NSArray *)gtm_allSubstringsMatchedByPattern:(NSString *)pattern; /// Returns a new, autoreleased segment enumerator that will break the string using pattern w/ the default match options // // The enumerator returns GTMRegexStringSegment options, see that class for more // details and examples. // - (NSEnumerator *)gtm_segmentEnumeratorForPattern:(NSString *)pattern; /// Returns a new, autoreleased segment enumerator that will only return matching segments from the string using pattern w/ the default match options // // The enumerator returns GTMRegexStringSegment options, see that class for more // details and examples. // - (NSEnumerator *)gtm_matchSegmentEnumeratorForPattern:(NSString *)pattern; /// Returns a new, autoreleased string with all matches for pattern |pattern| are replaced w/ |replacementPattern|. Uses the default match options. // // |replacemetPattern| has support for using any subExpression that matched, // see [GTMRegex stringByReplacingMatchesInString:withReplacement:] above // for details. // - (NSString *)gtm_stringByReplacingMatchesOfPattern:(NSString *)pattern withReplacement:(NSString *)replacementPattern; @end